[PATCH] libclc: Revert generic vload/vstore to pure CLC and move asm to R600

The assembly optimizations were making unsafe assumptions about which address
spaces had which identifiers.

Also, fix vload/vstore with 64-bit pointers. This was broken previously on
Radeon SI.

Signed-off-by: Aaron Watry <awatry@gmail.com>

The assembly optimizations were making unsafe assumptions about which address
spaces had which identifiers.

Also, fix vload/vstore with 64-bit pointers. This was broken previously on
Radeon SI.

Signed-off-by: Aaron Watry <awatry@gmail.com>
---
generic/lib/SOURCES | 4 --
generic/lib/shared/vload.cl | 54 +------------------
generic/lib/shared/vload_if.ll | 60 ---------------------
generic/lib/shared/vload_impl.ll | 49 -----------------
generic/lib/shared/vstore.cl | 58 +-------------------
generic/lib/shared/vstore_if.ll | 59 ---------------------
generic/lib/shared/vstore_impl.ll | 50 ------------------
r600/lib/SOURCES | 6 +++
r600/lib/shared/vload.cl | 99 ++++++++++++++++++++++++++++++++++
r600/lib/shared/vload_if.ll | 60 +++++++++++++++++++++
r600/lib/shared/vload_impl.ll | 44 ++++++++++++++++
r600/lib/shared/vstore.cl | 108 ++++++++++++++++++++++++++++++++++++++
r600/lib/shared/vstore_if.ll | 59 +++++++++++++++++++++
r600/lib/shared/vstore_impl.ll | 45 ++++++++++++++++
14 files changed, 425 insertions(+), 330 deletions(-)
delete mode 100644 generic/lib/shared/vload_if.ll
delete mode 100644 generic/lib/shared/vload_impl.ll
delete mode 100644 generic/lib/shared/vstore_if.ll
delete mode 100644 generic/lib/shared/vstore_impl.ll
create mode 100644 r600/lib/shared/vload.cl
create mode 100644 r600/lib/shared/vload_if.ll
create mode 100644 r600/lib/shared/vload_impl.ll
create mode 100644 r600/lib/shared/vstore.cl
create mode 100644 r600/lib/shared/vstore_if.ll
create mode 100644 r600/lib/shared/vstore_impl.ll

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 8cda14a..50cc9bd 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -24,10 +24,6 @@ shared/clamp.cl
shared/max.cl
shared/min.cl
shared/vload.cl
-shared/vload_if.ll
-shared/vload_impl.ll
shared/vstore.cl
-shared/vstore_if.ll
-shared/vstore_impl.ll
workitem/get_global_id.cl
workitem/get_global_size.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index f6ebd37..e8439e7 100644
--- a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -27,12 +27,13 @@
     VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
     VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \

-//int/uint are special... see below
#define VLOAD_TYPES() \
     VLOAD_ADDR_SPACES(char) \
     VLOAD_ADDR_SPACES(uchar) \
     VLOAD_ADDR_SPACES(short) \
     VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
     VLOAD_ADDR_SPACES(long) \
     VLOAD_ADDR_SPACES(ulong) \
     VLOAD_ADDR_SPACES(float) \
@@ -43,54 +44,3 @@ VLOAD_TYPES()
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VLOAD_ADDR_SPACES(double)
#endif
-
-VLOAD_VECTORIZE(int, __private)
-VLOAD_VECTORIZE(int, __local)
-VLOAD_VECTORIZE(int, __constant)
-VLOAD_VECTORIZE(uint, __private)
-VLOAD_VECTORIZE(uint, __local)
-VLOAD_VECTORIZE(uint, __constant)
-
-_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
- return (int2)(x[offset] , x[offset+1]);
-}
-_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
- return (int3)(vload2(offset, x), x[offset+2]);
-}
-_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
- return (uint2)(x[offset] , x[offset+1]);
-}
-_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
- return (uint3)(vload2(offset, x), x[offset+2]);
-}
-
-/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
- * they aren't actually overridden here
- */
-_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
-_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
-_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
-
-_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
- return __clc_vload4_int__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
- return __clc_vload8_int__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
- return __clc_vload16_int__global(offset, x);
-}
-
-_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
-_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
-_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
-
-_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
- return __clc_vload4_uint__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
- return __clc_vload8_uint__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
- return __clc_vload16_uint__global(offset, x);
-}
\ No newline at end of file
diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
deleted file mode 100644
index 2634d37..0000000
--- a/generic/lib/shared/vload_if.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-;Start int global vload
-
-declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
-declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
-declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
-declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
-declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
-
-define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
- ret <2 x i32> %call
-}
-
-define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
- ret <3 x i32> %call
-}
-
-define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
- ret <4 x i32> %call
-}
-
-define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
- ret <8 x i32> %call
-}
-
-define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
- ret <16 x i32> %call
-}
-
-
-;Start uint global vload
-
-define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
- ret <2 x i32> %call
-}
-
-define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
- ret <3 x i32> %call
-}
-
-define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
- ret <4 x i32> %call
-}
-
-define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
- ret <8 x i32> %call
-}
-
-define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
- ret <16 x i32> %call
-}
diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
deleted file mode 100644
index ae719e0..0000000
--- a/generic/lib/shared/vload_impl.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
-
-define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
- %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <2 x i32> %4
-}
-
-define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
- %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <3 x i32> %4
-}
-
-define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
- %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <4 x i32> %4
-}
-
-define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
- %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <8 x i32> %4
-}
-
-define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
- %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <16 x i32> %4
-}
-
-!1 = metadata !{metadata !"char", metadata !5}
-!2 = metadata !{metadata !"short", metadata !5}
-!3 = metadata !{metadata !"int", metadata !5}
-!4 = metadata !{metadata !"long", metadata !5}
-!5 = metadata !{metadata !"omnipotent char", metadata !6}
-!6 = metadata !{metadata !"Simple C/C++ TBAA"}
-
diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
index 5b84f47..38f4d37 100644
--- a/generic/lib/shared/vstore.cl
+++ b/generic/lib/shared/vstore.cl
@@ -34,12 +34,13 @@
     VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
     VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \

-//int/uint are special... see below
#define VSTORE_TYPES() \
     VSTORE_ADDR_SPACES(char) \
     VSTORE_ADDR_SPACES(uchar) \
     VSTORE_ADDR_SPACES(short) \
     VSTORE_ADDR_SPACES(ushort) \
+ VSTORE_ADDR_SPACES(int) \
+ VSTORE_ADDR_SPACES(uint) \
     VSTORE_ADDR_SPACES(long) \
     VSTORE_ADDR_SPACES(ulong) \
     VSTORE_ADDR_SPACES(float) \
@@ -50,58 +51,3 @@ VSTORE_TYPES()
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VSTORE_ADDR_SPACES(double)
#endif
-
-VSTORE_VECTORIZE(int, __private)
-VSTORE_VECTORIZE(int, __local)
-VSTORE_VECTORIZE(uint, __private)
-VSTORE_VECTORIZE(uint, __local)
-
-_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) {
- mem[offset] = vec.s0;
- mem[offset+1] = vec.s1;
-}
-_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
- mem[offset] = vec.s0;
- mem[offset+1] = vec.s1;
- mem[offset+2] = vec.s2;
-}
-_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) {
- mem[offset] = vec.s0;
- mem[offset+1] = vec.s1;
-}
-_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
- mem[offset] = vec.s0;
- mem[offset+1] = vec.s1;
- mem[offset+2] = vec.s2;
-}
-
-/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so
- * they aren't actually overridden here... lowest-common-denominator
- */
-_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
-_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
-_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
-
-_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
- __clc_vstore4_int__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
- __clc_vstore8_int__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
- __clc_vstore16_int__global(vec, offset, x);
-}
-
-_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
-_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
-_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
-
-_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
- __clc_vstore4_uint__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
- __clc_vstore8_uint__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
- __clc_vstore16_uint__global(vec, offset, x);
-}
diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll
deleted file mode 100644
index 30eb552..0000000
--- a/generic/lib/shared/vstore_if.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-;Start int global vstore
-
-declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
-
-define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-
-;Start uint global vstore
-define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
-
-define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
- call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
- ret void
-}
\ No newline at end of file
diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll
deleted file mode 100644
index 3baab5e..0000000
--- a/generic/lib/shared/vstore_impl.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
-
-define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
- store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret void
-}
-
-define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
- store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret void
-}
-
-define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
- store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret void
-}
-
-define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
- store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret void
-}
-
-define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
- store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret void
-}
-
-
-!1 = metadata !{metadata !"char", metadata !5}
-!2 = metadata !{metadata !"short", metadata !5}
-!3 = metadata !{metadata !"int", metadata !5}
-!4 = metadata !{metadata !"long", metadata !5}
-!5 = metadata !{metadata !"omnipotent char", metadata !6}
-!6 = metadata !{metadata !"Simple C/C++ TBAA"}
-
diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
index af8c8c8..0637632 100644
--- a/r600/lib/SOURCES
+++ b/r600/lib/SOURCES
@@ -2,3 +2,9 @@ workitem/get_group_id.ll
workitem/get_local_size.ll
workitem/get_local_id.ll
workitem/get_global_size.ll
+shared/vload.cl
+shared/vload_if.ll
+shared/vload_impl.ll
+shared/vstore.cl
+shared/vstore_if.ll
+shared/vstore_impl.ll
diff --git a/r600/lib/shared/vload.cl b/r600/lib/shared/vload.cl
new file mode 100644
index 0000000..dd28347
--- /dev/null
+++ b/r600/lib/shared/vload.cl
@@ -0,0 +1,99 @@
+#include <clc/clc.h>
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \
+ } \
+\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \
+ } \
+\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \
+ } \
+\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \
+ } \
+\
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \
+ } \
+
+#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \
+ VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \
+ VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \
+ VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
+ VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
+
+//int/uint are special... see below
+#define VLOAD_TYPES() \
+ VLOAD_ADDR_SPACES(char) \
+ VLOAD_ADDR_SPACES(uchar) \
+ VLOAD_ADDR_SPACES(short) \
+ VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(long) \
+ VLOAD_ADDR_SPACES(ulong) \
+ VLOAD_ADDR_SPACES(float) \
+
+VLOAD_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+ VLOAD_ADDR_SPACES(double)
+#endif
+
+VLOAD_VECTORIZE(int, __private)
+VLOAD_VECTORIZE(int, __local)
+VLOAD_VECTORIZE(int, __constant)
+VLOAD_VECTORIZE(uint, __private)
+VLOAD_VECTORIZE(uint, __local)
+VLOAD_VECTORIZE(uint, __constant)
+
+_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
+ return (int3)(vload2(offset, x), x[offset+2]);
+}
+_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
+ return (uint3)(vload2(offset, x), x[offset+2]);
+}
+
+/*Note: R600 doesn't support load <3 x ?>... so
+ * those functions aren't actually overridden here
+ */
+_CLC_DECL int2 __clc_vload2_int__global(size_t offset, const __global int *);
+_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
+_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
+_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
+
+_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
+ return __clc_vload2_int__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
+ return __clc_vload4_int__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
+ return __clc_vload8_int__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
+ return __clc_vload16_int__global(offset, x);
+}
+
+_CLC_DECL uint2 __clc_vload2_uint__global(size_t offset, const __global uint *);
+_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
+_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
+_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
+
+_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
+ return __clc_vload2_uint__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
+ return __clc_vload4_uint__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
+ return __clc_vload8_uint__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
+ return __clc_vload16_uint__global(offset, x);
+}
+
diff --git a/r600/lib/shared/vload_if.ll b/r600/lib/shared/vload_if.ll
new file mode 100644
index 0000000..bda592b
--- /dev/null
+++ b/r600/lib/shared/vload_if.ll

What is the purpose of the vload_if.ll file? All it does is wrap calls
to functions in vload_impl.ll, and I'm not sure why.

Trying again... first response got caught in moderator queue because
it was > 40Kb....

Always with the good questions... :slight_smile:

I used the add_sat/sub_sat function implementations as a source
pattern, which was where I originally got the interface/implementation
separation from. Also, being a Java dev by day, it felt natural to
continue that existing pattern. My motivation was mostly just
maintaining the status quo, so you won't find much resistance to
change here on my part.

From a functional standpoint, the only thing that it currently gives

us is a bit more separation between the high-level and low-level
implementation details. In the case of load/stores, we define
separate loads for e.g.:
int4 = vload4(0, global int* in);
uint4 = vload4(0, global uint* in);

which get redirected to:
define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)*
nocapture %addr)
define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)*
nocapture %addr)

which both get wrapped to:
define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32
addrspace(1)* nocapture %addr)

I guess the only time that this would actually make sense would be if
you had multiple chips/architectures which used the same assembly
prototypes and could swap out the implementation details based on the
chip (treating *_if.ll as headers essentially)... but we'd probably
just do that in the target/architecture-specific directories of libclc
anyway.

If you want, I'll strip out the _if.ll files and re-spin the patch to
make things simpler. Given that the functions themselves were marked
'alwaysinline', I don't think the generated code should be any
different.

The assembly optimizations were making unsafe assumptions about which address
spaces had which identifiers.

Also, fix vload/vstore with 64-bit pointers. This was broken previously on
Radeon SI.

Signed-off-by: Aaron Watry <awatry@gmail.com>
---
generic/lib/SOURCES | 4 --
generic/lib/shared/vload.cl | 54 +------------------
generic/lib/shared/vload_if.ll | 60 ---------------------
generic/lib/shared/vload_impl.ll | 49 -----------------
generic/lib/shared/vstore.cl | 58 +-------------------
generic/lib/shared/vstore_if.ll | 59 ---------------------
generic/lib/shared/vstore_impl.ll | 50 ------------------
r600/lib/SOURCES | 6 +++
r600/lib/shared/vload.cl | 99 ++++++++++++++++++++++++++++++++++
r600/lib/shared/vload_if.ll | 60 +++++++++++++++++++++
r600/lib/shared/vload_impl.ll | 44 ++++++++++++++++
r600/lib/shared/vstore.cl | 108 ++++++++++++++++++++++++++++++++++++++
r600/lib/shared/vstore_if.ll | 59 +++++++++++++++++++++
r600/lib/shared/vstore_impl.ll | 45 ++++++++++++++++
14 files changed, 425 insertions(+), 330 deletions(-)
delete mode 100644 generic/lib/shared/vload_if.ll
delete mode 100644 generic/lib/shared/vload_impl.ll
delete mode 100644 generic/lib/shared/vstore_if.ll
delete mode 100644 generic/lib/shared/vstore_impl.ll
create mode 100644 r600/lib/shared/vload.cl
create mode 100644 r600/lib/shared/vload_if.ll
create mode 100644 r600/lib/shared/vload_impl.ll
create mode 100644 r600/lib/shared/vstore.cl
create mode 100644 r600/lib/shared/vstore_if.ll
create mode 100644 r600/lib/shared/vstore_impl.ll

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 8cda14a..50cc9bd 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -24,10 +24,6 @@ shared/clamp.cl
shared/max.cl
shared/min.cl
shared/vload.cl
-shared/vload_if.ll
-shared/vload_impl.ll
shared/vstore.cl
-shared/vstore_if.ll
-shared/vstore_impl.ll
workitem/get_global_id.cl
workitem/get_global_size.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index f6ebd37..e8439e7 100644
--- a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -27,12 +27,13 @@
     VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
     VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \

-//int/uint are special... see below
#define VLOAD_TYPES() \
     VLOAD_ADDR_SPACES(char) \
     VLOAD_ADDR_SPACES(uchar) \
     VLOAD_ADDR_SPACES(short) \
     VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
     VLOAD_ADDR_SPACES(long) \
     VLOAD_ADDR_SPACES(ulong) \
     VLOAD_ADDR_SPACES(float) \
@@ -43,54 +44,3 @@ VLOAD_TYPES()
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VLOAD_ADDR_SPACES(double)
#endif
-
-VLOAD_VECTORIZE(int, __private)
-VLOAD_VECTORIZE(int, __local)
-VLOAD_VECTORIZE(int, __constant)
-VLOAD_VECTORIZE(uint, __private)
-VLOAD_VECTORIZE(uint, __local)
-VLOAD_VECTORIZE(uint, __constant)
-
-_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
- return (int2)(x[offset] , x[offset+1]);
-}
-_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
- return (int3)(vload2(offset, x), x[offset+2]);
-}
-_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
- return (uint2)(x[offset] , x[offset+1]);
-}
-_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
- return (uint3)(vload2(offset, x), x[offset+2]);
-}
-
-/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
- * they aren't actually overridden here
- */
-_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
-_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
-_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
-
-_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
- return __clc_vload4_int__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
- return __clc_vload8_int__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
- return __clc_vload16_int__global(offset, x);
-}
-
-_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
-_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
-_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
-
-_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
- return __clc_vload4_uint__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
- return __clc_vload8_uint__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
- return __clc_vload16_uint__global(offset, x);
-}
\ No newline at end of file
diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
deleted file mode 100644
index 2634d37..0000000
--- a/generic/lib/shared/vload_if.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-;Start int global vload
-
-declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
-declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
-declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
-declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
-declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
-
-define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
- ret <2 x i32> %call
-}
-
-define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
- ret <3 x i32> %call
-}
-
-define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
- ret <4 x i32> %call
-}
-
-define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
- ret <8 x i32> %call
-}
-
-define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
- ret <16 x i32> %call
-}
-
-
-;Start uint global vload
-
-define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
- ret <2 x i32> %call
-}
-
-define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
- ret <3 x i32> %call
-}
-
-define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
- ret <4 x i32> %call
-}
-
-define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
- ret <8 x i32> %call
-}
-
-define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
- %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
- ret <16 x i32> %call
-}
diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
deleted file mode 100644
index ae719e0..0000000
--- a/generic/lib/shared/vload_impl.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
-
-define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
- %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <2 x i32> %4
-}
-
-define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
- %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <3 x i32> %4
-}
-
-define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
- %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <4 x i32> %4
-}
-
-define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
- %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <8 x i32> %4
-}
-
-define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
- %1 = ptrtoint i32 addrspace(1)* %addr to i32
- %2 = add i32 %1, %offset
- %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
- %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
- ret <16 x i32> %4
-}
-
-!1 = metadata !{metadata !"char", metadata !5}
-!2 = metadata !{metadata !"short", metadata !5}
-!3 = metadata !{metadata !"int", metadata !5}
-!4 = metadata !{metadata !"long", metadata !5}
-!5 = metadata !{metadata !"omnipotent char", metadata !6}
-!6 = metadata !{metadata !"Simple C/C++ TBAA"}
-

<snipped the vstore implementation because of message size limits>

Trying again... first response got caught in moderator queue because
it was > 40Kb....

Always with the good questions... :slight_smile:

I used the add_sat/sub_sat function implementations as a source
pattern, which was where I originally got the interface/implementation
separation from. Also, being a Java dev by day, it felt natural to
continue that existing pattern. My motivation was mostly just
maintaining the status quo, so you won't find much resistance to
change here on my part.

From a functional standpoint, the only thing that it currently gives
us is a bit more separation between the high-level and low-level
implementation details. In the case of load/stores, we define
separate loads for e.g.:
int4 = vload4(0, global int* in);
uint4 = vload4(0, global uint* in);

which get redirected to:
define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)*
nocapture %addr)
define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)*
nocapture %addr)

which both get wrapped to:
define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32
addrspace(1)* nocapture %addr)

I guess the only time that this would actually make sense would be if
you had multiple chips/architectures which used the same assembly
prototypes and could swap out the implementation details based on the
chip (treating *_if.ll as headers essentially)... but we'd probably
just do that in the target/architecture-specific directories of libclc
anyway.

What I think would makes sense is to move v{load,store}_impl.ll back
into the generic directory and rename the functions based on llvm address
space, rather than opencl address space (e.g.
__clc_vload4_impl_i32__addr1 instead of __clc_vload4_impl_i32__global)
and then have the r600 version of v{load,store}.cl bypass vload_if.ll
entirely and call the __clc_vload4_impl_i32__addr1 and related function
directly.

This way, the optimized version could be reused by multiple targets,
even if addrspace 1 is global memory on one and constant memory on
another.

What do you think?

-Tom

I like the idea, just one question/stream of thought...

If we do this, we'll need 4 separate address spaces to be declared,
some of which may be legal to vstoreN() to.

So, let's say that R600 uses (note that these are probably wrong,
haven't had to deal with the others yet... but the example should
hold):
addrspace(1) = global
addrspace(2) = local
addrspace(3) = private
addrspace(4) = constant

And then nvptx uses:
addrspace(1) = global
addrspace(2) = constant
addrspace(3) = local
addrspace(4) = private

If we write generic implementations for:
__clc_vstore4_impl_i32__addr1
__clc_vstore4_impl_i32__addr2
__clc_vstore4_impl_i32__addr3
__clc_vstore4_impl_i32__addr4

Calling __clc_vstore4_impl_i32__addr2 is legal for R600, but illegal
on nvptx. As long as LLVM will recognize that the code is never being
called and will ignore the dead code when compiling from bytecode to
assembly/machine instructions, then we're good, and I'll be happy
going down this route.

I guess it's easy enough to test when I've got some spare time. I
really like this idea, so hopefully it works out.

--Aaron

I also just noticed that we are calculating addresses with non-zero
offsets incorrectly. Instead of just address[offset], we should be
using address[vector_size * offset] for all load/store address
calculations. I'll split the fix for that out into a separate patch
in v2 of this changeset.

We are currently doing the following for int4 vloads (paraphrasing):
int4 vload4(size_t offset, int4* in){
  return (int4){in[offset], in[offset+1], in[offset+2], in[offset+3]};
}

We should be doing:
int4 vload4(size_t offset, int4* in){
  return (int4){in[4*offset], in[4*offset+1], in[4*offset+2], in[4*offset+3]};
}

And the same follows for all other data types and vector sizes and
same for the vstore variants as well.

--Aaron