[PATCH 1/1] Implement vload_half{,n}

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
Fixes newly posted vload_half piglits on Turks, Carrizo, and Iceland.

generic/include/clc/shared/vload.h | 45 ++++++++++++++++-------------
generic/lib/SOURCES | 1 +
generic/lib/shared/vload.cl | 49 ++++++++++++++++++++++++++++++++
generic/lib/shared/vload_half.inc | 13 +++++++++
generic/lib/shared/vload_half_helpers.ll | 23 +++++++++++++++
5 files changed, 111 insertions(+), 20 deletions(-)
create mode 100644 generic/lib/shared/vload_half.inc
create mode 100644 generic/lib/shared/vload_half_helpers.ll

diff --git a/generic/include/clc/shared/vload.h b/generic/include/clc/shared/vload.h
index 93d0750..1da4e99 100644
--- a/generic/include/clc/shared/vload.h
+++ b/generic/include/clc/shared/vload.h
@@ -1,18 +1,21 @@
-#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
- _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
+#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);

-#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \
- _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
- _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
- _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
- _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
- _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
+#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
+ _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
+ _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
+ _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
+ _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
+ _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
+
+#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
+ _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
+ _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
+ _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
+ _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \

#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
- _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
- _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
- _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
- _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
+ _CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) \

#define _CLC_VECTOR_VLOAD_PRIM() \
     _CLC_VECTOR_VLOAD_PRIM1(char) \
@@ -24,14 +27,16 @@
     _CLC_VECTOR_VLOAD_PRIM1(long) \
     _CLC_VECTOR_VLOAD_PRIM1(ulong) \
     _CLC_VECTOR_VLOAD_PRIM1(float) \
-
+ _CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
+
#ifdef cl_khr_fp64
-#define _CLC_VECTOR_VLOAD() \
- _CLC_VECTOR_VLOAD_PRIM1(double) \
- _CLC_VECTOR_VLOAD_PRIM()
-#else
-#define _CLC_VECTOR_VLOAD() \
- _CLC_VECTOR_VLOAD_PRIM()
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+ _CLC_VECTOR_VLOAD_PRIM1(double)
#endif

-_CLC_VECTOR_VLOAD()
+_CLC_VECTOR_VLOAD_PRIM()
+// Plain vload_half also needs to be declared
+_CLC_VLOAD_DECL(_half, half, float, , __constant)
+_CLC_VLOAD_DECL(_half, half, float, , __global)
+_CLC_VLOAD_DECL(_half, half, float, , __local)
+_CLC_VLOAD_DECL(_half, half, float, , __private)
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 9e0157b..8b080f5 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -144,6 +144,7 @@ shared/clamp.cl
shared/max.cl
shared/min.cl
shared/vload.cl
+shared/vload_half_helpers.ll
shared/vstore.cl
shared/vstore_half_helpers.ll
workitem/get_global_id.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index 8897200..46d6486 100644
--- a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -50,3 +50,52 @@ VLOAD_TYPES()
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VLOAD_ADDR_SPACES(double)
#endif
+
+/* vload_half are legal even without cl_khr_fp16 */
+
+float __clc_vload_half_float_helper__constant(const __constant half *);
+float __clc_vload_half_float_helper__global(const __global half *);
+float __clc_vload_half_float_helper__local(const __local half *);
+float __clc_vload_half_float_helper__private(const __private half *);
+
+/* no vload_half for double */
+
+#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
+#define VEC_LOAD2(val, AS) \
+ VEC_LOAD1(val.lo, AS) \
+ VEC_LOAD1(val.hi, AS)
+#define VEC_LOAD3(val, AS) \
+ VEC_LOAD1(val.s0, AS) \
+ VEC_LOAD1(val.s1, AS) \
+ VEC_LOAD1(val.s2, AS)
+#define VEC_LOAD4(val, AS) \
+ VEC_LOAD2(val.lo, AS) \
+ VEC_LOAD2(val.hi, AS)
+#define VEC_LOAD8(val, AS) \
+ VEC_LOAD4(val.lo, AS) \
+ VEC_LOAD4(val.hi, AS)
+#define VEC_LOAD16(val, AS) \
+ VEC_LOAD8(val.lo, AS) \
+ VEC_LOAD8(val.hi, AS)
+
+#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
+ _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
+ offset *= VEC_SIZE; \
+ TYPE __tmp; \
+ VEC_LOAD##VEC_SIZE(__tmp, AS) \
+ return __tmp; \
+ }
+
+#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, AS)
+
+#define __CLC_BODY "vload_half.inc"
+#include <clc/math/gentype.inc>
+#undef __CLC_BODY
+#undef FUNC
+#undef __FUNC
+#undef VEC_LOAD16
+#undef VEC_LOAD8
+#undef VEC_LOAD4
+#undef VEC_LOAD3
+#undef VEC_LOAD2
+#undef VEC_LOAD1
diff --git a/generic/lib/shared/vload_half.inc b/generic/lib/shared/vload_half.inc
new file mode 100644
index 0000000..00dae8a
--- /dev/null
+++ b/generic/lib/shared/vload_half.inc
@@ -0,0 +1,13 @@
+#if __CLC_FPSIZE == 32
+#ifdef __CLC_VECSIZE
+ FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private);
+ FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local);
+ FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global);
+ FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __constant);
+#else
+ FUNC(, 1, __CLC_GENTYPE, __private);
+ FUNC(, 1, __CLC_GENTYPE, __local);
+ FUNC(, 1, __CLC_GENTYPE, __global);
+ FUNC(, 1, __CLC_GENTYPE, __constant);
+#endif
+#endif
diff --git a/generic/lib/shared/vload_half_helpers.ll b/generic/lib/shared/vload_half_helpers.ll
new file mode 100644
index 0000000..b8c905a
--- /dev/null
+++ b/generic/lib/shared/vload_half_helpers.ll
@@ -0,0 +1,23 @@
+define float @__clc_vload_half_float_helper__private(half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
+ %data = load half, half addrspace(0)* %ptr

Not sure I'm a fan of hard-coding the address space mappings here. In
the past, we've defined functions with a name of _addr[1234], and then
had target-specific mappings between the named address spaces and the
numeric ones.

--Aaron

Address space mappings aside, I've confirmed that this also fixes the
new piglits on my pitcairn (gcn 1.0).

--Aaron

Address space mappings aside, I've confirmed that this also fixes the
new piglits on my pitcairn (gcn 1.0).

thanks for testing

--Aaron

> > Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
> > ---
> > Fixes newly posted vload_half piglits on Turks, Carrizo, and Iceland.
> >
> > generic/include/clc/shared/vload.h | 45 ++++++++++++++++-------------
> > generic/lib/SOURCES | 1 +
> > generic/lib/shared/vload.cl | 49 ++++++++++++++++++++++++++++++++
> > generic/lib/shared/vload_half.inc | 13 +++++++++
> > generic/lib/shared/vload_half_helpers.ll | 23 +++++++++++++++
> > 5 files changed, 111 insertions(+), 20 deletions(-)
> > create mode 100644 generic/lib/shared/vload_half.inc
> > create mode 100644 generic/lib/shared/vload_half_helpers.ll
> >
> > diff --git a/generic/include/clc/shared/vload.h b/generic/include/clc/shared/vload.h
> > index 93d0750..1da4e99 100644
> > --- a/generic/include/clc/shared/vload.h
> > +++ b/generic/include/clc/shared/vload.h
> > @@ -1,18 +1,21 @@
> > -#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
> > - _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
> > +#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
> > + _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);
> >
> > -#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \
> > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
> > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
> > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
> > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
> > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
> > +#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
> > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
> > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
> > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
> > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
> > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
> > +
> > +#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
> > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
> > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
> > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
> > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \
> >
> > #define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
> > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
> > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
> > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
> > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
> > + _CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) \
> >
> > #define _CLC_VECTOR_VLOAD_PRIM() \
> > _CLC_VECTOR_VLOAD_PRIM1(char) \
> > @@ -24,14 +27,16 @@
> > _CLC_VECTOR_VLOAD_PRIM1(long) \
> > _CLC_VECTOR_VLOAD_PRIM1(ulong) \
> > _CLC_VECTOR_VLOAD_PRIM1(float) \
> > -
> > + _CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
> > +
> > #ifdef cl_khr_fp64
> > -#define _CLC_VECTOR_VLOAD() \
> > - _CLC_VECTOR_VLOAD_PRIM1(double) \
> > - _CLC_VECTOR_VLOAD_PRIM()
> > -#else
> > -#define _CLC_VECTOR_VLOAD() \
> > - _CLC_VECTOR_VLOAD_PRIM()
> > +#pragma OPENCL EXTENSION cl_khr_fp64: enable
> > + _CLC_VECTOR_VLOAD_PRIM1(double)
> > #endif
> >
> > -_CLC_VECTOR_VLOAD()
> > +_CLC_VECTOR_VLOAD_PRIM()
> > +// Plain vload_half also needs to be declared
> > +_CLC_VLOAD_DECL(_half, half, float, , __constant)
> > +_CLC_VLOAD_DECL(_half, half, float, , __global)
> > +_CLC_VLOAD_DECL(_half, half, float, , __local)
> > +_CLC_VLOAD_DECL(_half, half, float, , __private)
> > diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> > index 9e0157b..8b080f5 100644
> > --- a/generic/lib/SOURCES
> > +++ b/generic/lib/SOURCES
> > @@ -144,6 +144,7 @@ shared/clamp.cl
> > shared/max.cl
> > shared/min.cl
> > shared/vload.cl
> > +shared/vload_half_helpers.ll
> > shared/vstore.cl
> > shared/vstore_half_helpers.ll
> > workitem/get_global_id.cl
> > diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
> > index 8897200..46d6486 100644
> > --- a/generic/lib/shared/vload.cl
> > +++ b/generic/lib/shared/vload.cl
> > @@ -50,3 +50,52 @@ VLOAD_TYPES()
> > #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> > VLOAD_ADDR_SPACES(double)
> > #endif
> > +
> > +/* vload_half are legal even without cl_khr_fp16 */
> > +
> > +float __clc_vload_half_float_helper__constant(const __constant half *);
> > +float __clc_vload_half_float_helper__global(const __global half *);
> > +float __clc_vload_half_float_helper__local(const __local half *);
> > +float __clc_vload_half_float_helper__private(const __private half *);
> > +
> > +/* no vload_half for double */
> > +
> > +#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
> > +#define VEC_LOAD2(val, AS) \
> > + VEC_LOAD1(val.lo, AS) \
> > + VEC_LOAD1(val.hi, AS)
> > +#define VEC_LOAD3(val, AS) \
> > + VEC_LOAD1(val.s0, AS) \
> > + VEC_LOAD1(val.s1, AS) \
> > + VEC_LOAD1(val.s2, AS)
> > +#define VEC_LOAD4(val, AS) \
> > + VEC_LOAD2(val.lo, AS) \
> > + VEC_LOAD2(val.hi, AS)
> > +#define VEC_LOAD8(val, AS) \
> > + VEC_LOAD4(val.lo, AS) \
> > + VEC_LOAD4(val.hi, AS)
> > +#define VEC_LOAD16(val, AS) \
> > + VEC_LOAD8(val.lo, AS) \
> > + VEC_LOAD8(val.hi, AS)
> > +
> > +#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
> > + _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
> > + offset *= VEC_SIZE; \
> > + TYPE __tmp; \
> > + VEC_LOAD##VEC_SIZE(__tmp, AS) \
> > + return __tmp; \
> > + }
> > +
> > +#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, AS)
> > +
> > +#define __CLC_BODY "vload_half.inc"
> > +#include <clc/math/gentype.inc>
> > +#undef __CLC_BODY
> > +#undef FUNC
> > +#undef __FUNC
> > +#undef VEC_LOAD16
> > +#undef VEC_LOAD8
> > +#undef VEC_LOAD4
> > +#undef VEC_LOAD3
> > +#undef VEC_LOAD2
> > +#undef VEC_LOAD1
> > diff --git a/generic/lib/shared/vload_half.inc b/generic/lib/shared/vload_half.inc
> > new file mode 100644
> > index 0000000..00dae8a
> > --- /dev/null
> > +++ b/generic/lib/shared/vload_half.inc
> > @@ -0,0 +1,13 @@
> > +#if __CLC_FPSIZE == 32
> > +#ifdef __CLC_VECSIZE
> > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private);
> > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local);
> > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global);
> > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __constant);
> > +#else
> > + FUNC(, 1, __CLC_GENTYPE, __private);
> > + FUNC(, 1, __CLC_GENTYPE, __local);
> > + FUNC(, 1, __CLC_GENTYPE, __global);
> > + FUNC(, 1, __CLC_GENTYPE, __constant);
> > +#endif
> > +#endif
> > diff --git a/generic/lib/shared/vload_half_helpers.ll b/generic/lib/shared/vload_half_helpers.ll
> > new file mode 100644
> > index 0000000..b8c905a
> > --- /dev/null
> > +++ b/generic/lib/shared/vload_half_helpers.ll
> > @@ -0,0 +1,23 @@
> > +define float @__clc_vload_half_float_helper__private(half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
> > + %data = load half, half addrspace(0)* %ptr
>
> Not sure I'm a fan of hard-coding the address space mappings here. In
> the past, we've defined functions with a name of _addr[1234], and then
> had target-specific mappings between the named address spaces and the
> numeric ones.

I'm not a fan either. for now this patch follows vstore_half for
consistency. I thinks there is a bigger problem with using LLVM AS in
libclc. I've been considering 2 ways to address it:

a) rewrite everything in CLC. This will require adding clang builtins
for missing functionality.

b) pass .ll files through clang preprocessor. This would allow .ll
includes (for target data layout), and target specific defines in
shared .ll code (like clang/llvm version dependent AS mapping).

I have a slight preference for a), but it might restrict what
clang/llvm version we support (or add workarounds like the current
waitcnt implementation). I'd also have to see how receptive the clang
ppl are to having builtins for everything that we need and can't be
done with pure CLC.

Do you think this should be addressed before any changes that use .ll
files land?

Jan

_

> > Address space mappings aside, I've confirmed that this also fixes the
> > new piglits on my pitcairn (gcn 1.0).
>
> thanks for testing
>
> >
> > --Aaron
> >
> > > > Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
> > > > ---
> > > > Fixes newly posted vload_half piglits on Turks, Carrizo, and Iceland.
> > > >
> > > > generic/include/clc/shared/vload.h | 45 ++++++++++++++++-------------
> > > > generic/lib/SOURCES | 1 +
> > > > generic/lib/shared/vload.cl | 49 ++++++++++++++++++++++++++++++++
> > > > generic/lib/shared/vload_half.inc | 13 +++++++++
> > > > generic/lib/shared/vload_half_helpers.ll | 23 +++++++++++++++
> > > > 5 files changed, 111 insertions(+), 20 deletions(-)
> > > > create mode 100644 generic/lib/shared/vload_half.inc
> > > > create mode 100644 generic/lib/shared/vload_half_helpers.ll
> > > >
> > > > diff --git a/generic/include/clc/shared/vload.h b/generic/include/clc/shared/vload.h
> > > > index 93d0750..1da4e99 100644
> > > > --- a/generic/include/clc/shared/vload.h
> > > > +++ b/generic/include/clc/shared/vload.h
> > > > @@ -1,18 +1,21 @@
> > > > -#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
> > > > - _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
> > > > +#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
> > > > + _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);
> > > >
> > > > -#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \
> > > > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
> > > > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
> > > > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
> > > > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
> > > > - _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
> > > > +#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
> > > > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
> > > > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
> > > > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
> > > > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
> > > > + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
> > > > +
> > > > +#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
> > > > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
> > > > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
> > > > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
> > > > + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \
> > > >
> > > > #define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
> > > > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
> > > > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
> > > > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
> > > > - _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
> > > > + _CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) \
> > > >
> > > > #define _CLC_VECTOR_VLOAD_PRIM() \
> > > > _CLC_VECTOR_VLOAD_PRIM1(char) \
> > > > @@ -24,14 +27,16 @@
> > > > _CLC_VECTOR_VLOAD_PRIM1(long) \
> > > > _CLC_VECTOR_VLOAD_PRIM1(ulong) \
> > > > _CLC_VECTOR_VLOAD_PRIM1(float) \
> > > > -
> > > > + _CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
> > > > +
> > > > #ifdef cl_khr_fp64
> > > > -#define _CLC_VECTOR_VLOAD() \
> > > > - _CLC_VECTOR_VLOAD_PRIM1(double) \
> > > > - _CLC_VECTOR_VLOAD_PRIM()
> > > > -#else
> > > > -#define _CLC_VECTOR_VLOAD() \
> > > > - _CLC_VECTOR_VLOAD_PRIM()
> > > > +#pragma OPENCL EXTENSION cl_khr_fp64: enable
> > > > + _CLC_VECTOR_VLOAD_PRIM1(double)
> > > > #endif
> > > >
> > > > -_CLC_VECTOR_VLOAD()
> > > > +_CLC_VECTOR_VLOAD_PRIM()
> > > > +// Plain vload_half also needs to be declared
> > > > +_CLC_VLOAD_DECL(_half, half, float, , __constant)
> > > > +_CLC_VLOAD_DECL(_half, half, float, , __global)
> > > > +_CLC_VLOAD_DECL(_half, half, float, , __local)
> > > > +_CLC_VLOAD_DECL(_half, half, float, , __private)
> > > > diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> > > > index 9e0157b..8b080f5 100644
> > > > --- a/generic/lib/SOURCES
> > > > +++ b/generic/lib/SOURCES
> > > > @@ -144,6 +144,7 @@ shared/clamp.cl
> > > > shared/max.cl
> > > > shared/min.cl
> > > > shared/vload.cl
> > > > +shared/vload_half_helpers.ll
> > > > shared/vstore.cl
> > > > shared/vstore_half_helpers.ll
> > > > workitem/get_global_id.cl
> > > > diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
> > > > index 8897200..46d6486 100644
> > > > --- a/generic/lib/shared/vload.cl
> > > > +++ b/generic/lib/shared/vload.cl
> > > > @@ -50,3 +50,52 @@ VLOAD_TYPES()
> > > > #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> > > > VLOAD_ADDR_SPACES(double)
> > > > #endif
> > > > +
> > > > +/* vload_half are legal even without cl_khr_fp16 */
> > > > +
> > > > +float __clc_vload_half_float_helper__constant(const __constant half *);
> > > > +float __clc_vload_half_float_helper__global(const __global half *);
> > > > +float __clc_vload_half_float_helper__local(const __local half *);
> > > > +float __clc_vload_half_float_helper__private(const __private half *);
> > > > +
> > > > +/* no vload_half for double */
> > > > +
> > > > +#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
> > > > +#define VEC_LOAD2(val, AS) \
> > > > + VEC_LOAD1(val.lo, AS) \
> > > > + VEC_LOAD1(val.hi, AS)
> > > > +#define VEC_LOAD3(val, AS) \
> > > > + VEC_LOAD1(val.s0, AS) \
> > > > + VEC_LOAD1(val.s1, AS) \
> > > > + VEC_LOAD1(val.s2, AS)
> > > > +#define VEC_LOAD4(val, AS) \
> > > > + VEC_LOAD2(val.lo, AS) \
> > > > + VEC_LOAD2(val.hi, AS)
> > > > +#define VEC_LOAD8(val, AS) \
> > > > + VEC_LOAD4(val.lo, AS) \
> > > > + VEC_LOAD4(val.hi, AS)
> > > > +#define VEC_LOAD16(val, AS) \
> > > > + VEC_LOAD8(val.lo, AS) \
> > > > + VEC_LOAD8(val.hi, AS)
> > > > +
> > > > +#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
> > > > + _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
> > > > + offset *= VEC_SIZE; \
> > > > + TYPE __tmp; \
> > > > + VEC_LOAD##VEC_SIZE(__tmp, AS) \
> > > > + return __tmp; \
> > > > + }
> > > > +
> > > > +#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, AS)
> > > > +
> > > > +#define __CLC_BODY "vload_half.inc"
> > > > +#include <clc/math/gentype.inc>
> > > > +#undef __CLC_BODY
> > > > +#undef FUNC
> > > > +#undef __FUNC
> > > > +#undef VEC_LOAD16
> > > > +#undef VEC_LOAD8
> > > > +#undef VEC_LOAD4
> > > > +#undef VEC_LOAD3
> > > > +#undef VEC_LOAD2
> > > > +#undef VEC_LOAD1
> > > > diff --git a/generic/lib/shared/vload_half.inc b/generic/lib/shared/vload_half.inc
> > > > new file mode 100644
> > > > index 0000000..00dae8a
> > > > --- /dev/null
> > > > +++ b/generic/lib/shared/vload_half.inc
> > > > @@ -0,0 +1,13 @@
> > > > +#if __CLC_FPSIZE == 32
> > > > +#ifdef __CLC_VECSIZE
> > > > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private);
> > > > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local);
> > > > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global);
> > > > + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __constant);
> > > > +#else
> > > > + FUNC(, 1, __CLC_GENTYPE, __private);
> > > > + FUNC(, 1, __CLC_GENTYPE, __local);
> > > > + FUNC(, 1, __CLC_GENTYPE, __global);
> > > > + FUNC(, 1, __CLC_GENTYPE, __constant);
> > > > +#endif
> > > > +#endif
> > > > diff --git a/generic/lib/shared/vload_half_helpers.ll b/generic/lib/shared/vload_half_helpers.ll
> > > > new file mode 100644
> > > > index 0000000..b8c905a
> > > > --- /dev/null
> > > > +++ b/generic/lib/shared/vload_half_helpers.ll
> > > > @@ -0,0 +1,23 @@
> > > > +define float @__clc_vload_half_float_helper__private(half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
> > > > + %data = load half, half addrspace(0)* %ptr
> > >
> > > Not sure I'm a fan of hard-coding the address space mappings here. In
> > > the past, we've defined functions with a name of _addr[1234], and then
> > > had target-specific mappings between the named address spaces and the
> > > numeric ones.
>
> I'm not a fan either. for now this patch follows vstore_half for
> consistency. I thinks there is a bigger problem with using LLVM AS in
> libclc. I've been considering 2 ways to address it:
>
> a) rewrite everything in CLC. This will require adding clang builtins
> for missing functionality.
>
> b) pass .ll files through clang preprocessor. This would allow .ll
> includes (for target data layout), and target specific defines in
> shared .ll code (like clang/llvm version dependent AS mapping).
>
> I have a slight preference for a), but it might restrict what
> clang/llvm version we support (or add workarounds like the current
> waitcnt implementation). I'd also have to see how receptive the clang
> ppl are to having builtins for everything that we need and can't be
> done with pure CLC.
>
> Do you think this should be addressed before any changes that use .ll
> files land?

For now, I'd settle for renaming __clc_vload_half_float_helper__* to
just indicate the numeric address space being loaded from (e.g.
__clc_vload_half_float_helper_addr1), and then map those to named
address spaces via the .cl/.inc files in generic/lib/shared/.

This is very awkward solution, because it introduces temporary solution
that is broken in a different way from existing vstore code. I'll stall
the patches until a proper solution lands in clang.

That way, if we have to change them later, there's less confusion
about calling a function named *__global for what is now the
local/whatever address space.

I don't think this is happening, otherwise the tests would fail calling
function on wrong type of pointer.

regards,
Jan

_

Address space mappings aside, I’ve confirmed that this also fixes the
new piglits on my pitcairn (gcn 1.0).

thanks for testing

–Aaron

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

Fixes newly posted vload_half piglits on Turks, Carrizo, and Iceland.

generic/include/clc/shared/vload.h | 45 +++++++++++++++±------------
generic/lib/SOURCES | 1 +
generic/lib/shared/vload.cl | 49 ++++++++++++++++++++++++++++++++
generic/lib/shared/vload_half.inc | 13 +++++++++
generic/lib/shared/vload_half_helpers.ll | 23 +++++++++++++++
5 files changed, 111 insertions(+), 20 deletions(-)
create mode 100644 generic/lib/shared/vload_half.inc
create mode 100644 generic/lib/shared/vload_half_helpers.ll

diff --git a/generic/include/clc/shared/vload.h b/generic/include/clc/shared/vload.h
index 93d0750…1da4e99 100644
— a/generic/include/clc/shared/vload.h
+++ b/generic/include/clc/shared/vload.h
@@ -1,18 +1,21 @@
-#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \

  • _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
    +#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
  • _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);

-#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \

  • _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
    +#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
  • _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)

+#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \

  • _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
  • _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
  • _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
  • _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \

#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \

  • _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
  • _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
  • _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
  • _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
  • _CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) \

#define _CLC_VECTOR_VLOAD_PRIM()
_CLC_VECTOR_VLOAD_PRIM1(char)
@@ -24,14 +27,16 @@
_CLC_VECTOR_VLOAD_PRIM1(long)
_CLC_VECTOR_VLOAD_PRIM1(ulong)
_CLC_VECTOR_VLOAD_PRIM1(float) \

  • _CLC_VECTOR_VLOAD_PRIM3(_half, half, float)

#ifdef cl_khr_fp64
-#define _CLC_VECTOR_VLOAD() \

  • _CLC_VECTOR_VLOAD_PRIM1(double) \
  • _CLC_VECTOR_VLOAD_PRIM()
    -#else
    -#define _CLC_VECTOR_VLOAD() \
  • _CLC_VECTOR_VLOAD_PRIM()
    +#pragma OPENCL EXTENSION cl_khr_fp64: enable
  • _CLC_VECTOR_VLOAD_PRIM1(double)
    #endif

-_CLC_VECTOR_VLOAD()
+_CLC_VECTOR_VLOAD_PRIM()
+// Plain vload_half also needs to be declared
+_CLC_VLOAD_DECL(_half, half, float, , __constant)
+_CLC_VLOAD_DECL(_half, half, float, , __global)
+_CLC_VLOAD_DECL(_half, half, float, , __local)
+_CLC_VLOAD_DECL(_half, half, float, , __private)
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 9e0157b…8b080f5 100644
— a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -144,6 +144,7 @@ shared/clamp.cl
shared/max.cl
shared/min.cl
shared/vload.cl
+shared/vload_half_helpers.ll
shared/vstore.cl
shared/vstore_half_helpers.ll
workitem/get_global_id.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index 8897200…46d6486 100644
— a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -50,3 +50,52 @@ VLOAD_TYPES()
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
VLOAD_ADDR_SPACES(double)
#endif
+
+/* vload_half are legal even without cl_khr_fp16 */
+
+float __clc_vload_half_float_helper__constant(const __constant half *);
+float __clc_vload_half_float_helper__global(const __global half *);
+float __clc_vload_half_float_helper__local(const __local half *);
+float __clc_vload_half_float_helper__private(const __private half );
+
+/
no vload_half for double */
+
+#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
+#define VEC_LOAD2(val, AS) \

  • VEC_LOAD1(val.lo, AS) \
  • VEC_LOAD1(val.hi, AS)
    +#define VEC_LOAD3(val, AS) \
  • VEC_LOAD1(val.s0, AS) \
  • VEC_LOAD1(val.s1, AS) \
  • VEC_LOAD1(val.s2, AS)
    +#define VEC_LOAD4(val, AS) \
  • VEC_LOAD2(val.lo, AS) \
  • VEC_LOAD2(val.hi, AS)
    +#define VEC_LOAD8(val, AS) \
  • VEC_LOAD4(val.lo, AS) \
  • VEC_LOAD4(val.hi, AS)
    +#define VEC_LOAD16(val, AS) \
  • VEC_LOAD8(val.lo, AS) \
  • VEC_LOAD8(val.hi, AS)

+#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \

  • _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
  • offset *= VEC_SIZE; \
  • TYPE __tmp; \
  • VEC_LOAD##VEC_SIZE(__tmp, AS) \
  • return __tmp; \
  • }

+#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, AS)
+
+#define __CLC_BODY “vload_half.inc”
+#include <clc/math/gentype.inc>
+#undef __CLC_BODY
+#undef FUNC
+#undef __FUNC
+#undef VEC_LOAD16
+#undef VEC_LOAD8
+#undef VEC_LOAD4
+#undef VEC_LOAD3
+#undef VEC_LOAD2
+#undef VEC_LOAD1
diff --git a/generic/lib/shared/vload_half.inc b/generic/lib/shared/vload_half.inc
new file mode 100644
index 0000000…00dae8a
— /dev/null
+++ b/generic/lib/shared/vload_half.inc
@@ -0,0 +1,13 @@
+#if __CLC_FPSIZE == 32
+#ifdef __CLC_VECSIZE

  • FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private);
  • FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local);
  • FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global);
  • FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __constant);
    +#else
  • FUNC(, 1, __CLC_GENTYPE, __private);
  • FUNC(, 1, __CLC_GENTYPE, __local);
  • FUNC(, 1, __CLC_GENTYPE, __global);
  • FUNC(, 1, __CLC_GENTYPE, __constant);
    +#endif
    +#endif
    diff --git a/generic/lib/shared/vload_half_helpers.ll b/generic/lib/shared/vload_half_helpers.ll
    new file mode 100644
    index 0000000…b8c905a
    — /dev/null
    +++ b/generic/lib/shared/vload_half_helpers.ll
    @@ -0,0 +1,23 @@
    +define float @__clc_vload_half_float_helper__private(half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
  • %data = load half, half addrspace(0)* %ptr

Not sure I’m a fan of hard-coding the address space mappings here. In
the past, we’ve defined functions with a name of _addr[1234], and then
had target-specific mappings between the named address spaces and the
numeric ones.

I’m not a fan either. for now this patch follows vstore_half for
consistency. I thinks there is a bigger problem with using LLVM AS in
libclc. I’ve been considering 2 ways to address it:

a) rewrite everything in CLC. This will require adding clang builtins
for missing functionality.

b) pass .ll files through clang preprocessor. This would allow .ll
includes (for target data layout), and target specific defines in
shared .ll code (like clang/llvm version dependent AS mapping).

I have a slight preference for a), but it might restrict what
clang/llvm version we support (or add workarounds like the current
waitcnt implementation). I’d also have to see how receptive the clang
ppl are to having builtins for everything that we need and can’t be
done with pure CLC.

Do you think this should be addressed before any changes that use .ll
files land?

For now, I’d settle for renaming clc_vload_half_float_helper* to
just indicate the numeric address space being loaded from (e.g.
__clc_vload_half_float_helper_addr1), and then map those to named
address spaces via the .cl/.inc files in generic/lib/shared/.

This is very awkward solution, because it introduces temporary solution
that is broken in a different way from existing vstore code. I’ll stall
the patches until a proper solution lands in clang.

Up to you.

That way, if we have to change them later, there’s less confusion
about calling a function named *__global for what is now the
local/whatever address space.

I don’t think this is happening, otherwise the tests would fail calling
function on wrong type of pointer.

This is just me referring to the previous talk about how there are tentative plans to renumber the address space IDs in AMDGPU in the future. Not something that has happened yet. I was just trying to minimize the number of lines of code change if that happens.

For now, since we really only support R600/GCN GPUs, we haven’t really needed to dynamically map address space names to numbers.

What you’ve got is ok, just requires more change if that future renumbering happens, or if someone starts to care about ptx and libclc.

–Aaron

[SNIP]
> > >
> > > I'm not a fan either. for now this patch follows vstore_half for
> > > consistency. I thinks there is a bigger problem with using LLVM AS in
> > > libclc. I've been considering 2 ways to address it:
> > >
> > > a) rewrite everything in CLC. This will require adding clang builtins
> > > for missing functionality.
> > >
> > > b) pass .ll files through clang preprocessor. This would allow .ll
> > > includes (for target data layout), and target specific defines in
> > > shared .ll code (like clang/llvm version dependent AS mapping).
> > >
> > > I have a slight preference for a), but it might restrict what
> > > clang/llvm version we support (or add workarounds like the current
> > > waitcnt implementation). I'd also have to see how receptive the clang
> > > ppl are to having builtins for everything that we need and can't be
> > > done with pure CLC.
> > >
> > > Do you think this should be addressed before any changes that use .ll
> > > files land?
> >
> > For now, I'd settle for renaming __clc_vload_half_float_helper__* to
> > just indicate the numeric address space being loaded from (e.g.
> > __clc_vload_half_float_helper_addr1), and then map those to named
> > address spaces via the .cl/.inc files in generic/lib/shared/.
>
> This is very awkward solution, because it introduces temporary solution
> that is broken in a different way from existing vstore code. I'll stall
> the patches until a proper solution lands in clang.
>

Up to you.

> >
> > That way, if we have to change them later, there's less confusion
> > about calling a function named *__global for what is now the
> > local/whatever address space.
>
> I don't think this is happening, otherwise the tests would fail calling
> function on wrong type of pointer.
>
>
>
>

This is just me referring to the previous talk about how there are
tentative plans to renumber the address space IDs in AMDGPU in the future.
Not something that has happened yet. I was just trying to minimize the
number of lines of code change if that happens.

For now, since we really only support R600/GCN GPUs, we haven't really
needed to dynamically map address space names to numbers.

What you've got is ok, just requires more change if that future renumbering
happens, or if someone starts to care about ptx and libclc.

Guess I misread that. Anyway, it got me to actually start working on a
proper solution. My plan is to use clang builtin [0] to implement
vload/vstore_half. That would handle possible future AS numbering
changes, as well as proper support for nvptx.
*_helpers.ll will be kept around for llvm-4 (and 5, I think it's too
late to get [0] into 5.0). I can check nvptx AS numbering and move
helpers.ll to amdgpu if need be.

regards,
Jan

[0]https://reviews.llvm.org/D37231

--Aaron

> regards,
> Jan
>
> >
> > --Aaron
> >
> > >
> > > Jan
> > >

[SNIP]