[PATCH] relational: Implement shuffle builtin

This was added in CL 1.1

Tested with a Radeon HD 7850 (Pitcairn) using the CL CTS via:
test_conformance/relationals/test_relationals shuffle_built_in

Signed-off-by: Aaron Watry <awatry@gmail.com>

ping.

This was added in CL 1.1

Tested with a Radeon HD 7850 (Pitcairn) using the CL CTS via:
test_conformance/relationals/test_relationals shuffle_built_in

sorry it took so long. I think there are still parts missing but we
might be able to get away with it if clang can handle mask type
conversion implicitly.
it also needs to ignore the high bits of mask elements.
see inline comments.

I haven't tested it yet. I'll try to do that and provide shuffle2
piglits asap.

Signed-off-by: Aaron Watry <awatry@gmail.com>
---
generic/include/clc/clc.h | 1 +
generic/include/clc/relational/shuffle.h | 44 +++++++++
generic/lib/SOURCES | 1 +
generic/lib/relational/shuffle.cl | 153 +++++++++++++++++++++++++++++++
4 files changed, 199 insertions(+)
create mode 100644 generic/include/clc/relational/shuffle.h
create mode 100644 generic/lib/relational/shuffle.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 4c29214..ac1dab5 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -173,6 +173,7 @@
#include <clc/relational/isordered.h>
#include <clc/relational/isunordered.h>
#include <clc/relational/select.h>
+#include <clc/relational/shuffle.h>

Not sure why CTS puts these in relational category. specs have a misc
chapter for them, so it'd be nice to add new dir in clc.

#include <clc/relational/signbit.h>

/* 6.11.8 Synchronization Functions */
diff --git a/generic/include/clc/relational/shuffle.h b/generic/include/clc/relational/shuffle.h
new file mode 100644
index 0000000..e10ac5e
--- /dev/null
+++ b/generic/include/clc/relational/shuffle.h
@@ -0,0 +1,44 @@
+//===-- generic/include/clc/relational/shuffle.h ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under both the University of Illinois Open Source
+// License and the MIT license. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define _CLC_SHUFFLE_DECL(TYPE, MASKTYPE, RETTYPE) \
+ _CLC_OVERLOAD _CLC_DECL RETTYPE shuffle(TYPE x, MASKTYPE mask);
+
+//Return type is same base type as the input type, with the same vector size as the mask.
+//Elements in the mask must be the same size (number of bits) as the input value.
+//E.g. char8 ret = shuffle(char2 x, uchar8 mask);
+
+#define _CLC_VECTOR_SHUFFLE_MASKSIZE(INBASE, INTYPE, MASKTYPE) \
+ _CLC_SHUFFLE_DECL(INTYPE, MASKTYPE##2, INBASE##2) \
+ _CLC_SHUFFLE_DECL(INTYPE, MASKTYPE##4, INBASE##4) \
+ _CLC_SHUFFLE_DECL(INTYPE, MASKTYPE##8, INBASE##8) \
+ _CLC_SHUFFLE_DECL(INTYPE, MASKTYPE##16, INBASE##16) \
+
+#define _CLC_VECTOR_SHUFFLE_INSIZE(TYPE, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, TYPE##2, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, TYPE##4, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, TYPE##8, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, TYPE##16, MASKTYPE) \
+
+_CLC_VECTOR_SHUFFLE_INSIZE(char, uchar)
+_CLC_VECTOR_SHUFFLE_INSIZE(short, ushort)
+_CLC_VECTOR_SHUFFLE_INSIZE(int, uint)
+_CLC_VECTOR_SHUFFLE_INSIZE(long, ulong)
+_CLC_VECTOR_SHUFFLE_INSIZE(uchar, uchar)
+_CLC_VECTOR_SHUFFLE_INSIZE(ushort, ushort)
+_CLC_VECTOR_SHUFFLE_INSIZE(uint, uint)
+_CLC_VECTOR_SHUFFLE_INSIZE(ulong, ulong)
+_CLC_VECTOR_SHUFFLE_INSIZE(float, uint)
+#ifdef cl_khr_fp64
+_CLC_VECTOR_SHUFFLE_INSIZE(double, ulong)
+#endif
+
+#undef _CLC_SHUFFLE_DECL
+#undef _CLC_VECTOR_SHUFFLE_MASKSIZE
+#undef _CLC_VECTOR_SHUFFLE_INSIZE
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 9e0157b..fe0df5a 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -139,6 +139,7 @@ relational/isnormal.cl
relational/isnotequal.cl
relational/isordered.cl
relational/isunordered.cl
+relational/shuffle.cl
relational/signbit.cl
shared/clamp.cl
shared/max.cl
diff --git a/generic/lib/relational/shuffle.cl b/generic/lib/relational/shuffle.cl
new file mode 100644
index 0000000..7d96f86
--- /dev/null
+++ b/generic/lib/relational/shuffle.cl
@@ -0,0 +1,153 @@
+//===-- generic/lib/relational/shuffle.cl ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under both the University of Illinois Open Source
+// License and the MIT license. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc.h>
+
+#define _CLC_ELEMENT_CASES2(VAR) \
+ case 0: return VAR.s0; \
+ case 1: return VAR.s1;
+
+#define _CLC_ELEMENT_CASES4(VAR) \
+ _CLC_ELEMENT_CASES2(VAR) \
+ case 2: return VAR.s2; \
+ case 3: return VAR.s3;
+
+#define _CLC_ELEMENT_CASES8(VAR) \
+ _CLC_ELEMENT_CASES4(VAR) \
+ case 4: return VAR.s4; \
+ case 5: return VAR.s5; \
+ case 6: return VAR.s6; \
+ case 7: return VAR.s7;
+
+#define _CLC_ELEMENT_CASES16(VAR) \
+ _CLC_ELEMENT_CASES8(VAR) \
+ case 8: return VAR.s8; \
+ case 9: return VAR.s9; \
+ case 10: return VAR.sA; \
+ case 11: return VAR.sB; \
+ case 12: return VAR.sC; \
+ case 13: return VAR.sD; \
+ case 14: return VAR.sE; \
+ case 15: return VAR.sF;
+
+#define _CLC_GET_ELEMENT_DEFINE(ARGTYPE, ARGSIZE, IDXTYPE) \
+ inline ARGTYPE __clc_get_el_##ARGTYPE##ARGSIZE##_##IDXTYPE(ARGTYPE##ARGSIZE x, IDXTYPE idx) {\
+ switch (idx){ \

I think you need "idx % #ARGSIZE" here. Specs explitcilty mention that
higher bits are ignored and the newly posted piglit tests check this.

+ _CLC_ELEMENT_CASES##ARGSIZE(x) \
+ default: return 0; \
+ } \
+ } \
+
+#define _CLC_SHUFFLE_SET_ONE_ELEMENT(ARGTYPE, ARGSIZE, INDEX, MASKTYPE) \
+ ret_val.s##INDEX = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s##INDEX); \
+
+#define _CLC_SHUFFLE_SET_2_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ ret_val.s0 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s0); \
+ ret_val.s1 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s1);
+
+#define _CLC_SHUFFLE_SET_4_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_SET_2_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ ret_val.s2 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s2); \
+ ret_val.s3 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s3);
+
+#define _CLC_SHUFFLE_SET_8_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_SET_4_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ ret_val.s4 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s4); \
+ ret_val.s5 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s5); \
+ ret_val.s6 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s6); \
+ ret_val.s7 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s7);
+
+#define _CLC_SHUFFLE_SET_16_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_SET_8_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ ret_val.s8 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s8); \
+ ret_val.s9 = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.s9); \
+ ret_val.sA = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.sA); \
+ ret_val.sB = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.sB); \
+ ret_val.sC = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.sC); \
+ ret_val.sD = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.sD); \
+ ret_val.sE = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.sE); \
+ ret_val.sF = __clc_get_el_##ARGTYPE##ARGSIZE##_##MASKTYPE(x, mask.sF); \
+
+#define _CLC_SHUFFLE_DEFINE2(ARGTYPE, ARGSIZE, MASKTYPE) \
+_CLC_DEF _CLC_OVERLOAD ARGTYPE##2 shuffle(ARGTYPE##ARGSIZE x, MASKTYPE##2 mask){ \
+ ARGTYPE##2 ret_val; \
+ mask &= (MASKTYPE##2)(ARGSIZE-1); \
+ _CLC_SHUFFLE_SET_2_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ return ret_val; \
+}
+
+#define _CLC_SHUFFLE_DEFINE4(ARGTYPE, ARGSIZE, MASKTYPE) \
+_CLC_DEF _CLC_OVERLOAD ARGTYPE##4 shuffle(ARGTYPE##ARGSIZE x, MASKTYPE##4 mask){ \
+ ARGTYPE##4 ret_val; \
+ mask &= (MASKTYPE##4)(ARGSIZE-1); \
+ _CLC_SHUFFLE_SET_4_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ return ret_val; \
+}
+
+#define _CLC_SHUFFLE_DEFINE8(ARGTYPE, ARGSIZE, MASKTYPE) \
+_CLC_DEF _CLC_OVERLOAD ARGTYPE##8 shuffle(ARGTYPE##ARGSIZE x, MASKTYPE##8 mask){ \
+ ARGTYPE##8 ret_val; \
+ mask &= (MASKTYPE##8)(ARGSIZE-1); \
+ _CLC_SHUFFLE_SET_8_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ return ret_val; \
+}
+
+#define _CLC_SHUFFLE_DEFINE16(ARGTYPE, ARGSIZE, MASKTYPE) \
+_CLC_DEF _CLC_OVERLOAD ARGTYPE##16 shuffle(ARGTYPE##ARGSIZE x, MASKTYPE##16 mask){ \
+ ARGTYPE##16 ret_val; \
+ mask &= (MASKTYPE##16)(ARGSIZE-1); \
+ _CLC_SHUFFLE_SET_16_ELEMENTS(ARGTYPE, ARGSIZE, MASKTYPE) \
+ return ret_val; \
+}
+
+#define _CLC_VECTOR_SHUFFLE_MASKSIZE(INTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_GET_ELEMENT_DEFINE(INTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_DEFINE2(INTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_DEFINE4(INTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_DEFINE8(INTYPE, ARGSIZE, MASKTYPE) \
+ _CLC_SHUFFLE_DEFINE16(INTYPE, ARGSIZE, MASKTYPE) \
+
+#define _CLC_VECTOR_SHUFFLE_INSIZE(TYPE, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, 2, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, 4, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, 8, MASKTYPE) \
+ _CLC_VECTOR_SHUFFLE_MASKSIZE(TYPE, 16, MASKTYPE) \
+
+
+
+_CLC_VECTOR_SHUFFLE_INSIZE(char, uchar)
+_CLC_VECTOR_SHUFFLE_INSIZE(short, ushort)
+_CLC_VECTOR_SHUFFLE_INSIZE(int, uint)
+_CLC_VECTOR_SHUFFLE_INSIZE(long, ulong)
+_CLC_VECTOR_SHUFFLE_INSIZE(uchar, uchar)
+_CLC_VECTOR_SHUFFLE_INSIZE(ushort, ushort)
+_CLC_VECTOR_SHUFFLE_INSIZE(uint, uint)
+_CLC_VECTOR_SHUFFLE_INSIZE(ulong, ulong)
+_CLC_VECTOR_SHUFFLE_INSIZE(float, uint)

Mask type can be any vector of unsigned type, so I think you need all
combinations.

+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_VECTOR_SHUFFLE_INSIZE(double, ulong)

I think this needs other mask types.

+#endif

add half/cl_khr_fp16 here.

thanks,
Jan

> This was added in CL 1.1
>
> Tested with a Radeon HD 7850 (Pitcairn) using the CL CTS via:
> test_conformance/relationals/test_relationals shuffle_built_in

sorry it took so long. I think there are still parts missing but we
might be able to get away with it if clang can handle mask type
conversion implicitly.
it also needs to ignore the high bits of mask elements.
see inline comments.

looks like I was wrong on both accounts.
If you add half version and move this to clc/misc:
Reviewed-by: Jan Vesely <jan.vesely@rutgers.edu>

I haven't tested it yet. I'll try to do that and provide shuffle2
piglits asap.

works at least on carrizo/iceland with llvm 5.0

Jan

> This was added in CL 1.1
>
> Tested with a Radeon HD 7850 (Pitcairn) using the CL CTS via:
> test_conformance/relationals/test_relationals shuffle_built_in

sorry it took so long. I think there are still parts missing but we
might be able to get away with it if clang can handle mask type
conversion implicitly.
it also needs to ignore the high bits of mask elements.
see inline comments.

looks like I was wrong on both accounts.

Yeah, I did this with an 'mask &= (MASKTYPE##N)(ARGSIZE-1)' as the
first piece of the function implementation, instead of in the switch
statement. I did test this using the CTS, which handled float/double,
but doesn't have fp16 tests. I'll see what I need to do to get this
working on my SI using your piglit tests (assuming that I can).

I'll also go ahead and move this to a new misc/ directory as
suggested. If I can manage to test the fp16 support myself, do you
want to see the new version,or would the review stand, assuming that
all I have to add is:

#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
_CLC_VECTOR_SHUFFLE_INSIZE(half, ushort)
#endif

--Aaron

> This was added in CL 1.1
>
> Tested with a Radeon HD 7850 (Pitcairn) using the CL CTS via:
> test_conformance/relationals/test_relationals shuffle_built_in

sorry it took so long. I think there are still parts missing but we
might be able to get away with it if clang can handle mask type
conversion implicitly.
it also needs to ignore the high bits of mask elements.
see inline comments.

looks like I was wrong on both accounts.

Yeah, I did this with an 'mask &= (MASKTYPE##N)(ARGSIZE-1)' as the
first piece of the function implementation, instead of in the switch
statement. I did test this using the CTS, which handled float/double,
but doesn't have fp16 tests. I'll see what I need to do to get this
working on my SI using your piglit tests (assuming that I can).

I'll also go ahead and move this to a new misc/ directory as
suggested. If I can manage to test the fp16 support myself, do you
want to see the new version,or would the review stand, assuming that
all I have to add is:

#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
_CLC_VECTOR_SHUFFLE_INSIZE(half, ushort)
#endif

--Aaron

Note: I had to also add the half type definitions in
float/definitions.h. I'll re-send this piece as 1 patch, and then v2
of the two shuffle patches a parts 2/3, just so you can verify the
type definitions. half-precision is not something I've even started
to look into yet, but it seems like something you've possibly got a
use for, and possibly an ability to test (I believe that VI was the
first generation with fp16 support).

--Aaron

> > > > This was added in CL 1.1
> > > >
> > > > Tested with a Radeon HD 7850 (Pitcairn) using the CL CTS via:
> > > > test_conformance/relationals/test_relationals shuffle_built_in
> > >
> > > sorry it took so long. I think there are still parts missing but we
> > > might be able to get away with it if clang can handle mask type
> > > conversion implicitly.
> > > it also needs to ignore the high bits of mask elements.
> > > see inline comments.
> >
> > looks like I was wrong on both accounts.
>
> Yeah, I did this with an 'mask &= (MASKTYPE##N)(ARGSIZE-1)' as the
> first piece of the function implementation, instead of in the switch
> statement. I did test this using the CTS, which handled float/double,
> but doesn't have fp16 tests. I'll see what I need to do to get this
> working on my SI using your piglit tests (assuming that I can).
>
> I'll also go ahead and move this to a new misc/ directory as
> suggested. If I can manage to test the fp16 support myself, do you
> want to see the new version,or would the review stand, assuming that
> all I have to add is:
>
> #ifdef cl_khr_fp16
> #pragma OPENCL EXTENSION cl_khr_fp16 : enable
> _CLC_VECTOR_SHUFFLE_INSIZE(half, ushort)
> #endif

afaik this should be enough wrt to shuffle.

>
> --Aaron

Note: I had to also add the half type definitions in
float/definitions.h.

hm, I assumed we already had some half precision support in libclc.
If it means that you need to add all infrastructure, feel free to skip
this for shuffle{,2}. Sorry, should have checked it.

I'll re-send this piece as 1 patch, and then v2
of the two shuffle patches a parts 2/3, just so you can verify the
type definitions. half-precision is not something I've even started
to look into yet, but it seems like something you've possibly got a
use for, and possibly an ability to test (I believe that VI was the
first generation with fp16 support).

clang enables cl_khr_fp16 on everything >=gfx6 (which I believe is SI),
so you should be able to test it. I think you're correct that VI added
fp16 instructions, but they're not strictly required.
We don't expose any of this in clover however. so you'll either need to
disable the piglit requirement or modify clover.
I can test on carrizo if you post the patches that include fp16.

Jan

Hi Aaron,

A bit late to the party, but with LLVM 5.0 the IR being generated for this (for at least NVPTX) is rather ugly. I see sequences like the following appear quite a few number of times:

%1 = extractelement <4 x i32> %and.i, i32 0, !dbg !70
%trunc.i = trunc i32 %1 to i2, !dbg !70
switch i2 %trunc.i, label %__clc_get_el_int4_uint.exit34.i [
i2 0, label %sw.bb.i.i
i2 1, label %__clc_get_el_int4_uint.exit.i
i2 -2, label %sw.bb2.i.i
i2 -1, label %sw.bb3.i.i
], !dbg !70

sw.bb.i.i: ; preds = %entry
br label %__clc_get_el_int4_uint.exit.i, !dbg !70

sw.bb2.i.i: ; preds = %entry
br label %__clc_get_el_int4_uint.exit.i, !dbg !70

sw.bb3.i.i: ; preds = %entry
br label %__clc_get_el_int4_uint.exit.i, !dbg !70

__clc_get_el_int4_uint.exit34.i: ; preds = %entry
unreachable, !dbg !70

__clc_get_el_int4_uint.exit.i: ; preds = %sw.bb3.i.i, %sw.bb2.i.i, %sw.bb.i.i, %entry

This seems to be a missed optimisation in the SimplifyCFG pass. Is this a known issue?

Thanks,

Jeroen

Hi Aaron,

A bit late to the party, but with LLVM 5.0 the IR being generated for this
(for at least NVPTX) is rather ugly. I see sequences like the following
appear quite a few number of times:

  %1 = extractelement <4 x i32> %and.i, i32 0, !dbg !70
  %trunc.i = trunc i32 %1 to i2, !dbg !70
  switch i2 %trunc.i, label %__clc_get_el_int4_uint.exit34.i [
    i2 0, label %sw.bb.i.i
    i2 1, label %__clc_get_el_int4_uint.exit.i
    i2 -2, label %sw.bb2.i.i
    i2 -1, label %sw.bb3.i.i
  ], !dbg !70

sw.bb.i.i: ; preds = %entry
  br label %__clc_get_el_int4_uint.exit.i, !dbg !70

sw.bb2.i.i: ; preds = %entry
  br label %__clc_get_el_int4_uint.exit.i, !dbg !70

sw.bb3.i.i: ; preds = %entry
  br label %__clc_get_el_int4_uint.exit.i, !dbg !70

__clc_get_el_int4_uint.exit34.i: ; preds = %entry
  unreachable, !dbg !70

__clc_get_el_int4_uint.exit.i: ; preds = %sw.bb3.i.i,
%sw.bb2.i.i, %sw.bb.i.i, %entry

This seems to be a missed optimisation in the SimplifyCFG pass. Is this a
known issue?

Not necessarily a known issue (at least to me). In the case of
shuffle, I didn't spend too much time looking at optimizing, so much
as getting something that produced the correct results. I'm open to
any suggestions on optimizing the libclc implementation, or any
changes to the compiler back-end that might improve the generated
binaries.

Once we have correct results from the functions in libclc, that's when
I had planned on looping back to the functions that seemed to be a
bottleneck. Correctness, then completeness, then performance I
figured...

--Aaron