[PATCH v2 1/1] Implement generic mad_sat

v2: Fix trailing whitespace
    Fix signed long overflow
    improve comment

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

v2: Fix trailing whitespace
    Fix signed long overflow
    improve comment

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---

NOTE: Use http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20140804/229312.html to avoid llvm segfault

generic/include/clc/clc.h | 1 +
generic/include/clc/integer/mad_sat.h | 3 ++
generic/include/clc/integer/mad_sat.inc | 1 +
generic/lib/SOURCES | 1 +
generic/lib/clcmacro.h | 22 ++++++++++
generic/lib/integer/mad_sat.cl | 72 +++++++++++++++++++++++++++++++++
6 files changed, 100 insertions(+)
create mode 100644 generic/include/clc/integer/mad_sat.h
create mode 100644 generic/include/clc/integer/mad_sat.inc
create mode 100644 generic/lib/integer/mad_sat.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 9815c56..aca9b53 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -82,6 +82,7 @@
#include <clc/integer/hadd.h>
#include <clc/integer/mad24.h>
#include <clc/integer/mad_hi.h>
+#include <clc/integer/mad_sat.h>
#include <clc/integer/mul24.h>
#include <clc/integer/mul_hi.h>
#include <clc/integer/rhadd.h>
diff --git a/generic/include/clc/integer/mad_sat.h b/generic/include/clc/integer/mad_sat.h
new file mode 100644
index 0000000..3e92372
--- /dev/null
+++ b/generic/include/clc/integer/mad_sat.h
@@ -0,0 +1,3 @@
+#define __CLC_BODY <clc/integer/mad_sat.inc>
+#include <clc/integer/gentype.inc>
+#undef __CLC_BODY
diff --git a/generic/include/clc/integer/mad_sat.inc b/generic/include/clc/integer/mad_sat.inc
new file mode 100644
index 0000000..5da2bdf
--- /dev/null
+++ b/generic/include/clc/integer/mad_sat.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad_sat(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index bfdec7b..7d3fa6b 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -19,6 +19,7 @@ integer/clz_if.ll
integer/clz_impl.ll
integer/hadd.cl
integer/mad24.cl
+integer/mad_sat.cl
integer/mul24.cl
integer/mul_hi.cl
integer/rhadd.cl
diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
index 730073a..ef102ea 100644
--- a/generic/lib/clcmacro.h
+++ b/generic/lib/clcmacro.h
@@ -41,6 +41,28 @@
     return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
   }

+#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, ARG3_TYPE##2 z) { \
+ return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, ARG3_TYPE##3 z) { \
+ return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
+ FUNCTION(x.z, y.z, z.z)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, ARG3_TYPE##4 z) { \
+ return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, ARG3_TYPE##8 z) { \
+ return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
+ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
+ }
+
#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
   return BUILTIN(x, y); \
diff --git a/generic/lib/integer/mad_sat.cl b/generic/lib/integer/mad_sat.cl
new file mode 100644
index 0000000..7048931
--- /dev/null
+++ b/generic/lib/integer/mad_sat.cl
@@ -0,0 +1,72 @@
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) {
+ return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar mad_sat(uchar x, uchar y, uchar z) {
+ return clamp((ushort)mad24((ushort)x, (ushort)y, (ushort)z), (ushort)0, (ushort) UCHAR_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF short mad_sat(short x, short y, short z) {
+ return clamp((int)mad24((int)x, (int)y, (int)z), (int)SHRT_MIN, (int) SHRT_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort mad_sat(ushort x, ushort y, ushort z) {
+ return clamp((uint)mad24((uint)x, (uint)y, (uint)z), (uint)0, (uint) USHRT_MAX);
+}
+
+_CLC_OVERLOAD _CLC_DEF int mad_sat(int x, int y, int z) {
+ int mhi = mul_hi(x, y);
+ uint mlo = x * y;
+ long m = upsample(mhi, mlo);
+ m += z;
+ if (m > INT_MAX)
+ return INT_MAX;
+ if (m < INT_MIN)
+ return INT_MIN;
+ return m;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint mad_sat(uint x, uint y, uint z) {
+ if (mul_hi(x, y) != 0)
+ return UINT_MAX;
+ return add_sat(x * y, z);
+}
+
+_CLC_OVERLOAD _CLC_DEF long mad_sat(long x, long y, long z) {
+ long hi = mul_hi(x, y);
+ ulong ulo = x * y;
+ long slo = x * y;
+ /* Big overflow of more than 2 bits, add can't fix this */
+ if (((x < 0) == (y < 0)) && hi != 0)
+ return LONG_MAX;
+ /* Low overflow in mul and z not neg enough to correct it */
+ if (hi == 0 && ulo >= LONG_MAX && (z > 0 || (ulo + z) > LONG_MAX))
+ return LONG_MAX;
+ /* Big overflow of more than 2 bits, add can't fix this */
+ if (((x < 0) != (y < 0)) && hi != -1)
+ return LONG_MIN;
+ /* Low overflow in mul and z not pos enough to correct it */
+ if (hi == -1 && ulo <= ((ulong)LONG_MAX + 1UL) && (z < 0 || z < (LONG_MAX - ulo)))
+ return LONG_MIN;
+ /* We have checked all conditions, any overflow in addtion returns

s/addtion/addition/

The code returns all test passes on evergreen using the piglit unit
tests for mad_sat (all data types) after I applied your LLVM
workaround.

Otherwise, I'm trying to find out if there's any redundant boolean
logic in the above bits. Especially I'm wondering if we need to keep
checking if hi is greater than, less than, equal to, or not equal to 0
and -1, or if we can just pull that part out to a quick overflow test
at the beginning.

Either way, I've successfully tested this version of the code with
your LLVM FlattenCFG.cpp patch and gotten successful unit test passes
on CEDAR (Radeon 5400). I believe that radeonsi will probably still
fail due to the ulong instruction selection issue that I noted
yesterday (unless the FlattenCFG change also affects this in a
slightly different way), but that doesn't seem like an issue with this
patch so much as the back-end.

--Aaron

What operation is not selecting? I thought most of those were taken care of already

I've attached the bitcode and resulting LLVM Error from the mad_sat
ulong2 test kernel.

The kernel source is:
kernel void test_2_mad_sat_ulong(global ulong* out, global ulong* in0,
global ulong* in1, global ulong* in2){
  vstore2(mad_sat(vload2(0, in0), vload2(0, in1), vload2(0, in2)), 0, out);
}

Note that it's likely that mad_sat is fine, and the mul_hi and/or
add_sat call embedded in mad_sat is actually where the issue is
generated.

mad_sat_ulong2_test.txt (6.79 KB)

OK, I think this is because of the 64-bit ands in control flow which currently blocks selecting the scalar version. With the current workarounds for SGPRs and control flow, a 64-bit VALU and pattern needs to be added (which I recall seeing a patch for recently)

> v2: Fix trailing whitespace
> Fix signed long overflow
> improve comment
>
> Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
> ---
>
> NOTE: Use http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20140804/229312.html to avoid llvm segfault
>
> generic/include/clc/clc.h | 1 +
> generic/include/clc/integer/mad_sat.h | 3 ++
> generic/include/clc/integer/mad_sat.inc | 1 +
> generic/lib/SOURCES | 1 +
> generic/lib/clcmacro.h | 22 ++++++++++
> generic/lib/integer/mad_sat.cl | 72 +++++++++++++++++++++++++++++++++
> 6 files changed, 100 insertions(+)
> create mode 100644 generic/include/clc/integer/mad_sat.h
> create mode 100644 generic/include/clc/integer/mad_sat.inc
> create mode 100644 generic/lib/integer/mad_sat.cl
>
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index 9815c56..aca9b53 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -82,6 +82,7 @@
> #include <clc/integer/hadd.h>
> #include <clc/integer/mad24.h>
> #include <clc/integer/mad_hi.h>
> +#include <clc/integer/mad_sat.h>
> #include <clc/integer/mul24.h>
> #include <clc/integer/mul_hi.h>
> #include <clc/integer/rhadd.h>
> diff --git a/generic/include/clc/integer/mad_sat.h b/generic/include/clc/integer/mad_sat.h
> new file mode 100644
> index 0000000..3e92372
> --- /dev/null
> +++ b/generic/include/clc/integer/mad_sat.h
> @@ -0,0 +1,3 @@
> +#define __CLC_BODY <clc/integer/mad_sat.inc>
> +#include <clc/integer/gentype.inc>
> +#undef __CLC_BODY
> diff --git a/generic/include/clc/integer/mad_sat.inc b/generic/include/clc/integer/mad_sat.inc
> new file mode 100644
> index 0000000..5da2bdf
> --- /dev/null
> +++ b/generic/include/clc/integer/mad_sat.inc
> @@ -0,0 +1 @@
> +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad_sat(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z);
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index bfdec7b..7d3fa6b 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -19,6 +19,7 @@ integer/clz_if.ll
> integer/clz_impl.ll
> integer/hadd.cl
> integer/mad24.cl
> +integer/mad_sat.cl
> integer/mul24.cl
> integer/mul_hi.cl
> integer/rhadd.cl
> diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
> index 730073a..ef102ea 100644
> --- a/generic/lib/clcmacro.h
> +++ b/generic/lib/clcmacro.h
> @@ -41,6 +41,28 @@
> return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
> }
>
> +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
> + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, ARG3_TYPE##2 z) { \
> + return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, ARG3_TYPE##3 z) { \
> + return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
> + FUNCTION(x.z, y.z, z.z)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, ARG3_TYPE##4 z) { \
> + return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, ARG3_TYPE##8 z) { \
> + return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
> + return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
> + }
> +
> #define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
> _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
> return BUILTIN(x, y); \
> diff --git a/generic/lib/integer/mad_sat.cl b/generic/lib/integer/mad_sat.cl
> new file mode 100644
> index 0000000..7048931
> --- /dev/null
> +++ b/generic/lib/integer/mad_sat.cl
> @@ -0,0 +1,72 @@
> +#include <clc/clc.h>
> +#include "../clcmacro.h"
> +
> +_CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) {
> + return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX);
> +}
> +
> +_CLC_OVERLOAD _CLC_DEF uchar mad_sat(uchar x, uchar y, uchar z) {
> + return clamp((ushort)mad24((ushort)x, (ushort)y, (ushort)z), (ushort)0, (ushort) UCHAR_MAX);
> +}
> +
> +_CLC_OVERLOAD _CLC_DEF short mad_sat(short x, short y, short z) {
> + return clamp((int)mad24((int)x, (int)y, (int)z), (int)SHRT_MIN, (int) SHRT_MAX);
> +}
> +
> +_CLC_OVERLOAD _CLC_DEF ushort mad_sat(ushort x, ushort y, ushort z) {
> + return clamp((uint)mad24((uint)x, (uint)y, (uint)z), (uint)0, (uint) USHRT_MAX);
> +}
> +
> +_CLC_OVERLOAD _CLC_DEF int mad_sat(int x, int y, int z) {
> + int mhi = mul_hi(x, y);
> + uint mlo = x * y;
> + long m = upsample(mhi, mlo);
> + m += z;
> + if (m > INT_MAX)
> + return INT_MAX;
> + if (m < INT_MIN)
> + return INT_MIN;
> + return m;
> +}
> +
> +_CLC_OVERLOAD _CLC_DEF uint mad_sat(uint x, uint y, uint z) {
> + if (mul_hi(x, y) != 0)
> + return UINT_MAX;
> + return add_sat(x * y, z);
> +}
> +
> +_CLC_OVERLOAD _CLC_DEF long mad_sat(long x, long y, long z) {
> + long hi = mul_hi(x, y);
> + ulong ulo = x * y;
> + long slo = x * y;
> + /* Big overflow of more than 2 bits, add can't fix this */
> + if (((x < 0) == (y < 0)) && hi != 0)
> + return LONG_MAX;
> + /* Low overflow in mul and z not neg enough to correct it */
> + if (hi == 0 && ulo >= LONG_MAX && (z > 0 || (ulo + z) > LONG_MAX))
> + return LONG_MAX;
> + /* Big overflow of more than 2 bits, add can't fix this */
> + if (((x < 0) != (y < 0)) && hi != -1)
> + return LONG_MIN;
> + /* Low overflow in mul and z not pos enough to correct it */
> + if (hi == -1 && ulo <= ((ulong)LONG_MAX + 1UL) && (z < 0 || z < (LONG_MAX - ulo)))
> + return LONG_MIN;
> + /* We have checked all conditions, any overflow in addtion returns

s/addtion/addition/

fixed in v3

The code returns all test passes on evergreen using the piglit unit
tests for mad_sat (all data types) after I applied your LLVM
workaround.

Otherwise, I'm trying to find out if there's any redundant boolean
logic in the above bits. Especially I'm wondering if we need to keep
checking if hi is greater than, less than, equal to, or not equal to 0
and -1, or if we can just pull that part out to a quick overflow test
at the beginning.

I've tried to combine the MIN/MAX branches, but the problem is that not
all overflows are equal. Some sign bit overflows (hi is 0 or -1), can be
'corrected' by the follow up addition, so I decided to make the code
more readable and keep the cases separate.

Either way, I've successfully tested this version of the code with
your LLVM FlattenCFG.cpp patch and gotten successful unit test passes
on CEDAR (Radeon 5400). I believe that radeonsi will probably still
fail due to the ulong instruction selection issue that I noted
yesterday (unless the FlattenCFG change also affects this in a
slightly different way), but that doesn't seem like an issue with this
patch so much as the back-end.

thanks for testing,
jan

Hi Jan,

I noticed that this isn't in yet (probably because the underlying LLVM
issue is still being worked on).

I found another issue while running a modified piglit run for my final
vload/vstore patch. I had enabled 3-element vectors in piglit to
fully test the vload3/vstore3 changes, and it turns out that mad_sat
doesn't currently support 3-element vectors.

The generic/lib/integer/mad_sat implementation defines mad_sat with
3-element vector types, but the
generic/include/clc/integer/integer-gentype.inc file doesn't define
int3 or uint3. This manifests itself as a crash when compiling
kernels that use mad_sat(int3, int3, int3). I suspect mul24 and mad24
also are similarly affected (and I can check that if needed when I
have some time).

Given that 3-element vectors were added in OpenCL 1.1, and the
libclc.llvm.org page lists libclc as supporting CL 1.1, we should
probably just add that definition in to that file while we're here.

--Aaron

Hi Aaron,

Hi Jan,

I noticed that this isn't in yet (probably because the underlying LLVM
issue is still being worked on).

The segfault issue is fixed in r215574. I don't know about the SI AND
select issue. I plan to post v3 when I return from the holiday in Sept.
the only difference is from v2 the typo you spotted.
I thought it would be good if someone could confirm that it works with
backend other than r600 (SI or nvptx).

I found another issue while running a modified piglit run for my final
vload/vstore patch. I had enabled 3-element vectors in piglit to
fully test the vload3/vstore3 changes, and it turns out that mad_sat
doesn't currently support 3-element vectors.

The generic/lib/integer/mad_sat implementation defines mad_sat with
3-element vector types, but the
generic/include/clc/integer/integer-gentype.inc file doesn't define
int3 or uint3. This manifests itself as a crash when compiling
kernels that use mad_sat(int3, int3, int3). I suspect mul24 and mad24
also are similarly affected (and I can check that if needed when I
have some time).
Given that 3-element vectors were added in OpenCL 1.1, and the
libclc.llvm.org page lists libclc as supporting CL 1.1, we should
probably just add that definition in to that file while we're here.

Can you be more specific about the error? I can't reproduce it locally.
simple kernel:
"__kernel void mad_sat_test( \n" \
" __global uint3* input1, \n" \
" __global uint3* input2, \n" \
" __global uint3* input3, \n" \
" __global uint3* output) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" output[i] = mad_sat(input1[i], input2[i], input3[i]); \n" \
"} \n" \
"\n";

compiles and runs ok on my test machine. (you can try mad_sat test from
[0]).
mad_sat does not use integer/integer-gentype.inc but rather
integer/gentype.inc, which includes (u)int3.

I think (u)int3 missing from integer-gentype.inc is a separate problem
and should only affect mul24 and mad24.

regards,
Jan

[0] https://github.com/jvesely/ocl_tests

Ack... Just sent a response to Jan without reply-all... Short
summary: I was wrong and mad_sat doesn't suffer from the int3/uint3
issue... It's just mul24/mad24 which are affected. I'll be following
up with a patch once I've given it a full piglit test run.

--Aaron