[PATCH 1/2] math: Implement erfc

Signed-off-by: Aaron Watry <awatry@gmail.com>

Signed-off-by: Aaron Watry <awatry@gmail.com>

Signed-off-by: Aaron Watry <awatry@gmail.com>
---
generic/include/clc/clc.h | 1 +
generic/include/clc/math/binary_decl.inc | 18 ++++-
generic/include/clc/math/gentype_tss.inc | 108 +++++++++++++++++++++++++++++
generic/include/clc/math/ldexp.h | 9 +++
generic/lib/SOURCES | 1 +
generic/lib/clcmacro.h | 26 +++++++
generic/lib/math/ldexp.cl | 114 +++++++++++++++++++++++++++++++

I just sent out a patch implementing an optimized ldexp for R600/SI.
I was able create the declarations and definitions without having to
modify too many of the *.inc files or adding new macros to clcmacro.h

I think we should try a similar approach for the generic version.

7 files changed, 274 insertions(+), 3 deletions(-)
create mode 100644 generic/include/clc/math/gentype_tss.inc
create mode 100644 generic/include/clc/math/ldexp.h
create mode 100644 generic/lib/math/ldexp.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 1c12cf3..ecabcf1 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -51,6 +51,7 @@
#include <clc/math/fmin.h>
#include <clc/math/fmod.h>
#include <clc/math/hypot.h>
+#include <clc/math/ldexp.h>
#include <clc/math/log.h>
#include <clc/math/log10.h>
#include <clc/math/log1p.h>
diff --git a/generic/include/clc/math/binary_decl.inc b/generic/include/clc/math/binary_decl.inc
index 70a7114..1805527 100644
--- a/generic/include/clc/math/binary_decl.inc
+++ b/generic/include/clc/math/binary_decl.inc
@@ -1,6 +1,18 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, float b);
+#ifdef __CLC_INT_GENTYPE
+ #if !defined(__CLC_SCALAR)
+ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_INT_GENTYPE b);
+ #endif
+ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_SCALAR_INT_TYPE b);
+
+#else
+
+#if !defined(__CLC_SCALAR)
+ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b);
+#endif
+ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, float b);

#ifdef cl_khr_fp64
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, double b);
+ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, double b);
+#endif
+
#endif
diff --git a/generic/include/clc/math/gentype_tss.inc b/generic/include/clc/math/gentype_tss.inc
new file mode 100644
index 0000000..11ec9ff
--- /dev/null
+++ b/generic/include/clc/math/gentype_tss.inc
@@ -0,0 +1,108 @@
+/* Used to provide support for multi-arg functions where the argument types and/or sizes do NOT match
+ *
+ * e.g. ldexp(float16,int), ldexp(float16,int16), ldexp(double8, int)
+ *
+ * In general, consumers of this include will probably have versions with a vector first argument, and then
+ * vector/scalar 2nd argument which may have an entirely different base type.
+ */
+
+#define __CLC_SCALAR_GENTYPE float
+#ifndef __CLC_SCALAR_INT_TYPE
+ #define __CLC_UNDEF_SCALAR_INT_TYPE
+ #define __CLC_SCALAR_INT_TYPE int
+#endif
+#define __CLC_FPSIZE 32
+
+#define __CLC_GENTYPE float
+#define __CLC_INT_GENTYPE int
+#define __CLC_SCALAR
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+#undef __CLC_SCALAR
+
+#define __CLC_GENTYPE float2
+#define __CLC_INT_GENTYPE int2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE float3
+#define __CLC_INT_GENTYPE int3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE float4
+#define __CLC_INT_GENTYPE int4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE float8
+#define __CLC_INT_GENTYPE int8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE float16
+#define __CLC_INT_GENTYPE int16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#undef __CLC_FPSIZE
+#undef __CLC_SCALAR_GENTYPE
+
+#ifdef cl_khr_fp64
+#define __CLC_SCALAR_GENTYPE double
+#define __CLC_FPSIZE 64
+
+#define __CLC_SCALAR
+#define __CLC_GENTYPE double
+#define __CLC_INT_GENTYPE int
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_SCALAR
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE double2
+#define __CLC_INT_GENTYPE int2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE double3
+#define __CLC_INT_GENTYPE int3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE double4
+#define __CLC_INT_GENTYPE int4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE double8
+#define __CLC_INT_GENTYPE int8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#define __CLC_GENTYPE double16
+#define __CLC_INT_GENTYPE int16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_INT_GENTYPE
+
+#undef __CLC_FPSIZE
+#undef __CLC_SCALAR_GENTYPE
+#endif
+
+#ifdef __CLC_UNDEF_SCALAR_INT_TYPE
+ #undef __CLC_SCALAR_INT_TYPE
+ #undef __CLC_UNDEF_SCALAR_INT_TYPE
+#endif
+
+#undef __CLC_BODY
diff --git a/generic/include/clc/math/ldexp.h b/generic/include/clc/math/ldexp.h
new file mode 100644
index 0000000..2e3b502
--- /dev/null
+++ b/generic/include/clc/math/ldexp.h
@@ -0,0 +1,9 @@
+#define __CLC_BODY <clc/math/binary_decl.inc>
+#define __CLC_FUNCTION ldexp
+
+#include <clc/math/gentype_tss.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#undef __CLC_ARG2_BASE_TYPE
\ No newline at end of file
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 0110e15..be6865e 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -70,6 +70,7 @@ math/fmax.cl
math/fmin.cl
math/fmod.cl
math/hypot.cl
+math/ldexp.cl
math/log10.cl
math/log1p.cl
math/mad.cl
diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
index 346adf2..3f389e5 100644
--- a/generic/lib/clcmacro.h
+++ b/generic/lib/clcmacro.h
@@ -41,6 +41,28 @@
     return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
   }

+#define _CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
+ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE y) { \
+ return (RET_TYPE##2)(FUNCTION(x.x, y), FUNCTION(x.y, y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE y) { \
+ return (RET_TYPE##3)(FUNCTION(x.x, y), FUNCTION(x.y, y), \
+ FUNCTION(x.z, y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE y) { \
+ return (RET_TYPE##4)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE y) { \
+ return (RET_TYPE##8)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
+ } \
+\
+ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE y) { \
+ return (RET_TYPE##16)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
+ }
+
#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \
   DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
     return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
@@ -115,6 +137,10 @@ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
} \
_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)

+#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
+_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
+_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
+
#define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE) \
_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \
   return BUILTIN(x); \
diff --git a/generic/lib/math/ldexp.cl b/generic/lib/math/ldexp.cl
new file mode 100644
index 0000000..b7c5a92
--- /dev/null
+++ b/generic/lib/math/ldexp.cl
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../clcmacro.h"
+#include "math.h"
+
+_CLC_DEF _CLC_OVERLOAD float ldexp(float x, int n) {

I think this is where we need to integrate with my subnormal helper
patches to add denormal flushing. I can give this a try if you want.

-Tom

Signed-off-by: Aaron Watry <awatry@gmail.com>
---
generic/include/clc/clc.h | 1 +
generic/include/clc/math/erfc.h | 9 +
generic/lib/SOURCES | 1 +
generic/lib/math/erfc.cl | 413 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 424 insertions(+)
create mode 100644 generic/include/clc/math/erfc.h
create mode 100644 generic/lib/math/erfc.cl

LGTM.

> Signed-off-by: Aaron Watry <awatry@gmail.com>
> ---
> generic/include/clc/clc.h | 1 +
> generic/include/clc/math/binary_decl.inc | 18 ++++-
> generic/include/clc/math/gentype_tss.inc | 108
+++++++++++++++++++++++++++++
> generic/include/clc/math/ldexp.h | 9 +++
> generic/lib/SOURCES | 1 +
> generic/lib/clcmacro.h | 26 +++++++
> generic/lib/math/ldexp.cl | 114
+++++++++++++++++++++++++++++++

I just sent out a patch implementing an optimized ldexp for R600/SI.
I was able create the declarations and definitions without having to
modify too many of the *.inc files or adding new macros to clcmacro.h

I think we should try a similar approach for the generic version.

Fair enough. We might as well piggy-back on what you did for the R600/SI
version.

> 7 files changed, 274 insertions(+), 3 deletions(-)
> create mode 100644 generic/include/clc/math/gentype_tss.inc
> create mode 100644 generic/include/clc/math/ldexp.h
> create mode 100644 generic/lib/math/ldexp.cl
>
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index 1c12cf3..ecabcf1 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -51,6 +51,7 @@
> #include <clc/math/fmin.h>
> #include <clc/math/fmod.h>
> #include <clc/math/hypot.h>
> +#include <clc/math/ldexp.h>
> #include <clc/math/log.h>
> #include <clc/math/log10.h>
> #include <clc/math/log1p.h>
> diff --git a/generic/include/clc/math/binary_decl.inc
b/generic/include/clc/math/binary_decl.inc
> index 70a7114..1805527 100644
> --- a/generic/include/clc/math/binary_decl.inc
> +++ b/generic/include/clc/math/binary_decl.inc
> @@ -1,6 +1,18 @@
> -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
__CLC_GENTYPE b);
> -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
float b);
> +#ifdef __CLC_INT_GENTYPE
> + #if !defined(__CLC_SCALAR)
> + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
a, __CLC_INT_GENTYPE b);
> + #endif
> + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
a, __CLC_SCALAR_INT_TYPE b);
> +
> +#else
> +
> +#if !defined(__CLC_SCALAR)
> + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
__CLC_GENTYPE b);
> +#endif
> + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
float b);
>
> #ifdef cl_khr_fp64
> -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
double b);
> + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
double b);
> +#endif
> +
> #endif
> diff --git a/generic/include/clc/math/gentype_tss.inc
b/generic/include/clc/math/gentype_tss.inc
> new file mode 100644
> index 0000000..11ec9ff
> --- /dev/null
> +++ b/generic/include/clc/math/gentype_tss.inc
> @@ -0,0 +1,108 @@
> +/* Used to provide support for multi-arg functions where the argument
types and/or sizes do NOT match
> + *
> + * e.g. ldexp(float16,int), ldexp(float16,int16), ldexp(double8, int)
> + *
> + * In general, consumers of this include will probably have versions
with a vector first argument, and then
> + * vector/scalar 2nd argument which may have an entirely different base
type.
> + */
> +
> +#define __CLC_SCALAR_GENTYPE float
> +#ifndef __CLC_SCALAR_INT_TYPE
> + #define __CLC_UNDEF_SCALAR_INT_TYPE
> + #define __CLC_SCALAR_INT_TYPE int
> +#endif
> +#define __CLC_FPSIZE 32
> +
> +#define __CLC_GENTYPE float
> +#define __CLC_INT_GENTYPE int
> +#define __CLC_SCALAR
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +#undef __CLC_SCALAR
> +
> +#define __CLC_GENTYPE float2
> +#define __CLC_INT_GENTYPE int2
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE float3
> +#define __CLC_INT_GENTYPE int3
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE float4
> +#define __CLC_INT_GENTYPE int4
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE float8
> +#define __CLC_INT_GENTYPE int8
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE float16
> +#define __CLC_INT_GENTYPE int16
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#undef __CLC_FPSIZE
> +#undef __CLC_SCALAR_GENTYPE
> +
> +#ifdef cl_khr_fp64
> +#define __CLC_SCALAR_GENTYPE double
> +#define __CLC_FPSIZE 64
> +
> +#define __CLC_SCALAR
> +#define __CLC_GENTYPE double
> +#define __CLC_INT_GENTYPE int
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_SCALAR
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE double2
> +#define __CLC_INT_GENTYPE int2
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE double3
> +#define __CLC_INT_GENTYPE int3
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE double4
> +#define __CLC_INT_GENTYPE int4
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE double8
> +#define __CLC_INT_GENTYPE int8
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#define __CLC_GENTYPE double16
> +#define __CLC_INT_GENTYPE int16
> +#include __CLC_BODY
> +#undef __CLC_GENTYPE
> +#undef __CLC_INT_GENTYPE
> +
> +#undef __CLC_FPSIZE
> +#undef __CLC_SCALAR_GENTYPE
> +#endif
> +
> +#ifdef __CLC_UNDEF_SCALAR_INT_TYPE
> + #undef __CLC_SCALAR_INT_TYPE
> + #undef __CLC_UNDEF_SCALAR_INT_TYPE
> +#endif
> +
> +#undef __CLC_BODY
> diff --git a/generic/include/clc/math/ldexp.h
b/generic/include/clc/math/ldexp.h
> new file mode 100644
> index 0000000..2e3b502
> --- /dev/null
> +++ b/generic/include/clc/math/ldexp.h
> @@ -0,0 +1,9 @@
> +#define __CLC_BODY <clc/math/binary_decl.inc>
> +#define __CLC_FUNCTION ldexp
> +
> +#include <clc/math/gentype_tss.inc>
> +
> +#undef __CLC_BODY
> +#undef __CLC_FUNCTION
> +
> +#undef __CLC_ARG2_BASE_TYPE
> \ No newline at end of file
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index 0110e15..be6865e 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -70,6 +70,7 @@ math/fmax.cl
> math/fmin.cl
> math/fmod.cl
> math/hypot.cl
> +math/ldexp.cl
> math/log10.cl
> math/log1p.cl
> math/mad.cl
> diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
> index 346adf2..3f389e5 100644
> --- a/generic/lib/clcmacro.h
> +++ b/generic/lib/clcmacro.h
> @@ -41,6 +41,28 @@
> return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
> }
>
> +#define _CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(DECLSPEC, RET_TYPE,
FUNCTION, ARG1_TYPE, ARG2_TYPE) \
> + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE y) { \
> + return (RET_TYPE##2)(FUNCTION(x.x, y), FUNCTION(x.y, y)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE y) { \
> + return (RET_TYPE##3)(FUNCTION(x.x, y), FUNCTION(x.y, y), \
> + FUNCTION(x.z, y)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE y) { \
> + return (RET_TYPE##4)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE y) { \
> + return (RET_TYPE##8)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> + } \
> +\
> + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE y) { \
> + return (RET_TYPE##16)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> + }
> +
> #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,
ARG2_TYPE) \
> DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
> return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
> @@ -115,6 +137,10 @@ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE
x, ARG2_TYPE y) { \
> } \
> _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION,
ARG1_TYPE, ARG2_TYPE)
>
> +#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE,
FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
> +_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,
ARG2_TYPE) \
> +_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF,
RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
> +
> #define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN,
ARG1_TYPE) \
> _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \
> return BUILTIN(x); \
> diff --git a/generic/lib/math/ldexp.cl b/generic/lib/math/ldexp.cl
> new file mode 100644
> index 0000000..b7c5a92
> --- /dev/null
> +++ b/generic/lib/math/ldexp.cl
> @@ -0,0 +1,114 @@
> +/*
> + * Copyright (c) 2014 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
obtaining a copy
> + * of this software and associated documentation files (the
"Software"), to deal
> + * in the Software without restriction, including without limitation
the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE
> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include <clc/clc.h>
> +#include "../clcmacro.h"
> +#include "math.h"
> +
> +_CLC_DEF _CLC_OVERLOAD float ldexp(float x, int n) {

I think this is where we need to integrate with my subnormal helper
patches to add denormal flushing. I can give this a try if you want.

Go for it. I did notice that your denormal patch always reports false for
16/32-bit float denormal support, so this will always fall into the same
code path regardless for float ldexp. It's just doubles which have a call
into llvm to check if the hardware supports subnormals.

Bit of a question: If the hardware supports subnormals, do we just not
need the subnormal support code?

I noticed in quite a few of the amd builtins that subnormal support is
explicitly tested for, but in the ldexp case, the code below is straight
from the AMD built-in (doesn't have a check for if the hardware supports
subnormals or not). I guess it's possible that the ldexp implementation
was done in such a way that it didn't matter if the hardware supports
subnormals or not, and we're leaving missing out on possible performance by
always running the code as written when we have hardware with subnormal
support.

In summary, floating point isn't my strongest area, and without wikipedia,
wolfram, and other sites to explain some of these algorithms, I'd be
hopeless :slight_smile:

--Aaron

-Tom

> > Signed-off-by: Aaron Watry <awatry@gmail.com>
> > ---
> > generic/include/clc/clc.h | 1 +
> > generic/include/clc/math/binary_decl.inc | 18 ++++-
> > generic/include/clc/math/gentype_tss.inc | 108
> +++++++++++++++++++++++++++++
> > generic/include/clc/math/ldexp.h | 9 +++
> > generic/lib/SOURCES | 1 +
> > generic/lib/clcmacro.h | 26 +++++++
> > generic/lib/math/ldexp.cl | 114
> +++++++++++++++++++++++++++++++
>
> I just sent out a patch implementing an optimized ldexp for R600/SI.
> I was able create the declarations and definitions without having to
> modify too many of the *.inc files or adding new macros to clcmacro.h
>
> I think we should try a similar approach for the generic version.
>
>
Fair enough. We might as well piggy-back on what you did for the R600/SI
version.

> > 7 files changed, 274 insertions(+), 3 deletions(-)
> > create mode 100644 generic/include/clc/math/gentype_tss.inc
> > create mode 100644 generic/include/clc/math/ldexp.h
> > create mode 100644 generic/lib/math/ldexp.cl
> >
> > diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> > index 1c12cf3..ecabcf1 100644
> > --- a/generic/include/clc/clc.h
> > +++ b/generic/include/clc/clc.h
> > @@ -51,6 +51,7 @@
> > #include <clc/math/fmin.h>
> > #include <clc/math/fmod.h>
> > #include <clc/math/hypot.h>
> > +#include <clc/math/ldexp.h>
> > #include <clc/math/log.h>
> > #include <clc/math/log10.h>
> > #include <clc/math/log1p.h>
> > diff --git a/generic/include/clc/math/binary_decl.inc
> b/generic/include/clc/math/binary_decl.inc
> > index 70a7114..1805527 100644
> > --- a/generic/include/clc/math/binary_decl.inc
> > +++ b/generic/include/clc/math/binary_decl.inc
> > @@ -1,6 +1,18 @@
> > -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
> __CLC_GENTYPE b);
> > -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
> float b);
> > +#ifdef __CLC_INT_GENTYPE
> > + #if !defined(__CLC_SCALAR)
> > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
> a, __CLC_INT_GENTYPE b);
> > + #endif
> > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
> a, __CLC_SCALAR_INT_TYPE b);
> > +
> > +#else
> > +
> > +#if !defined(__CLC_SCALAR)
> > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
> __CLC_GENTYPE b);
> > +#endif
> > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
> float b);
> >
> > #ifdef cl_khr_fp64
> > -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
> double b);
> > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
> double b);
> > +#endif
> > +
> > #endif
> > diff --git a/generic/include/clc/math/gentype_tss.inc
> b/generic/include/clc/math/gentype_tss.inc
> > new file mode 100644
> > index 0000000..11ec9ff
> > --- /dev/null
> > +++ b/generic/include/clc/math/gentype_tss.inc
> > @@ -0,0 +1,108 @@
> > +/* Used to provide support for multi-arg functions where the argument
> types and/or sizes do NOT match
> > + *
> > + * e.g. ldexp(float16,int), ldexp(float16,int16), ldexp(double8, int)
> > + *
> > + * In general, consumers of this include will probably have versions
> with a vector first argument, and then
> > + * vector/scalar 2nd argument which may have an entirely different base
> type.
> > + */
> > +
> > +#define __CLC_SCALAR_GENTYPE float
> > +#ifndef __CLC_SCALAR_INT_TYPE
> > + #define __CLC_UNDEF_SCALAR_INT_TYPE
> > + #define __CLC_SCALAR_INT_TYPE int
> > +#endif
> > +#define __CLC_FPSIZE 32
> > +
> > +#define __CLC_GENTYPE float
> > +#define __CLC_INT_GENTYPE int
> > +#define __CLC_SCALAR
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +#undef __CLC_SCALAR
> > +
> > +#define __CLC_GENTYPE float2
> > +#define __CLC_INT_GENTYPE int2
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE float3
> > +#define __CLC_INT_GENTYPE int3
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE float4
> > +#define __CLC_INT_GENTYPE int4
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE float8
> > +#define __CLC_INT_GENTYPE int8
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE float16
> > +#define __CLC_INT_GENTYPE int16
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#undef __CLC_FPSIZE
> > +#undef __CLC_SCALAR_GENTYPE
> > +
> > +#ifdef cl_khr_fp64
> > +#define __CLC_SCALAR_GENTYPE double
> > +#define __CLC_FPSIZE 64
> > +
> > +#define __CLC_SCALAR
> > +#define __CLC_GENTYPE double
> > +#define __CLC_INT_GENTYPE int
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_SCALAR
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE double2
> > +#define __CLC_INT_GENTYPE int2
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE double3
> > +#define __CLC_INT_GENTYPE int3
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE double4
> > +#define __CLC_INT_GENTYPE int4
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE double8
> > +#define __CLC_INT_GENTYPE int8
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#define __CLC_GENTYPE double16
> > +#define __CLC_INT_GENTYPE int16
> > +#include __CLC_BODY
> > +#undef __CLC_GENTYPE
> > +#undef __CLC_INT_GENTYPE
> > +
> > +#undef __CLC_FPSIZE
> > +#undef __CLC_SCALAR_GENTYPE
> > +#endif
> > +
> > +#ifdef __CLC_UNDEF_SCALAR_INT_TYPE
> > + #undef __CLC_SCALAR_INT_TYPE
> > + #undef __CLC_UNDEF_SCALAR_INT_TYPE
> > +#endif
> > +
> > +#undef __CLC_BODY
> > diff --git a/generic/include/clc/math/ldexp.h
> b/generic/include/clc/math/ldexp.h
> > new file mode 100644
> > index 0000000..2e3b502
> > --- /dev/null
> > +++ b/generic/include/clc/math/ldexp.h
> > @@ -0,0 +1,9 @@
> > +#define __CLC_BODY <clc/math/binary_decl.inc>
> > +#define __CLC_FUNCTION ldexp
> > +
> > +#include <clc/math/gentype_tss.inc>
> > +
> > +#undef __CLC_BODY
> > +#undef __CLC_FUNCTION
> > +
> > +#undef __CLC_ARG2_BASE_TYPE
> > \ No newline at end of file
> > diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> > index 0110e15..be6865e 100644
> > --- a/generic/lib/SOURCES
> > +++ b/generic/lib/SOURCES
> > @@ -70,6 +70,7 @@ math/fmax.cl
> > math/fmin.cl
> > math/fmod.cl
> > math/hypot.cl
> > +math/ldexp.cl
> > math/log10.cl
> > math/log1p.cl
> > math/mad.cl
> > diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
> > index 346adf2..3f389e5 100644
> > --- a/generic/lib/clcmacro.h
> > +++ b/generic/lib/clcmacro.h
> > @@ -41,6 +41,28 @@
> > return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
> > }
> >
> > +#define _CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(DECLSPEC, RET_TYPE,
> FUNCTION, ARG1_TYPE, ARG2_TYPE) \
> > + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE y) { \
> > + return (RET_TYPE##2)(FUNCTION(x.x, y), FUNCTION(x.y, y)); \
> > + } \
> > +\
> > + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE y) { \
> > + return (RET_TYPE##3)(FUNCTION(x.x, y), FUNCTION(x.y, y), \
> > + FUNCTION(x.z, y)); \
> > + } \
> > +\
> > + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE y) { \
> > + return (RET_TYPE##4)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> > + } \
> > +\
> > + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE y) { \
> > + return (RET_TYPE##8)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> > + } \
> > +\
> > + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE y) { \
> > + return (RET_TYPE##16)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> > + }
> > +
> > #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,
> ARG2_TYPE) \
> > DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
> > return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
> > @@ -115,6 +137,10 @@ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE
> x, ARG2_TYPE y) { \
> > } \
> > _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION,
> ARG1_TYPE, ARG2_TYPE)
> >
> > +#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE,
> FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
> > +_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,
> ARG2_TYPE) \
> > +_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF,
> RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
> > +
> > #define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN,
> ARG1_TYPE) \
> > _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \
> > return BUILTIN(x); \
> > diff --git a/generic/lib/math/ldexp.cl b/generic/lib/math/ldexp.cl
> > new file mode 100644
> > index 0000000..b7c5a92
> > --- /dev/null
> > +++ b/generic/lib/math/ldexp.cl
> > @@ -0,0 +1,114 @@
> > +/*
> > + * Copyright (c) 2014 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> obtaining a copy
> > + * of this software and associated documentation files (the
> "Software"), to deal
> > + * in the Software without restriction, including without limitation
> the rights
> > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or
> sell
> > + * copies of the Software, and to permit persons to whom the Software is
> > + * furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be
> included in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
> SHALL THE
> > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING FROM,
> > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> DEALINGS IN
> > + * THE SOFTWARE.
> > + */
> > +
> > +#include <clc/clc.h>
> > +#include "../clcmacro.h"
> > +#include "math.h"
> > +
> > +_CLC_DEF _CLC_OVERLOAD float ldexp(float x, int n) {
>
> I think this is where we need to integrate with my subnormal helper
> patches to add denormal flushing. I can give this a try if you want.
>
>
Go for it. I did notice that your denormal patch always reports false for
16/32-bit float denormal support, so this will always fall into the same
code path regardless for float ldexp. It's just doubles which have a call
into llvm to check if the hardware supports subnormals.

My patch just implemented the defaults required by the OpenCL spec. Targets
will be able to override these if they want.

Bit of a question: If the hardware supports subnormals, do we just not
need the subnormal support code?

We are not required by the spec to include the subnormal support code if
subnormals are supported. However, if we want to support the
-cl-denorms-are-zero flag, we will need the subnormal support code.

Note that spec does not require us to do anything when this flag is passed,
so supporting this flag is not required for compliance, it is just
for enabling optimizations in some cases.
Supporting this flag is very low priority for me.

I noticed in quite a few of the amd builtins that subnormal support is
explicitly tested for, but in the ldexp case, the code below is straight
from the AMD built-in (doesn't have a check for if the hardware supports
subnormals or not). I guess it's possible that the ldexp implementation
was done in such a way that it didn't matter if the hardware supports
subnormals or not, and we're leaving missing out on possible performance by
always running the code as written when we have hardware with subnormal
support.

In the ldexp implementation, I think the code inside the #if 0
block was handling denormals:

http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/math32/ldexpF.cl?view=co

-Tom

>
> > > Signed-off-by: Aaron Watry <awatry@gmail.com>
> > > ---
> > > generic/include/clc/clc.h | 1 +
> > > generic/include/clc/math/binary_decl.inc | 18 ++++-
> > > generic/include/clc/math/gentype_tss.inc | 108
> > +++++++++++++++++++++++++++++
> > > generic/include/clc/math/ldexp.h | 9 +++
> > > generic/lib/SOURCES | 1 +
> > > generic/lib/clcmacro.h | 26 +++++++
> > > generic/lib/math/ldexp.cl | 114
> > +++++++++++++++++++++++++++++++
> >
> > I just sent out a patch implementing an optimized ldexp for R600/SI.
> > I was able create the declarations and definitions without having to
> > modify too many of the *.inc files or adding new macros to clcmacro.h
> >
> > I think we should try a similar approach for the generic version.
> >
> >
> Fair enough. We might as well piggy-back on what you did for the R600/SI
> version.
>
>
> > > 7 files changed, 274 insertions(+), 3 deletions(-)
> > > create mode 100644 generic/include/clc/math/gentype_tss.inc
> > > create mode 100644 generic/include/clc/math/ldexp.h
> > > create mode 100644 generic/lib/math/ldexp.cl
> > >
> > > diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> > > index 1c12cf3..ecabcf1 100644
> > > --- a/generic/include/clc/clc.h
> > > +++ b/generic/include/clc/clc.h
> > > @@ -51,6 +51,7 @@
> > > #include <clc/math/fmin.h>
> > > #include <clc/math/fmod.h>
> > > #include <clc/math/hypot.h>
> > > +#include <clc/math/ldexp.h>
> > > #include <clc/math/log.h>
> > > #include <clc/math/log10.h>
> > > #include <clc/math/log1p.h>
> > > diff --git a/generic/include/clc/math/binary_decl.inc
> > b/generic/include/clc/math/binary_decl.inc
> > > index 70a7114..1805527 100644
> > > --- a/generic/include/clc/math/binary_decl.inc
> > > +++ b/generic/include/clc/math/binary_decl.inc
> > > @@ -1,6 +1,18 @@
> > > -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
a,
> > __CLC_GENTYPE b);
> > > -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
a,
> > float b);
> > > +#ifdef __CLC_INT_GENTYPE
> > > + #if !defined(__CLC_SCALAR)
> > > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
__CLC_FUNCTION(__CLC_GENTYPE
> > a, __CLC_INT_GENTYPE b);
> > > + #endif
> > > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
__CLC_FUNCTION(__CLC_GENTYPE
> > a, __CLC_SCALAR_INT_TYPE b);
> > > +
> > > +#else
> > > +
> > > +#if !defined(__CLC_SCALAR)
> > > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
__CLC_FUNCTION(__CLC_GENTYPE a,
> > __CLC_GENTYPE b);
> > > +#endif
> > > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
__CLC_FUNCTION(__CLC_GENTYPE a,
> > float b);
> > >
> > > #ifdef cl_khr_fp64
> > > -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE
a,
> > double b);
> > > + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
__CLC_FUNCTION(__CLC_GENTYPE a,
> > double b);
> > > +#endif
> > > +
> > > #endif
> > > diff --git a/generic/include/clc/math/gentype_tss.inc
> > b/generic/include/clc/math/gentype_tss.inc
> > > new file mode 100644
> > > index 0000000..11ec9ff
> > > --- /dev/null
> > > +++ b/generic/include/clc/math/gentype_tss.inc
> > > @@ -0,0 +1,108 @@
> > > +/* Used to provide support for multi-arg functions where the
argument
> > types and/or sizes do NOT match
> > > + *
> > > + * e.g. ldexp(float16,int), ldexp(float16,int16), ldexp(double8,
int)
> > > + *
> > > + * In general, consumers of this include will probably have versions
> > with a vector first argument, and then
> > > + * vector/scalar 2nd argument which may have an entirely different
base
> > type.
> > > + */
> > > +
> > > +#define __CLC_SCALAR_GENTYPE float
> > > +#ifndef __CLC_SCALAR_INT_TYPE
> > > + #define __CLC_UNDEF_SCALAR_INT_TYPE
> > > + #define __CLC_SCALAR_INT_TYPE int
> > > +#endif
> > > +#define __CLC_FPSIZE 32
> > > +
> > > +#define __CLC_GENTYPE float
> > > +#define __CLC_INT_GENTYPE int
> > > +#define __CLC_SCALAR
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +#undef __CLC_SCALAR
> > > +
> > > +#define __CLC_GENTYPE float2
> > > +#define __CLC_INT_GENTYPE int2
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE float3
> > > +#define __CLC_INT_GENTYPE int3
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE float4
> > > +#define __CLC_INT_GENTYPE int4
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE float8
> > > +#define __CLC_INT_GENTYPE int8
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE float16
> > > +#define __CLC_INT_GENTYPE int16
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#undef __CLC_FPSIZE
> > > +#undef __CLC_SCALAR_GENTYPE
> > > +
> > > +#ifdef cl_khr_fp64
> > > +#define __CLC_SCALAR_GENTYPE double
> > > +#define __CLC_FPSIZE 64
> > > +
> > > +#define __CLC_SCALAR
> > > +#define __CLC_GENTYPE double
> > > +#define __CLC_INT_GENTYPE int
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_SCALAR
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE double2
> > > +#define __CLC_INT_GENTYPE int2
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE double3
> > > +#define __CLC_INT_GENTYPE int3
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE double4
> > > +#define __CLC_INT_GENTYPE int4
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE double8
> > > +#define __CLC_INT_GENTYPE int8
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#define __CLC_GENTYPE double16
> > > +#define __CLC_INT_GENTYPE int16
> > > +#include __CLC_BODY
> > > +#undef __CLC_GENTYPE
> > > +#undef __CLC_INT_GENTYPE
> > > +
> > > +#undef __CLC_FPSIZE
> > > +#undef __CLC_SCALAR_GENTYPE
> > > +#endif
> > > +
> > > +#ifdef __CLC_UNDEF_SCALAR_INT_TYPE
> > > + #undef __CLC_SCALAR_INT_TYPE
> > > + #undef __CLC_UNDEF_SCALAR_INT_TYPE
> > > +#endif
> > > +
> > > +#undef __CLC_BODY
> > > diff --git a/generic/include/clc/math/ldexp.h
> > b/generic/include/clc/math/ldexp.h
> > > new file mode 100644
> > > index 0000000..2e3b502
> > > --- /dev/null
> > > +++ b/generic/include/clc/math/ldexp.h
> > > @@ -0,0 +1,9 @@
> > > +#define __CLC_BODY <clc/math/binary_decl.inc>
> > > +#define __CLC_FUNCTION ldexp
> > > +
> > > +#include <clc/math/gentype_tss.inc>
> > > +
> > > +#undef __CLC_BODY
> > > +#undef __CLC_FUNCTION
> > > +
> > > +#undef __CLC_ARG2_BASE_TYPE
> > > \ No newline at end of file
> > > diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> > > index 0110e15..be6865e 100644
> > > --- a/generic/lib/SOURCES
> > > +++ b/generic/lib/SOURCES
> > > @@ -70,6 +70,7 @@ math/fmax.cl
> > > math/fmin.cl
> > > math/fmod.cl
> > > math/hypot.cl
> > > +math/ldexp.cl
> > > math/log10.cl
> > > math/log1p.cl
> > > math/mad.cl
> > > diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
> > > index 346adf2..3f389e5 100644
> > > --- a/generic/lib/clcmacro.h
> > > +++ b/generic/lib/clcmacro.h
> > > @@ -41,6 +41,28 @@
> > > return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi,
y.hi)); \
> > > }
> > >
> > > +#define _CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(DECLSPEC, RET_TYPE,
> > FUNCTION, ARG1_TYPE, ARG2_TYPE) \
> > > + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE y) { \
> > > + return (RET_TYPE##2)(FUNCTION(x.x, y), FUNCTION(x.y, y)); \
> > > + } \
> > > +\
> > > + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE y) { \
> > > + return (RET_TYPE##3)(FUNCTION(x.x, y), FUNCTION(x.y, y), \
> > > + FUNCTION(x.z, y)); \
> > > + } \
> > > +\
> > > + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE y) { \
> > > + return (RET_TYPE##4)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> > > + } \
> > > +\
> > > + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE y) { \
> > > + return (RET_TYPE##8)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> > > + } \
> > > +\
> > > + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE y) { \
> > > + return (RET_TYPE##16)(FUNCTION(x.lo, y), FUNCTION(x.hi, y)); \
> > > + }
> > > +
> > > #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION,
ARG1_TYPE,
> > ARG2_TYPE) \
> > > DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
> > > return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
> > > @@ -115,6 +137,10 @@ _CLC_DEF _CLC_OVERLOAD RET_TYPE
FUNCTION(ARG1_TYPE
> > x, ARG2_TYPE y) { \
> > > } \
> > > _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION,
> > ARG1_TYPE, ARG2_TYPE)
> > >
> > > +#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(RET_TYPE,
> > FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
> > > +_CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,
> > ARG2_TYPE) \
> > > +_CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF,
> > RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)
> > > +
> > > #define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN,
> > ARG1_TYPE) \
> > > _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { \
> > > return BUILTIN(x); \
> > > diff --git a/generic/lib/math/ldexp.cl b/generic/lib/math/ldexp.cl
> > > new file mode 100644
> > > index 0000000..b7c5a92
> > > --- /dev/null
> > > +++ b/generic/lib/math/ldexp.cl
> > > @@ -0,0 +1,114 @@
> > > +/*
> > > + * Copyright (c) 2014 Advanced Micro Devices, Inc.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person
> > obtaining a copy
> > > + * of this software and associated documentation files (the
> > "Software"), to deal
> > > + * in the Software without restriction, including without limitation
> > the rights
> > > + * to use, copy, modify, merge, publish, distribute, sublicense,
and/or
> > sell
> > > + * copies of the Software, and to permit persons to whom the
Software is
> > > + * furnished to do so, subject to the following conditions:
> > > + *
> > > + * The above copyright notice and this permission notice shall be
> > included in
> > > + * all copies or substantial portions of the Software.
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
> > SHALL THE
> > > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> > OTHER
> > > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > ARISING FROM,
> > > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> > DEALINGS IN
> > > + * THE SOFTWARE.
> > > + */
> > > +
> > > +#include <clc/clc.h>
> > > +#include "../clcmacro.h"
> > > +#include "math.h"
> > > +
> > > +_CLC_DEF _CLC_OVERLOAD float ldexp(float x, int n) {
> >
> > I think this is where we need to integrate with my subnormal helper
> > patches to add denormal flushing. I can give this a try if you want.
> >
> >
> Go for it. I did notice that your denormal patch always reports false
for
> 16/32-bit float denormal support, so this will always fall into the same
> code path regardless for float ldexp. It's just doubles which have a
call
> into llvm to check if the hardware supports subnormals.
>

My patch just implemented the defaults required by the OpenCL spec.
Targets
will be able to override these if they want.

> Bit of a question: If the hardware supports subnormals, do we just not
> need the subnormal support code?

We are not required by the spec to include the subnormal support code if
subnormals are supported. However, if we want to support the
-cl-denorms-are-zero flag, we will need the subnormal support code.

Note that spec does not require us to do anything when this flag is passed,
so supporting this flag is not required for compliance, it is just
for enabling optimizations in some cases.
Supporting this flag is very low priority for me.

>
> I noticed in quite a few of the amd builtins that subnormal support is
> explicitly tested for, but in the ldexp case, the code below is straight
> from the AMD built-in (doesn't have a check for if the hardware supports
> subnormals or not). I guess it's possible that the ldexp implementation
> was done in such a way that it didn't matter if the hardware supports
> subnormals or not, and we're leaving missing out on possible performance
by
> always running the code as written when we have hardware with subnormal
> support.

In the ldexp implementation, I think the code inside the #if 0
block was handling denormals:

http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/math32/ldexpF.cl?view=co

Wow, I'm blind. I was looking at that exact file as I wrote up my previous
email, and I totally glossed over the #if 0 section. I'm guessing because
my IDE highlighted it as a comment/dead-code and, well, because it was
Friday morning before coffee....

--Aaron