[PATCH 1/1] r600: Fix get_work_dim range metadata

"The pair a,b represents the range [a,b)."

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

Last time I tried it, these functions failed the OpenCL conformance test. Have you considered porting from the amd-builtins branch instead?

LGTM

> Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
> ---
>
> Fixes myocyte rodinia benchmark.
> piglit test posted
>
> generic/include/clc/clc.h | 1 +
> generic/include/clc/math/log10.h | 9 +++++++++
> generic/lib/SOURCES | 1 +
> generic/lib/math/log10.cl | 8 ++++++++
> generic/lib/math/log10.inc | 10 ++++++++++
> 5 files changed, 29 insertions(+)
> create mode 100644 generic/include/clc/math/log10.h
> create mode 100644 generic/lib/math/log10.cl
> create mode 100644 generic/lib/math/log10.inc
>
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index bd92fdb..29c1855 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -50,6 +50,7 @@
> #include <clc/math/fmod.h>
> #include <clc/math/hypot.h>
> #include <clc/math/log.h>
> +#include <clc/math/log10.h>
> #include <clc/math/log1p.h>
> #include <clc/math/log2.h>
> #include <clc/math/mad.h>
> diff --git a/generic/include/clc/math/log10.h b/generic/include/clc/math/log10.h
> new file mode 100644
> index 0000000..ec4e4ae
> --- /dev/null
> +++ b/generic/include/clc/math/log10.h
> @@ -0,0 +1,9 @@
> +#undef log10
> +
> +#define __CLC_BODY <clc/math/unary_decl.inc>
> +#define __CLC_FUNCTION log10
> +
> +#include <clc/math/gentype.inc>
> +
> +#undef __CLC_BODY
> +#undef __CLC_FUNCTION
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index b76fec9..1032606 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -63,6 +63,7 @@ math/fmax.cl
> math/fmin.cl
> math/fmod.cl
> math/hypot.cl
> +math/log10.cl
> math/log1p.cl
> math/mad.cl
> math/mix.cl
> diff --git a/generic/lib/math/log10.cl b/generic/lib/math/log10.cl
> new file mode 100644
> index 0000000..d65764a
> --- /dev/null
> +++ b/generic/lib/math/log10.cl
> @@ -0,0 +1,8 @@
> +#include <clc/clc.h>
> +
> +#ifdef cl_khr_fp64
> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
> +#endif
> +
> +#define __CLC_BODY <log10.inc>
> +#include <clc/math/gentype.inc>
> diff --git a/generic/lib/math/log10.inc b/generic/lib/math/log10.inc
> new file mode 100644
> index 0000000..82e77bc
> --- /dev/null
> +++ b/generic/lib/math/log10.inc
> @@ -0,0 +1,10 @@
> +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE log10(__CLC_GENTYPE val) {
> + // log10(x) = log2(x) / log2(10)
> +#if __CLC_FPSIZE == 32
> + return log2(val) / log2(10.0f);
> +#elif __CLC_FPSIZE == 64
> + return exp2(val) / log2(10.0);

             ^^^^ should have been log2

> +#else
> +#error unknown _CLC_FPSIZE
> +#endif
> +}

Last time I tried it, these functions failed the OpenCL conformance
test. Have you considered porting from the amd-builtins branch instead?

is precision the problem?

I could not find any information about relative errors of hw
instructions. I thought that _IEEE instructions produced correctly
rounded results.
I'm still not clear about how the relative error propagates across
operations:
if we have log1p implemented ( with error <= 2ulp ) would
log(x) = log1p(x-1.0) result in error <= 2 ulp since subtraction is
correctly rounded?
if so would log(x) / LOG_2 (correctly rounded constant) give correct
log2 (error <= 3 ulp)?

jan

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

Fixes myocyte rodinia benchmark.
piglit test posted

generic/include/clc/clc.h | 1 +
generic/include/clc/math/log10.h | 9 +++++++++
generic/lib/SOURCES | 1 +
generic/lib/math/log10.cl | 8 ++++++++
generic/lib/math/log10.inc | 10 ++++++++++
5 files changed, 29 insertions(+)
create mode 100644 generic/include/clc/math/log10.h
create mode 100644 generic/lib/math/log10.cl
create mode 100644 generic/lib/math/log10.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index bd92fdb…29c1855 100644
— a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -50,6 +50,7 @@
#include <clc/math/fmod.h>
#include <clc/math/hypot.h>
#include <clc/math/log.h>
+#include <clc/math/log10.h>
#include <clc/math/log1p.h>
#include <clc/math/log2.h>
#include <clc/math/mad.h>
diff --git a/generic/include/clc/math/log10.h b/generic/include/clc/math/log10.h
new file mode 100644
index 0000000…ec4e4ae
— /dev/null
+++ b/generic/include/clc/math/log10.h
@@ -0,0 +1,9 @@
+#undef log10
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION log10
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index b76fec9…1032606 100644
— a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -63,6 +63,7 @@ math/fmax.cl
math/fmin.cl
math/fmod.cl
math/hypot.cl
+math/log10.cl
math/log1p.cl
math/mad.cl
math/mix.cl
diff --git a/generic/lib/math/log10.cl b/generic/lib/math/log10.cl
new file mode 100644
index 0000000…d65764a
— /dev/null
+++ b/generic/lib/math/log10.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <log10.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/math/log10.inc b/generic/lib/math/log10.inc
new file mode 100644
index 0000000…82e77bc
— /dev/null
+++ b/generic/lib/math/log10.inc
@@ -0,0 +1,10 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE log10(__CLC_GENTYPE val) {

  • // log10(x) = log2(x) / log2(10)
    +#if __CLC_FPSIZE == 32
  • return log2(val) / log2(10.0f);
    +#elif __CLC_FPSIZE == 64
  • return exp2(val) / log2(10.0);

^^^^ should have been log2

+#else
+#error unknown _CLC_FPSIZE
+#endif
+}

Last time I tried it, these functions failed the OpenCL conformance
test. Have you considered porting from the amd-builtins branch instead?

is precision the problem?

Yes. I’ll try to run these on the conformance test later to verify. The double version will definitely not pass since f64 division is currently not implemented correctly (which I’m working on, although it involves fixing a lot of special cases).

I also don’t think the log2(10) will constant fold correctly (I think LLVM will fold the intrinsic for it depending on if the host compiler has the libfunc for it, which also introduces variability since there is no standardization of precision in standard C), so assuming this will produce correct results it should use the computed constant.

I could not find any information about relative errors of hw
instructions. I thought that _IEEE instructions produced correctly
rounded results.
I’m still not clear about how the relative error propagates across
operations:

Which instructions? I’ve only found notes about precision on CI for some instructions in some internal documentation. The general rule is most of the f32 transcendental instructions should be good enough for the OpenCL library functions (without denormals), but not the f64 ones (which could only be used for the native_* versions)

if we have log1p implemented ( with error <= 2ulp ) would
log(x) = log1p(x-1.0) result in error <= 2 ulp since subtraction is
correctly rounded?
if so would log(x) / LOG_2 (correctly rounded constant) give correct
log2 (error <= 3 ulp)?

I’m not sure. I know you wouldn’t want to do that from a performance perspective since log1p is a longer function. With the AMD OpenCL builtins, log1p is more than twice as long and uses 10 more VGPRs

>
> [SNIP]
>>
>> Last time I tried it, these functions failed the OpenCL conformance
>> test. Have you considered porting from the amd-builtins branch instead?

can you point me to that branch? I tried searching for it but no luck
(though I do remember reading news about it)

>
> is precision the problem?

Yes. I’ll try to run these on the conformance test later to verify. The
double version will definitely not pass since f64 division is currently
not implemented correctly (which I’m working on, although it involves
fixing a lot of special cases).

I also don’t think the log2(10) will constant fold correctly (I think
LLVM will fold the intrinsic for it depending on if the host compiler
has the libfunc for it, which also introduces variability since there
is no standardization of precision in standard C), so assuming this
will produce correct results it should use the computed constant.

I agree, precomputed constant is better.

>
> I could not find any information about relative errors of hw
> instructions. I thought that _IEEE instructions produced correctly
> rounded results.
> I'm still not clear about how the relative error propagates across
> operations:

Which instructions? I’ve only found notes about precision on CI for
some instructions in some internal documentation. The general rule is
most of the f32 transcendental instructions should be good enough for
the OpenCL library functions (without denormals), but not the f64 ones
(which could only be used for the native_* versions)

In R600/EG/Cayman ISA there is bunch of instructions with _IEEE suffix
(in this case LOG_IEEE). the specs do not mention precision, MUL_IEEE
(and MULADD_IEEE*) says that it uses IEEE rules for 0*x.
I hoped that _IEEE also meant that the operation also follows the rest
of the rules including precision.

> if we have log1p implemented ( with error <= 2ulp ) would
> log(x) = log1p(x-1.0) result in error <= 2 ulp since subtraction is
> correctly rounded?
> if so would log(x) / LOG_2 (correctly rounded constant) give correct
> log2 (error <= 3 ulp)?

I’m not sure. I know you wouldn’t want to do that from a performance
perspective since log1p is a longer function. With the AMD OpenCL
builtins, log1p is more than twice as long and uses 10 more VGPRs

I meant whether there are error propagation rules in general (especially
for the second case). right now my understanding is that we'd need a
comment/proof for every operation.

jan

[SNIP]

Last time I tried it, these functions failed the OpenCL conformance
test. Have you considered porting from the amd-builtins branch instead?

can you point me to that branch? I tried searching for it but no luck
(though I do remember reading news about it)

It’s the amd-builtins branch in the libclc repo. It doesn’t seem to be mirrored in the git mirror, but I was able to clone it with:
git svn clone --branches=amd-builtins http://llvm.org/svn/llvm-project/libclc/ libclc_amd

I could not find any information about relative errors of hw
instructions. I thought that _IEEE instructions produced correctly
rounded results.
I'm still not clear about how the relative error propagates across
operations:

Which instructions? I’ve only found notes about precision on CI for
some instructions in some internal documentation. The general rule is
most of the f32 transcendental instructions should be good enough for
the OpenCL library functions (without denormals), but not the f64 ones
(which could only be used for the native_* versions)

In R600/EG/Cayman ISA there is bunch of instructions with _IEEE suffix
(in this case LOG_IEEE). the specs do not mention precision, MUL_IEEE
(and MULADD_IEEE*) says that it uses IEEE rules for 0*x.
I hoped that _IEEE also meant that the operation also follows the rest
of the rules including precision.

I believe all of the basic arithmetic instructions should always be correctly rounded. the IEEE part of the name refers to the handling of NaN and other special cases. In SI the convention appears to be the “normal” instruction names have the IEEE NaN behavior, and the _legacy_ ones do not.

v2: Use constant and multiplication instead of division

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

v2: Use constant and multiplication instead of division

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---

Hi,
sorry it took so long. This issue has fallen low on priority. The new version
uses multiplication (correctly rounded) and builtin constant,
the constant is smaller than 1 so the resulting error should be smaller
or equal to log2 function.
This is the same approach as log() (although log does not handle DP correctly).

I have also pinged the corresponding piglit test.

jan

generic/include/clc/clc.h | 1 +
generic/include/clc/math/log10.h | 9 +++++++++
generic/lib/SOURCES | 1 +
generic/lib/math/log10.cl | 8 ++++++++
generic/lib/math/log10.inc | 13 +++++++++++++
5 files changed, 32 insertions(+)
create mode 100644 generic/include/clc/math/log10.h
create mode 100644 generic/lib/math/log10.cl
create mode 100644 generic/lib/math/log10.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index bd92fdb..29c1855 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -50,6 +50,7 @@
#include <clc/math/fmod.h>
#include <clc/math/hypot.h>
#include <clc/math/log.h>
+#include <clc/math/log10.h>
#include <clc/math/log1p.h>
#include <clc/math/log2.h>
#include <clc/math/mad.h>
diff --git a/generic/include/clc/math/log10.h b/generic/include/clc/math/log10.h
new file mode 100644
index 0000000..ec4e4ae
--- /dev/null
+++ b/generic/include/clc/math/log10.h
@@ -0,0 +1,9 @@
+#undef log10
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION log10
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index b76fec9..1032606 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -63,6 +63,7 @@ math/fmax.cl
math/fmin.cl
math/fmod.cl
math/hypot.cl
+math/log10.cl
math/log1p.cl
math/mad.cl
math/mix.cl
diff --git a/generic/lib/math/log10.cl b/generic/lib/math/log10.cl
new file mode 100644
index 0000000..d65764a
--- /dev/null
+++ b/generic/lib/math/log10.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <log10.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/math/log10.inc b/generic/lib/math/log10.inc
new file mode 100644
index 0000000..bf8fa1a
--- /dev/null
+++ b/generic/lib/math/log10.inc
@@ -0,0 +1,13 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE log10(__CLC_GENTYPE val) {
+ // log10(x) = log2(x) / log2(10)
+ // 1 / log2(10) = 0.30102999566 = log10(2)
+ // SP representation is 0.30103
+ // DP representation is 0.301029995659999993762312442414
+#if __CLC_FPSIZE == 32
+ return log2(val) * 0.30102999566f;
+#elif __CLC_FPSIZE == 64
+ return log2(val) * 0.30102999566;

In other places we’re using hexadecimal floating point representations. Shouldn’t
those be used here too?

Jeroen

v2: Use constant and multiplication instead of division
v3: Use hex constants

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>

v2: Use constant and multiplication instead of division
v3: Use hex constants

LGTM.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---

So I understand the idea is to use only representable values as constants,
and not rely on the compiler to do the rounding, correct?

Yes, this is the idea.

-Tom

> v2: Use constant and multiplication instead of division
> v3: Use hex constants
>
LGTM.

Thanks.

Jeroen, you commented on v2, are you OK with v3?

v2: Use constant and multiplication instead of division
v3: Use hex constants

LGTM.

Thanks.

Jeroen, you commented on v2, are you OK with v3?

Yes, looks good to mee too.

Jeroen