Another compiler shootout

FYI. http://leonardo-m.livejournal.com/73732.html

If anyone is motivated, please file bugs for the losing cases. Also, it might make sense to incorporate the tests into our nightly tester test suite.

Thanks,

Evan

Most of them should already be in our nightly tester. I went through the Alioth tests A while back and added them.

-Owen

Ah, I see. It's BenchmarkGame. How are we doing there?

Evan

FWIW, I just ported my ray tracer benchmark to C and found that llvm-gcc gives
much worse performance than gcc on x86 but not on x86-64 on an Opteron:

2.1GHz Opteron

32-bit
     gcc 4.3.2: 5.60s (gcc -Wall -O3 -lm ray.c -o ray)
llvm-gcc 4.2.1: 9.00s (llvm-gcc -O3 -march=opteron -msse2 -lm ray.c -o ray)

64-bit
     gcc 4.3.2: 4.18s (gcc -Wall -O3 -lm ray.c -o ray)
llvm-gcc 4.2.1: 5.00s (llvm-gcc -O3 -march=opteron -msse2 -lm ray.c -o ray)

Note that the LLVM-generated code is 60% slower than GCC's in the first case.

I am unfamiliar with x86 assembler but I believe the problem is that LLVM is
calling a function for fsqrt rather than using the x86 op-code. Should I be
passing some command line arguments or using a newer llvm-gcc to get it to
emit fsqrt or is that not yet implemented?

Benchmark was:

  time ./ray 9 512 >image.pgm

Compile times go down from 0.36s to 0.13s on x86 and 0.35s to 0.19s on x86-64
as expected.

Here's the code:

#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define real float
#define epsilon FLT_EPSILON
//#define real double
//#define epsilon DBL_EPSILON

real delta, INFINITY;

typedef struct { real x, y, z; } Vec;
Vec vec(real x, real y, real z) { Vec r; r.x=x; r.y=y; r.z=z; return r; }
Vec add(const Vec a, const Vec b) { return vec(a.x+b.x, a.y+b.y, a.z+b.z); }
Vec sub(const Vec a, const Vec b) { return vec(a.x-b.x, a.y-b.y, a.z-b.z); }
Vec scale(real a, const Vec b) { return vec(a*b.x, a*b.y, a*b.z); }
real dot(const Vec a, const Vec b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
Vec unitise(const Vec a) { return scale((1.0 / sqrt(dot(a, a))), a); }

struct Scene {
  Vec center;
  real radius;
  struct Scene *child;
};

real ray_sphere(Vec o, Vec d, Vec c, real r) {
  Vec v = sub(c, o);
  real b = dot(v, d), disc = b*b - dot(v, v) + r*r, t1, t2;
  if (disc < 0.0) return INFINITY;
  disc = sqrt(disc);
  t2 = b + disc;
  if (t2 < 0.0) return INFINITY;
  t1 = b - disc;
  return (t1 > 0.0 ? t1 : t2);
}

void intersect(Vec o, Vec d, real *lambda, struct Scene **t, struct Scene
*scene) {
  real lambda2 = ray_sphere(o, d, scene->center, scene->radius);
  if (lambda2 < *lambda) {
    if (scene->child) {
      int i;
      for (i=0; i<5; ++i)
        intersect(o, d, lambda, t, &scene->child[i]);
    } else {
      *lambda = lambda2;
      *t = scene;
    }
  }
}

Vec neglight;

real ray_trace(Vec o, Vec d, struct Scene scene) {
  real lambda = INFINITY;
  struct Scene *t = NULL;
  intersect(o, d, &lambda, &t, &scene);
  if (lambda == INFINITY) return 0.0;
  {
    Vec p = add(o, scale(lambda, d));
    Vec normal = unitise(sub(p, t->center));
    real g = dot(normal, neglight);
    if (g <= 0.0) return 0.0;
    p = add(p, scale(delta, normal));
    lambda = INFINITY;
    intersect(p, neglight, &lambda, &t, &scene);
    return (lambda < INFINITY ? 0.0 : g);
  }
}

struct Scene create(int level, Vec c, real r) {
  struct Scene scene;
  scene.center = c;
  if (level == 1) {
    scene.radius = r;
    scene.child = NULL;
  } else {
    real rn = 3*r/sqrt(12);
    scene.radius = 3*r;
    scene.child = (struct Scene *)malloc(5*sizeof(struct Scene));
    scene.child[0] = create(1, c, r);
    scene.child[1] = create(level-1, add(c, scale(rn, vec(-1, 1, -1))), r/2);
    scene.child[2] = create(level-1, add(c, scale(rn, vec( 1, 1, -1))), r/2);
    scene.child[3] = create(level-1, add(c, scale(rn, vec(-1, 1, 1))), r/2);
    scene.child[4] = create(level-1, add(c, scale(rn, vec( 1, 1, 1))), r/2);
  }
  return scene;
}

int main(int argc, char *argv) {
  struct Scene scene;
  int level, n, ss=4, x, y;
  level = (argc==3 ? atoi(argv[1]) : 9);
  n = (argc==3 ? atoi(argv[2]) : 512);
  delta = sqrt(epsilon);
  INFINITY = 1.0 / 0.0;
  neglight = unitise(vec(1, 3, -2));
  scene = create(level, vec(0, -1, 0), 1);
  printf("P5\n%d %d\n255\n", n, n);
  for (y=n-1; y>=0; --y)
    for (x=0; x<n; ++x) {
      real g=0.0;
      int dx, dy;
      for (dx=0; dx<ss; ++dx)
  for (dy=0; dy<ss; ++dy) {
    Vec d=unitise(vec(x+dx*1./ss-n/2., y+dy*1./ss-n/2., n));
    g += ray_trace(vec(0, 0, -4), d, scene);
  }
      printf("%c", (char)(0.5 + 255.0 * g / (ss*ss)));
    }
  return 0;
}

This may also explain the one anomalous result here:

  http://leonardo-m.livejournal.com/73732.html

On the nbody shootout because that also has a sqrt at its core:

      double distance = sqrt(dx * dx + dy * dy + dz * dz);

FYI. http://leonardo-m.livejournal.com/73732.html

If anyone is motivated, please file bugs for the losing cases. Also,
it might make sense to incorporate the tests into our nightly tester
test suite.

FWIW, I just ported my ray tracer benchmark to C and found that llvm-gcc gives
much worse performance than gcc on x86 but not on x86-64 on an Opteron:

2.1GHz Opteron

32-bit
    gcc 4.3.2: 5.60s (gcc -Wall -O3 -lm ray.c -o ray)
llvm-gcc 4.2.1: 9.00s (llvm-gcc -O3 -march=opteron -msse2 -lm ray.c -o ray)

64-bit
    gcc 4.3.2: 4.18s (gcc -Wall -O3 -lm ray.c -o ray)
llvm-gcc 4.2.1: 5.00s (llvm-gcc -O3 -march=opteron -msse2 -lm ray.c -o ray)

Note that the LLVM-generated code is 60% slower than GCC's in the first case.

Wow that's bad :), nice relatively small testcase too.

Can you please file a bugzilla report with this .c file, and the output of the above compilations in -S mode (so attach the .s file from llvm and gcc in 32/64 bit modes)?

It would also be useful to attach the -emit-llvm -S output from the llvm-gcc compiles.

I am unfamiliar with x86 assembler but I believe the problem is that LLVM is
calling a function for fsqrt rather than using the x86 op-code. Should I be
passing some command line arguments or using a newer llvm-gcc to get it to
emit fsqrt or is that not yet implemented?

That sound be very easy to add. Thanks!

-Chris

Oh, I just had another crazy thought. This is quite likely the result of llvm-gcc disabling the inline asm in your libm headers. Can you also attach the "-E -o foo.i" output from both llvm-gcc and gcc in 32-bit mode?

Thanks Jon,

-Chris