# clang: Manual unfolding doesn't match automatic unfolding

Here's the code and compilation steps:

#include <stdint.h>
typedef unsigned int uint128_t __attribute__((mode(TI)));

typedef struct{
uint64_t l[5];
} s;

void f(s * restrict r, const s * restrict x, const s * restrict y) {
uint128_t t[5] = {0, 0, 0, 0, 0};
#define BODY(i,j) { int i_ = i < j ? i : j; int j_ = i < j ? j :
i; uint128_t m = (uint128_t) x->l[i_] * (y->l[j_] * (i + j > 4 ? 19 :
1)); if (i + j > 4) { t[i + j - 5] += m; } else { t[i + j] += m; } }
#define LOOP(i) BODY(i, 0); BODY(i, 1); BODY(i, 2); BODY(i, 3); BODY(i, 4);
LOOP(0); LOOP(1); LOOP(2); LOOP(3); LOOP(4);
const uint64_t mask = (1LL << 51) - 1;
for (int i = 0; i < 5; i++) {
r->l[i] = ((uint64_t) t[i] & mask) + (i == 0 ? 19 : 1) *
(uint64_t)(t[(i + 4) % 5] >> 51);
}
}

% clang -O4 -S -o f.l f.c

If you change the loop to the unrolled version:
#define FOLD1(i) r->l[i] = ((uint64_t) t[i] & mask) + (i == 0 ? 19 :
1) * (uint64_t)(t[(i + 4) % 5] >> 51)
FOLD1(0); FOLD1(1); FOLD1(2); FOLD1(3); FOLD1(4);

you get different code, which is very sad-making.

Any ideas?