clang generates way more code than - Optimizer bug?

void decipher(char* text_, int text_len_)
for (int i = text_len_ - 1; i >= 0; --i)
text_[i] ^= 0xff - (i << 2);

generates this code

decipher(char*, int):
sub esi, 1
js .L1
movsx rax, esi
add rdi, rax
lea eax, [0+rsi*4]
xor al, BYTE PTR [rdi]
sub esi, 1
sub rdi, 1
not eax
mov BYTE PTR [rdi+1], al
cmp esi, -1
jne .L3

generates this huge code - maybe its better, but i don't think so

.quad -3 # 0xfffffffffffffffd
.quad -4 # 0xfffffffffffffffc
.quad -5 # 0xfffffffffffffffb
.quad -6 # 0xfffffffffffffffa
.quad -7 # 0xfffffffffffffff9
.quad -8 # 0xfffffffffffffff8
.quad -9 # 0xfffffffffffffff7
.quad -10 # 0xfffffffffffffff6
.quad -11 # 0xfffffffffffffff5
.quad -12 # 0xfffffffffffffff4
.quad -13 # 0xfffffffffffffff3
.quad -14 # 0xfffffffffffffff2
.quad -15 # 0xfffffffffffffff1
.quad -16 # 0xfffffffffffffff0
.quad -1 # 0xffffffffffffffff
.quad -2 # 0xfffffffffffffffe
.byte 255 # 0xff
.byte 0 # 0x0
.byte 255 # 0xff
.byte 0 # 0x0
.zero 16,252
decipher(char*, int): # @decipher(char*, int)
test esi, esi
jle .LBB0_12
mov r9d, esi
cmp esi, 16
jae .LBB0_3
mov rcx, r9
lea rax, [rcx + 1]
shl cl, 2
mov dl, 3
sub dl, cl
.LBB0_11: # =>This Inner Loop Header: Depth=1
lea ecx, [rax - 2]
xor byte ptr [rdi + rcx], dl
add rax, -1
add dl, 4
cmp rax, 1
ja .LBB0_11
lea rax, [r9 - 1]
add esi, -1
cmp esi, eax
jb .LBB0_9
shr rax, 32
jne .LBB0_9
mov r8d, r9d
and r8d, -16
mov ecx, r9d
and ecx, 15
movdqa xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 = [18446744073709551613,18446744073709551612]
movdqa xmm9, xmmword ptr [rip + .LCPI0_1] # xmm9 = [18446744073709551611,18446744073709551610]
movdqa xmm10, xmmword ptr [rip + .LCPI0_2] # xmm10 = [18446744073709551609,18446744073709551608]
movdqa xmm11, xmmword ptr [rip + .LCPI0_3] # xmm11 = [18446744073709551607,18446744073709551606]
movdqa xmm12, xmmword ptr [rip + .LCPI0_4] # xmm12 = [18446744073709551605,18446744073709551604]
movdqa xmm13, xmmword ptr [rip + .LCPI0_5] # xmm13 = [18446744073709551603,18446744073709551602]
movdqa xmm14, xmmword ptr [rip + .LCPI0_6] # xmm14 = [18446744073709551601,18446744073709551600]
movdqa xmm15, xmmword ptr [rip + .LCPI0_7] # xmm15 = [18446744073709551615,18446744073709551614]
movdqa xmm0, xmmword ptr [rip + .LCPI0_8] # xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
movdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
pcmpeqd xmm2, xmm2
pxor xmm3, xmm3
mov rsi, r8
mov rax, r9
.LBB0_6: # =>This Inner Loop Header: Depth=1
movq xmm4, rax
pshufd xmm4, xmm4, 68 # xmm4 = xmm4[0,1,0,1]
movdqa xmm5, xmm4
paddq xmm5, xmm11
movdqa xmm6, xmm4
paddq xmm6, xmm13
movdqa xmm7, xmm4
paddq xmm7, xmm14
pand xmm7, xmm0
pand xmm6, xmm0
packuswb xmm6, xmm7
movdqa xmm7, xmm4
paddq xmm7, xmm12
pand xmm7, xmm0
pand xmm5, xmm0
packuswb xmm5, xmm7
movdqa xmm7, xmm4
paddq xmm7, xmm9
packuswb xmm5, xmm6
movdqa xmm6, xmm4
paddq xmm6, xmm10
pand xmm6, xmm0
pand xmm7, xmm0
packuswb xmm7, xmm6
movdqa xmm6, xmm4
paddq xmm6, xmm8
paddq xmm4, xmm15
movq rdx, xmm4
pand xmm6, xmm0
pand xmm4, xmm0
packuswb xmm4, xmm6
packuswb xmm4, xmm7
mov edx, edx
packuswb xmm4, xmm5
psllw xmm4, 2
pand xmm4, xmm1
pxor xmm4, xmm2
movdqa xmm5, xmm4
punpcklbw xmm5, xmm3 # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
pshufd xmm5, xmm5, 78 # xmm5 = xmm5[2,3,0,1]
pshuflw xmm5, xmm5, 27 # xmm5 = xmm5[3,2,1,0,4,5,6,7]
pshufhw xmm5, xmm5, 27 # xmm5 = xmm5[0,1,2,3,7,6,5,4]
punpckhbw xmm4, xmm3 # xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
pshufd xmm4, xmm4, 78 # xmm4 = xmm4[2,3,0,1]
pshuflw xmm4, xmm4, 27 # xmm4 = xmm4[3,2,1,0,4,5,6,7]
pshufhw xmm4, xmm4, 27 # xmm4 = xmm4[0,1,2,3,7,6,5,4]
packuswb xmm4, xmm5
movdqu xmm5, xmmword ptr [rdi + rdx - 15]
pxor xmm4, xmm5
movdqu xmmword ptr [rdi + rdx - 15], xmm4
add rax, -16
add rsi, -16
jne .LBB0_6
cmp r8, r9
jne .LBB0_10
jmp .LBB0_12

Hi Dennis,

I believe GCC at optimisation level O2 doesn’t enable the vectoriser, that’s why you only get a scalar loop. With Clang at O2 vectorisation is enabled and you’ll get a scalar and vector loop. Yes, it’s a lot of code, but that is not the best metric for fast code. I guess the only way to tell is to run and measure it.


Yes, it's a lot of code, but that is not the best metric for fast code.
I guess the only way to tell is to run and measure it.

90% of the generated code is nearly same to gcc and then comes a huge block of code

its a trivial for with xor and shift

gcc -O2 or -O3 generates the same small code

yeah, the clang code is faster than gcc - even with the much larger code
- as you told :slight_smile:

thank you

gcc O1: 9.782262 seconds
gcc O2: 8.115871 seconds
gcc O3: 3.092142 seconds

clang O1: 9.905967 seconds
clang O2: 1.629295 seconds
clang O3: 1.629502 seconds


#include <stdio.h>
#include <time.h>

void decipher(unsigned char* text_, int text_len_)
for (int i = text_len_ - 1; i >= 0; --i)
text_[i] ^= 0xff - (i << 2);

int benchmark()
unsigned char text[] = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF };

int anti_optimizer = 0;
for (int i = 0; i < 1000000000; ++i)
decipher(text, sizeof(text));

 for \(int x = 0; x &lt; sizeof\(text\); \+\+x\)
   anti\_optimizer \+= text\[x\];

return anti_optimizer;

int main()
clock_t start_time = clock();
int result = benchmark();
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time);
return result;

Thanks for checking. :slight_smile:
What matters most is where in the app the run time is spent (i.e. which code paths are hot) and that must be the vectorised loop which processes 16 elements in parallel using SIMD instructions. If we compare GCC O2 with Clang O3, then we compare an efficient scalar loop with a vectorised loop and see a ~5x improvement. That’s pretty decent, but is some way off from a theoretical 16x speed up and there could be many reasons for that. First, not all time is spent in the kernel, and some time is spent in all the setup code before it enters the loop, or the vectorised codegen is not efficient enough, or it is waiting for data. I haven’t looked into details (the code and experimental setup) so this is a bit hand waivy, but I guess this must be the gist of it.