LLVM 10 is miscompiling this code on x86_64 with an invalid shift count for `shrd`

It appears that LLVM 10 is miscompiling the following test on x86_64 but the trunk version seems to work fine. Before I spend any time debugging this, it would be helpful if someone can point me to a bug fix if there was one.

#include <stdio.h>

#include <stdint.h>

attribute ((noinline)) uint64_t foo4(unsigned* arr)

{

unsigned chunk = 34 >> 5;

unsigned bitIndex = 34 - chunk * 32;

arr = arr + chunk;

__int128_t* arr1 = (__int128_t*) arr;

__int128_t v1 = *arr1;

v1 = v1 >> bitIndex;

return (uint64_t) v1;

}

attribute ((noinline)) uint64_t foo3(unsigned* arr, unsigned index)

{

unsigned chunk = index >> 5;

unsigned bitIndex = index - chunk * 32;

arr = arr + chunk;

__int128_t* arr1 = (__int128_t*) arr;

__int128_t v1 = *arr1;

v1 = v1 >> bitIndex;

return (uint64_t) v1;

}

int main() {

unsigned arr[5];

arr[0]= 0x6f7cfd6f;

arr[1]= 0xd96c9806;

arr[2]= 0x89420144;

arr[3]= 0x8a20f548;

printf(“foo3 %lx\n”,foo3(arr,34) & 0x3ffffffff); // prints 222508051 instead of 1365b2601

printf(“foo4 %lx\n”,foo4(arr) & 0x3ffffffff); // prints 1365b2601 as expected

}

If you look at the assembly, there is an ‘and cl, 31’ missing in foo3 which is needed to make sure that the shift count is in the range [0…31]. This is missing in the generated code. Note that if the count in CL register is greater than 31, x86_64 manual states that the behavior of shrd is undefined.

clang 10

foo3(unsigned int*, unsigned int): # @foo3(unsigned int*, unsigned int)

mov ecx, esi

mov edx, esi

shr edx, 5

mov rax, qword ptr [rdi + 4*rdx]

mov rdx, qword ptr [rdi + 4*rdx + 8]

shrd rax, rdx, cl

ret

Trunk

foo3(unsigned int*, unsigned int): # @foo3(unsigned int*, unsigned int)

mov ecx, esi

mov edx, esi

shr edx, 5

mov rax, qword ptr [rdi + 4*rdx]

mov rdx, qword ptr [rdi + 4*rdx + 8]

and cl, 31

shrd rax, rdx, cl

ret

It appears that LLVM 10 is miscompiling the following test on x86_64 but the trunk version seems to work fine. Before I spend any time debugging this, it would be helpful if someone can point me to a bug fix if there was one.

#include <stdio.h>

#include <stdint.h>

__attribute__ ((noinline)) uint64_t foo4(unsigned* arr)

{

    unsigned chunk = 34 >> 5;

    unsigned bitIndex = 34 - chunk * 32;

    arr = arr + chunk;

    __int128_t* arr1 = (__int128_t*) arr;

I'm pretty sure this is UB, because the alignment of __int128_t is
bigger than that of unsigned.

    __int128_t v1 = *arr1;

    v1 = v1 >> bitIndex;

    return (uint64_t) v1;

}

__attribute__ ((noinline)) uint64_t foo3(unsigned* arr, unsigned index)

{

    unsigned chunk = index >> 5;

    unsigned bitIndex = index - chunk * 32;

    arr = arr + chunk;

    __int128_t* arr1 = (__int128_t*) arr;

    __int128_t v1 = *arr1;

    v1 = v1 >> bitIndex;

    return (uint64_t) v1;

}

int main() {

    unsigned arr[5];

    arr[0]= 0x6f7cfd6f;

    arr[1]= 0xd96c9806;

    arr[2]= 0x89420144;

    arr[3]= 0x8a20f548;

    printf("foo3 %lx\n",foo3(arr,34) & 0x3ffffffff); // prints 222508051 instead of 1365b2601

    printf("foo4 %lx\n",foo4(arr) & 0x3ffffffff); // prints 1365b2601 as expected

}

If you look at the assembly, there is an ‘and cl, 31’ missing in foo3 which is needed to make sure that the shift count is in the range [0...31]. This is missing in the generated code. Note that if the count in CL register is greater than 31, x86_64 manual states that the behavior of shrd is undefined.

# clang 10

foo3(unsigned int*, unsigned int): # @foo3(unsigned int*, unsigned int)

        mov ecx, esi

        mov edx, esi

        shr edx, 5

        mov rax, qword ptr [rdi + 4*rdx]

        mov rdx, qword ptr [rdi + 4*rdx + 8]

        shrd rax, rdx, cl

        ret

# Trunk

foo3(unsigned int*, unsigned int): # @foo3(unsigned int*, unsigned int)

        mov ecx, esi

        mov edx, esi

        shr edx, 5

        mov rax, qword ptr [rdi + 4*rdx]

        mov rdx, qword ptr [rdi + 4*rdx + 8]

        and cl, 31

        shrd rax, rdx, cl

        ret

Roman.

Do you see any UB in the following modified test? Here also, I see a difference between foo3 and foo4. Again, trunk and gcc match. But not Clang-10

#include <stdio.h>

#include <stdint.h>

#include <string.h>

attribute ((noinline)) uint64_t foo4(__int128_t* arr1)

{

unsigned chunk = 34 >> 5;

unsigned bitIndex = 34 - chunk * 32;

unsigned* arr = (unsigned *)arr + chunk;

__int128_t v1 = *arr1;

v1 = v1 >> bitIndex;

return (uint64_t) v1;

}

attribute ((noinline)) uint64_t foo3(__int128_t* arr1, unsigned index)

{

unsigned chunk = index >> 5;

unsigned bitIndex = index - chunk * 32;

unsigned *arr = (unsigned *)arr + chunk;

__int128_t v1 = *arr1;

v1 = v1 >> bitIndex;

return (uint64_t) v1;

}

int main() {

unsigned arr[4];

arr[0]= 0x6f7cfd6f;

arr[1]= 0xd96c9806;

arr[2]= 0x89420144;

arr[3]= 0x8a20f548;

__int128_t arr1;

memcpy(&arr1, arr, sizeof(arr1));

printf(“foo3 %lx\n”,foo3(&arr1,34) & 0x3ffffffff); // prints 1365b2601 (clang-10) 19bdf3f5b (gcc, clang-12)

printf(“foo4 %lx\n”,foo4(&arr1) & 0x3ffffffff); // prints 19bdf3f5b

}

Apologies, outlook messed up the formatting in the previous email. Here is it again.

Do you see any UB in the following modified test? Here also, I see a difference between foo3 and foo4. Again, trunk and gcc match. But not Clang-10.

#include <stdio.h>

#include <stdint.h>

#include <string.h>

attribute ((noinline)) uint64_t foo4(__int128_t* arr1)

{

unsigned chunk = 34 >> 5;

unsigned bitIndex = 34 - chunk * 32;

unsigned* arr = (unsigned *)arr + chunk;

__int128_t v1 = *arr1;

v1 = v1 >> bitIndex;

return (uint64_t) v1;

}

attribute ((noinline)) uint64_t foo3(__int128_t* arr1, unsigned index)

{

unsigned chunk = index >> 5;

unsigned bitIndex = index - chunk * 32;

unsigned *arr = (unsigned *)arr + chunk;

__int128_t v1 = *arr1;

v1 = v1 >> bitIndex;

return (uint64_t) v1;

}

int main() {

unsigned arr[4];

arr[0]= 0x6f7cfd6f;

arr[1]= 0xd96c9806;

arr[2]= 0x89420144;

arr[3]= 0x8a20f548;

__int128_t arr1;

memcpy(&arr1, arr, sizeof(arr1));

printf(“foo3 %lx\n”,foo3(&arr1,34) & 0x3ffffffff);

printf(“foo4 %lx\n”,foo4(&arr1) & 0x3ffffffff);

}

example.c:43:37: runtime error: subtraction of unsigned offset from
0x000000000022 overflowed to 0x7fffd01217c8
SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior example.c:43:37 in
example.c:19:37: runtime error: applying non-zero offset
140576899311424 to null pointer
SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior example.c:19:37 in
foo3 19bdf3f5b
foo4 19bdf3f5b

My bad! There was a typo that caused the UBSAN error. Here is the corrected test. This time UBSAN reports no errors. But clang-10 still produces unexpected result whereas trunk does not.

#include <stdio.h>
#include <stdint.h>
#include <string.h>

__attribute__ ((noinline)) uint64_t foo4(__int128_t* arr1)
{
    unsigned chunk = 34 >> 5;
    unsigned bitIndex = 34 - chunk * 32;

    unsigned* arr = (unsigned *)arr1 + chunk;
    __int128_t v1 = *arr1;
    v1 = v1 >> bitIndex;

    return (uint64_t) v1;
}

__attribute__ ((noinline)) uint64_t foo3(__int128_t* arr1, unsigned index)
{
    unsigned chunk = index >> 5;
    unsigned bitIndex = index - chunk * 32;

    unsigned *arr = (unsigned *)arr1 + chunk;
    __int128_t v1 = *arr1;
    v1 = v1 >> bitIndex;

    return (uint64_t) v1;
}

int main() {

    unsigned arr[4];

    arr[0]= 0x6f7cfd6f;
    arr[1]= 0xd96c9806;
    arr[2]= 0x89420144;
    arr[3]= 0x8a20f548;

    __int128_t arr1;

    memcpy(&arr1, arr, sizeof(arr1));

    printf("foo3 %lx\n",foo3(&arr1,34) & 0x3ffffffff);
    printf("foo4 %lx\n",foo4(&arr1) & 0x3ffffffff);
}

Curiously, for the following test, with -fsanitize=undefined -O3, clang-10 reports no UB errors and produces the correct run-time result. But without -fsanitize=undefined, it produces wrong output. Clang (trunk) produces correct results either way. I think there is definitely a bug that was fixed recently (in the last year or so). Could it be this one:

https://reviews.llvm.org/D75748?id=

Roman, you were a reviewer for this.

There’s a typo in this line of X86InstrCompiler.td in llvm 10. It should be shiftMask64 not shiftMask32.

// (shift x (and y, 63)) ==> (shift x, y)
def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
(!cast(name # “64rrCL”) GR64:$src1, GR64:$src2)>;

Which was flagged in post-commit review about a year later. https://reviews.llvm.org/D58475

Thanks Craig!