#include <arm_neon.h>
extern void abort (void);
__attribute__((noinline)) uint8x16_t
wrap_vld1q_lane_u8 (const uint8_t *load, uint8x16_t vec) {
return vld1q_lane_u8 (load, vec, 12);
}
int test_vld1q_lane_u8(const uint8_t *data) {
uint8_t out[16];
uint8_t overwrite = 7;
int j;
uint8x16_t in = vld1q_u8 (data);
in = wrap_vld1q_lane_u8 (&overwrite, in);
vst1q_u8 (out, in);
for (j = 0; j < 13; j++)
if (out[j] != (j == 12 ? overwrite : data[j])) {
abort();
}
return 0;
}
int main (int argc, char **argv)
{
uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
test_vld1q_lane_u8((const uint8_t *)orig_data);
return 0;
}
this code fail, when -O3 -fno-inline
.
I see the Using ARM NEON instructions in big endian mode — LLVM 18.0.0git documentation , but I still confuse about the asm:
rev64 v0.16b, v0.16b
ext v0.16b, v0.16b, v0.16b, #8
rev64 v0.16b, v0.16b
ext v0.16b, v0.16b, v0.16b, #8
this code seem do nothing, but appear many times.
I opt bisect it, it’s fail in SLPVectorizerPass
.
A weird observation is when set lane in range 0-11
, the code is work.
But when the lane great than 12
, the code is fail.
And read the asm, I comment it:
test_vld1q_lane_u8: // @test_vld1q_lane_u8
// %bb.0: // %entry
sub sp, sp, #48
ld1 { v0.16b }, [x0]
// v0 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce}
stp x29, x30, [sp, #32] // 16-byte Folded Spill
add x29, sp, #32
mov w8, #7
sub x0, x29, #4
sturb w8, [x29, #-4]
str q0, [sp] // 16-byte Folded Spill
rev64 v0.16b, v0.16b
ext v0.16b, v0.16b, v0.16b, #8
rev64 v0.16b, v0.16b
ext v0.16b, v0.16b, v0.16b, #8
bl wrap_vld1q_lane_u8
rev64 v0.16b, v0.16b
// v0 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13}
adrp x8, .LCPI1_0
add x8, x8, :lo12:.LCPI1_0
ldr q5, [sp] // 16-byte Folded Reload
// v5 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce}
ext v0.16b, v0.16b, v0.16b, #8
// v0 = {0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12}
ld1 { v2.8b }, [x8] // v2: [6, 7, 5, 4, 3, 2, 1, 0, ...]
adrp x8, .LCPI1_1
add x8, x8, :lo12:.LCPI1_1
mov v1.16b, v5.16b // v1 = v5
mov v1.d[1], v5.d[0]
// v1 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef}
ext v3.16b, v0.16b, v0.16b, #8 // v3 <- v0
// v3 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13}
ld1 { v4.8b }, [x8] // v4: [1, 0, 2, 3, 4, 5, 6, 7, ...]
umov w8, v0.b[7]
umov w9, v0.b[6]
ext v5.16b, v5.16b, v5.16b, #8
// v5 = {0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef}
mov v3.d[1], v3.d[0]
// v3 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12}
fmov s6, w8 // v6 handle [8-11]
tbl v1.8b, { v1.16b }, v4.8b // v1 <- v1[v4]
umov w8, v0.b[5]
tbl v2.8b, { v3.16b }, v2.8b // v2 <- v3[v2]
zip1 v3.8b, v0.8b, v5.8b
// v3 = {0x13, 0xce, 0x57, 0x8a, 0x9b, 0x46, 0xdf, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
mov v6.h[1], w9
umov w9, v0.b[4]
cmeq v1.8b, v2.8b, v1.8b
// {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef} 0-7 done
rev16 v2.8b, v3.8b // v2 <- v3
// v2 = {0xce13, 0x8a57, 0x469b, 0x7df, 0x0, 0x0, 0x0, 0x0}
mov v6.h[2], w8
umov w8, v1.b[0]
umov w10, v1.b[1]
umov w11, v1.b[2]
umov w12, v1.b[3]
umov w13, v1.b[4]
mov v6.h[3], w9
bic v2.4h, #255, lsl #8
// v2 = {0xce00, 0x8a00, 0x4600, 0x700, 0x0, 0x0, 0x0, 0x0}
and w8, w8, #0x1
and w9, w10, #0x1
umov w10, v1.b[5]
lsl w9, w9, #6
and w11, w11, #0x1
bfi w9, w8, #7, #1
umov w8, v1.b[6]
and w12, w12, #0x1
bic v6.4h, #255, lsl #8
and w13, w13, #0x1
// v6 = {0x1300, 0x5700, 0x9b00, 0xdf00, 0x0, 0x0, 0x0, 0x0}
cmeq v2.4h, v6.4h, v2.4h
Focus on v2 cmp v6
which is compare index [8-11], the v2
is
{0xce00, 0x8a00, 0x4600, 0x700, 0x0, 0x0, 0x0, 0x0}
and the v6
is
{0x1300, 0x5700, 0x9b00, 0xdf00, 0x0, 0x0, 0x0, 0x0}
It caused by zip1 v3.8b, v0.8b, v5.8b
or rev16 v2.8b, v3.8b
.
If I change zip1 v3.8b, v0.8b, v5.8b
to zip1 v3.8b, v5.8b, v0.8b
, it work.
Or change rev16 v2.8b, v3.8b
to mov v2.8b, v3.8b
, it work.
Can anyone give me some clue about this fail?