[AArch64] neon vectorize miscompiled

#include <arm_neon.h>

extern void abort (void);

__attribute__((noinline)) uint8x16_t
wrap_vld1q_lane_u8 (const uint8_t *load, uint8x16_t vec) {
  return vld1q_lane_u8 (load, vec, 12);
}

int test_vld1q_lane_u8(const uint8_t *data) {
  uint8_t out[16];
  uint8_t overwrite = 7;
  int j;
  uint8x16_t in = vld1q_u8 (data);
  in = wrap_vld1q_lane_u8 (&overwrite, in);
  vst1q_u8 (out, in);
  for (j = 0; j < 13; j++)
    if (out[j] != (j == 12 ? overwrite : data[j])) {
      abort();
    }
  return 0;
}

int main (int argc, char **argv)
{
  uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
  test_vld1q_lane_u8((const uint8_t *)orig_data);
  return 0;
}

this code fail, when -O3 -fno-inline.

I see the Using ARM NEON instructions in big endian mode — LLVM 18.0.0git documentation , but I still confuse about the asm:

        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8

this code seem do nothing, but appear many times.

I opt bisect it, it’s fail in SLPVectorizerPass.

A weird observation is when set lane in range 0-11, the code is work.
But when the lane great than 12, the code is fail.

And read the asm, I comment it:

test_vld1q_lane_u8:                     // @test_vld1q_lane_u8
// %bb.0:                               // %entry
        sub     sp, sp, #48
        ld1     { v0.16b }, [x0]
  //  v0 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce}
        stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
        add     x29, sp, #32
        mov     w8, #7
        sub     x0, x29, #4
        sturb   w8, [x29, #-4]
        str     q0, [sp]                        // 16-byte Folded Spill
        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        bl      wrap_vld1q_lane_u8
        rev64   v0.16b, v0.16b
  // v0 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13}
        adrp    x8, .LCPI1_0
        add     x8, x8, :lo12:.LCPI1_0
        ldr     q5, [sp]                        // 16-byte Folded Reload
  // v5 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce}
        ext     v0.16b, v0.16b, v0.16b, #8
  // v0 = {0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12}
        ld1     { v2.8b }, [x8]         // v2: [6, 7, 5, 4, 3, 2, 1, 0, ...]
        adrp    x8, .LCPI1_1
        add     x8, x8, :lo12:.LCPI1_1
        mov     v1.16b, v5.16b          // v1 = v5
        mov     v1.d[1], v5.d[0]
  // v1 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef}
        ext     v3.16b, v0.16b, v0.16b, #8  // v3 <- v0
  // v3 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13}
        ld1     { v4.8b }, [x8]         // v4: [1, 0, 2, 3, 4, 5, 6, 7, ...]
        umov    w8, v0.b[7]
        umov    w9, v0.b[6]
        ext     v5.16b, v5.16b, v5.16b, #8
  // v5 = {0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef}
        mov     v3.d[1], v3.d[0]
  // v3 =  {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12}
        fmov    s6, w8                  // v6 handle [8-11]
        tbl     v1.8b, { v1.16b }, v4.8b  // v1 <- v1[v4]
        umov    w8, v0.b[5]
        tbl     v2.8b, { v3.16b }, v2.8b  // v2 <- v3[v2]
        zip1 v3.8b, v0.8b, v5.8b
  // v3 =  {0x13, 0xce, 0x57, 0x8a, 0x9b, 0x46, 0xdf, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
        mov     v6.h[1], w9
        umov    w9, v0.b[4]
        cmeq    v1.8b, v2.8b, v1.8b
  // {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef} 0-7 done
        rev16   v2.8b, v3.8b            // v2 <- v3
  // v2 = {0xce13, 0x8a57, 0x469b, 0x7df, 0x0, 0x0, 0x0, 0x0}
        mov     v6.h[2], w8
        umov    w8, v1.b[0]
        umov    w10, v1.b[1]
        umov    w11, v1.b[2]
        umov    w12, v1.b[3]
        umov    w13, v1.b[4]
        mov     v6.h[3], w9
        bic     v2.4h, #255, lsl #8
  // v2 = {0xce00, 0x8a00, 0x4600, 0x700, 0x0, 0x0, 0x0, 0x0}
        and     w8, w8, #0x1
        and     w9, w10, #0x1
        umov    w10, v1.b[5]
        lsl     w9, w9, #6
        and     w11, w11, #0x1
        bfi     w9, w8, #7, #1
        umov    w8, v1.b[6]
        and     w12, w12, #0x1
        bic     v6.4h, #255, lsl #8
        and     w13, w13, #0x1
  // v6 = {0x1300, 0x5700, 0x9b00, 0xdf00, 0x0, 0x0, 0x0, 0x0}
        cmeq    v2.4h, v6.4h, v2.4h

Focus on v2 cmp v6 which is compare index [8-11], the v2 is

{0xce00, 0x8a00, 0x4600, 0x700, 0x0, 0x0, 0x0, 0x0}

and the v6 is

{0x1300, 0x5700, 0x9b00, 0xdf00, 0x0, 0x0, 0x0, 0x0}

It caused by zip1 v3.8b, v0.8b, v5.8b or rev16 v2.8b, v3.8b.

If I change zip1 v3.8b, v0.8b, v5.8b to zip1 v3.8b, v5.8b, v0.8b, it work.
Or change rev16 v2.8b, v3.8b to mov v2.8b, v3.8b, it work.

Can anyone give me some clue about this fail?