Registers xmm6 to xmm15 are not saved in Clang16 with -march = skylake-avx512 and -march = x86-64-v4

Registers xmm6 to xmm15 are not saved in Clang16 for the following code snippet on Windows. Clang 9.0.1 and also Clang16 with -march = haswell, seems to save them.

Is this behavior valid ?

void foo()
{
    asm
    (
        "vpxord %%zmm0, %%zmm0, %%zmm0\n"
        "vpxord %%zmm1, %%zmm1, %%zmm1\n"
        "vpxord %%zmm2, %%zmm2, %%zmm2\n"
        "vpxord %%zmm3, %%zmm3, %%zmm3\n"
        "vpxord %%zmm4, %%zmm4, %%zmm4\n"
        "vpxord %%zmm5, %%zmm5, %%zmm5\n"
        "vpxord %%zmm6, %%zmm6, %%zmm6\n"
        "vpxord %%zmm7, %%zmm7, %%zmm7\n"
        "vpxord %%zmm8, %%zmm8, %%zmm8\n"
        "vpxord %%zmm9, %%zmm9, %%zmm9\n"
        "vpxord %%zmm10, %%zmm10, %%zmm10\n"
        "vpxord %%zmm11, %%zmm11, %%zmm11\n"
        "vpxord %%zmm12, %%zmm12, %%zmm12\n"
        "vpxord %%zmm13, %%zmm13, %%zmm13\n"
        "vpxord %%zmm14, %%zmm14, %%zmm14\n"
        "vpxord %%zmm15, %%zmm15, %%zmm15\n"
        "vpxord %%zmm16, %%zmm16, %%zmm16\n"
        "vpxord %%zmm17, %%zmm17, %%zmm17\n"
        "vpxord %%zmm18, %%zmm18, %%zmm18\n"
        "vpxord %%zmm19, %%zmm19, %%zmm19\n"
        "vpxord %%zmm20, %%zmm20, %%zmm20\n"
        "vpxord %%zmm21, %%zmm21, %%zmm21\n"
        "vpxord %%zmm22, %%zmm22, %%zmm22\n"
        "vpxord %%zmm23, %%zmm23, %%zmm23\n"
        "vpxord %%zmm24, %%zmm24, %%zmm24\n"
        "vpxord %%zmm25, %%zmm25, %%zmm25\n"
        "vpxord %%zmm26, %%zmm26, %%zmm26\n"
        "vpxord %%zmm27, %%zmm27, %%zmm27\n"
        "vpxord %%zmm28, %%zmm28, %%zmm28\n"
        "vpxord %%zmm29, %%zmm29, %%zmm29\n"
        "vpxord %%zmm30, %%zmm30, %%zmm30\n"
        "vpxord %%zmm31, %%zmm31, %%zmm31\n"
        ::: "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
          "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
          "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
          "zmm30", "zmm31"
    );
}

With Clang 16.0.0 .

-O3 -march=skylake-avx512 -mavx512f --target=x86_64-pc-windows-msvc

"?foo@@YAXXZ":                          # @"?foo@@YAXXZ"
        vpxord  zmm0, zmm0, zmm0
        vpxord  zmm1, zmm1, zmm1
        vpxord  zmm2, zmm2, zmm2
        vpxord  zmm3, zmm3, zmm3
        vpxord  zmm4, zmm4, zmm4
        vpxord  zmm5, zmm5, zmm5
        vpxord  zmm6, zmm6, zmm6
        vpxord  zmm7, zmm7, zmm7
        vpxord  zmm8, zmm8, zmm8
        vpxord  zmm9, zmm9, zmm9
        vpxord  zmm10, zmm10, zmm10
        vpxord  zmm11, zmm11, zmm11
        vpxord  zmm12, zmm12, zmm12
        vpxord  zmm13, zmm13, zmm13
        vpxord  zmm14, zmm14, zmm14
        vpxord  zmm15, zmm15, zmm15
        vpxord  zmm16, zmm16, zmm16
        vpxord  zmm17, zmm17, zmm17
        vpxord  zmm18, zmm18, zmm18
        vpxord  zmm19, zmm19, zmm19
        vpxord  zmm20, zmm20, zmm20
        vpxord  zmm21, zmm21, zmm21
        vpxord  zmm22, zmm22, zmm22
        vpxord  zmm23, zmm23, zmm23
        vpxord  zmm24, zmm24, zmm24
        vpxord  zmm25, zmm25, zmm25
        vpxord  zmm26, zmm26, zmm26
        vpxord  zmm27, zmm27, zmm27
        vpxord  zmm28, zmm28, zmm28
        vpxord  zmm29, zmm29, zmm29
        vpxord  zmm30, zmm30, zmm30
        vpxord  zmm31, zmm31, zmm31

        ret

With Clang 9.0.1
-O3 -march=skylake-avx512 -mavx512f --target=x86_64-pc-windows-msvc

"?foo@@YAXXZ":                          # @"?foo@@YAXXZ"
        sub     rsp, 168
        vmovaps xmmword ptr [rsp + 144], xmm15 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 128], xmm14 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 112], xmm13 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 96], xmm12 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 80], xmm11 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 64], xmm10 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 48], xmm9 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 32], xmm8 # 16-byte Spill
        vmovaps xmmword ptr [rsp + 16], xmm7 # 16-byte Spill
        vmovaps xmmword ptr [rsp], xmm6 # 16-byte Spill
        vpxord  zmm0, zmm0, zmm0
        vpxord  zmm1, zmm1, zmm1
        vpxord  zmm2, zmm2, zmm2
        vpxord  zmm3, zmm3, zmm3
        vpxord  zmm4, zmm4, zmm4
        vpxord  zmm5, zmm5, zmm5
        vpxord  zmm6, zmm6, zmm6
        vpxord  zmm7, zmm7, zmm7
        vpxord  zmm8, zmm8, zmm8
        vpxord  zmm9, zmm9, zmm9
        vpxord  zmm10, zmm10, zmm10
        vpxord  zmm11, zmm11, zmm11
        vpxord  zmm12, zmm12, zmm12
        vpxord  zmm13, zmm13, zmm13
        vpxord  zmm14, zmm14, zmm14
        vpxord  zmm15, zmm15, zmm15
        vpxord  zmm16, zmm16, zmm16
        vpxord  zmm17, zmm17, zmm17
        vpxord  zmm18, zmm18, zmm18
        vpxord  zmm19, zmm19, zmm19
        vpxord  zmm20, zmm20, zmm20
        vpxord  zmm21, zmm21, zmm21
        vpxord  zmm22, zmm22, zmm22
        vpxord  zmm23, zmm23, zmm23
        vpxord  zmm24, zmm24, zmm24
        vpxord  zmm25, zmm25, zmm25
        vpxord  zmm26, zmm26, zmm26
        vpxord  zmm27, zmm27, zmm27
        vpxord  zmm28, zmm28, zmm28
        vpxord  zmm29, zmm29, zmm29
        vpxord  zmm30, zmm30, zmm30
        vpxord  zmm31, zmm31, zmm31

        vmovaps xmm6, xmmword ptr [rsp] # 16-byte Reload
        vmovaps xmm7, xmmword ptr [rsp + 16] # 16-byte Reload
        vmovaps xmm8, xmmword ptr [rsp + 32] # 16-byte Reload
        vmovaps xmm9, xmmword ptr [rsp + 48] # 16-byte Reload
        vmovaps xmm10, xmmword ptr [rsp + 64] # 16-byte Reload
        vmovaps xmm11, xmmword ptr [rsp + 80] # 16-byte Reload
        vmovaps xmm12, xmmword ptr [rsp + 96] # 16-byte Reload
        vmovaps xmm13, xmmword ptr [rsp + 112] # 16-byte Reload
        vmovaps xmm14, xmmword ptr [rsp + 128] # 16-byte Reload
        vmovaps xmm15, xmmword ptr [rsp + 144] # 16-byte Reload
        add     rsp, 168
        vzeroupper
        ret

With Clang 16.0.0
-O3 -march=haswell -mavx512f --target=x86_64-pc-windows-msvc

"?foo@@YAXXZ":                          # @"?foo@@YAXXZ"
        sub     rsp, 168
        vmovaps xmmword ptr [rsp + 144], xmm15  # 16-byte Spill
        vmovaps xmmword ptr [rsp + 128], xmm14  # 16-byte Spill
        vmovaps xmmword ptr [rsp + 112], xmm13  # 16-byte Spill
        vmovaps xmmword ptr [rsp + 96], xmm12   # 16-byte Spill
        vmovaps xmmword ptr [rsp + 80], xmm11   # 16-byte Spill
        vmovaps xmmword ptr [rsp + 64], xmm10   # 16-byte Spill
        vmovaps xmmword ptr [rsp + 48], xmm9    # 16-byte Spill
        vmovaps xmmword ptr [rsp + 32], xmm8    # 16-byte Spill
        vmovaps xmmword ptr [rsp + 16], xmm7    # 16-byte Spill
        vmovaps xmmword ptr [rsp], xmm6         # 16-byte Spill
        vpxord  zmm0, zmm0, zmm0
        vpxord  zmm1, zmm1, zmm1
        vpxord  zmm2, zmm2, zmm2
        vpxord  zmm3, zmm3, zmm3
        vpxord  zmm4, zmm4, zmm4
        vpxord  zmm5, zmm5, zmm5
        vpxord  zmm6, zmm6, zmm6
        vpxord  zmm7, zmm7, zmm7
        vpxord  zmm8, zmm8, zmm8
        vpxord  zmm9, zmm9, zmm9
        vpxord  zmm10, zmm10, zmm10
        vpxord  zmm11, zmm11, zmm11
        vpxord  zmm12, zmm12, zmm12
        vpxord  zmm13, zmm13, zmm13
        vpxord  zmm14, zmm14, zmm14
        vpxord  zmm15, zmm15, zmm15
        vpxord  zmm16, zmm16, zmm16
        vpxord  zmm17, zmm17, zmm17
        vpxord  zmm18, zmm18, zmm18
        vpxord  zmm19, zmm19, zmm19
        vpxord  zmm20, zmm20, zmm20
        vpxord  zmm21, zmm21, zmm21
        vpxord  zmm22, zmm22, zmm22
        vpxord  zmm23, zmm23, zmm23
        vpxord  zmm24, zmm24, zmm24
        vpxord  zmm25, zmm25, zmm25
        vpxord  zmm26, zmm26, zmm26
        vpxord  zmm27, zmm27, zmm27
        vpxord  zmm28, zmm28, zmm28
        vpxord  zmm29, zmm29, zmm29
        vpxord  zmm30, zmm30, zmm30
        vpxord  zmm31, zmm31, zmm31

        vmovaps xmm6, xmmword ptr [rsp]         # 16-byte Reload
        vmovaps xmm7, xmmword ptr [rsp + 16]    # 16-byte Reload
        vmovaps xmm8, xmmword ptr [rsp + 32]    # 16-byte Reload
        vmovaps xmm9, xmmword ptr [rsp + 48]    # 16-byte Reload
        vmovaps xmm10, xmmword ptr [rsp + 64]   # 16-byte Reload
        vmovaps xmm11, xmmword ptr [rsp + 80]   # 16-byte Reload
        vmovaps xmm12, xmmword ptr [rsp + 96]   # 16-byte Reload
        vmovaps xmm13, xmmword ptr [rsp + 112]  # 16-byte Reload
        vmovaps xmm14, xmmword ptr [rsp + 128]  # 16-byte Reload
        vmovaps xmm15, xmmword ptr [rsp + 144]  # 16-byte Reload
        add     rsp, 168
        vzeroupper
        ret

The spill is unnecessary, there are no other uses of ZMM registers in your function. If you add an argument of type __m512 you should see ZMM0 get spilled.

I think srides’ point is that according to Windows x64 ABI XMM6 to XMM15 must be preserved by the function.
I don’t see any clause there that would limit this requirement to functions with a vector argument.

Somehow clang-16 seems to comply with the ms_abi calling convention when YMM registers are clobbered, but not when clobbering ZMM registers.

Sorry, my bad, I can reproduce this behaviour on current main as well and it seems to me like a bug, can you please create a bug report?

I’ve created [X86] Callee saved ZMM[6-15] not preserved before being clobbered by inline asm as required by the Windows x64 ABI · Issue #63615 · llvm/llvm-project · GitHub.

Thank you n-omer.
Another datapoint: registers get correctly spilled when -mprefer-vector-width is >=512, which explains the regression since clang 9.0.1.

I think the issue is in the frontend. The inline assembling handling should update LargestVectorWidth in CodeGenFunction to 512 when it detects the clobber of any zmm register. This will cause the “min-legal-vector-width” attribute to be 512 instead of 0. This will make 512 a legal type backend.