Hello,
I am using llvm on my core i7 laptop which has no avx support.
my goal is to generate avx512 code (loop vectorization) for Knight landing/skylake .
my .c code is;
int a[256], b[256], c[256];
foo () {
int i;
for (i=0; i<256; i++) {
a[i] = b[i] + c[i];
}
}
i first generated its .ll file via clang
clang -S -emit-llvm test.c -o test.ll
then i optimized it;
opt -S -O3 test.ll -o test_o3.ll
then i used llc for code generation
llc -mcpu=skylake-avx512 -mattr=+avx512f test_o3.ll -o test_o3.s
llc -mcpu=knl -mattr=+avx512f test_o3.ll -o test_o3.s
here is my generated code;
.text
.file “filer_o3.ll”
.globl foo
.p2align 4, 0x90
.type foo,@function
foo: # @foo
.cfi_startproc
BB#0: # %min.iters.checked
pushq %rbp
.Ltmp0:
.cfi_def_cfa_offset 16
.Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp2:
.cfi_def_cfa_register %rbp
movq $-1024, %rax # imm = 0xFC00
.p2align 4, 0x90
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
vmovdqa32 c+1024(%rax), %xmm0
vmovdqa32 c+1040(%rax), %xmm1
vpaddd b+1024(%rax), %xmm0, %xmm0
vpaddd b+1040(%rax), %xmm1, %xmm1
vmovdqa32 %xmm0, a+1024(%rax)
vmovdqa32 %xmm1, a+1040(%rax)
vmovdqa32 c+1056(%rax), %xmm0
vmovdqa32 c+1072(%rax), %xmm1
vpaddd b+1056(%rax), %xmm0, %xmm0
vpaddd b+1072(%rax), %xmm1, %xmm1
vmovdqa32 %xmm0, a+1056(%rax)
vmovdqa32 %xmm1, a+1072(%rax)
addq $64, %rax
jne .LBB0_1
BB#2: # %middle.block
popq %rbp
retq
.Lfunc_end0:
.size foo, .Lfunc_end0-foo
.cfi_endproc
.type b,@object # @b
.comm b,1024,16
.type c,@object # @c
.comm c,1024,16
.type a,@object # @a
.comm a,1024,16
.ident “clang version 3.9.0 (tags/RELEASE_390/final)”
.section “.note.GNU-stack”,"",@progbits
in the generated code although there is use of vmov… instructions but no zmm register? only xmm registers.
Can you please specify where i am wrong. i have tried it several times by different parameters but always get xmm registers.
Thank You