Vectorization not works

Hello, I’m new to vectorization, and I’m feeling confused about it.

Despite having LLVM version 17.0.0git, when I use opt --help , it doesn’t provide any information about vectorization options. Additionally, my test file remains unvectorized.

Could you let me know what happens?
Any advice is appreciated :slight_smile:

My build script is

cmake -G Ninja ../llvm \
   -DLLVM_ENABLE_PROJECTS="mlir;clang"\
   -DLLVM_BUILD_EXAMPLES=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLLVM_USE_LINKER=lld -DLLVM_CCACHE_BUILD=ON   \
   -DLLVM_TARGETS_TO_BUILD="Native;" \
   -DCMAKE_BUILD_TYPE=Release \
   -DLLVM_ENABLE_ASSERTIONS=ON

My running script is

build/bin/clang -S -emit-llvm -O0  myfolder/hello.c -o myfolder/hello.ll
build/bin/opt -vectorize-loops -vectorize-slp myfolder/hello.ll -o myfolder/hello-vec.bc
llvm-dis myfolder/hello-vec.bc -o myfolder/hello-vec.ll

My opt --help is

$ build/bin/opt --help | grep vec
    =data-without-lane-mask                                             -   Create lane mask with compare/stepvector
      --expandvp                                                           - Expand vector predication intrinsics
      --interleaved-load-combine                                           - Combine interleaved loads into wide loads and shufflevector instructions
      --load-store-vectorizer                                              - Vectorize load and store instructions
      --replace-with-veclib                                                - Replace intrinsics with calls to vector library
      --scalarizer                                                         - Scalarize vector operations
  --vec-extabi                                                          - Enable the AIX Extended Altivec ABI.

My input file is

#include <stdio.h>
int main(int argc, char * argv[]) {

    int a[100];
    for(int i = 0; i < 100; i++) {
        a[i] = i;
    }

    return 0;
}

myfolder/hello.ll is (and it remains unchanged after opt -vectorize-loops)

; ModuleID = 'input/hello.c'
source_filename = "input/hello.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @main(i32 noundef %argc, ptr noundef %argv) #0 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca ptr, align 8
  %a = alloca [100 x i32], align 16
  %i = alloca i32, align 4
  store i32 0, ptr %retval, align 4
  store i32 %argc, ptr %argc.addr, align 4
  store ptr %argv, ptr %argv.addr, align 8
  store i32 0, ptr %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, ptr %i, align 4
  %cmp = icmp slt i32 %0, 100
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %1 = load i32, ptr %i, align 4
  %2 = load i32, ptr %i, align 4
  %idxprom = sext i32 %2 to i64
  %arrayidx = getelementptr inbounds [100 x i32], ptr %a, i64 0, i64 %idxprom
  store i32 %1, ptr %arrayidx, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %3 = load i32, ptr %i, align 4
  %inc = add nsw i32 %3, 1
  store i32 %inc, ptr %i, align 4
  br label %for.cond, !llvm.loop !6

for.end:                                          ; preds = %for.cond
  ret i32 0
}

attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="alderlake" "target-features"="+64bit,+adx,+aes,+avx,+avx2,+avxvnni,+bmi,+bmi2,+clflushopt,+clwb,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+gfni,+hreset,+invpcid,+kl,+lzcnt,+mmx,+movbe,+movdir64b,+movdiri,+pclmul,+pconfig,+pku,+popcnt,+prfchw,+ptwrite,+rdpid,+rdrnd,+rdseed,+sahf,+serialize,+sha,+shstk,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+waitpkg,+widekl,+x87,+xsave,+xsavec,+xsaveopt,+xsaves,-amx-bf16,-amx-fp16,-amx-int8,-amx-tile,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnniint8,-cldemote,-clzero,-cmpccxadd,-enqcmd,-fma4,-lwp,-mwaitx,-prefetchi,-prefetchwt1,-raoint,-rdpru,-rtm,-sgx,-sse4a,-tbm,-tsxldtrk,-uintr,-wbnoinvd,-xop" }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 029313cc979ae71877b65794b1063d4e51184cc8)"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}

You compiled the C source with -O0, which will have some consequences for the generated bitcode: it will mark all functions in the module as optnone (which will turn off optimizations for those functions), plus it will not generate any aliasing metadata (which will cause legality checks to fail).

In general it you’re seeing undesirable output from the vectorizer, you can run it with a debug flag

opt -passes=loop-vectorize -S < hello.ll -debug-only=loop-vectorize

which will show you what the vectorizer is doing in more detail.

PS. Use the -passes= flag to run a pass (or multiple passes).

1 Like

Thank you for your reply!

After trying this, I find that my code is not vectorized.
Could you please tell me what I need to do to find out why my code is not vectorized?
I really have no clue :smiling_face_with_tear:

Thank you for letting me know.

; ModuleID = '<stdin>'
source_filename = "input/hello.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @main(i32 noundef %argc, ptr noundef %argv) #0 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca ptr, align 8
  %a = alloca [100 x i32], align 16
  %i = alloca i32, align 4
  store i32 0, ptr %retval, align 4
  store i32 %argc, ptr %argc.addr, align 4
  store ptr %argv, ptr %argv.addr, align 8
  store i32 0, ptr %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, ptr %i, align 4
  %cmp = icmp slt i32 %0, 100
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %1 = load i32, ptr %i, align 4
  %2 = load i32, ptr %i, align 4
  %idxprom = sext i32 %2 to i64
  %arrayidx = getelementptr inbounds [100 x i32], ptr %a, i64 0, i64 %idxprom
  store i32 %1, ptr %arrayidx, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %3 = load i32, ptr %i, align 4
  %inc = add nsw i32 %3, 1
  store i32 %inc, ptr %i, align 4
  br label %for.cond, !llvm.loop !6

for.end:                                          ; preds = %for.cond
  ret i32 0
}

attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="alderlake" "target-features"="+64bit,+adx,+aes,+avx,+avx2,+avxvnni,+bmi,+bmi2,+clflushopt,+clwb,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+gfni,+hreset,+invpcid,+kl,+lzcnt,+mmx,+movbe,+movdir64b,+movdiri,+pclmul,+pconfig,+pku,+popcnt,+prfchw,+ptwrite,+rdpid,+rdrnd,+rdseed,+sahf,+serialize,+sha,+shstk,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+waitpkg,+widekl,+x87,+xsave,+xsavec,+xsaveopt,+xsaves,-amx-bf16,-amx-fp16,-amx-int8,-amx-tile,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnniint8,-cldemote,-clzero,-cmpccxadd,-enqcmd,-fma4,-lwp,-mwaitx,-prefetchi,-prefetchwt1,-raoint,-rdpru,-rtm,-sgx,-sse4a,-tbm,-tsxldtrk,-uintr,-wbnoinvd,-xop" }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 029313cc979ae71877b65794b1063d4e51184cc8)"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}

The IR you shared here still has the optnone attribute.

Most passes won’t work effectively directly on the IR that clang generates, but require certain other passes to run before.

When running the full pipeline you can see that your code gets optimized away (you are storing to a local array that is never used):

1 Like

Oh, it was my mistake as I am not familiar with it.
I tried it on Godbolt, and it works!
I really appreciate for your help.

In addition to what was already said, you can change your source to the following (to avoid the loop being optimized away):

int a[100];

int main(int argc, char * argv[]) {
    for(int i = 0; i < 100; i++) {
        a[i] = i;
    }
    return 0;
}

Then you can get the .ll file with O1 (to prevent unrolling and vectorization from running by default):

clang -c hello.c -O1 -S -emit-llvm

And then you can use opt to run the vectorizer pass:

opt -passes=loop-vectorize -S < hello.ll

And you should see vectorized code.

Obviously, if you don’t want to run opt by hand, you can use -O2 or -O3 and have clang run the vectorizer automatically. You can use -Rpass=loop-vectorize to get optimization remarks about vectorization:

clang -c hello.c -O2 -Rpass=loop-vectorize
hello.c:5:5: remark: vectorized loop (vectorization width: 4, interleaved count: 2) [-Rpass=loop-vectorize]
    5 |     for(int i = 0; i < 100; i++) {
      |     ^
1 Like