How does the auto-vectorization decide the value for the RISCV V extension LMUL value? It always seems to choose LMUL=1, even when the option --riscv-v-fixed-length-vector-lmul-max=8 is specified.
The sample LLVMIR is generated internally in IREE for a fixed tensor size.
builtin.module attributes {llvm.data_layout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128", llvm.target_triple = "riscv64"} {
llvm.func internal @forward_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(4 : index) : i64
%2 = llvm.mlir.constant(1 : index) : i64
%3 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%4 = llvm.extractvalue %3[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%5 = llvm.mlir.constant(0 : i64) : i64
%6 = llvm.getelementptr %4[%5] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.getelementptr %7[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%9 = llvm.bitcast %8 : !llvm.ptr<i8> to !llvm.ptr<f32>
%10 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%11 = llvm.extractvalue %10[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%12 = llvm.mlir.constant(1 : i64) : i64
%13 = llvm.getelementptr %11[%12] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%14 = llvm.load %13 : !llvm.ptr<ptr<i8>>
%15 = llvm.getelementptr %14[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%18 = llvm.extractvalue %17[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%19 = llvm.mlir.constant(2 : i64) : i64
%20 = llvm.getelementptr %18[%19] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%21 = llvm.load %20 : !llvm.ptr<ptr<i8>>
%22 = llvm.getelementptr %21[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%23 = llvm.bitcast %22 : !llvm.ptr<i8> to !llvm.ptr<f32>
%24 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%25 = llvm.extractvalue %24[0] : !llvm.array<3 x i32>
%26 = llvm.zext %25 : i32 to i64
%27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%28 = llvm.extractvalue %27[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32>
%30 = llvm.zext %29 : i32 to i64
%31 = llvm.mlir.constant(64 : index) : i64
%32 = llvm.mul %26, %31 : i64
%33 = llvm.mul %30, %31 : i64
llvm.br ^bb1(%32 : i64)
^bb1(%34: i64): // 2 preds: ^bb0, ^bb5
%35 = llvm.icmp "slt" %34, %1 : i64
llvm.cond_br %35, ^bb2, ^bb6
^bb2: // pred: ^bb1
%36 = llvm.mlir.constant(-1 : index) : i64
%37 = llvm.mul %34, %36 : i64
%38 = llvm.add %37, %1 : i64
%39 = llvm.icmp "slt" %31, %38 : i64
%40 = llvm.select %39, %31, %38 : i1, i64
%41 = llvm.bitcast %9 : !llvm.ptr<f32> to !llvm.ptr<f32>
%42 = llvm.mul %34, %2 : i64
%43 = llvm.add %0, %42 : i64
%44 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<f32>
%45 = llvm.bitcast %23 : !llvm.ptr<f32> to !llvm.ptr<f32>
llvm.br ^bb3(%0 : i64)
^bb3(%46: i64): // 2 preds: ^bb2, ^bb4
%47 = llvm.icmp "slt" %46, %40 : i64
llvm.cond_br %47, ^bb4, ^bb5
^bb4: // pred: ^bb3
%48 = llvm.add %43, %46 : i64
%49 = llvm.getelementptr %41[%48] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%50 = llvm.load %49 : !llvm.ptr<f32>
%51 = llvm.getelementptr %44[%48] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%52 = llvm.load %51 : !llvm.ptr<f32>
%53 = llvm.fsub %50, %52 : f32
%54 = llvm.getelementptr %45[%48] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %53, %54 : !llvm.ptr<f32>
%55 = llvm.add %46, %2 : i64
llvm.br ^bb3(%55 : i64)
^bb5: // pred: ^bb3
%56 = llvm.add %34, %33 : i64
llvm.br ^bb1(%56 : i64)
^bb6: // pred: ^bb1
%57 = llvm.mlir.constant(0 : i32) : i32
llvm.return %57 : i32
}
}
This produces:
vsetvli s1, zero, e16, m1
for a fixed size 128-element array, when it might be better to use m4 or m8.