I want to add some new riscv vector intrinsic functions and use embedded assembly. I found that the order of generated assembly instructions has not changed.
Is this because it cannot be scheduled?
This is my simple test:
// Normal Code
void add_rvv(float *A, float *B, float *C, size_t n) {
size_t l;
vfloat32m8_t va, vb, vc;
for (; n > 0; n -= l) {
l = __riscv_vsetvl_e32m8(n);
va = __riscv_vle32_v_f32m8(A, l);
A += l;
vb = __riscv_vle32_v_f32m8(B, l);
B += l;
vc = __riscv_vfadd_vv_f32m8(va, vb, l);
__riscv_vse32_v_f32m8(C, vc, l);
C += l;
}
}
// Asm Code
void add_rvv(float *A, float *B, float *C, size_t n) {
size_t l;
for (; n > 0; n -= l) {
l = __riscv_vsetvl_e32m8(n);
asm volatile("vle32.v v8, (%0)" : : "r" (A) );
A += l;
asm volatile("vle32.v v16, (%0)" :: "r" (B));
B += l;
asm volatile("vfadd.vv v8, v8, v16" ::);
asm volatile("vse32.v v8, (%0)" :: "r" (C));
C += l;
}
}
Using -O3, the generated instructions are as follows:
// Norma Code generated
add_rvv: # @add_rvv
# %bb.0: # %entry
beqz a3, .LBB0_2
.LBB0_1: # %for.body
# =>This Inner Loop Header: Depth=1
vsetvli a4, a3, e32, m8, ta, mu
vle32.v v8, (a0)
vle32.v v16, (a1)
slli a5, a4, 2
add a0, a0, a5
add a1, a1, a5
vfadd.vv v8, v8, v16
add a1, a1, a5
vse32.v v8, (a2)
sub a3, a3, a4
add a2, a2, a5
bnez a3, .LBB0_1
.LBB0_2: # %for.end
ret
// Asm Code generated
add_rvv: # @add_rvv
# %bb.0: # %entry
beqz a3, .LBB0_2
.LBB0_1: # %for.body
# =>This Inner Loop Header: Depth=1
vsetvli a4, a3, e32, m8, ta, mu
vle32.v v8, (a0)
slli a5, a4, 2
add a0, a0, a5
vle32.v v16, (a1)
add a1, a1, a5
vfadd.vv v8, v8, v16
vse32.v v8, (a2)
sub a3, a3, a4
add a2, a2, a5
bnez a3, .LBB0_1
.LBB0_2: # %for.end
ret
SelectionDAG has 55 nodes:
t0: ch,glue = EntryToken
t3: i64,ch = CopyFromReg t0, Register:i64 %3
t6: i64,ch = llvm.riscv.vsetvli t0, TargetConstant:i64<8204>, t3, TargetConstant:i64<2>, TargetConstant:i64<3>
t8: i64,ch = CopyFromReg t0, Register:i64 %0
t13: ch,glue = CopyToReg t6:1, Register:i64 %12, t8
t17: i64 = shl t6, Constant:i64<2>
t22: i64,ch = CopyFromReg t0, Register:i64 %1
t15: ch,glue = inlineasm t13, TargetExternalSymbol:i64'vle32.v v8, ($0)', MDNode:ch<0x557d74564c48>, TargetConstant:i64<1>, TargetConstant:i32<327689>, Register:i64 %12, t13:1
t26: ch,glue = CopyToReg t15, Register:i64 %13, t22
t35: i64,ch = CopyFromReg t0, Register:i64 %2
t27: ch,glue = inlineasm t26, TargetExternalSymbol:i64'vle32.v v16, ($0)', MDNode:ch<0x557d74564d48>, TargetConstant:i64<1>, TargetConstant:i32<327689>, Register:i64 %13, t26:1
t33: ch,glue = inlineasm t27, TargetExternalSymbol:i64'vfadd.vv v8, v8, v16', MDNode:ch<0x557d745583f8>, TargetConstant:i64<1>
t39: ch,glue = CopyToReg t33, Register:i64 %14, t35
t44: i64 = sub t3, t6
t18: i64 = add t8, t17
t20: ch = CopyToReg t0, Register:i64 %4, t18
t28: i64 = add t22, t17
t30: ch = CopyToReg t0, Register:i64 %5, t28
t41: i64 = add t35, t17
t43: ch = CopyToReg t0, Register:i64 %6, t41
t46: ch = CopyToReg t0, Register:i64 %7, t44
t40: ch,glue = inlineasm t39, TargetExternalSymbol:i64'vse32.v v8, ($0)', MDNode:ch<0x557d74558748>, TargetConstant:i64<1>, TargetConstant:i32<327689>, Register:i64 %14, t39:1
t52: ch = TokenFactor t20, t30, t43, t46, t40
t58: i1 = setcc t44, Constant:i64<0>, setne:ch
t59: ch = brcond t52, t58, BasicBlock:ch<for.body 0x557d745f5848>
t56: ch = br t59, BasicBlock:ch<for.end 0x557d745f5ae8>
For SelectionDAG node, I found that there is an additional ‘glue’ attribute of embedded assembly instructions. Is this the reason that affects its normal scheduling? And will it affect optimization in other stages? Thank you !