Will the embedded assembly code be scheduled normally?

I want to add some new riscv vector intrinsic functions and use embedded assembly. I found that the order of generated assembly instructions has not changed.
Is this because it cannot be scheduled?
This is my simple test:

//  Normal Code
void add_rvv(float *A, float *B, float *C,  size_t n) {  
    size_t l;
    vfloat32m8_t va, vb, vc;

    for (; n > 0; n -= l)  {
         l = __riscv_vsetvl_e32m8(n);
         va = __riscv_vle32_v_f32m8(A, l);
         A += l;
         vb = __riscv_vle32_v_f32m8(B, l);
         B += l;
         vc = __riscv_vfadd_vv_f32m8(va, vb, l);
         __riscv_vse32_v_f32m8(C, vc, l); 
         C += l;
     }
}

// Asm Code
void add_rvv(float *A, float *B, float *C,  size_t n) {  
    size_t l;

    for (; n > 0; n -= l)  {
         l = __riscv_vsetvl_e32m8(n);
         asm volatile("vle32.v v8, (%0)" : : "r" (A) );
         A += l;
         asm volatile("vle32.v v16, (%0)" :: "r" (B));
         B += l;
        asm volatile("vfadd.vv v8, v8, v16" ::);
        asm volatile("vse32.v v8, (%0)" :: "r" (C));
         C += l;
     }
}

Using -O3, the generated instructions are as follows:

// Norma Code generated 
add_rvv:                                # @add_rvv
# %bb.0:                                # %entry
    beqz    a3, .LBB0_2
.LBB0_1:                                # %for.body
                                        # =>This Inner Loop Header: Depth=1
    vsetvli a4, a3, e32, m8, ta, mu
    vle32.v v8, (a0)
    vle32.v v16, (a1)
    slli    a5, a4, 2
    add a0, a0, a5
    add a1, a1, a5
    vfadd.vv    v8, v8, v16
    add a1, a1, a5
    vse32.v v8, (a2)
    sub a3, a3, a4
    add a2, a2, a5
    bnez    a3, .LBB0_1
.LBB0_2:                                # %for.end
    ret
// Asm Code generated 
add_rvv:                                # @add_rvv
# %bb.0:                                # %entry
    beqz    a3, .LBB0_2
.LBB0_1:                                # %for.body
                                        # =>This Inner Loop Header: Depth=1
    vsetvli a4, a3, e32, m8, ta, mu
    vle32.v v8, (a0)
    slli    a5, a4, 2                   
    add a0, a0, a5
    vle32.v v16, (a1)
    add a1, a1, a5
    vfadd.vv    v8, v8, v16
    vse32.v v8, (a2)
    sub a3, a3, a4
    add a2, a2, a5
    bnez    a3, .LBB0_1
.LBB0_2:                                # %for.end
    ret
SelectionDAG has 55 nodes:
 t0: ch,glue = EntryToken
 t3: i64,ch = CopyFromReg t0, Register:i64 %3
 t6: i64,ch = llvm.riscv.vsetvli t0, TargetConstant:i64<8204>, t3, TargetConstant:i64<2>, TargetConstant:i64<3>
 t8: i64,ch = CopyFromReg t0, Register:i64 %0
 t13: ch,glue = CopyToReg t6:1, Register:i64 %12, t8
 t17: i64 = shl t6, Constant:i64<2>
 t22: i64,ch = CopyFromReg t0, Register:i64 %1
   t15: ch,glue = inlineasm t13, TargetExternalSymbol:i64'vle32.v v8, ($0)', MDNode:ch<0x557d74564c48>, TargetConstant:i64<1>, TargetConstant:i32<327689>, Register:i64 %12, t13:1
 t26: ch,glue = CopyToReg t15, Register:i64 %13, t22
 t35: i64,ch = CopyFromReg t0, Register:i64 %2
     t27: ch,glue = inlineasm t26, TargetExternalSymbol:i64'vle32.v v16, ($0)', MDNode:ch<0x557d74564d48>, TargetConstant:i64<1>, TargetConstant:i32<327689>, Register:i64 %13, t26:1
   t33: ch,glue = inlineasm t27, TargetExternalSymbol:i64'vfadd.vv v8, v8, v16', MDNode:ch<0x557d745583f8>, TargetConstant:i64<1>
 t39: ch,glue = CopyToReg t33, Register:i64 %14, t35
 t44: i64 = sub t3, t6
         t18: i64 = add t8, t17
       t20: ch = CopyToReg t0, Register:i64 %4, t18
         t28: i64 = add t22, t17
       t30: ch = CopyToReg t0, Register:i64 %5, t28
         t41: i64 = add t35, t17
       t43: ch = CopyToReg t0, Register:i64 %6, t41
       t46: ch = CopyToReg t0, Register:i64 %7, t44
       t40: ch,glue = inlineasm t39, TargetExternalSymbol:i64'vse32.v v8, ($0)', MDNode:ch<0x557d74558748>, TargetConstant:i64<1>, TargetConstant:i32<327689>, Register:i64 %14, t39:1
     t52: ch = TokenFactor t20, t30, t43, t46, t40
     t58: i1 = setcc t44, Constant:i64<0>, setne:ch
   t59: ch = brcond t52, t58, BasicBlock:ch<for.body 0x557d745f5848>
 t56: ch = br t59, BasicBlock:ch<for.end 0x557d745f5ae8>

For SelectionDAG node, I found that there is an additional ‘glue’ attribute of embedded assembly instructions. Is this the reason that affects its normal scheduling? And will it affect optimization in other stages? Thank you !

1 Like