Cannot auto vectorization for the half float point in RISCV

As far as I know, there should be no difference between the FP16 and INT16 from the perspective of the RVV instructions. For example, the below instructions may have almost the same opportunity when the loop vectorizer tries to compute the cost module.

vsetvli     t0,zero,e16,m1,ta,mu
vfadd.vv    v8,v8,v9
-------------------------------
vsetvli     t0,zero,e16,m1,ta,mu
vadd.vv     v8,v8,v9

However, the interesting thing is that only the INT16 performs the auto-vectorization in a loop while the FP16 not. I can reproduce this by the below sample code with related build options.

// clang -Wall -Wextra -Werror -I/home/pli/bin/gnu-multilib-linux/bin/riscv64-unknown-linux-gnu-gcc/../include --target=riscv64-unknown-linux-gnu -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize -DRV64 -march=rv64gcv1p0_zbb_zfh -O2 -c ./main.c -o __BUILD_/./main.o
// riscv64-unknown-linux-gnu-gcc -Ofast -static __BUILD_/./main.o -o __BUILD_/hfp-auto-vectorization.elf -lm
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define NOINLINE __attribute__ ((noinline))

typedef __fp16 half;
typedef short int int16;

static void NOINLINE add_fp16_vector(half* a, half *b, int length) {
  for (int i = 0; i < length; i++) {
    a[i] += b[i];
  }
}

static void NOINLINE add_int16_vector(int16 *a, int16 *b, int length) {
  for (int i = 0; i < length; i++) {
    a[i] += b[i];
  }
}

static void NOINLINE add_fp16_asm(half *a, half *b) {
  asm volatile (
    "fence\n\t"
    "vsetvli    t0,zero,e16,m1,ta,mu\n\t"
    "vl1re16.v  v8, (%[src1])\n\t"
    "vl1re16.v  v9, (%[src2])\n\t"
    "vfadd.vv  v8, v8, v9\n\t"
    "vs1r.v  v8, (%[src1])\n\t"
    "fence\n\t"
    :
    : [src1]"r"(a), [src2]"r"(b)
  );
}

int main()
{
  int size = 4096;
  half *a = malloc(sizeof(half) * size);
  half *b = malloc(sizeof(half) * size);

  a[0] = 1.0;
  b[0] = 1.0;
  add_fp16_vector(a, b, size);

  printf("add_fp16_vector = %f == 2.0\n", a[0]);

  a[0] = 1.0;
  b[0] = 1.0;
  add_fp16_asm(a, b);
  printf("add_fp16_asm %f == 2.0\n", a[0]);

  int16 *ia = (int16 *)a;
  int16 *ib = (int16 *)b;
  ia[0] = 1;
  ib[0] = 1;
  add_int16_vector(ia, ib, size);
  printf("add_int16_vector %d == 2\n", ia[0]);

  free(a);
  free(b);

  return EXIT_SUCCESS;
}

Moreover, the clang complains the cost model may be not beneficial for this case, but it looks like the only difference in about code should be the element type. I am not sure why it influences the result of the cost model.

./main.c:11:3: remark: Scalable vectorization is not supported for all element types found in this loop. [-Rpass-analysis=loop-vectorize]
  for (int i = 0; i < length; i++) {
  ^
./main.c:11:3: remark: the cost-model indicates that vectorization is not beneficial [-Rpass-missed=loop-vectorize]
./main.c:11:3: remark: the cost-model indicates that interleaving is not beneficial [-Rpass-missed=loop-vectorize]
./main.c:17:3: remark: vectorized loop (vectorization width: vscale x 4, interleaved count: 2) [-Rpass=loop-vectorize]

Here is the objdump for the reference.

00000000000106ba <add_fp16_vector>:
   106ba:...6605                ....lui.a2,0x1
   106bc:...00059007          ..flh.ft0,0(a1)
   106c0:...00051087          ..flh.ft1,0(a0)
   106c4:...04107053          ..fadd.h..ft0,ft0,ft1
   106c8:...00051027          ..fsh.ft0,0(a0)
   106cc:...0509                ....addi....a0,a0,2
   106ce:...167d                ....addi....a2,a2,-1 # fff <__libc_tsd_CTYPE_B+0xfa7>
   106d0:...0589                ....addi....a1,a1,2
   106d2:...f66d                ....bnez....a2,106bc <add_fp16_vector+0x2>
   106d4:...8082                ....ret

00000000000106d6 <add_fp16_asm>:
   106d6:...0ff0000f          ..fence
   106da:...048072d7          ..vsetvli.t0,zero,e16,m1,ta,mu
   106de:...02855407          ..vl1re16.v...v8,(a0)
   106e2:...0285d487          ..vl1re16.v...v9,(a1)
   106e6:...02849457          ..vfadd.vv....v8,v8,v9
   106ea:...02850427          ..vs1r.v..v8,(a0)
   106ee:...0ff0000f          ..fence
   106f2:...8082                ....ret

00000000000106f4 <add_int16_vector>:
   ...
   10758:...04807757          ..vsetvli.a4,zero,e16,m1,ta,mu
   1075c:...8746                ....mv..a4,a7
   1075e:...00f586b3          ..add.a3,a1,a5
   10762:...0286d407          ..vl1re16.v...v8,(a3)
   10766:...00f386b3          ..add.a3,t2,a5
   1076a:...0286d487          ..vl1re16.v...v9,(a3)
   1076e:...00f506b3          ..add.a3,a0,a5
   10772:...0286d507          ..vl1re16.v...v10,(a3)
   10776:...00fe0633          ..add.a2,t3,a5
   1077a:...02865587          ..vl1re16.v...v11,(a2)
   1077e:...02a40457          ..vadd.vv.v8,v10,v8
   10782:...02b484d7          ..vadd.vv.v9,v11,v9
   10786:...02868427          ..vs1r.v..v8,(a3)
   1078a:...028604a7          ..vs1r.v..v9,(a2)
   1078e:...40570733          ..sub.a4,a4,t0
   10792:...979a                ....add.a5,a5,t1
   10794:...f769                ....bnez....a4,1075e <add_int16_vector+0x6a>
   10796:...f6081fe3          ..bnez....a6,10714 <add_int16_vector+0x20>
   1079a:...b745                ....j...1073a <add_int16_vector+0x46>

FP16 in vector requires the Zvfh extension.

1 Like

Oops, I may consider zfh is good enough. Let me have a try and back to you later.

@topperc Thank you very much. It works well when given rv64gcv1p0_zfh_zvfh0p1. I am thinking about how can we get such information easily when there is a similar issue comes next time. Do you have any suggestions about it?