[SOLVED] Performance optimize(Matmul) in MLIR is slow

Hi everyone here:

Currently I’m trying to optimize Matmul, more clearly, I follow OpenCV fastGEMM implementation and write this:

#aptr_map_1 = affine_map<(d0)[s0] -> (d0 + 1, s0 - 1)>
#aptr_map_2 = affine_map<(d0)[s0] -> (d0 + 2, s0 - 1)>
#aptr_map_3 = affine_map<(d0)[s0] -> (d0 + 3, s0 - 1)>
#bptr_map = affine_map<(d0) -> (d0 + 16)>
#sub = affine_map<()[s0, s1] -> (s0 - s1 + 1)>
#map_broadcast = affine_map<(d0, d1) -> (0)>
module {
  func.func @gemm(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant 0.000000e+00 : f32
    %cf0 = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %0 = memref.dim %arg0, %c0 : memref<?x?xf32> // i
    %1 = memref.dim %arg0, %c1 : memref<?x?xf32> // k
    %2 = memref.dim %arg1, %c1 : memref<?x?xf32> // j
    affine.for %n = 0 to #sub()[%1, %c32] step 32 {
      affine.for %m = 0 to %0 step 4 {
        %aptr0 = memref.subview %arg0[%m, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%mark_1 = affine.min #aptr_map_1(%m)[%0]
        %aptr1 = memref.subview %arg0[%mark_1, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%mark_2 = affine.min #aptr_map_2(%m)[%0]
        %aptr2 = memref.subview %arg0[%mark_2, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%mark_3 = affine.min #aptr_map_3(%m)[%0]
        %aptr3 = memref.subview %arg0[%mark_3, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr0 = memref.subview %arg2[%m, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr1 = memref.subview %arg2[%mark_1, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr2 = memref.subview %arg2[%mark_2, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr3 = memref.subview %arg2[%mark_3, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%md00 = memref.alloc() : memref<1xvector<16xf32>>
        %_d00 = vector.transfer_read %cptr0[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d00, %md00[%c0] : memref<1xvector<16xf32>>

	%md01 = memref.alloc() : memref<1xvector<16xf32>>
        %l_c_sub0 = affine.apply #bptr_map(%n)
        %_d01 = vector.transfer_read %cptr0[%c0, %l_c_sub0], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d01, %md01[%c0] : memref<1xvector<16xf32>>

	%md10 = memref.alloc() : memref<1xvector<16xf32>>
        %_d10 = vector.transfer_read %cptr1[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d10, %md10[%c0] : memref<1xvector<16xf32>>

	%md11 = memref.alloc() : memref<1xvector<16xf32>>
        %l_c_sub1 = affine.apply #bptr_map(%n)
        %_d11 = vector.transfer_read %cptr0[%c0, %l_c_sub1], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d11, %md11[%c0] : memref<1xvector<16xf32>>

	%md20 = memref.alloc() : memref<1xvector<16xf32>>
        %_d20 = vector.transfer_read %cptr2[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d20, %md20[%c0] : memref<1xvector<16xf32>>

	%md21 = memref.alloc() : memref<1xvector<16xf32>>
        %l_c_sub2 = affine.apply #bptr_map(%n)
        %_d21 = vector.transfer_read %cptr2[%c0, %l_c_sub2], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d21, %md21[%c0] : memref<1xvector<16xf32>>

	%md30 = memref.alloc() : memref<1xvector<16xf32>>
        %_d30 = vector.transfer_read %cptr3[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d30, %md30[%c0] : memref<1xvector<16xf32>>

	%md31 = memref.alloc() : memref<1xvector<16xf32>>
        %l_c_sub3 = affine.apply #bptr_map(%n)
        %_d31 = vector.transfer_read %cptr3[%c0, %l_c_sub3], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        memref.store %_d31, %md31[%c0] : memref<1xvector<16xf32>>

	affine.for %k = 0 to %1 {
          %a0 = vector.transfer_read %aptr0[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
          %a1 = vector.transfer_read %aptr1[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
          %a2 = vector.transfer_read %aptr2[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
          %a3 = vector.transfer_read %aptr3[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

          %b0 = vector.load %arg1[%k, %n] : memref<?x?xf32>, vector<16xf32>
          %b_sub = affine.apply #bptr_map(%n)
          %b1 = vector.load %arg1[%k, %b_sub] : memref<?x?xf32>, vector<16xf32>

	  %d00 = memref.load %md00[%c0] : memref<1xvector<16xf32>>
	  %d01 = memref.load %md01[%c0] : memref<1xvector<16xf32>>
	  %d10 = memref.load %md10[%c0] : memref<1xvector<16xf32>>
	  %d11 = memref.load %md11[%c0] : memref<1xvector<16xf32>>
	  %d20 = memref.load %md20[%c0] : memref<1xvector<16xf32>>
	  %d21 = memref.load %md21[%c0] : memref<1xvector<16xf32>>
	  %d30 = memref.load %md30[%c0] : memref<1xvector<16xf32>>
	  %d31 = memref.load %md31[%c0] : memref<1xvector<16xf32>>

	  %d00_ = vector.fma %a0, %b0, %d00 : vector<16xf32>
	  %d01_ = vector.fma %a0, %b1, %d01 : vector<16xf32>
	  %d10_ = vector.fma %a1, %b0, %d10 : vector<16xf32>
	  %d11_ = vector.fma %a1, %b1, %d11 : vector<16xf32>
	  %d20_ = vector.fma %a2, %b0, %d20 : vector<16xf32>
	  %d21_ = vector.fma %a2, %b1, %d21 : vector<16xf32>
	  %d30_ = vector.fma %a3, %b0, %d30 : vector<16xf32>
	  %d31_ = vector.fma %a3, %b1, %d31 : vector<16xf32>
		
          memref.store %d00_, %md00[%c0] : memref<1xvector<16xf32>>
          memref.store %d01_, %md01[%c0] : memref<1xvector<16xf32>>
          memref.store %d10_, %md10[%c0] : memref<1xvector<16xf32>>
          memref.store %d11_, %md11[%c0] : memref<1xvector<16xf32>>
          memref.store %d20_, %md20[%c0] : memref<1xvector<16xf32>>
          memref.store %d21_, %md21[%c0] : memref<1xvector<16xf32>>
          memref.store %d30_, %md30[%c0] : memref<1xvector<16xf32>>
          memref.store %d31_, %md31[%c0] : memref<1xvector<16xf32>>
	}

	%l_d00 = memref.load %md00[%c0] : memref<1xvector<16xf32>>
	%l_d01 = memref.load %md01[%c0] : memref<1xvector<16xf32>>
	%l_d10 = memref.load %md10[%c0] : memref<1xvector<16xf32>>
	%l_d11 = memref.load %md11[%c0] : memref<1xvector<16xf32>>
	%l_d20 = memref.load %md20[%c0] : memref<1xvector<16xf32>>
	%l_d21 = memref.load %md21[%c0] : memref<1xvector<16xf32>>
	%l_d30 = memref.load %md30[%c0] : memref<1xvector<16xf32>>
	%l_d31 = memref.load %md31[%c0] : memref<1xvector<16xf32>>

        vector.transfer_write %l_d00, %cptr0[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d01, %cptr0[%c0, %l_c_sub0] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d10, %cptr1[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d11, %cptr1[%c0, %l_c_sub1] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d20, %cptr2[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d21, %cptr2[%c0, %l_c_sub2] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d30, %cptr3[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %l_d31, %cptr3[%c0, %l_c_sub3] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	memref.dealloc %md00 : memref<1xvector<16xf32>>
	memref.dealloc %md01 : memref<1xvector<16xf32>>
	memref.dealloc %md10 : memref<1xvector<16xf32>>
	memref.dealloc %md11 : memref<1xvector<16xf32>>
	memref.dealloc %md20 : memref<1xvector<16xf32>>
	memref.dealloc %md21 : memref<1xvector<16xf32>>
	memref.dealloc %md30 : memref<1xvector<16xf32>>
	memref.dealloc %md31 : memref<1xvector<16xf32>>

      }
    }
    return
  }
}

And test it by googlebenchmark, I find it’s performance is slow.

image
x axis is size, (e.g. 512x512x512)

Any suggestions here?

Thanks so much !

Here is objdump result of @gemm:
command:

mlir-opt opt_gemm.mlir  \                                                                                                                                                                                        
    -convert-vector-to-llvm \                                                                                                                                                                                     
    -convert-memref-to-llvm \                                                                                                                                                                                     
    --lower-affine \                                                                                                                                                                                              
    -convert-scf-to-cf \                                                                                                                                                                                          
    -convert-linalg-to-llvm \                                                                                                                                                                                     
    -llvm-request-c-wrappers \                                                                                                                                                                                    
    -convert-func-to-llvm \                                                                                                                                                                                       
    -reconcile-unrealized-casts   \                                                                                                                                                                               
    | \                                                                                                                                                                                                           
mlir-translate --mlir-to-llvmir \                                                                                                                                                                                
| \                                                                                                                                                                                                               
llc -mtriple='x86_64-unknown-linux-gnu' \                                                                                                                                                                         
-mattr='avx512f' --filetype=obj -O3 -o mlir-gemm.o                                                                                                                                                                
llvm-objdump -d mlir-gemm.o > dump.ll                                                                                                                                                                             
rm mlir-gemm.o                                                                                                                                                                                                    
cat dump.ll | less 

mlir-gemm.o:	file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <gemm>:
       0: 55                           	pushq	%rbp
       1: 41 57                        	pushq	%r15
       3: 41 56                        	pushq	%r14
       5: 41 55                        	pushq	%r13
       7: 41 54                        	pushq	%r12
       9: 53                           	pushq	%rbx
       a: 48 81 ec 28 01 00 00         	subq	$296, %rsp              # imm = 0x128
      11: 4c 89 4c 24 68               	movq	%r9, 104(%rsp)
      16: 48 89 54 24 60               	movq	%rdx, 96(%rsp)
      1b: 48 89 74 24 58               	movq	%rsi, 88(%rsp)
      20: 49 8d 40 e1                  	leaq	-31(%r8), %rax
      24: 48 89 44 24 48               	movq	%rax, 72(%rsp)
      29: 48 89 4c 24 70               	movq	%rcx, 112(%rsp)
      2e: 48 8d 41 ff                  	leaq	-1(%rcx), %rax
      32: 45 31 f6                     	xorl	%r14d, %r14d
      35: 4c 89 44 24 20               	movq	%r8, 32(%rsp)
      3a: 48 89 44 24 50               	movq	%rax, 80(%rsp)
      3f: eb 13                        	jmp	0x54 <gemm+0x54>
      41: 66 2e 0f 1f 84 00 00 00 00 00	nopw	%cs:(%rax,%rax)
      4b: 0f 1f 44 00 00               	nopl	(%rax,%rax)
      50: 49 83 c6 20                  	addq	$32, %r14
      54: 4c 3b 74 24 48               	cmpq	72(%rsp), %r14
      59: 0f 8d 6b 06 00 00            	jge	0x6ca <gemm+0x6ca>
      5f: 41 b8 03 00 00 00            	movl	$3, %r8d
      65: bb 02 00 00 00               	movl	$2, %ebx
      6a: b9 01 00 00 00               	movl	$1, %ecx
      6f: 31 ff                        	xorl	%edi, %edi
      71: 48 89 7c 24 28               	movq	%rdi, 40(%rsp)
      76: 31 ff                        	xorl	%edi, %edi
      78: 4c 89 74 24 30               	movq	%r14, 48(%rsp)
      7d: e9 e3 01 00 00               	jmp	0x265 <gemm+0x265>
      82: 66 2e 0f 1f 84 00 00 00 00 00	nopw	%cs:(%rax,%rax)
      8c: 0f 1f 40 00                  	nopl	(%rax)
      90: 48 8b 4c 24 40               	movq	64(%rsp), %rcx
      95: 62 f1 7c 48 28 01            	vmovaps	(%rcx), %zmm0
      9b: 48 8b 4c 24 38               	movq	56(%rsp), %rcx
      a0: 62 f1 7c 48 28 09            	vmovaps	(%rcx), %zmm1
      a6: 62 d1 7c 48 28 17            	vmovaps	(%r15), %zmm2
      ac: 62 d1 7c 48 28 5d 00         	vmovaps	(%r13), %zmm3
      b3: 62 f1 7c 48 28 23            	vmovaps	(%rbx), %zmm4
      b9: 62 d1 7c 48 28 2e            	vmovaps	(%r14), %zmm5
      bf: 62 d1 7c 48 28 34 24         	vmovaps	(%r12), %zmm6
      c6: 62 f1 7c 48 28 38            	vmovaps	(%rax), %zmm7
      cc: 48 8b 74 24 20               	movq	32(%rsp), %rsi
      d1: 89 f0                        	movl	%esi, %eax
      d3: 4c 8b 74 24 30               	movq	48(%rsp), %r14
      d8: 44 29 f0                     	subl	%r14d, %eax
      db: 62 72 7d 48 7c c0            	vpbroadcastd	%eax, %zmm8
      e1: 62 71 fd 48 6f 0d 00 00 00 00	vmovdqa64	(%rip), %zmm9   # 0xeb <gemm+0xeb>
      eb: 62 d1 3d 48 66 c9            	vpcmpgtd	%zmm9, %zmm8, %k1
      f1: 48 8b 8c 24 a8 01 00 00      	movq	424(%rsp), %rcx
      f9: 48 8b 84 24 c0 00 00 00      	movq	192(%rsp), %rax
     101: 62 f1 7c 49 11 04 81         	vmovups	%zmm0, (%rcx,%rax,4) {%k1}
     108: 89 f0                        	movl	%esi, %eax
     10a: 48 8b 94 24 d8 00 00 00      	movq	216(%rsp), %rdx
     112: 29 d0                        	subl	%edx, %eax
     114: 62 f2 7d 48 7c c0            	vpbroadcastd	%eax, %zmm0
     11a: 62 d1 7d 48 66 d1            	vpcmpgtd	%zmm9, %zmm0, %k2
     120: 48 8b 84 24 e8 00 00 00      	movq	232(%rsp), %rax
     128: 48 01 d0                     	addq	%rdx, %rax
     12b: 62 f1 7c 4a 11 0c 81         	vmovups	%zmm1, (%rcx,%rax,4) {%k2}
     132: 48 8b 84 24 a0 00 00 00      	movq	160(%rsp), %rax
     13a: 62 f1 7c 49 11 14 81         	vmovups	%zmm2, (%rcx,%rax,4) {%k1}
     141: 89 f0                        	movl	%esi, %eax
     143: 29 d0                        	subl	%edx, %eax
     145: 62 f2 7d 48 7c c0            	vpbroadcastd	%eax, %zmm0
     14b: 62 d1 7d 48 66 d1            	vpcmpgtd	%zmm9, %zmm0, %k2
     151: 48 8b 84 24 f8 00 00 00      	movq	248(%rsp), %rax
     159: 48 01 d0                     	addq	%rdx, %rax
     15c: 62 f1 7c 4a 11 1c 81         	vmovups	%zmm3, (%rcx,%rax,4) {%k2}
     163: 48 8b 84 24 98 00 00 00      	movq	152(%rsp), %rax
     16b: 62 f1 7c 49 11 24 81         	vmovups	%zmm4, (%rcx,%rax,4) {%k1}
     172: 89 f0                        	movl	%esi, %eax
     174: 29 d0                        	subl	%edx, %eax
     176: 62 f2 7d 48 7c c0            	vpbroadcastd	%eax, %zmm0
     17c: 62 d1 7d 48 66 d1            	vpcmpgtd	%zmm9, %zmm0, %k2
     182: 48 8b 84 24 00 01 00 00      	movq	256(%rsp), %rax
     18a: 48 01 d0                     	addq	%rdx, %rax
     18d: 62 f1 7c 4a 11 2c 81         	vmovups	%zmm5, (%rcx,%rax,4) {%k2}
     194: 48 8b 84 24 90 00 00 00      	movq	144(%rsp), %rax
     19c: 62 f1 7c 49 11 34 81         	vmovups	%zmm6, (%rcx,%rax,4) {%k1}
     1a3: 89 f0                        	movl	%esi, %eax
     1a5: 29 d0                        	subl	%edx, %eax
     1a7: 62 f2 7d 48 7c c0            	vpbroadcastd	%eax, %zmm0
     1ad: 48 8b 44 24 18               	movq	24(%rsp), %rax
     1b2: 48 01 d0                     	addq	%rdx, %rax
     1b5: 62 d1 7d 48 66 c9            	vpcmpgtd	%zmm9, %zmm0, %k1
     1bb: 62 f1 7c 49 11 3c 81         	vmovups	%zmm7, (%rcx,%rax,4) {%k1}
     1c2: 48 8b bc 24 f0 00 00 00      	movq	240(%rsp), %rdi
     1ca: c5 f8 77                     	vzeroupper
     1cd: e8 00 00 00 00               	callq	0x1d2 <gemm+0x1d2>
     1d2: 48 8b bc 24 e0 00 00 00      	movq	224(%rsp), %rdi
     1da: e8 00 00 00 00               	callq	0x1df <gemm+0x1df>
     1df: 48 8b bc 24 d0 00 00 00      	movq	208(%rsp), %rdi
     1e7: e8 00 00 00 00               	callq	0x1ec <gemm+0x1ec>
     1ec: 48 8b bc 24 c8 00 00 00      	movq	200(%rsp), %rdi
     1f4: e8 00 00 00 00               	callq	0x1f9 <gemm+0x1f9>
     1f9: 48 8b bc 24 b8 00 00 00      	movq	184(%rsp), %rdi
     201: e8 00 00 00 00               	callq	0x206 <gemm+0x206>
     206: 48 8b bc 24 b0 00 00 00      	movq	176(%rsp), %rdi
     20e: e8 00 00 00 00               	callq	0x213 <gemm+0x213>
     213: 48 8b bc 24 a8 00 00 00      	movq	168(%rsp), %rdi
     21b: e8 00 00 00 00               	callq	0x220 <gemm+0x220>
     220: 48 8b 7c 24 10               	movq	16(%rsp), %rdi
     225: e8 00 00 00 00               	callq	0x22a <gemm+0x22a>
     22a: 48 8b bc 24 08 01 00 00      	movq	264(%rsp), %rdi
     232: 48 83 c7 04                  	addq	$4, %rdi
     236: 4c 8b 84 24 20 01 00 00      	movq	288(%rsp), %r8
     23e: 49 83 c0 04                  	addq	$4, %r8
     242: 48 8b 9c 24 18 01 00 00      	movq	280(%rsp), %rbx
     24a: 48 83 c3 04                  	addq	$4, %rbx
     24e: 48 8b 8c 24 10 01 00 00      	movq	272(%rsp), %rcx
     256: 48 83 c1 04                  	addq	$4, %rcx
     25a: 48 83 44 24 28 10            	addq	$16, 40(%rsp)
     260: 48 8b 44 24 50               	movq	80(%rsp), %rax
     265: 49 39 c0                     	cmpq	%rax, %r8
     268: 48 89 c6                     	movq	%rax, %rsi
     26b: 49 0f 4c f0                  	cmovlq	%r8, %rsi
     26f: 48 39 c3                     	cmpq	%rax, %rbx
     272: 48 89 c2                     	movq	%rax, %rdx
     275: 48 0f 4c d3                  	cmovlq	%rbx, %rdx
     279: 48 39 c1                     	cmpq	%rax, %rcx
     27c: 48 89 c5                     	movq	%rax, %rbp
     27f: 48 0f 4c e9                  	cmovlq	%rcx, %rbp
     283: 48 3b 7c 24 70               	cmpq	112(%rsp), %rdi
     288: 0f 8d c2 fd ff ff            	jge	0x50 <gemm+0x50>
     28e: 48 89 8c 24 10 01 00 00      	movq	%rcx, 272(%rsp)
     296: 48 89 9c 24 18 01 00 00      	movq	%rbx, 280(%rsp)
     29e: 4c 89 84 24 20 01 00 00      	movq	%r8, 288(%rsp)
     2a6: 4c 8d 6f 01                  	leaq	1(%rdi), %r13
     2aa: 49 39 c5                     	cmpq	%rax, %r13
     2ad: 4c 0f 4d e8                  	cmovgeq	%rax, %r13
     2b1: 4c 8d 67 02                  	leaq	2(%rdi), %r12
     2b5: 49 39 c4                     	cmpq	%rax, %r12
     2b8: 4c 0f 4d e0                  	cmovgeq	%rax, %r12
     2bc: 48 8d 4f 03                  	leaq	3(%rdi), %rcx
     2c0: 48 39 c1                     	cmpq	%rax, %rcx
     2c3: 48 0f 4d c8                  	cmovgeq	%rax, %rcx
     2c7: 49 89 c8                     	movq	%rcx, %r8
     2ca: 48 89 bc 24 08 01 00 00      	movq	%rdi, 264(%rsp)
     2d2: 48 89 fb                     	movq	%rdi, %rbx
     2d5: 48 8b 84 24 c8 01 00 00      	movq	456(%rsp), %rax
     2dd: 48 0f af d8                  	imulq	%rax, %rbx
     2e1: 48 8b 8c 24 b0 01 00 00      	movq	432(%rsp), %rcx
     2e9: 48 01 cb                     	addq	%rcx, %rbx
     2ec: 4c 0f af e8                  	imulq	%rax, %r13
     2f0: 49 01 cd                     	addq	%rcx, %r13
     2f3: 4c 0f af e0                  	imulq	%rax, %r12
     2f7: 49 01 cc                     	addq	%rcx, %r12
     2fa: 4c 0f af c0                  	imulq	%rax, %r8
     2fe: 49 01 c8                     	addq	%rcx, %r8
     301: 4c 89 44 24 18               	movq	%r8, 24(%rsp)
     306: bf 80 00 00 00               	movl	$128, %edi
     30b: 48 89 ac 24 88 00 00 00      	movq	%rbp, 136(%rsp)
     313: 48 89 94 24 80 00 00 00      	movq	%rdx, 128(%rsp)
     31b: 48 89 74 24 78               	movq	%rsi, 120(%rsp)
     320: e8 00 00 00 00               	callq	0x325 <gemm+0x325>
     325: 48 89 84 24 f0 00 00 00      	movq	%rax, 240(%rsp)
     32d: 48 89 c1                     	movq	%rax, %rcx
     330: 48 83 c1 3f                  	addq	$63, %rcx
     334: 48 83 e1 c0                  	andq	$-64, %rcx
     338: 4c 8b 7c 24 20               	movq	32(%rsp), %r15
     33d: 44 89 f8                     	movl	%r15d, %eax
     340: 44 29 f0                     	subl	%r14d, %eax
     343: 62 f2 7d 48 7c c0            	vpbroadcastd	%eax, %zmm0
     349: 62 f1 fd 48 6f 0d 00 00 00 00	vmovdqa64	(%rip), %zmm1   # 0x353 <gemm+0x353>
     353: 62 f1 7d 48 66 c9            	vpcmpgtd	%zmm1, %zmm0, %k1
     359: c5 f8 91 4c 24 10            	kmovw	%k1, 16(%rsp)
     35f: 4a 8d 04 33                  	leaq	(%rbx,%r14), %rax
     363: 48 8b ac 24 a8 01 00 00      	movq	424(%rsp), %rbp
     36b: 48 89 84 24 c0 00 00 00      	movq	%rax, 192(%rsp)
     373: 62 f1 7c c9 10 44 85 00      	vmovups	(%rbp,%rax,4), %zmm0 {%k1} {z}
     37b: 48 89 4c 24 40               	movq	%rcx, 64(%rsp)
     380: 62 f1 7c 48 29 01            	vmovaps	%zmm0, (%rcx)
     386: bf 80 00 00 00               	movl	$128, %edi
     38b: c5 f8 77                     	vzeroupper
     38e: e8 00 00 00 00               	callq	0x393 <gemm+0x393>
     393: 48 89 84 24 e0 00 00 00      	movq	%rax, 224(%rsp)
     39b: 48 89 c2                     	movq	%rax, %rdx
     39e: 48 83 c2 3f                  	addq	$63, %rdx
     3a2: 48 83 e2 c0                  	andq	$-64, %rdx
     3a6: 49 8d 4e 10                  	leaq	16(%r14), %rcx
     3aa: 44 89 f8                     	movl	%r15d, %eax
     3ad: 48 89 8c 24 d8 00 00 00      	movq	%rcx, 216(%rsp)
     3b5: 29 c8                        	subl	%ecx, %eax
     3b7: 62 f2 7d 48 7c c0            	vpbroadcastd	%eax, %zmm0
     3bd: 62 f1 7d 48 66 0d 00 00 00 00	vpcmpgtd	(%rip), %zmm0, %k1      # 0x3c7 <gemm+0x3c7>
     3c7: c5 f8 91 4c 24 0e            	kmovw	%k1, 14(%rsp)
     3cd: 4c 89 f5                     	movq	%r14, %rbp
     3d0: 48 89 9c 24 e8 00 00 00      	movq	%rbx, 232(%rsp)
     3d8: 4d 8d 74 1e 10               	leaq	16(%r14,%rbx), %r14
     3dd: 48 8b 9c 24 a8 01 00 00      	movq	424(%rsp), %rbx
     3e5: 62 b1 7c c9 10 04 b3         	vmovups	(%rbx,%r14,4), %zmm0 {%k1} {z}
     3ec: 48 89 54 24 38               	movq	%rdx, 56(%rsp)
     3f1: 62 f1 7c 48 29 02            	vmovaps	%zmm0, (%rdx)
     3f7: bf 80 00 00 00               	movl	$128, %edi
     3fc: c5 f8 77                     	vzeroupper
     3ff: e8 00 00 00 00               	callq	0x404 <gemm+0x404>
     404: 49 89 c7                     	movq	%rax, %r15
     407: 48 89 84 24 d0 00 00 00      	movq	%rax, 208(%rsp)
     40f: 49 83 c7 3f                  	addq	$63, %r15
     413: 49 83 e7 c0                  	andq	$-64, %r15
     417: 4c 89 ac 24 f8 00 00 00      	movq	%r13, 248(%rsp)
     41f: 49 8d 44 2d 00               	leaq	(%r13,%rbp), %rax
     424: 48 89 84 24 a0 00 00 00      	movq	%rax, 160(%rsp)
     42c: c5 f8 90 4c 24 10            	kmovw	16(%rsp), %k1
     432: 62 f1 7c c9 10 04 83         	vmovups	(%rbx,%rax,4), %zmm0 {%k1} {z}
     439: 62 d1 7c 48 29 07            	vmovaps	%zmm0, (%r15)
     43f: bf 80 00 00 00               	movl	$128, %edi
     444: c5 f8 77                     	vzeroupper
     447: e8 00 00 00 00               	callq	0x44c <gemm+0x44c>
     44c: 49 89 c5                     	movq	%rax, %r13
     44f: 48 89 84 24 c8 00 00 00      	movq	%rax, 200(%rsp)
     457: 49 83 c5 3f                  	addq	$63, %r13
     45b: 49 83 e5 c0                  	andq	$-64, %r13
     45f: c5 f8 90 4c 24 0e            	kmovw	14(%rsp), %k1
     465: 62 b1 7c c9 10 04 b3         	vmovups	(%rbx,%r14,4), %zmm0 {%k1} {z}
     46c: 49 89 de                     	movq	%rbx, %r14
     46f: 62 d1 7c 48 29 45 00         	vmovaps	%zmm0, (%r13)
     476: bf 80 00 00 00               	movl	$128, %edi
     47b: c5 f8 77                     	vzeroupper
     47e: e8 00 00 00 00               	callq	0x483 <gemm+0x483>
     483: 48 89 c3                     	movq	%rax, %rbx
     486: 48 89 84 24 b8 00 00 00      	movq	%rax, 184(%rsp)
     48e: 48 83 c3 3f                  	addq	$63, %rbx
     492: 48 83 e3 c0                  	andq	$-64, %rbx
     496: 49 8d 04 2c                  	leaq	(%r12,%rbp), %rax
     49a: 48 89 84 24 98 00 00 00      	movq	%rax, 152(%rsp)
     4a2: c5 f8 90 4c 24 10            	kmovw	16(%rsp), %k1
     4a8: 62 d1 7c c9 10 04 86         	vmovups	(%r14,%rax,4), %zmm0 {%k1} {z}
     4af: 62 f1 7c 48 29 03            	vmovaps	%zmm0, (%rbx)
     4b5: bf 80 00 00 00               	movl	$128, %edi
     4ba: c5 f8 77                     	vzeroupper
     4bd: e8 00 00 00 00               	callq	0x4c2 <gemm+0x4c2>
     4c2: 49 89 c6                     	movq	%rax, %r14
     4c5: 48 89 84 24 b0 00 00 00      	movq	%rax, 176(%rsp)
     4cd: 49 83 c6 3f                  	addq	$63, %r14
     4d1: 49 83 e6 c0                  	andq	$-64, %r14
     4d5: 4c 89 a4 24 00 01 00 00      	movq	%r12, 256(%rsp)
     4dd: 4a 8d 44 25 10               	leaq	16(%rbp,%r12), %rax
     4e2: c5 f8 90 4c 24 0e            	kmovw	14(%rsp), %k1
     4e8: 48 8b 8c 24 a8 01 00 00      	movq	424(%rsp), %rcx
     4f0: 62 f1 7c c9 10 04 81         	vmovups	(%rcx,%rax,4), %zmm0 {%k1} {z}
     4f7: 62 d1 7c 48 29 06            	vmovaps	%zmm0, (%r14)
     4fd: bf 80 00 00 00               	movl	$128, %edi
     502: c5 f8 77                     	vzeroupper
     505: e8 00 00 00 00               	callq	0x50a <gemm+0x50a>
     50a: 49 89 c4                     	movq	%rax, %r12
     50d: 48 89 84 24 a8 00 00 00      	movq	%rax, 168(%rsp)
     515: 49 83 c4 3f                  	addq	$63, %r12
     519: 49 83 e4 c0                  	andq	$-64, %r12
     51d: 48 8b 44 24 18               	movq	24(%rsp), %rax
     522: 48 01 e8                     	addq	%rbp, %rax
     525: 48 89 84 24 90 00 00 00      	movq	%rax, 144(%rsp)
     52d: c5 f8 90 4c 24 10            	kmovw	16(%rsp), %k1
     533: 48 8b 8c 24 a8 01 00 00      	movq	424(%rsp), %rcx
     53b: 62 f1 7c c9 10 04 81         	vmovups	(%rcx,%rax,4), %zmm0 {%k1} {z}
     542: 62 d1 7c 48 29 04 24         	vmovaps	%zmm0, (%r12)
     549: bf 80 00 00 00               	movl	$128, %edi
     54e: c5 f8 77                     	vzeroupper
     551: e8 00 00 00 00               	callq	0x556 <gemm+0x556>
     556: 48 89 44 24 10               	movq	%rax, 16(%rsp)
     55b: 48 83 c0 3f                  	addq	$63, %rax
     55f: 48 83 e0 c0                  	andq	$-64, %rax
     563: 48 8b 4c 24 18               	movq	24(%rsp), %rcx
     568: 48 8d 4c 0d 10               	leaq	16(%rbp,%rcx), %rcx
     56d: c5 f8 90 4c 24 0e            	kmovw	14(%rsp), %k1
     573: 48 8b 94 24 a8 01 00 00      	movq	424(%rsp), %rdx
     57b: 62 f1 7c c9 10 04 8a         	vmovups	(%rdx,%rcx,4), %zmm0 {%k1} {z}
     582: 62 f1 7c 48 29 00            	vmovaps	%zmm0, (%rax)
     588: 48 8b 4c 24 68               	movq	104(%rsp), %rcx
     58d: 48 8b 54 24 78               	movq	120(%rsp), %rdx
     592: 48 0f af d1                  	imulq	%rcx, %rdx
     596: 48 8b 7c 24 60               	movq	96(%rsp), %rdi
     59b: 48 01 fa                     	addq	%rdi, %rdx
     59e: 48 8b 6c 24 58               	movq	88(%rsp), %rbp
     5a3: 4c 8d 44 95 00               	leaq	(%rbp,%rdx,4), %r8
     5a8: 48 8b 94 24 80 00 00 00      	movq	128(%rsp), %rdx
     5b0: 48 0f af d1                  	imulq	%rcx, %rdx
     5b4: 48 01 fa                     	addq	%rdi, %rdx
     5b7: 4c 8d 54 95 00               	leaq	(%rbp,%rdx,4), %r10
     5bc: 48 8b 94 24 88 00 00 00      	movq	136(%rsp), %rdx
     5c4: 48 0f af d1                  	imulq	%rcx, %rdx
     5c8: 48 01 fa                     	addq	%rdi, %rdx
     5cb: 4c 8d 5c 95 00               	leaq	(%rbp,%rdx,4), %r11
     5d0: 48 0f af 4c 24 28            	imulq	40(%rsp), %rcx
     5d6: 48 8d 3c b9                  	leaq	(%rcx,%rdi,4), %rdi
     5da: 48 01 ef                     	addq	%rbp, %rdi
     5dd: 31 ed                        	xorl	%ebp, %ebp
     5df: 48 8b 94 24 90 01 00 00      	movq	400(%rsp), %rdx
     5e7: 48 8b b4 24 70 01 00 00      	movq	368(%rsp), %rsi
     5ef: 48 3b 6c 24 20               	cmpq	32(%rsp), %rbp
     5f4: 0f 8d 96 fa ff ff            	jge	0x90 <gemm+0x90>
     5fa: 66 0f 1f 44 00 00            	nopw	(%rax,%rax)
     600: 48 89 e9                     	movq	%rbp, %rcx
     603: 48 0f af ca                  	imulq	%rdx, %rcx
     607: 48 03 4c 24 30               	addq	48(%rsp), %rcx
     60c: 62 f1 7c 48 10 04 8e         	vmovups	(%rsi,%rcx,4), %zmm0
     613: 62 f1 7c 48 10 4c 8e 01      	vmovups	64(%rsi,%rcx,4), %zmm1
     61b: 62 f2 7d 48 18 14 af         	vbroadcastss	(%rdi,%rbp,4), %zmm2
     622: 62 d2 7d 48 18 1c ab         	vbroadcastss	(%r11,%rbp,4), %zmm3
     629: 62 d2 7d 48 18 24 aa         	vbroadcastss	(%r10,%rbp,4), %zmm4
     630: 62 d2 7d 48 18 2c a8         	vbroadcastss	(%r8,%rbp,4), %zmm5
     637: 48 8b 4c 24 40               	movq	64(%rsp), %rcx
     63c: 62 f1 7c 48 28 31            	vmovaps	(%rcx), %zmm6
     642: 62 f2 6d 48 b8 f0            	vfmadd231ps	%zmm0, %zmm2, %zmm6 # zmm6 = (zmm2 * zmm0) + zmm6
     648: 4c 8b 4c 24 38               	movq	56(%rsp), %r9
     64d: 62 d2 75 48 a8 11            	vfmadd213ps	(%r9), %zmm1, %zmm2 # zmm2 = (zmm1 * zmm2) + mem
     653: 62 d1 7c 48 28 3f            	vmovaps	(%r15), %zmm7
     659: 62 f2 65 48 b8 f8            	vfmadd231ps	%zmm0, %zmm3, %zmm7 # zmm7 = (zmm3 * zmm0) + zmm7
     65f: 62 d2 75 48 a8 5d 00         	vfmadd213ps	(%r13), %zmm1, %zmm3 # zmm3 = (zmm1 * zmm3) + mem
     666: 62 71 7c 48 28 03            	vmovaps	(%rbx), %zmm8
     66c: 62 72 5d 48 b8 c0            	vfmadd231ps	%zmm0, %zmm4, %zmm8 # zmm8 = (zmm4 * zmm0) + zmm8
     672: 62 d2 75 48 a8 26            	vfmadd213ps	(%r14), %zmm1, %zmm4 # zmm4 = (zmm1 * zmm4) + mem
     678: 62 d2 55 48 a8 04 24         	vfmadd213ps	(%r12), %zmm5, %zmm0 # zmm0 = (zmm5 * zmm0) + mem
     67f: 62 f2 55 48 a8 08            	vfmadd213ps	(%rax), %zmm5, %zmm1 # zmm1 = (zmm5 * zmm1) + mem
     685: 62 f1 7c 48 29 31            	vmovaps	%zmm6, (%rcx)
     68b: 62 d1 7c 48 29 11            	vmovaps	%zmm2, (%r9)
     691: 62 d1 7c 48 29 3f            	vmovaps	%zmm7, (%r15)
     697: 62 d1 7c 48 29 5d 00         	vmovaps	%zmm3, (%r13)
     69e: 62 71 7c 48 29 03            	vmovaps	%zmm8, (%rbx)
     6a4: 62 d1 7c 48 29 26            	vmovaps	%zmm4, (%r14)
     6aa: 62 d1 7c 48 29 04 24         	vmovaps	%zmm0, (%r12)
     6b1: 62 f1 7c 48 29 08            	vmovaps	%zmm1, (%rax)
     6b7: 48 ff c5                     	incq	%rbp
     6ba: 48 3b 6c 24 20               	cmpq	32(%rsp), %rbp
     6bf: 0f 8c 3b ff ff ff            	jl	0x600 <gemm+0x600>
     6c5: e9 c6 f9 ff ff               	jmp	0x90 <gemm+0x90>
     6ca: 48 81 c4 28 01 00 00         	addq	$296, %rsp              # imm = 0x128
     6d1: 5b                           	popq	%rbx
     6d2: 41 5c                        	popq	%r12
     6d4: 41 5d                        	popq	%r13
     6d6: 41 5e                        	popq	%r14
     6d8: 41 5f                        	popq	%r15
     6da: 5d                           	popq	%rbp
     6db: c3                           	retq
     6dc: 0f 1f 40 00                  	nopl	(%rax)

00000000000006e0 <_mlir_ciface_gemm>:
     6e0: 48 83 ec 78                  	subq	$120, %rsp
     6e4: 4c 8b 4f 28                  	movq	40(%rdi), %r9
     6e8: 4c 8b 47 20                  	movq	32(%rdi), %r8
     6ec: 48 8b 4f 18                  	movq	24(%rdi), %rcx
     6f0: 4c 8b 57 10                  	movq	16(%rdi), %r10
     6f4: 48 8b 07                     	movq	(%rdi), %rax
     6f7: 4c 8b 5f 08                  	movq	8(%rdi), %r11
     6fb: 48 8b 7f 30                  	movq	48(%rdi), %rdi
     6ff: c5 fc 10 06                  	vmovups	(%rsi), %ymm0
     703: c5 f8 10 4e 20               	vmovups	32(%rsi), %xmm1
     708: 48 8b 76 30                  	movq	48(%rsi), %rsi
     70c: c5 fc 10 12                  	vmovups	(%rdx), %ymm2
     710: c5 f8 10 5a 20               	vmovups	32(%rdx), %xmm3
     715: 48 8b 52 30                  	movq	48(%rdx), %rdx
     719: 48 89 54 24 70               	movq	%rdx, 112(%rsp)
     71e: c5 f8 11 5c 24 60            	vmovups	%xmm3, 96(%rsp)
     724: c5 fc 11 54 24 40            	vmovups	%ymm2, 64(%rsp)
     72a: 48 89 74 24 38               	movq	%rsi, 56(%rsp)
     72f: c5 f8 11 4c 24 28            	vmovups	%xmm1, 40(%rsp)
     735: c5 fc 11 44 24 08            	vmovups	%ymm0, 8(%rsp)
     73b: 48 89 3c 24                  	movq	%rdi, (%rsp)
     73f: 48 89 c7                     	movq	%rax, %rdi
     742: 4c 89 de                     	movq	%r11, %rsi
     745: 4c 89 d2                     	movq	%r10, %rdx
     748: c5 f8 77                     	vzeroupper
     74b: e8 00 00 00 00               	callq	0x750 <_mlir_ciface_gemm+0x70>
     750: 48 83 c4 78                  	addq	$120, %rsp
     754: c3                           	retq

Oh! I find what’s going wrong:
According to Linalg.generic for full BLAS matmul expression - #7 by giuseros
I remove memref.alloc:

#aptr_map_1 = affine_map<(d0)[s0] -> (d0 + 1, s0 - 1)>
#aptr_map_2 = affine_map<(d0)[s0] -> (d0 + 2, s0 - 1)>
#aptr_map_3 = affine_map<(d0)[s0] -> (d0 + 3, s0 - 1)>
#bptr_map = affine_map<(d0) -> (d0 + 16)>
#sub = affine_map<()[s0, s1] -> (s0 - s1 + 1)>
#map_broadcast = affine_map<(d0, d1) -> (0)>
module {
  func.func @gemm(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant 0.000000e+00 : f32
    %cf0 = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %0 = memref.dim %arg0, %c0 : memref<?x?xf32> // i
    %1 = memref.dim %arg0, %c1 : memref<?x?xf32> // k
    %2 = memref.dim %arg1, %c1 : memref<?x?xf32> // j
    affine.for %n = 0 to #sub()[%1, %c32] step 32 {
      affine.for %m = 0 to %0 step 4 {
        %aptr0 = memref.subview %arg0[%m, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%mark_1 = affine.min #aptr_map_1(%m)[%0]
        %aptr1 = memref.subview %arg0[%mark_1, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%mark_2 = affine.min #aptr_map_2(%m)[%0]
        %aptr2 = memref.subview %arg0[%mark_2, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	%mark_3 = affine.min #aptr_map_3(%m)[%0]
        %aptr3 = memref.subview %arg0[%mark_3, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr0 = memref.subview %arg2[%m, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr1 = memref.subview %arg2[%mark_1, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr2 = memref.subview %arg2[%mark_2, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

        %cptr3 = memref.subview %arg2[%mark_3, 0][1, %1][1, 1] : memref<?x?xf32> to memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>

	// %md00 = memref.alloc() : memref<1xvector<16xf32>>
        // %_d00 = vector.transfer_read %cptr0[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d00, %md00[%c0] : memref<1xvector<16xf32>>

	// %md01 = memref.alloc() : memref<1xvector<16xf32>>
        // %l_c_sub0 = affine.apply #bptr_map(%n)
        // %_d01 = vector.transfer_read %cptr0[%c0, %l_c_sub0], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d01, %md01[%c0] : memref<1xvector<16xf32>>

	// %md10 = memref.alloc() : memref<1xvector<16xf32>>
        // %_d10 = vector.transfer_read %cptr1[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d10, %md10[%c0] : memref<1xvector<16xf32>>

	// %md11 = memref.alloc() : memref<1xvector<16xf32>>
        // %l_c_sub1 = affine.apply #bptr_map(%n)
        // %_d11 = vector.transfer_read %cptr0[%c0, %l_c_sub1], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d11, %md11[%c0] : memref<1xvector<16xf32>>

	// %md20 = memref.alloc() : memref<1xvector<16xf32>>
        // %_d20 = vector.transfer_read %cptr2[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d20, %md20[%c0] : memref<1xvector<16xf32>>

	// %md21 = memref.alloc() : memref<1xvector<16xf32>>
        // %l_c_sub2 = affine.apply #bptr_map(%n)
        // %_d21 = vector.transfer_read %cptr2[%c0, %l_c_sub2], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d21, %md21[%c0] : memref<1xvector<16xf32>>

	// %md30 = memref.alloc() : memref<1xvector<16xf32>>
        // %_d30 = vector.transfer_read %cptr3[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d30, %md30[%c0] : memref<1xvector<16xf32>>

	// %md31 = memref.alloc() : memref<1xvector<16xf32>>
        // %l_c_sub3 = affine.apply #bptr_map(%n)
        // %_d31 = vector.transfer_read %cptr3[%c0, %l_c_sub3], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
        // memref.store %_d31, %md31[%c0] : memref<1xvector<16xf32>>

	affine.for %k = 0 to %1 {
          %a0 = vector.transfer_read %aptr0[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
          %a1 = vector.transfer_read %aptr1[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
          %a2 = vector.transfer_read %aptr2[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
          %a3 = vector.transfer_read %aptr3[%c0, %k], %cst {permutation_map = #map_broadcast} : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>
         %d00 = vector.transfer_read %cptr0[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %l_c_sub0 = affine.apply #bptr_map(%n)
         %d01 = vector.transfer_read %cptr0[%c0, %l_c_sub0], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %d10 = vector.transfer_read %cptr1[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %l_c_sub1 = affine.apply #bptr_map(%n)
         %d11 = vector.transfer_read %cptr0[%c0, %l_c_sub1], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %d20 = vector.transfer_read %cptr2[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %l_c_sub2 = affine.apply #bptr_map(%n)
         %d21 = vector.transfer_read %cptr2[%c0, %l_c_sub2], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %d30 = vector.transfer_read %cptr3[%c0, %n], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

         %l_c_sub3 = affine.apply #bptr_map(%n)
         %d31 = vector.transfer_read %cptr3[%c0, %l_c_sub3], %cst : memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<16xf32>

          %b0 = vector.load %arg1[%k, %n] : memref<?x?xf32>, vector<16xf32>
          %b_sub = affine.apply #bptr_map(%n)
          %b1 = vector.load %arg1[%k, %b_sub] : memref<?x?xf32>, vector<16xf32>

	  %d00_ = vector.fma %a0, %b0, %d00 : vector<16xf32>
	  %d01_ = vector.fma %a0, %b1, %d01 : vector<16xf32>
	  %d10_ = vector.fma %a1, %b0, %d10 : vector<16xf32>
	  %d11_ = vector.fma %a1, %b1, %d11 : vector<16xf32>
	  %d20_ = vector.fma %a2, %b0, %d20 : vector<16xf32>
	  %d21_ = vector.fma %a2, %b1, %d21 : vector<16xf32>
	  %d30_ = vector.fma %a3, %b0, %d30 : vector<16xf32>
	  %d31_ = vector.fma %a3, %b1, %d31 : vector<16xf32>
		
        vector.transfer_write %d00_, %cptr0[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d01_, %cptr0[%c0, %l_c_sub0] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d10_, %cptr1[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d11_, %cptr1[%c0, %l_c_sub1] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d20_, %cptr2[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d21_, %cptr2[%c0, %l_c_sub2] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d30_, %cptr3[%c0, %n] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
        vector.transfer_write %d31_, %cptr3[%c0, %l_c_sub3] : vector<16xf32>, memref<1x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
	}



      }
    }
    return
  }
}

Now performance is same with OpenCV version:

Side note, I recommend writing your abstractions at a higher level (in particular, not unroll by hand) and use the relevant transform dialect abstractions to generate good code.

This should be quite nicer to drive and if certain transformations or lowerings are missing we can add them and improve everyone’s experience.