I tried a little code sample with a transfer_read
and would like to know if this is expected.
func @transfer_read_2d(%A : memref<4x4xf32>, %base1: index, %base2: index) -> vector<4x4xf32>{
%fm42 = constant -42.0: f32
%f = vector.transfer_read %A[%base1, %base2], %fm42
: memref<4x4xf32>, vector<4x4xf32>
return %f : vector<4x4xf32>
}
The code generated with mlir-opt --convert-vector-to-scf --lower-affine --convert-scf-to-std --convert-vector-to-llvm
generates this:
llvm.func @transfer_read_2d(%arg0: !llvm<"float*">, %arg1: !llvm<"float*">, %arg2: !llvm.i64, %arg3: !llvm.i64, %arg4: !llvm.i64, %arg5: !llvm.i64, %arg6: !llvm.i64, %arg7: !llvm.i64, %arg8: !llvm.i64) -> !llvm<"[4 x <4 x float>]"> {
%0 = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%1 = llvm.insertvalue %arg0, %0[0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%2 = llvm.insertvalue %arg1, %1[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%3 = llvm.insertvalue %arg2, %2[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%5 = llvm.insertvalue %arg5, %4[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%7 = llvm.insertvalue %arg6, %6[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%8 = llvm.mlir.constant(-4.200000e+01 : f32) : !llvm.float
%9 = llvm.mlir.constant(dense<-4.200000e+01> : vector<4xf32>) : !llvm<"<4 x float>">
%10 = llvm.mlir.constant(0 : index) : !llvm.i64
%11 = llvm.mlir.constant(4 : index) : !llvm.i64
%12 = llvm.mlir.constant(1 : index) : !llvm.i64
%13 = llvm.mlir.constant(4 : index) : !llvm.i64
%14 = llvm.mlir.null : !llvm<"<4 x float>*">
%15 = llvm.mlir.constant(1 : index) : !llvm.i64
%16 = llvm.getelementptr %14[%15] : (!llvm<"<4 x float>*">, !llvm.i64) -> !llvm<"<4 x float>*">
%17 = llvm.ptrtoint %16 : !llvm<"<4 x float>*"> to !llvm.i64
%18 = llvm.mul %13, %17 : !llvm.i64
%19 = llvm.alloca %18 x !llvm<"<4 x float>"> {alignment = 128 : i64} : (!llvm.i64) -> !llvm<"<4 x float>*">
%20 = llvm.mlir.undef : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%21 = llvm.insertvalue %19, %20[0] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%22 = llvm.insertvalue %19, %21[1] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%23 = llvm.mlir.constant(0 : index) : !llvm.i64
%24 = llvm.insertvalue %23, %22[2] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%25 = llvm.mlir.constant(1 : index) : !llvm.i64
%26 = llvm.insertvalue %13, %24[3, 0] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%27 = llvm.insertvalue %25, %26[4, 0] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
llvm.br ^bb1(%10 : !llvm.i64)
^bb1(%28: !llvm.i64): // 2 preds: ^bb0, ^bb5
%29 = llvm.icmp "slt" %28, %11 : !llvm.i64
llvm.cond_br %29, ^bb2, ^bb6
^bb2: // pred: ^bb1
%30 = llvm.add %28, %arg7 : !llvm.i64
%31 = llvm.icmp "slt" %30, %11 : !llvm.i64
llvm.cond_br %31, ^bb3, ^bb4
^bb3: // pred: ^bb2
%32 = llvm.extractvalue %7[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
%33 = llvm.mlir.constant(0 : index) : !llvm.i64
%34 = llvm.mlir.constant(4 : index) : !llvm.i64
%35 = llvm.mul %30, %34 : !llvm.i64
%36 = llvm.add %33, %35 : !llvm.i64
%37 = llvm.mlir.constant(1 : index) : !llvm.i64
%38 = llvm.mul %arg8, %37 : !llvm.i64
%39 = llvm.add %36, %38 : !llvm.i64
%40 = llvm.getelementptr %32[%39] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
%41 = llvm.bitcast %40 : !llvm<"float*"> to !llvm<"<4 x float>*">
%42 = llvm.mlir.constant(dense<[0, 1, 2, 3]> : vector<4xi64>) : !llvm<"<4 x i64>">
%43 = llvm.mlir.undef : !llvm<"<4 x i64>">
%44 = llvm.mlir.constant(0 : i32) : !llvm.i32
%45 = llvm.insertelement %arg8, %43[%44 : !llvm.i32] : !llvm<"<4 x i64>">
%46 = llvm.shufflevector %45, %43 [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x i64>">, !llvm<"<4 x i64>">
%47 = llvm.add %46, %42 : !llvm<"<4 x i64>">
%48 = llvm.mlir.constant(1 : index) : !llvm.i64
%49 = llvm.mlir.constant(4 : index) : !llvm.i64
%50 = llvm.mlir.undef : !llvm<"<4 x i64>">
%51 = llvm.mlir.constant(0 : i32) : !llvm.i32
%52 = llvm.insertelement %49, %50[%51 : !llvm.i32] : !llvm<"<4 x i64>">
%53 = llvm.shufflevector %52, %50 [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x i64>">, !llvm<"<4 x i64>">
%54 = llvm.icmp "slt" %47, %53 : !llvm<"<4 x i64>">
%55 = llvm.mlir.constant(dense<-4.200000e+01> : vector<4xf32>) : !llvm<"<4 x float>">
%56 = llvm.intr.masked.load %41, %54, %55 {alignment = 4 : i32} : (!llvm<"<4 x float>*">, !llvm<"<4 x i1>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">
%57 = llvm.extractvalue %27[1] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%58 = llvm.mlir.constant(0 : index) : !llvm.i64
%59 = llvm.mlir.constant(1 : index) : !llvm.i64
%60 = llvm.mul %28, %59 : !llvm.i64
%61 = llvm.add %58, %60 : !llvm.i64
%62 = llvm.getelementptr %57[%61] : (!llvm<"<4 x float>*">, !llvm.i64) -> !llvm<"<4 x float>*">
llvm.store %56, %62 : !llvm<"<4 x float>*">
llvm.br ^bb5
^bb4: // pred: ^bb2
%63 = llvm.extractvalue %27[1] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%64 = llvm.mlir.constant(0 : index) : !llvm.i64
%65 = llvm.mlir.constant(1 : index) : !llvm.i64
%66 = llvm.mul %28, %65 : !llvm.i64
%67 = llvm.add %64, %66 : !llvm.i64
%68 = llvm.getelementptr %63[%67] : (!llvm<"<4 x float>*">, !llvm.i64) -> !llvm<"<4 x float>*">
llvm.store %9, %68 : !llvm<"<4 x float>*">
llvm.br ^bb5
^bb5: // 2 preds: ^bb3, ^bb4
%69 = llvm.add %28, %12 : !llvm.i64
llvm.br ^bb1(%69 : !llvm.i64)
^bb6: // pred: ^bb1
%70 = llvm.mlir.undef : !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64 }">
%71 = llvm.extractvalue %27[0] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%72 = llvm.bitcast %71 : !llvm<"<4 x float>*"> to !llvm<"[4 x <4 x float>]*">
%73 = llvm.insertvalue %72, %70[0] : !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64 }">
%74 = llvm.extractvalue %27[1] : !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
%75 = llvm.bitcast %74 : !llvm<"<4 x float>*"> to !llvm<"[4 x <4 x float>]*">
%76 = llvm.insertvalue %75, %73[1] : !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64 }">
%77 = llvm.mlir.constant(0 : index) : !llvm.i64
%78 = llvm.insertvalue %77, %76[2] : !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64 }">
%79 = llvm.extractvalue %78[1] : !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64 }">
%80 = llvm.mlir.constant(0 : index) : !llvm.i64
%81 = llvm.getelementptr %79[%80] : (!llvm<"[4 x <4 x float>]*">, !llvm.i64) -> !llvm<"[4 x <4 x float>]*">
%82 = llvm.load %81 : !llvm<"[4 x <4 x float>]*">
llvm.return %82 : !llvm<"[4 x <4 x float>]">
}
and compiling to assembly (piping the previous MLIR output to mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3
) I get:
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 10, 15
.section __TEXT,__literal16,16byte_literals
.p2align 4 ## -- Begin function transfer_read_2d
LCPI0_0:
.quad 2 ## 0x2
.quad 3 ## 0x3
LCPI0_1:
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 1 ## 0x1
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
LCPI0_2:
.long 0xc2280000 ## float -42
.long 0xc2280000 ## float -42
.long 0xc2280000 ## float -42
.long 0xc2280000 ## float -42
LCPI0_3:
.quad 2147483648 ## 0x80000000
.quad 2147483648 ## 0x80000000
LCPI0_4:
.quad 2147483652 ## 0x80000004
.quad 2147483652 ## 0x80000004
LCPI0_5:
.space 4
.long 0xc2280000 ## float -42
.long 0xc2280000 ## float -42
.long 0xc2280000 ## float -42
.section __TEXT,__text,regular,pure_instructions
.globl _transfer_read_2d
.p2align 4, 0x90
_transfer_read_2d: ## @transfer_read_2d
Lfunc_begin0:
.file 1 "/Users/alexe/MLIR/atests/simd/<stdin>"
.loc 1 5 0 ## <stdin>:5:0
.cfi_sections .debug_frame
.cfi_startproc
## %bb.0:
movq 24(%rsp), %rax
movq 16(%rsp), %rcx
Ltmp0:
.loc 1 0 0 prologue_end ## <stdin>:0:0
movq %rax, %xmm0
pshufd $68, %xmm0, %xmm9 ## xmm9 = xmm0[0,1,0,1]
movdqa LCPI0_0(%rip), %xmm8 ## xmm8 = [2,3]
paddq %xmm9, %xmm8
paddq LCPI0_1(%rip), %xmm9
movaps LCPI0_2(%rip), %xmm3 ## xmm3 = [-4.2E+1,-4.2E+1,-4.2E+1,-4.2E+1]
.loc 1 40 11 ## <stdin>:40:11
cmpq $3, %rcx
.loc 1 0 0 is_stmt 0 ## <stdin>:0:0
movaps %xmm3, %xmm0
.loc 1 41 5 is_stmt 1 ## <stdin>:41:5
jg LBB0_10
## %bb.1:
.loc 1 50 11 ## <stdin>:50:11
leaq (%rax,%rcx,4), %rdx
.loc 1 51 11 ## <stdin>:51:11
leaq (%rsi,%rdx,4), %rdx
.loc 1 67 11 ## <stdin>:67:11
movdqa LCPI0_3(%rip), %xmm0 ## xmm0 = [2147483648,2147483648]
movdqa %xmm9, %xmm1
pxor %xmm0, %xmm1
movdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [2147483652,2147483652]
movdqa %xmm2, %xmm6
pcmpeqd %xmm1, %xmm6
movdqa %xmm2, %xmm7
pcmpgtd %xmm1, %xmm7
pshufd $160, %xmm7, %xmm1 ## xmm1 = xmm7[0,0,2,2]
pand %xmm6, %xmm1
por %xmm7, %xmm1
pxor %xmm8, %xmm0
movdqa %xmm2, %xmm6
pcmpeqd %xmm0, %xmm6
pcmpgtd %xmm0, %xmm2
pshufd $160, %xmm2, %xmm0 ## xmm0 = xmm2[0,0,2,2]
pand %xmm6, %xmm0
por %xmm2, %xmm0
packssdw %xmm0, %xmm1
movmskps %xmm1, %edi
testb $1, %dil
je LBB0_2
## %bb.3: ## %cond.load
movss (%rdx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
movaps LCPI0_5(%rip), %xmm0 ## xmm0 = <u,-4.2E+1,-4.2E+1,-4.2E+1>
movss %xmm1, %xmm0 ## xmm0 = xmm1[0],xmm0[1,2,3]
testb $2, %dil
jne LBB0_5
jmp LBB0_6
LBB0_2:
.loc 1 0 11 is_stmt 0 ## <stdin>:0:11
movaps LCPI0_2(%rip), %xmm0 ## xmm0 = [-4.2E+1,-4.2E+1,-4.2E+1,-4.2E+1]
.loc 1 67 11 ## <stdin>:67:11
testb $2, %dil
je LBB0_6
LBB0_5: ## %cond.load1
movss 4(%rdx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
shufps $0, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[0,0]
shufps $226, %xmm0, %xmm1 ## xmm1 = xmm1[2,0],xmm0[2,3]
movaps %xmm1, %xmm0
LBB0_6: ## %else2
testb $4, %dil
jne LBB0_7
## %bb.8: ## %else5
testb $8, %dil
je LBB0_10
LBB0_9: ## %cond.load7
movss 12(%rdx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
shufps $32, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[2,0]
shufps $36, %xmm1, %xmm0 ## xmm0 = xmm0[0,1],xmm1[2,0]
LBB0_10:
.loc 1 39 11 is_stmt 1 ## <stdin>:39:11
leaq 1(%rcx), %rdx
.loc 1 40 11 ## <stdin>:40:11
cmpq $3, %rdx
.loc 1 0 0 is_stmt 0 ## <stdin>:0:0
movaps %xmm3, %xmm1
.loc 1 41 5 is_stmt 1 ## <stdin>:41:5
jg LBB0_20
## %bb.11:
.loc 1 50 11 ## <stdin>:50:11
leaq (%rax,%rdx,4), %rdx
.loc 1 51 11 ## <stdin>:51:11
leaq (%rsi,%rdx,4), %rdx
.loc 1 67 11 ## <stdin>:67:11
movdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [2147483648,2147483648]
movdqa %xmm9, %xmm2
pxor %xmm1, %xmm2
movdqa LCPI0_4(%rip), %xmm6 ## xmm6 = [2147483652,2147483652]
movdqa %xmm6, %xmm7
pcmpeqd %xmm2, %xmm7
movdqa %xmm6, %xmm4
pcmpgtd %xmm2, %xmm4
pshufd $160, %xmm4, %xmm2 ## xmm2 = xmm4[0,0,2,2]
pand %xmm7, %xmm2
por %xmm4, %xmm2
pxor %xmm8, %xmm1
movdqa %xmm6, %xmm4
pcmpeqd %xmm1, %xmm4
pcmpgtd %xmm1, %xmm6
pshufd $160, %xmm6, %xmm1 ## xmm1 = xmm6[0,0,2,2]
pand %xmm4, %xmm1
por %xmm6, %xmm1
packssdw %xmm1, %xmm2
movmskps %xmm2, %edi
testb $1, %dil
je LBB0_12
## %bb.13: ## %cond.load11
movss (%rdx), %xmm2 ## xmm2 = mem[0],zero,zero,zero
movaps LCPI0_5(%rip), %xmm1 ## xmm1 = <u,-4.2E+1,-4.2E+1,-4.2E+1>
movss %xmm2, %xmm1 ## xmm1 = xmm2[0],xmm1[1,2,3]
testb $2, %dil
jne LBB0_15
jmp LBB0_16
LBB0_7: ## %cond.load4
movss 8(%rdx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
shufps $48, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[3,0]
shufps $132, %xmm1, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0,2]
testb $8, %dil
jne LBB0_9
jmp LBB0_10
LBB0_12:
.loc 1 0 11 is_stmt 0 ## <stdin>:0:11
movaps LCPI0_2(%rip), %xmm1 ## xmm1 = [-4.2E+1,-4.2E+1,-4.2E+1,-4.2E+1]
.loc 1 67 11 ## <stdin>:67:11
testb $2, %dil
je LBB0_16
LBB0_15: ## %cond.load14
movss 4(%rdx), %xmm2 ## xmm2 = mem[0],zero,zero,zero
shufps $0, %xmm1, %xmm2 ## xmm2 = xmm2[0,0],xmm1[0,0]
shufps $226, %xmm1, %xmm2 ## xmm2 = xmm2[2,0],xmm1[2,3]
movaps %xmm2, %xmm1
LBB0_16: ## %else15
testb $4, %dil
jne LBB0_17
## %bb.18: ## %else18
testb $8, %dil
je LBB0_20
LBB0_19: ## %cond.load20
movss 12(%rdx), %xmm2 ## xmm2 = mem[0],zero,zero,zero
shufps $32, %xmm1, %xmm2 ## xmm2 = xmm2[0,0],xmm1[2,0]
shufps $36, %xmm2, %xmm1 ## xmm1 = xmm1[0,1],xmm2[2,0]
LBB0_20:
.loc 1 39 11 is_stmt 1 ## <stdin>:39:11
leaq 2(%rcx), %rdx
.loc 1 40 11 ## <stdin>:40:11
cmpq $3, %rdx
.loc 1 0 0 is_stmt 0 ## <stdin>:0:0
movaps %xmm3, %xmm2
.loc 1 41 5 is_stmt 1 ## <stdin>:41:5
jg LBB0_30
## %bb.21:
.loc 1 50 11 ## <stdin>:50:11
leaq (%rax,%rdx,4), %rdx
.loc 1 51 11 ## <stdin>:51:11
leaq (%rsi,%rdx,4), %rdx
.loc 1 67 11 ## <stdin>:67:11
movdqa LCPI0_3(%rip), %xmm2 ## xmm2 = [2147483648,2147483648]
movdqa %xmm9, %xmm4
pxor %xmm2, %xmm4
movdqa LCPI0_4(%rip), %xmm6 ## xmm6 = [2147483652,2147483652]
movdqa %xmm6, %xmm7
pcmpeqd %xmm4, %xmm7
movdqa %xmm6, %xmm5
pcmpgtd %xmm4, %xmm5
pshufd $160, %xmm5, %xmm4 ## xmm4 = xmm5[0,0,2,2]
pand %xmm7, %xmm4
por %xmm5, %xmm4
pxor %xmm8, %xmm2
movdqa %xmm6, %xmm5
pcmpeqd %xmm2, %xmm5
pcmpgtd %xmm2, %xmm6
pshufd $160, %xmm6, %xmm2 ## xmm2 = xmm6[0,0,2,2]
pand %xmm5, %xmm2
por %xmm6, %xmm2
packssdw %xmm2, %xmm4
movmskps %xmm4, %edi
testb $1, %dil
je LBB0_22
## %bb.23: ## %cond.load24
movss (%rdx), %xmm4 ## xmm4 = mem[0],zero,zero,zero
movaps LCPI0_5(%rip), %xmm2 ## xmm2 = <u,-4.2E+1,-4.2E+1,-4.2E+1>
movss %xmm4, %xmm2 ## xmm2 = xmm4[0],xmm2[1,2,3]
testb $2, %dil
jne LBB0_25
jmp LBB0_26
LBB0_17: ## %cond.load17
movss 8(%rdx), %xmm2 ## xmm2 = mem[0],zero,zero,zero
shufps $48, %xmm1, %xmm2 ## xmm2 = xmm2[0,0],xmm1[3,0]
shufps $132, %xmm2, %xmm1 ## xmm1 = xmm1[0,1],xmm2[0,2]
testb $8, %dil
jne LBB0_19
jmp LBB0_20
LBB0_22:
.loc 1 0 11 is_stmt 0 ## <stdin>:0:11
movaps LCPI0_2(%rip), %xmm2 ## xmm2 = [-4.2E+1,-4.2E+1,-4.2E+1,-4.2E+1]
.loc 1 67 11 ## <stdin>:67:11
testb $2, %dil
je LBB0_26
LBB0_25: ## %cond.load27
movss 4(%rdx), %xmm4 ## xmm4 = mem[0],zero,zero,zero
shufps $0, %xmm2, %xmm4 ## xmm4 = xmm4[0,0],xmm2[0,0]
shufps $226, %xmm2, %xmm4 ## xmm4 = xmm4[2,0],xmm2[2,3]
movaps %xmm4, %xmm2
LBB0_26: ## %else28
testb $4, %dil
jne LBB0_27
## %bb.28: ## %else31
testb $8, %dil
je LBB0_30
LBB0_29: ## %cond.load33
movss 12(%rdx), %xmm4 ## xmm4 = mem[0],zero,zero,zero
shufps $32, %xmm2, %xmm4 ## xmm4 = xmm4[0,0],xmm2[2,0]
shufps $36, %xmm4, %xmm2 ## xmm2 = xmm2[0,1],xmm4[2,0]
LBB0_30:
.loc 1 39 11 is_stmt 1 ## <stdin>:39:11
addq $3, %rcx
.loc 1 40 11 ## <stdin>:40:11
cmpq $3, %rcx
.loc 1 41 5 ## <stdin>:41:5
jg LBB0_40
## %bb.31:
.loc 1 50 11 ## <stdin>:50:11
leaq (%rax,%rcx,4), %rax
.loc 1 51 11 ## <stdin>:51:11
leaq (%rsi,%rax,4), %rax
.loc 1 67 11 ## <stdin>:67:11
movdqa LCPI0_3(%rip), %xmm3 ## xmm3 = [2147483648,2147483648]
pxor %xmm3, %xmm9
movdqa LCPI0_4(%rip), %xmm4 ## xmm4 = [2147483652,2147483652]
movdqa %xmm4, %xmm5
pcmpeqd %xmm9, %xmm5
movdqa %xmm4, %xmm6
pcmpgtd %xmm9, %xmm6
pshufd $160, %xmm6, %xmm7 ## xmm7 = xmm6[0,0,2,2]
pand %xmm5, %xmm7
por %xmm6, %xmm7
pxor %xmm3, %xmm8
movdqa %xmm4, %xmm3
pcmpeqd %xmm8, %xmm3
pcmpgtd %xmm8, %xmm4
pshufd $160, %xmm4, %xmm5 ## xmm5 = xmm4[0,0,2,2]
pand %xmm3, %xmm5
por %xmm4, %xmm5
packssdw %xmm5, %xmm7
movmskps %xmm7, %ecx
testb $1, %cl
je LBB0_32
## %bb.33: ## %cond.load37
movss (%rax), %xmm4 ## xmm4 = mem[0],zero,zero,zero
movaps LCPI0_5(%rip), %xmm3 ## xmm3 = <u,-4.2E+1,-4.2E+1,-4.2E+1>
movss %xmm4, %xmm3 ## xmm3 = xmm4[0],xmm3[1,2,3]
testb $2, %cl
jne LBB0_35
jmp LBB0_36
LBB0_27: ## %cond.load30
movss 8(%rdx), %xmm4 ## xmm4 = mem[0],zero,zero,zero
shufps $48, %xmm2, %xmm4 ## xmm4 = xmm4[0,0],xmm2[3,0]
shufps $132, %xmm4, %xmm2 ## xmm2 = xmm2[0,1],xmm4[0,2]
testb $8, %dil
jne LBB0_29
jmp LBB0_30
LBB0_32:
.loc 1 0 11 is_stmt 0 ## <stdin>:0:11
movaps LCPI0_2(%rip), %xmm3 ## xmm3 = [-4.2E+1,-4.2E+1,-4.2E+1,-4.2E+1]
.loc 1 67 11 ## <stdin>:67:11
testb $2, %cl
je LBB0_36
LBB0_35: ## %cond.load40
movss 4(%rax), %xmm4 ## xmm4 = mem[0],zero,zero,zero
shufps $0, %xmm3, %xmm4 ## xmm4 = xmm4[0,0],xmm3[0,0]
shufps $226, %xmm3, %xmm4 ## xmm4 = xmm4[2,0],xmm3[2,3]
movaps %xmm4, %xmm3
LBB0_36: ## %else41
testb $4, %cl
jne LBB0_37
## %bb.38: ## %else44
testb $8, %cl
je LBB0_40
LBB0_39: ## %cond.load46
movss 12(%rax), %xmm4 ## xmm4 = mem[0],zero,zero,zero
shufps $32, %xmm3, %xmm4 ## xmm4 = xmm4[0,0],xmm3[2,0]
shufps $36, %xmm4, %xmm3 ## xmm3 = xmm3[0,1],xmm4[2,0]
LBB0_40:
.loc 1 102 5 is_stmt 1 ## <stdin>:102:5
retq
LBB0_37: ## %cond.load43
.loc 1 67 11 ## <stdin>:67:11
movss 8(%rax), %xmm4 ## xmm4 = mem[0],zero,zero,zero
shufps $48, %xmm3, %xmm4 ## xmm4 = xmm4[0,0],xmm3[3,0]
shufps $132, %xmm4, %xmm3 ## xmm3 = xmm3[0,1],xmm4[0,2]
testb $8, %cl
jne LBB0_39
jmp LBB0_40
Ltmp1:
Lfunc_end0:
Is that expected? Has this operation performed differently when inlined into a nested loop, e.g. for matrix multiply?
Thanks