KNL Assembly Code for Matrix Multiplication

Hello, I want some help in understanding knl intel assembly of matrix multiplication code. some of the things are not clear;

here .c file:

#include <stdio.h>
#define N 1000

// This function multiplies A and B, and stores
// the result in C
void multiply(int A[N], int B[N], int C[N])
{
int i, j, k, r;
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
r = 0;
for (k = 0; k < N; k++) {
r += A[i][k]*B[k][j];}
C[i][j] = r;

}

}
}

here .s file: the code that i want to ask is in red color.

.text
.intel_syntax noprefix
.file “matn_o3.ll”
.section .rodata,“a”,@progbits
.p2align 6
.LCPI0_0:
.quad 8 # 0x8
.quad 9 # 0x9
.quad 10 # 0xa
.quad 11 # 0xb
.quad 12 # 0xc
.quad 13 # 0xd
.quad 14 # 0xe
.quad 15 # 0xf
.LCPI0_1:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 6 # 0x6
.quad 7 # 0x7
.section .rodata.cst8,“aM”,@progbits,8
.p2align 3
.LCPI0_2:
.quad 4000 # 0xfa0
.LCPI0_3:
.quad 64000 # 0xfa00
.LCPI0_4:
.quad 128000 # 0x1f400
.LCPI0_5:
.quad 192000 # 0x2ee00
.LCPI0_6:
.quad 64 # 0x40
.text
.globl multiply
.p2align 4, 0x90
.type multiply,@function
multiply: # @multiply
.cfi_startproc

BB#0:

push rbp
.Lcfi0:
.cfi_def_cfa_offset 16
push r15
.Lcfi1:
.cfi_def_cfa_offset 24
push r14
.Lcfi2:
.cfi_def_cfa_offset 32
push r12
.Lcfi3:
.cfi_def_cfa_offset 40
push rbx
.Lcfi4:
.cfi_def_cfa_offset 48
.Lcfi5:
.cfi_offset rbx, -48
.Lcfi6:
.cfi_offset r12, -40
.Lcfi7:
.cfi_offset r14, -32
.Lcfi8:
.cfi_offset r15, -24
.Lcfi9:
.cfi_offset rbp, -16
lea r8, [rdi + 3856]
xor r9d, r9d
vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 = [8,9,10,11,12,13,14,15]
vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]
vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
vpbroadcastq zmm3, rsi
add rsi, 3856000
vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
kxnorw k1, k0, k0
kshiftrw k1, k1, 8
vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
.p2align 4, 0x90
.LBB0_1: # %.preheader26

=>This Loop Header: Depth=1

Child Loop BB0_2 Depth 2

Child Loop BB0_3 Depth 3

Child Loop BB0_5 Depth 3

xor r11d, r11d
.p2align 4, 0x90
.LBB0_2: # %.preheader

Parent Loop BB0_1 Depth=1

=> This Loop Header: Depth=2

Child Loop BB0_3 Depth 3

Child Loop BB0_5 Depth 3

vpxord zmm8, zmm8, zmm8
mov ecx, 960
vmovdqa64 zmm9, zmm23
vmovdqa64 zmm10, zmm22
vpxord zmm11, zmm11, zmm11
vpxord zmm12, zmm12, zmm12
vpxord zmm13, zmm13, zmm13
.p2align 4, 0x90
.LBB0_3: # %vector.body

Parent Loop BB0_1 Depth=1

Parent Loop BB0_2 Depth=2

=> This Inner Loop Header: Depth=3

this bb will run 15 times

vmovq rax, xmm9
imul r10, r9, 4000
lea rbx, [rdi + r10]
vpmuludq zmm14, zmm10, zmm2 ; this is BB for vector here we have to do gather for B due to arbitrary addresses so here zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in these indexes? and zmm2=[4000, 4000,…4000]. these are the indexes for B we need to multiple indexes with stride=4000. i know here these indexes are 64 bit but the values stored in these locations are 32 bits then the load using zmm10 index will give 8 elements of 32 bits present in these locations, so do the registers contain 8 elements of 32 bits present at specified indexes?? so after multiplication we get indexes for higher 8 elements of B i.e [3200,3600,40000,…54000].

vpsrlq zmm15, zmm10, 32 ; i dont understand the need for this step, please explain the purpose of all these steps. here vpsrlq will shift right zmm10 values by 256 bits (32*8)…zmmm10 initially=[8,9,10,11,12,13,14,15]. it will now become [0,0,0,0,8,9,10,11]…Am I correct? Please explain me the purpose of this step.
vpmuludq zmm15, zmm15, zmm2 ; similarly dont understand the need for this step.
vpsllq zmm15, zmm15, 32 ; dont understand the need for this step
vpaddq zmm14, zmm14, zmm3 ;
vpaddq zmm14, zmm15, zmm14 ; dont understand the need for this step
vpbroadcastq zmm15, r11 ; r11 changes when loop variable j changes whats the need of this step?
vpsllq zmm15, zmm15, 2 ; dont understand the need for this step
vpaddq zmm14, zmm14, zmm15 ; dont understand the need for this step
vpmuludq zmm16, zmm9, zmm2 ; here same as before the lower 8 elements of B indexes are computed as Zmm16=[0,4000,8000,…28000]
vpsrlq zmm17, zmm9, 32 ; dont understand the need for this step
vpmuludq zmm17, zmm17, zmm2 ; dont understand the need for this step
vpsllq zmm17, zmm17, 32 ; dont understand the need for this step
vpaddq zmm16, zmm16, zmm3
vpaddq zmm16, zmm17, zmm16 ; dont understand the need for this step
vpaddq zmm15, zmm16, zmm15 ; dont understand the need for this step
vpaddq zmm16, zmm15, zmm4
vpaddq zmm17, zmm14, zmm4
vpaddq zmm18, zmm15, zmm5
vpaddq zmm19, zmm14, zmm5
vpaddq zmm20, zmm15, zmm6
vpaddq zmm21, zmm14, zmm6
kmovw k2, k1 ; dont understand the need for this step
vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8 indexes ( or values at these 8 indexes???) so it will load 8 elements not 16. here it should be zmm14**=[3200,3600,40000,…54000]. but by the above computation these indexes are changes??**
kxnorw k2, k0, k0 ; dont understand the need for this step
vpgatherqd ymm14 {k2}, zmmword ptr [zmm15] ; here again issues with index zmm15. it should be [0,4000,8000,…28000] but its different due to above computation.
vinserti64x4 zmm0, zmm14, ymm0, 1
kmovw k2, k1
vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]
kxnorw k2, k0, k0
vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]
vinserti64x4 zmm14, zmm15, ymm14, 1
kmovw k2, k1
vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]
kxnorw k2, k0, k0
vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]
vinserti64x4 zmm15, zmm16, ymm15, 1
kmovw k2, k1
vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]
kxnorw k2, k0, k0
vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]
vinserti64x4 zmm1, zmm16, ymm1, 1
vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]
vpmulld zmm14, zmm14, zmmword ptr [rbx + 4rax + 64]
vpmulld zmm15, zmm15, zmmword ptr [rbx + 4
rax + 128]
vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
vpaddd zmm8, zmm0, zmm8
vpaddd zmm11, zmm14, zmm11
vpaddd zmm12, zmm15, zmm12
vpaddd zmm13, zmm1, zmm13
vpaddq zmm9, zmm9, zmm7 #zmm7=64
vpaddq zmm10, zmm10, zmm7
add rcx, -64 #decrement counter by 64
jne .LBB0_3 # if rcx not equal to zero goto .lbbo_3

BB#4: # %middle.block

in Loop: Header=BB0_2 Depth=2

vpaddd zmm0, zmm11, zmm8
vpaddd zmm0, zmm12, zmm0
vpaddd zmm0, zmm13, zmm0
vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1] ; please explain how shuffle instructions work here. i know of llvm ir shuffle, but these assembly ones are difficult for me to understand
vpaddd zmm0, zmm0, zmm1
vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]
vpaddd zmm0, zmm0, zmm1
vpshufd zmm1, zmm0, 238 # zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
vpaddd zmm0, zmm0, zmm1
vpshufd zmm1, zmm0, 229 # zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
vpaddd zmm0, zmm0, zmm1
vmovd ebx, xmm0
mov rax, r8
xor r14d, r14d
.p2align 4, 0x90
.LBB0_5: # Parent Loop BB0_1 Depth=1

Parent Loop BB0_2 Depth=2

=> This Inner Loop Header: Depth=3

lea r15, [rsi + r14]
mov r12d, dword ptr [r15 + 4r11 - 16000]
imul r12d, dword ptr [rax - 16]
mov ecx, dword ptr [r15 + 4
r11 - 12000]
imul ecx, dword ptr [rax - 12]
mov ebp, dword ptr [r15 + 4r11 - 8000]
imul ebp, dword ptr [rax - 8]
add r12d, ebx
add ecx, r12d
add ebp, ecx
mov ecx, dword ptr [r15 + 4
r11 - 4000]
imul ecx, dword ptr [rax - 4]
add ecx, ebp
mov ebx, dword ptr [r15 + 4*r11]
imul ebx, dword ptr [rax]
add ebx, ecx
add r14, 20000
add rax, 20
cmp r14, 160000
jne .LBB0_5

BB#6: # %.loopexit

in Loop: Header=BB0_2 Depth=2

add r10, rdx #rdx is c
mov dword ptr [r10 + 4*r11], ebx
inc r11
cmp r11, 1000
jne .LBB0_2

BB#7: # in Loop: Header=BB0_1 Depth=1

inc r9
add r8, 4000
cmp r9, 1000
jne .LBB0_1

BB#8:

pop rbx
pop r12
pop r14
pop r15
pop rbp
ret

Looking forward to your reply

Thank You

Some comments inline, I’ll need to look more later.

Thank You.

in this step;

vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 = [8,9,10,11,12,13,14,15]

the indexes are 64 bit but the element stored at these position is 32 bit since we are dealing with integers and ir also shows this.
here we are loading 32 bit value from those 64 bit indexes which means zmm22 will hold values 32 bit from these 64 bit position so there is capacity of 16 32 bit elements then why all this??

this is mentioned in IR as

%5 = getelementptr inbounds [1000 x i32], [1000 x i32]* %0, i64 %indvars.iv34, i64 %4
%6 = bitcast i32* %5 to <16 x i32>*
%wide.load = load <16 x i32>, <16 x i32>* %6, align 4, !tbaa !1

here indvars are 64 bit values but the values loaded from these indexes (step 3) is 32 bit???

Please correct me.