# KNL Assembly Code for Matrix Multiplication

Hello, I want some help in understanding knl intel assembly of matrix multiplication code. some of the things are not clear;

here .c file:

#include <stdio.h>
#define N 1000

// This function multiplies A and B, and stores
// the result in C
void multiply(int A[N], int B[N], int C[N])
{
int i, j, k, r;
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
r = 0;
for (k = 0; k < N; k++) {
r += A[i][k]*B[k][j];}
C[i][j] = r;

}

}
}

here .s file: the code that i want to ask is in red color.

.text
.intel_syntax noprefix
.file “matn_o3.ll”
.section .rodata,“a”,@progbits
.p2align 6
.LCPI0_0:
.LCPI0_1:
.section .rodata.cst8,“aM”,@progbits,8
.p2align 3
.LCPI0_2:
.LCPI0_3:
.LCPI0_4:
.LCPI0_5:
.LCPI0_6:
.text
.globl multiply
.p2align 4, 0x90
.type multiply,@function
multiply: # @multiply
.cfi_startproc

# BB#0:

push rbp
.Lcfi0:
.cfi_def_cfa_offset 16
push r15
.Lcfi1:
.cfi_def_cfa_offset 24
push r14
.Lcfi2:
.cfi_def_cfa_offset 32
push r12
.Lcfi3:
.cfi_def_cfa_offset 40
push rbx
.Lcfi4:
.cfi_def_cfa_offset 48
.Lcfi5:
.cfi_offset rbx, -48
.Lcfi6:
.cfi_offset r12, -40
.Lcfi7:
.cfi_offset r14, -32
.Lcfi8:
.cfi_offset r15, -24
.Lcfi9:
.cfi_offset rbp, -16
lea r8, [rdi + 3856]
xor r9d, r9d
vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 = [8,9,10,11,12,13,14,15]
vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]
vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
kxnorw k1, k0, k0
kshiftrw k1, k1, 8
vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
.p2align 4, 0x90

xor r11d, r11d
.p2align 4, 0x90

# Child Loop BB0_5 Depth 3

vpxord zmm8, zmm8, zmm8
mov ecx, 960
vmovdqa64 zmm9, zmm23
vmovdqa64 zmm10, zmm22
vpxord zmm11, zmm11, zmm11
vpxord zmm12, zmm12, zmm12
vpxord zmm13, zmm13, zmm13
.p2align 4, 0x90
.LBB0_3: # %vector.body

# this bb will run 15 times

vmovq rax, xmm9
imul r10, r9, 4000
lea rbx, [rdi + r10]
vpmuludq zmm14, zmm10, zmm2 ; this is BB for vector here we have to do gather for B due to arbitrary addresses so here zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in these indexes? and zmm2=[4000, 4000,…4000]. these are the indexes for B we need to multiple indexes with stride=4000. i know here these indexes are 64 bit but the values stored in these locations are 32 bits then the load using zmm10 index will give 8 elements of 32 bits present in these locations, so do the registers contain 8 elements of 32 bits present at specified indexes?? so after multiplication we get indexes for higher 8 elements of B i.e [3200,3600,40000,…54000].

vpsrlq zmm15, zmm10, 32 ; i dont understand the need for this step, please explain the purpose of all these steps. here vpsrlq will shift right zmm10 values by 256 bits (32*8)…zmmm10 initially=[8,9,10,11,12,13,14,15]. it will now become [0,0,0,0,8,9,10,11]…Am I correct? Please explain me the purpose of this step.
vpmuludq zmm15, zmm15, zmm2 ; similarly dont understand the need for this step.
vpsllq zmm15, zmm15, 32 ; dont understand the need for this step
vpaddq zmm14, zmm15, zmm14 ; dont understand the need for this step
vpbroadcastq zmm15, r11 ; r11 changes when loop variable j changes whats the need of this step?
vpsllq zmm15, zmm15, 2 ; dont understand the need for this step
vpaddq zmm14, zmm14, zmm15 ; dont understand the need for this step
vpmuludq zmm16, zmm9, zmm2 ; here same as before the lower 8 elements of B indexes are computed as Zmm16=[0,4000,8000,…28000]
vpsrlq zmm17, zmm9, 32 ; dont understand the need for this step
vpmuludq zmm17, zmm17, zmm2 ; dont understand the need for this step
vpsllq zmm17, zmm17, 32 ; dont understand the need for this step
vpaddq zmm16, zmm17, zmm16 ; dont understand the need for this step
vpaddq zmm15, zmm16, zmm15 ; dont understand the need for this step
kmovw k2, k1 ; dont understand the need for this step
vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8 indexes ( or values at these 8 indexes???) so it will load 8 elements not 16. here it should be zmm14**=[3200,3600,40000,…54000]. but by the above computation these indexes are changes??**
kxnorw k2, k0, k0 ; dont understand the need for this step
vpgatherqd ymm14 {k2}, zmmword ptr [zmm15] ; here again issues with index zmm15. it should be [0,4000,8000,…28000] but its different due to above computation.
vinserti64x4 zmm0, zmm14, ymm0, 1
kmovw k2, k1
vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]
kxnorw k2, k0, k0
vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]
vinserti64x4 zmm14, zmm15, ymm14, 1
kmovw k2, k1
vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]
kxnorw k2, k0, k0
vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]
vinserti64x4 zmm15, zmm16, ymm15, 1
kmovw k2, k1
vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]
kxnorw k2, k0, k0
vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]
vinserti64x4 zmm1, zmm16, ymm1, 1
vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]
vpmulld zmm14, zmm14, zmmword ptr [rbx + 4rax + 64]
vpmulld zmm15, zmm15, zmmword ptr [rbx + 4
rax + 128]
vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
add rcx, -64 #decrement counter by 64
jne .LBB0_3 # if rcx not equal to zero goto .lbbo_3

# BB#4: # %middle.block

vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1] ; please explain how shuffle instructions work here. i know of llvm ir shuffle, but these assembly ones are difficult for me to understand
vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]
vpshufd zmm1, zmm0, 238 # zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
vpshufd zmm1, zmm0, 229 # zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
vmovd ebx, xmm0
mov rax, r8
xor r14d, r14d
.p2align 4, 0x90
.LBB0_5: # Parent Loop BB0_1 Depth=1

# => This Inner Loop Header: Depth=3

lea r15, [rsi + r14]
mov r12d, dword ptr [r15 + 4r11 - 16000]
imul r12d, dword ptr [rax - 16]
mov ecx, dword ptr [r15 + 4
r11 - 12000]
imul ecx, dword ptr [rax - 12]
mov ebp, dword ptr [r15 + 4r11 - 8000]
imul ebp, dword ptr [rax - 8]
mov ecx, dword ptr [r15 + 4
r11 - 4000]
imul ecx, dword ptr [rax - 4]
mov ebx, dword ptr [r15 + 4*r11]
imul ebx, dword ptr [rax]
cmp r14, 160000
jne .LBB0_5

# BB#6: # %.loopexit

add r10, rdx #rdx is c
mov dword ptr [r10 + 4*r11], ebx
inc r11
cmp r11, 1000
jne .LBB0_2

inc r9
cmp r9, 1000
jne .LBB0_1

# BB#8:

pop rbx
pop r12
pop r14
pop r15
pop rbp
ret

Thank You

Some comments inline, I’ll need to look more later.

Thank You.

in this step;

vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 = [8,9,10,11,12,13,14,15]

the indexes are 64 bit but the element stored at these position is 32 bit since we are dealing with integers and ir also shows this.
here we are loading 32 bit value from those 64 bit indexes which means zmm22 will hold values 32 bit from these 64 bit position so there is capacity of 16 32 bit elements then why all this??

this is mentioned in IR as

%5 = getelementptr inbounds [1000 x i32], [1000 x i32]* %0, i64 %indvars.iv34, i64 %4
%6 = bitcast i32* %5 to <16 x i32>*
%wide.load = load <16 x i32>, <16 x i32>* %6, align 4, !tbaa !1

here indvars are 64 bit values but the values loaded from these indexes (step 3) is 32 bit???