I tweaked things. The asm is a little better:
L720:
leaq (,%rbx,8), %rcx
addq %rbp, %rcx
prefetcht0 -256(%r14,%r8)
vbroadcastsd -8(%rbp,%rbx,8), %zmm30
vmovups -256(%r14), %zmm29
vmovups -192(%r14), %zmm28
vmovups -128(%r14), %zmm27
vmovups -64(%r14), %zmm26
vmovupd (%r14), %zmm25
prefetcht0 -192(%r14,%r8)
prefetcht0 -128(%r14,%r8)
prefetcht0 -64(%r14,%r8)
prefetcht0 (%r14,%r8)
vfmadd231pd %zmm29, %zmm30, %zmm0 # zmm0 = (zmm30 * zmm29) + zmm0
vfmadd231pd %zmm28, %zmm30, %zmm1 # zmm1 = (zmm30 * zmm28) + zmm1
vfmadd231pd %zmm27, %zmm30, %zmm2 # zmm2 = (zmm30 * zmm27) + zmm2
vfmadd231pd %zmm26, %zmm30, %zmm3 # zmm3 = (zmm30 * zmm26) + zmm3
vfmadd231pd %zmm25, %zmm30, %zmm4 # zmm4 = (zmm30 * zmm25) + zmm4
leaq -8(%r11,%rcx), %rdx
vbroadcastsd -8(%rcx,%rsi,8), %zmm30
vfmadd231pd %zmm29, %zmm30, %zmm5 # zmm5 = (zmm30 * zmm29) + zmm5
vfmadd231pd %zmm28, %zmm30, %zmm6 # zmm6 = (zmm30 * zmm28) + zmm6
vfmadd231pd %zmm27, %zmm30, %zmm7 # zmm7 = (zmm30 * zmm27) + zmm7
vfmadd231pd %zmm26, %zmm30, %zmm8 # zmm8 = (zmm30 * zmm26) + zmm8
vfmadd231pd %zmm25, %zmm30, %zmm9 # zmm9 = (zmm30 * zmm25) + zmm9
vbroadcastsd (%rdx,%rsi,8), %zmm30
addq %r11, %rdx
vfmadd231pd %zmm29, %zmm30, %zmm10 # zmm10 = (zmm30 * zmm29) + zmm10
vfmadd231pd %zmm28, %zmm30, %zmm11 # zmm11 = (zmm30 * zmm28) + zmm11
vfmadd231pd %zmm27, %zmm30, %zmm12 # zmm12 = (zmm30 * zmm27) + zmm12
vfmadd231pd %zmm26, %zmm30, %zmm13 # zmm13 = (zmm30 * zmm26) + zmm13
vfmadd231pd %zmm25, %zmm30, %zmm14 # zmm14 = (zmm30 * zmm25) + zmm14
vbroadcastsd (%rdx,%rsi,8), %zmm30
addq %r11, %rdx
vfmadd231pd %zmm29, %zmm30, %zmm15 # zmm15 = (zmm30 * zmm29) + zmm15
vfmadd231pd %zmm28, %zmm30, %zmm16 # zmm16 = (zmm30 * zmm28) + zmm16
vfmadd231pd %zmm27, %zmm30, %zmm17 # zmm17 = (zmm30 * zmm27) + zmm17
vfmadd231pd %zmm26, %zmm30, %zmm18 # zmm18 = (zmm30 * zmm26) + zmm18
vfmadd231pd %zmm25, %zmm30, %zmm19 # zmm19 = (zmm30 * zmm25) + zmm19
vbroadcastsd (%rdx,%rsi,8), %zmm30
vfmadd231pd %zmm29, %zmm30, %zmm20 # zmm20 = (zmm30 * zmm29) + zmm20
vfmadd231pd %zmm28, %zmm30, %zmm21 # zmm21 = (zmm30 * zmm28) + zmm21
vfmadd231pd %zmm27, %zmm30, %zmm22 # zmm22 = (zmm30 * zmm27) + zmm22
vfmadd231pd %zmm26, %zmm30, %zmm23 # zmm23 = (zmm30 * zmm26) + zmm23
vfmadd231pd %zmm25, %zmm30, %zmm24 # zmm24 = (zmm30 * zmm25) + zmm24
addq %r15, %r14
cmpq %rdi, %rbx
leaq 1(%rbx), %rbx
jb L720
The associated optimized LLVM IR:
L111: ; preds = %L111, %L111.preheader
%value_phi2 = phi <8 x double> [ %res.i2399, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi4 = phi i64 [ %res.i2708, %L111 ], [ 0, %L111.preheader ]
%value_phi5 = phi <8 x double> [ %res.i2400, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi7 = phi <8 x double> [ %res.i2401, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi9 = phi <8 x double> [ %res.i2402, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi11 = phi <8 x double> [ %res.i2403, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi13 = phi <8 x double> [ %res.i2543, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi15 = phi <8 x double> [ %res.i2544, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi17 = phi <8 x double> [ %res.i2545, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi19 = phi <8 x double> [ %res.i2546, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi21 = phi <8 x double> [ %res.i2547, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi23 = phi <8 x double> [ %res.i2563, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi25 = phi <8 x double> [ %res.i2564, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi27 = phi <8 x double> [ %res.i2565, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi29 = phi <8 x double> [ %res.i2566, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi31 = phi <8 x double> [ %res.i2567, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi33 = phi <8 x double> [ %res.i2582, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi35 = phi <8 x double> [ %res.i2583, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi37 = phi <8 x double> [ %res.i2584, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi39 = phi <8 x double> [ %res.i2585, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi41 = phi <8 x double> [ %res.i2586, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi43 = phi <8 x double> [ %res.i2703, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi45 = phi <8 x double> [ %res.i2704, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi47 = phi <8 x double> [ %res.i2705, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi49 = phi <8 x double> [ %res.i2706, %L111 ], [ zeroinitializer, %L111.preheader ]
%value_phi51 = phi <8 x double> [ %res.i2707, %L111 ], [ zeroinitializer, %L111.preheader ]
%res.i1789 = add nsw i64 %value_phi4, %res.i1787
%offsetptr.i1792 = getelementptr inbounds double, double* %ptr.i1791, i64 %res.i1789
%res.i1803 = load double, double* %offsetptr.i1792, align 8
%res.i1811 = mul nsw i64 %value_phi4, %.sroa.2.0.copyload
%res.i1813 = add nsw i64 %res.i1811, %value_phi15290
%offsetptr.i1816 = getelementptr inbounds double, double* %ptr.i1815, i64 %res.i1813
%ptr.i1826 = bitcast double* %offsetptr.i1816 to <8 x double>*
%res.i1827 = load <8 x double>, <8 x double>* %ptr.i1826, align 8
%offsetptr.i1930 = getelementptr inbounds double, double* %offsetptr.i1816, i64 8
%ptr.i1931 = bitcast double* %offsetptr.i1930 to <8 x double>*
%res.i1932 = load <8 x double>, <8 x double>* %ptr.i1931, align 8
%offsetptr.i1948 = getelementptr inbounds double, double* %offsetptr.i1816, i64 16
%ptr.i1949 = bitcast double* %offsetptr.i1948 to <8 x double>*
%res.i1950 = load <8 x double>, <8 x double>* %ptr.i1949, align 8
%offsetptr.i2164 = getelementptr inbounds double, double* %offsetptr.i1816, i64 24
%ptr.i2165 = bitcast double* %offsetptr.i2164 to <8 x double>*
%res.i2166 = load <8 x double>, <8 x double>* %ptr.i2165, align 8
%offsetptr.i2177 = getelementptr inbounds double, double* %offsetptr.i1816, i64 32
%ptr.i2178 = bitcast double* %offsetptr.i2177 to <8 x double>*
%res.i2179 = load <8 x double>, <8 x double>* %ptr.i2178, align 8
%offsetptr.i2185 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2183
%addr.i2194 = bitcast double* %offsetptr.i2185 to i8*
call void @llvm.prefetch.p0i8(i8* %addr.i2194, i32 0, i32 3, i32 1)
%offsetptr.i2200 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2198
%addr.i2203 = bitcast double* %offsetptr.i2200 to i8*
call void @llvm.prefetch.p0i8(i8* %addr.i2203, i32 0, i32 3, i32 1)
%offsetptr.i2215 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2213
%addr.i2218 = bitcast double* %offsetptr.i2215 to i8*
call void @llvm.prefetch.p0i8(i8* %addr.i2218, i32 0, i32 3, i32 1)
%offsetptr.i2370 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2368
%addr.i2373 = bitcast double* %offsetptr.i2370 to i8*
call void @llvm.prefetch.p0i8(i8* %addr.i2373, i32 0, i32 3, i32 1)
%offsetptr.i2386 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2384
%addr.i2389 = bitcast double* %offsetptr.i2386 to i8*
call void @llvm.prefetch.p0i8(i8* %addr.i2389, i32 0, i32 3, i32 1)
%ie.i2396 = insertelement <8 x double> undef, double %res.i1803, i32 0
%v.i2397 = shufflevector <8 x double> %ie.i2396, <8 x double> undef, <8 x i32> zeroinitializer
%res.i2399 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi2, <8 x double> %v.i2397, <8 x double> %res.i1827) #8
%res.i2400 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi5, <8 x double> %v.i2397, <8 x double> %res.i1932) #8
%res.i2401 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi7, <8 x double> %v.i2397, <8 x double> %res.i1950) #8
%res.i2402 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi9, <8 x double> %v.i2397, <8 x double> %res.i2166) #8
%res.i2403 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi11, <8 x double> %v.i2397, <8 x double> %res.i2179) #8
%ptr.i2406 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %.sroa.5.16.copyload
%res.i2407 = load double, double* %ptr.i2406, align 8
%ie.i2409 = insertelement <8 x double> undef, double %res.i2407, i32 0
%v.i2410 = shufflevector <8 x double> %ie.i2409, <8 x double> undef, <8 x i32> zeroinitializer
%res.i2543 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi13, <8 x double> %v.i2410, <8 x double> %res.i1827) #8
%res.i2544 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi15, <8 x double> %v.i2410, <8 x double> %res.i1932) #8
%res.i2545 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi17, <8 x double> %v.i2410, <8 x double> %res.i1950) #8
%res.i2546 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi19, <8 x double> %v.i2410, <8 x double> %res.i2166) #8
%res.i2547 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi21, <8 x double> %v.i2410, <8 x double> %res.i2179) #8
%ptr.i2551 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %res.i2548
%res.i2552 = load double, double* %ptr.i2551, align 8
%ie.i2554 = insertelement <8 x double> undef, double %res.i2552, i32 0
%v.i2555 = shufflevector <8 x double> %ie.i2554, <8 x double> undef, <8 x i32> zeroinitializer
%res.i2563 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi23, <8 x double> %v.i2555, <8 x double> %res.i1827) #8
%res.i2564 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi25, <8 x double> %v.i2555, <8 x double> %res.i1932) #8
%res.i2565 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi27, <8 x double> %v.i2555, <8 x double> %res.i1950) #8
%res.i2566 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi29, <8 x double> %v.i2555, <8 x double> %res.i2166) #8
%res.i2567 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi31, <8 x double> %v.i2555, <8 x double> %res.i2179) #8
%ptr.i2571 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %res.i2568
%res.i2572 = load double, double* %ptr.i2571, align 8
%ie.i2574 = insertelement <8 x double> undef, double %res.i2572, i32 0
%v.i2575 = shufflevector <8 x double> %ie.i2574, <8 x double> undef, <8 x i32> zeroinitializer
%res.i2582 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi33, <8 x double> %v.i2575, <8 x double> %res.i1827) #8
%res.i2583 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi35, <8 x double> %v.i2575, <8 x double> %res.i1932) #8
%res.i2584 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi37, <8 x double> %v.i2575, <8 x double> %res.i1950) #8
%res.i2585 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi39, <8 x double> %v.i2575, <8 x double> %res.i2166) #8
%res.i2586 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi41, <8 x double> %v.i2575, <8 x double> %res.i2179) #8
%ptr.i2590 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %res.i2587
%res.i2591 = load double, double* %ptr.i2590, align 8
%ie.i2593 = insertelement <8 x double> undef, double %res.i2591, i32 0
%v.i2594 = shufflevector <8 x double> %ie.i2593, <8 x double> undef, <8 x i32> zeroinitializer
%res.i2703 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi43, <8 x double> %v.i2594, <8 x double> %res.i1827) #8
%res.i2704 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi45, <8 x double> %v.i2594, <8 x double> %res.i1932) #8
%res.i2705 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi47, <8 x double> %v.i2594, <8 x double> %res.i1950) #8
%res.i2706 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi49, <8 x double> %v.i2594, <8 x double> %res.i2166) #8
%res.i2707 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi51, <8 x double> %v.i2594, <8 x double> %res.i2179) #8
%res.i2708 = add nuw nsw i64 %value_phi4, 1
%24 = icmp ult i64 %res.i2708, %15
%actual.i2711 = call i1 @llvm.expect.i1(i1 %24, i1 true)
br i1 %actual.i2711, label %L111, label %L414
There are now 3 unnecessary addq
and 2 unnecessary leaq
.
It should look a little more like this:
L2688:
prefetcht0 2570(%r12)
vbroadcastsd (%r10), %zmm25
vmovupd (%r12), %zmm26
vmovupd 64(%r12), %zmm27
vmovupd 128(%r12), %zmm28
vmovupd 192(%r12), %zmm29
vfmadd231pd %zmm26, %zmm25, %zmm0 # zmm0 = (zmm25 * zmm26) + zmm0
vfmadd231pd %zmm27, %zmm25, %zmm1 # zmm1 = (zmm25 * zmm27) + zmm1
vfmadd231pd %zmm28, %zmm25, %zmm2 # zmm2 = (zmm25 * zmm28) + zmm2
vfmadd231pd %zmm29, %zmm25, %zmm3 # zmm3 = (zmm25 * zmm29) + zmm3
vmovupd 256(%r12), %zmm30
vbroadcastsd (%r10,%rsi,8), %zmm31
vfmadd231pd %zmm30, %zmm25, %zmm4 # zmm4 = (zmm25 * zmm30) + zmm4
vfmadd231pd %zmm26, %zmm31, %zmm5 # zmm5 = (zmm31 * zmm26) + zmm5
vfmadd231pd %zmm27, %zmm31, %zmm6 # zmm6 = (zmm31 * zmm27) + zmm6
vfmadd231pd %zmm28, %zmm31, %zmm7 # zmm7 = (zmm31 * zmm28) + zmm7
vfmadd231pd %zmm29, %zmm31, %zmm8 # zmm8 = (zmm31 * zmm29) + zmm8
vfmadd231pd %zmm30, %zmm31, %zmm9 # zmm9 = (zmm31 * zmm30) + zmm9
vbroadcastsd (%r10,%rdi), %zmm25
vfmadd231pd %zmm26, %zmm25, %zmm10 # zmm10 = (zmm25 * zmm26) + zmm10
vfmadd231pd %zmm27, %zmm25, %zmm11 # zmm11 = (zmm25 * zmm27) + zmm11
vfmadd231pd %zmm28, %zmm25, %zmm12 # zmm12 = (zmm25 * zmm28) + zmm12
vfmadd231pd %zmm29, %zmm25, %zmm13 # zmm13 = (zmm25 * zmm29) + zmm13
vfmadd231pd %zmm30, %zmm25, %zmm14 # zmm14 = (zmm25 * zmm30) + zmm14
vbroadcastsd (%r10,%rbx), %zmm25
vfmadd231pd %zmm26, %zmm25, %zmm15 # zmm15 = (zmm25 * zmm26) + zmm15
vfmadd231pd %zmm27, %zmm25, %zmm16 # zmm16 = (zmm25 * zmm27) + zmm16
vfmadd231pd %zmm28, %zmm25, %zmm17 # zmm17 = (zmm25 * zmm28) + zmm17
vfmadd231pd %zmm29, %zmm25, %zmm18 # zmm18 = (zmm25 * zmm29) + zmm18
vfmadd231pd %zmm30, %zmm25, %zmm19 # zmm19 = (zmm25 * zmm30) + zmm19
vbroadcastsd (%r10,%rcx), %zmm25
vfmadd231pd %zmm26, %zmm25, %zmm20 # zmm20 = (zmm25 * zmm26) + zmm20
vfmadd231pd %zmm27, %zmm25, %zmm21 # zmm21 = (zmm25 * zmm27) + zmm21
vfmadd231pd %zmm28, %zmm25, %zmm22 # zmm22 = (zmm25 * zmm28) + zmm22
vfmadd231pd %zmm29, %zmm25, %zmm23 # zmm23 = (zmm25 * zmm29) + zmm23
vfmadd231pd %zmm30, %zmm25, %zmm24 # zmm24 = (zmm25 * zmm30) + zmm24
addq $320, %r12 # imm = 0x140
addq $8, %r10
decq %rdx
jne L2688
jmp L2952
With associated optimized LLVM IR:
L1007: ; preds = %L1007, %L945
%value_phi92 = phi i64 [ %200, %L1007 ], [ %value_phi38, %L945 ]
%value_phi93 = phi i64 [ %202, %L1007 ], [ %value_phi39, %L945 ]
%value_phi94 = phi <8 x double> [ %res.i1084, %L1007 ], [ %value_phi40, %L945 ]
%value_phi95 = phi <8 x double> [ %res.i1085, %L1007 ], [ %value_phi42, %L945 ]
%value_phi96 = phi <8 x double> [ %res.i1086, %L1007 ], [ %value_phi44, %L945 ]
%value_phi97 = phi <8 x double> [ %res.i1087, %L1007 ], [ %value_phi46, %L945 ]
%value_phi98 = phi <8 x double> [ %res.i1088, %L1007 ], [ %value_phi48, %L945 ]
%value_phi99 = phi <8 x double> [ %res.i1107, %L1007 ], [ %value_phi50, %L945 ]
%value_phi100 = phi <8 x double> [ %res.i1108, %L1007 ], [ %value_phi52, %L945 ]
%value_phi101 = phi <8 x double> [ %res.i1109, %L1007 ], [ %value_phi54, %L945 ]
%value_phi102 = phi <8 x double> [ %res.i1110, %L1007 ], [ %value_phi56, %L945 ]
%value_phi103 = phi <8 x double> [ %res.i1111, %L1007 ], [ %value_phi58, %L945 ]
%value_phi104 = phi <8 x double> [ %res.i1118, %L1007 ], [ %value_phi60, %L945 ]
%value_phi105 = phi <8 x double> [ %res.i1119, %L1007 ], [ %value_phi62, %L945 ]
%value_phi106 = phi <8 x double> [ %res.i1120, %L1007 ], [ %value_phi64, %L945 ]
%value_phi107 = phi <8 x double> [ %res.i1121, %L1007 ], [ %value_phi66, %L945 ]
%value_phi108 = phi <8 x double> [ %res.i1122, %L1007 ], [ %value_phi68, %L945 ]
%value_phi109 = phi <8 x double> [ %res.i1131, %L1007 ], [ %value_phi70, %L945 ]
%value_phi110 = phi <8 x double> [ %res.i1132, %L1007 ], [ %value_phi72, %L945 ]
%value_phi111 = phi <8 x double> [ %res.i1133, %L1007 ], [ %value_phi74, %L945 ]
%value_phi112 = phi <8 x double> [ %res.i1134, %L1007 ], [ %value_phi76, %L945 ]
%value_phi113 = phi <8 x double> [ %res.i1135, %L1007 ], [ %value_phi78, %L945 ]
%value_phi114 = phi <8 x double> [ %res.i1142, %L1007 ], [ %value_phi80, %L945 ]
%value_phi115 = phi <8 x double> [ %res.i1143, %L1007 ], [ %value_phi82, %L945 ]
%value_phi116 = phi <8 x double> [ %res.i1144, %L1007 ], [ %value_phi84, %L945 ]
%value_phi117 = phi <8 x double> [ %res.i1145, %L1007 ], [ %value_phi86, %L945 ]
%value_phi118 = phi <8 x double> [ %res.i1146, %L1007 ], [ %value_phi88, %L945 ]
%value_phi119 = phi i64 [ %204, %L1007 ], [ 1, %L945 ]
%188 = inttoptr i64 %value_phi92 to i8*
%189 = getelementptr i8, i8* %188, i64 2570
call void @llvm.prefetch.p0i8(i8* %189, i32 0, i32 3, i32 1)
%ptr.i1061 = inttoptr i64 %value_phi92 to <8 x double>*
%unmaskedload1911 = load <8 x double>, <8 x double>* %ptr.i1061, align 8
%190 = getelementptr i8, i8* %188, i64 64
%ptr.i1064 = bitcast i8* %190 to <8 x double>*
%unmaskedload1912 = load <8 x double>, <8 x double>* %ptr.i1064, align 8
%191 = getelementptr i8, i8* %188, i64 128
%ptr.i1067 = bitcast i8* %191 to <8 x double>*
%unmaskedload1913 = load <8 x double>, <8 x double>* %ptr.i1067, align 8
%192 = getelementptr i8, i8* %188, i64 192
%ptr.i1072 = bitcast i8* %192 to <8 x double>*
%unmaskedload1914 = load <8 x double>, <8 x double>* %ptr.i1072, align 8
%193 = getelementptr i8, i8* %188, i64 256
%ptr.i1075 = bitcast i8* %193 to <8 x double>*
%unmaskedload1915 = load <8 x double>, <8 x double>* %ptr.i1075, align 8
%ptr.i1078 = inttoptr i64 %value_phi93 to double*
%res.i1079 = load double, double* %ptr.i1078, align 8
%ie.i1081 = insertelement <8 x double> undef, double %res.i1079, i32 0
%v.i1082 = shufflevector <8 x double> %ie.i1081, <8 x double> undef, <8 x i32> zeroinitializer
%res.i1084 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi94, <8 x double> %v.i1082, <8 x double> %unmaskedload1911) #8
%res.i1085 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi95, <8 x double> %v.i1082, <8 x double> %unmaskedload1912) #8
%res.i1086 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi96, <8 x double> %v.i1082, <8 x double> %unmaskedload1913) #8
%res.i1087 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi97, <8 x double> %v.i1082, <8 x double> %unmaskedload1914) #8
%res.i1088 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi98, <8 x double> %v.i1082, <8 x double> %unmaskedload1915) #8
%194 = inttoptr i64 %value_phi93 to i8*
%195 = getelementptr i8, i8* %194, i64 %73
%ptr.i1089 = bitcast i8* %195 to double*
%res.i1090 = load double, double* %ptr.i1089, align 8
%ie.i1104 = insertelement <8 x double> undef, double %res.i1090, i32 0
%v.i1105 = shufflevector <8 x double> %ie.i1104, <8 x double> undef, <8 x i32> zeroinitializer
%res.i1107 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi99, <8 x double> %v.i1105, <8 x double> %unmaskedload1911) #8
%res.i1108 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi100, <8 x double> %v.i1105, <8 x double> %unmaskedload1912) #8
%res.i1109 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi101, <8 x double> %v.i1105, <8 x double> %unmaskedload1913) #8
%res.i1110 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi102, <8 x double> %v.i1105, <8 x double> %unmaskedload1914) #8
%res.i1111 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi103, <8 x double> %v.i1105, <8 x double> %unmaskedload1915) #8
%196 = getelementptr i8, i8* %194, i64 %75
%ptr.i1112 = bitcast i8* %196 to double*
%res.i1113 = load double, double* %ptr.i1112, align 8
%ie.i1115 = insertelement <8 x double> undef, double %res.i1113, i32 0
%v.i1116 = shufflevector <8 x double> %ie.i1115, <8 x double> undef, <8 x i32> zeroinitializer
%res.i1118 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi104, <8 x double> %v.i1116, <8 x double> %unmaskedload1911) #8
%res.i1119 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi105, <8 x double> %v.i1116, <8 x double> %unmaskedload1912) #8
%res.i1120 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi106, <8 x double> %v.i1116, <8 x double> %unmaskedload1913) #8
%res.i1121 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi107, <8 x double> %v.i1116, <8 x double> %unmaskedload1914) #8
%res.i1122 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi108, <8 x double> %v.i1116, <8 x double> %unmaskedload1915) #8
%197 = getelementptr i8, i8* %194, i64 %76
%ptr.i1123 = bitcast i8* %197 to double*
%res.i1124 = load double, double* %ptr.i1123, align 8
%ie.i1126 = insertelement <8 x double> undef, double %res.i1124, i32 0
%v.i1127 = shufflevector <8 x double> %ie.i1126, <8 x double> undef, <8 x i32> zeroinitializer
%res.i1131 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi109, <8 x double> %v.i1127, <8 x double> %unmaskedload1911) #8
%res.i1132 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi110, <8 x double> %v.i1127, <8 x double> %unmaskedload1912) #8
%res.i1133 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi111, <8 x double> %v.i1127, <8 x double> %unmaskedload1913) #8
%res.i1134 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi112, <8 x double> %v.i1127, <8 x double> %unmaskedload1914) #8
%res.i1135 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi113, <8 x double> %v.i1127, <8 x double> %unmaskedload1915) #8
%198 = getelementptr i8, i8* %194, i64 %77
%ptr.i1136 = bitcast i8* %198 to double*
%res.i1137 = load double, double* %ptr.i1136, align 8
%ie.i1139 = insertelement <8 x double> undef, double %res.i1137, i32 0
%v.i1140 = shufflevector <8 x double> %ie.i1139, <8 x double> undef, <8 x i32> zeroinitializer
%res.i1142 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi114, <8 x double> %v.i1140, <8 x double> %unmaskedload1911) #8
%res.i1143 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi115, <8 x double> %v.i1140, <8 x double> %unmaskedload1912) #8
%res.i1144 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi116, <8 x double> %v.i1140, <8 x double> %unmaskedload1913) #8
%res.i1145 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi117, <8 x double> %v.i1140, <8 x double> %unmaskedload1914) #8
%res.i1146 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi118, <8 x double> %v.i1140, <8 x double> %unmaskedload1915) #8
%199 = getelementptr i8, i8* %188, i64 320
%200 = ptrtoint i8* %199 to i64
%201 = getelementptr i8, i8* %194, i64 8
%202 = ptrtoint i8* %201 to i64
%203 = icmp eq i64 %value_phi119, %65
%204 = add nuw i64 %value_phi119, 1
br i1 %203, label %L1194, label %L1007
The main difference I see is that the good version increments pointers between iterations:
%value_phi92 = phi i64 [ %200, %L1007 ], [ %value_phi38, %L945 ]
%value_phi93 = phi i64 [ %202, %L1007 ], [ %value_phi39, %L945 ]
# ...
%199 = getelementptr i8, i8* %188, i64 320
%200 = ptrtoint i8* %199 to i64
%201 = getelementptr i8, i8* %194, i64 8
%202 = ptrtoint i8* %201 to i64
%203 = icmp eq i64 %value_phi119, %65
%204 = add nuw i64 %value_phi119, 1
br i1 %203, label %L1194, label %L1007
While the bad version increments a loop counter, and uses this to calculate the pointers:
%value_phi4 = phi i64 [ %res.i2708, %L111 ], [ 0, %L111.preheader ]
# ...
%res.i1789 = add nsw i64 %value_phi4, %res.i1787
%offsetptr.i1792 = getelementptr inbounds double, double* %ptr.i1791, i64 %res.i1789
%res.i1803 = load double, double* %offsetptr.i1792, align 8
%res.i1811 = mul nsw i64 %value_phi4, %.sroa.2.0.copyload
%res.i1813 = add nsw i64 %res.i1811, %value_phi15290
%offsetptr.i1816 = getelementptr inbounds double, double* %ptr.i1815, i64 %res.i1813
# ...
%res.i2708 = add nuw nsw i64 %value_phi4, 1
Is there anything I can do to help LLVM trigger this optimization?
The “bad LLVM IR” is what my Julia library LoopVectorization.jl produces for matrix multiplication, while the “good” are kernels from a dedicated library.
I would like to know how to fix my code-gen. Any guidance on that front would be greatly appreciated.
I’m also happy to provide more information, e.g. all of the LLVM IR (optimized or unoptimized), or any other information that may help.
I’ve been throwing mud at a wall recently.
For example, I’ve spent the better part of today converting all my libraries to use 0-based indexing instead of 1-based indexing, like a cargo-cultist:
- The code generating the good IR was.
- Most LLVM languages are, so perhaps the optimizer has trouble with it.
Result: an extra addq
/leaq
, and resulting code performs about 3% worse in benchmarks.
But I suspect this is just me overfitting noise, and flailing cluelessly due to lack of any understanding or intuition about why LLVM does what it does.