Sanjay,
I'm looking at some missed optimizations caused by D70246. Here's a test case:
define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
%t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
%t44 = bitcast <3 x i32> %t43 to <3 x float>
%t45 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 0, i32 undef, i32 undef, i32 undef>
%t46 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 undef, i32 1, i32 undef, i32 undef>
%t47 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 undef, i32 undef, i32 2, i32 undef>
%t48 = insertelement <4 x float> %t45, float 1.000000e+00, i32 3
%t49 = shufflevector <4 x float> %t48, <4 x float> %t46, <4 x i32>
<i32 0, i32 5, i32 undef, i32 3>
%t50 = shufflevector <4 x float> %t49, <4 x float> %t47, <4 x i32>
<i32 0, i32 1, i32 6, i32 3>
%t55 = load <4 x float>, <4 x float>* %t24, align 16
%t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
%t59 = load <4 x float>, <4 x float>* %t58, align 16
%t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
%t63 = load <4 x float>, <4 x float>* %t62, align 16
%t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
%t67 = load <4 x float>, <4 x float>* %t66, align 16
%t69 = shufflevector <4 x float> %t50, <4 x float> undef, <4 x i32>
zeroinitializer
%t71 = fmul <4 x float> %t55, %t69
%t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
%t74 = fmul <4 x float> %t59, %t72
%t75 = fadd <4 x float> %t71, %t74
%t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
%t78 = fmul <4 x float> %t63, %t76
%t79 = fadd <4 x float> %t75, %t78
%t80 = shufflevector <4 x float> %t50, <4 x float> undef, <4 x i32>
<i32 3, i32 3, i32 3, i32 3>
%t82 = fmul <4 x float> %t67, %t80
%t83 = fadd <4 x float> %t79, %t82
ret <4 x float> %t83
}
Before D70246, opt -instcombine gives this:
define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
%t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
%t44 = bitcast <3 x i32> %t43 to <3 x float>
%t55 = load <4 x float>, <4 x float>* %t24, align 16
%t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
%t59 = load <4 x float>, <4 x float>* %t58, align 16
%t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
%t63 = load <4 x float>, <4 x float>* %t62, align 16
%t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
%t67 = load <4 x float>, <4 x float>* %t66, align 16
%t69 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
zeroinitializer
%t71 = fmul <4 x float> %t55, %t69
%t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
%t74 = fmul <4 x float> %t59, %t72
%t75 = fadd <4 x float> %t71, %t74
%t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
%t78 = fmul <4 x float> %t63, %t76
%t79 = fadd <4 x float> %t75, %t78
%t83 = fadd <4 x float> %t79, %t67
ret <4 x float> %t83
}
After D70246, opt -instcombine gives this:
define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
%t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
%t44 = bitcast <3 x i32> %t43 to <3 x float>
%t45 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 0, i32 undef, i32 undef, i32 undef>
%t48 = insertelement <4 x float> %t45, float 1.000000e+00, i32 3
%t55 = load <4 x float>, <4 x float>* %t24, align 16
%t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
%t59 = load <4 x float>, <4 x float>* %t58, align 16
%t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
%t63 = load <4 x float>, <4 x float>* %t62, align 16
%t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
%t67 = load <4 x float>, <4 x float>* %t66, align 16
%t69 = shufflevector <4 x float> %t48, <4 x float> undef, <4 x i32>
zeroinitializer
%t71 = fmul <4 x float> %t55, %t69
%t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
%t74 = fmul <4 x float> %t59, %t72
%t75 = fadd <4 x float> %t71, %t74
%t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
%t78 = fmul <4 x float> %t63, %t76
%t79 = fadd <4 x float> %t75, %t78
%t80 = shufflevector <4 x float> %t48, <4 x float> undef, <4 x i32>
<i32 3, i32 3, i32 3, i32 3>
%t82 = fmul <4 x float> %t67, %t80
%t83 = fadd <4 x float> %t79, %t82
ret <4 x float> %t83
}
Notice that it has failed to simplify %t80, which extracts element 3
from %t48, which is obviously just the constant 1.0.
Would you expect the change you committed to have this kind of effect?
Do you agree that simplifying %t80 is still valid, even with the new
interpretation of the rules about shufflevector and undef and poison?
Thanks,
Jay.