[PATCH] D70246: [InstCombine] remove identity shuffle simplification for mask with undefs

Sanjay,

I'm looking at some missed optimizations caused by D70246. Here's a test case:

define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
  %t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
  %t44 = bitcast <3 x i32> %t43 to <3 x float>
  %t45 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 0, i32 undef, i32 undef, i32 undef>
  %t46 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 undef, i32 1, i32 undef, i32 undef>
  %t47 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 undef, i32 undef, i32 2, i32 undef>
  %t48 = insertelement <4 x float> %t45, float 1.000000e+00, i32 3
  %t49 = shufflevector <4 x float> %t48, <4 x float> %t46, <4 x i32>
<i32 0, i32 5, i32 undef, i32 3>
  %t50 = shufflevector <4 x float> %t49, <4 x float> %t47, <4 x i32>
<i32 0, i32 1, i32 6, i32 3>
  %t55 = load <4 x float>, <4 x float>* %t24, align 16
  %t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
  %t59 = load <4 x float>, <4 x float>* %t58, align 16
  %t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
  %t63 = load <4 x float>, <4 x float>* %t62, align 16
  %t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
  %t67 = load <4 x float>, <4 x float>* %t66, align 16
  %t69 = shufflevector <4 x float> %t50, <4 x float> undef, <4 x i32>
zeroinitializer
  %t71 = fmul <4 x float> %t55, %t69
  %t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
  %t74 = fmul <4 x float> %t59, %t72
  %t75 = fadd <4 x float> %t71, %t74
  %t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
  %t78 = fmul <4 x float> %t63, %t76
  %t79 = fadd <4 x float> %t75, %t78
  %t80 = shufflevector <4 x float> %t50, <4 x float> undef, <4 x i32>
<i32 3, i32 3, i32 3, i32 3>
  %t82 = fmul <4 x float> %t67, %t80
  %t83 = fadd <4 x float> %t79, %t82
  ret <4 x float> %t83
}

Before D70246, opt -instcombine gives this:

define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
  %t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
  %t44 = bitcast <3 x i32> %t43 to <3 x float>
  %t55 = load <4 x float>, <4 x float>* %t24, align 16
  %t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
  %t59 = load <4 x float>, <4 x float>* %t58, align 16
  %t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
  %t63 = load <4 x float>, <4 x float>* %t62, align 16
  %t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
  %t67 = load <4 x float>, <4 x float>* %t66, align 16
  %t69 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
zeroinitializer
  %t71 = fmul <4 x float> %t55, %t69
  %t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
  %t74 = fmul <4 x float> %t59, %t72
  %t75 = fadd <4 x float> %t71, %t74
  %t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
  %t78 = fmul <4 x float> %t63, %t76
  %t79 = fadd <4 x float> %t75, %t78
  %t83 = fadd <4 x float> %t79, %t67
  ret <4 x float> %t83
}

After D70246, opt -instcombine gives this:

define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
  %t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
  %t44 = bitcast <3 x i32> %t43 to <3 x float>
  %t45 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 0, i32 undef, i32 undef, i32 undef>
  %t48 = insertelement <4 x float> %t45, float 1.000000e+00, i32 3
  %t55 = load <4 x float>, <4 x float>* %t24, align 16
  %t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
  %t59 = load <4 x float>, <4 x float>* %t58, align 16
  %t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
  %t63 = load <4 x float>, <4 x float>* %t62, align 16
  %t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
  %t67 = load <4 x float>, <4 x float>* %t66, align 16
  %t69 = shufflevector <4 x float> %t48, <4 x float> undef, <4 x i32>
zeroinitializer
  %t71 = fmul <4 x float> %t55, %t69
  %t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
  %t74 = fmul <4 x float> %t59, %t72
  %t75 = fadd <4 x float> %t71, %t74
  %t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
  %t78 = fmul <4 x float> %t63, %t76
  %t79 = fadd <4 x float> %t75, %t78
  %t80 = shufflevector <4 x float> %t48, <4 x float> undef, <4 x i32>
<i32 3, i32 3, i32 3, i32 3>
  %t82 = fmul <4 x float> %t67, %t80
  %t83 = fadd <4 x float> %t79, %t82
  ret <4 x float> %t83
}

Notice that it has failed to simplify %t80, which extracts element 3
from %t48, which is obviously just the constant 1.0.

Would you expect the change you committed to have this kind of effect?
Do you agree that simplifying %t80 is still valid, even with the new
interpretation of the rules about shufflevector and undef and poison?

Thanks,
Jay.

Hi Jay -

Yes, this change carried some risk of regressions, and we should still be able to optimize this case. We may need to add or loosen a fold for splat-of-insert:
define <4 x float> @f(<4 x float> %x) {
%ins3 = insertelement <4 x float> %x, float 3.000000e+00, i32 3
%splat0 = shufflevector <4 x float> %ins3, <4 x float> undef, <4 x i32> zeroinitializer
%splat3 = shufflevector <4 x float> %ins3, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%r = fadd <4 x float> %splat0, %splat3
ret <4 x float> %r
}

SimplifyDemandedVectorElts() may not get this because %ins3 has multiple uses, but I didn’t step through that yet to confirm. I can work on this if you haven’t started already. Let me know.

Sanjay,

I haven't started looking into it, so please go ahead!

Thanks,
Jay.