Please tell me whether the following implementation is correct…
My target supports 64 bit mask means immediate(0-2^63)
I have implemented it but i dont know whether its correct or not. Please see the changes below that i have made in x86isellowering.cpp
static SDValue lower2048BitVectorShuffle(const SDLoc &DL, ArrayRef Mask,
MVT VT, SDValue V1, SDValue V2,
const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = Mask.size();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Check for being able to broadcast a single element.
if (SDValue Broadcast =
lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don’t have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v32f64:
return lowerV32F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64f32:
return lowerV64F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i64:
return lowerV32I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i32:
return lowerV64I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable(“Not a valid P x86 vector type!”);
}
}
static SDValue lowerV64I32VectorShuffle(const SDLoc &DL, ArrayRef Mask,
const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v64i32 && “Bad operand type!”);
assert(V2.getSimpleValueType() == MVT::v64i32 && “Bad operand type!”);
assert(Mask.size() == 64 && “Unexpected mask size for v64 shuffle!”);
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
DL, MVT::v64i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
SmallVector<int, 16> RepeatedMask;
bool Is128BitLaneRepeatedShuffle =
is128BitLaneRepeatedShuffleMask(MVT::v64i32, Mask, RepeatedMask);
if (Is128BitLaneRepeatedShuffle) {
// assert(RepeatedMask.size() == 16 && “Unexpected repeated mask size!”);
if (V2.isUndef())
{
return DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32, V1,
getV16X86ShuffleImm64ForMask(/Repeated/Mask, DL, DAG));
}
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v64i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v64i32, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than using a permv shuffle.
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v64i32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v64i32, Mask, V1, V2, DAG);
}
static SDValue lowerV32I64VectorShuffle(const SDLoc &DL, ArrayRef Mask,
const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i64 && “Bad operand type!”);
assert(V2.getSimpleValueType() == MVT::v32i64 && “Bad operand type!”);
assert(Mask.size() == 32 && “Unexpected mask size for v8 shuffle!”);
if (SDValue Shuf128 =
lowerV16X128VectorShuffle(DL, MVT::v32i64, Mask, V1, V2, DAG))
return Shuf128;
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on all four
// 128-bit lanes.
SmallVector<int, 8> Repeated128Mask;
if (is128BitLaneRepeatedShuffleMask(MVT::v32i64, Mask, Repeated128Mask)) {
SmallVector<int, 64> PSHUFDMask;
scaleShuffleMask(8, Repeated128Mask, PSHUFDMask);
return DAG.getBitcast(
MVT::v32i64,
DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32,
DAG.getBitcast(MVT::v64i32, V1),
getV16X86ShuffleImm64ForMask(PSHUFDMask, DL, DAG)));
}
SmallVector<int, 16> Repeated256Mask;
if (is256BitLaneRepeatedShuffleMask(MVT::v32i64, Mask, Repeated256Mask))
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v32i64, V1,
getV16X86ShuffleImm64ForMask(Repeated256Mask, DL, DAG));
}
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v32i64, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v32i64, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v32i64, Mask, V1, V2, DAG))
return Unpck;
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v32i64, Zeroable, Mask, V1,
V2, DAG, Subtarget))
return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v32i64, Mask, V1, V2, DAG);
}
static SDValue getV64X86ShuffleImm64ForMask(ArrayRef Mask, SDLoc DL,