mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-19 04:32:19 +00:00
[AArch64] Match interleaved memory accesses into ldN/stN instructions.
Re-commit after adding "-aarch64-neon-syntax=generic" to fix the failure on OS X. This patch was firstly committed in r239514, then reverted in r239544 because of a syntax incompatible failure on OS X. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239711 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f6da33d457
commit
6024ab3b8f
@ -38,6 +38,7 @@ FunctionPass *createAArch64LoadStoreOptimizationPass();
|
||||
ModulePass *createAArch64PromoteConstantPass();
|
||||
FunctionPass *createAArch64ConditionOptimizerPass();
|
||||
FunctionPass *createAArch64AddressTypePromotionPass();
|
||||
FunctionPass *createAArch64InterleavedAccessPass();
|
||||
FunctionPass *createAArch64A57FPLoadBalancing();
|
||||
FunctionPass *createAArch64A53Fix835769();
|
||||
|
||||
|
392
lib/Target/AArch64/AArch64InterleavedAccess.cpp
Normal file
392
lib/Target/AArch64/AArch64InterleavedAccess.cpp
Normal file
@ -0,0 +1,392 @@
|
||||
//=--------------------- AArch64InterleavedAccess.cpp ----------------------==//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements the AArch64InterleavedAccess pass, which identifies
|
||||
// interleaved memory accesses and Transforms them into an AArch64 ldN/stN
|
||||
// intrinsics (N = 2, 3, 4).
|
||||
//
|
||||
// An interleaved load reads data from memory into several vectors, with
|
||||
// DE-interleaving the data on factor. An interleaved store writes several
|
||||
// vectors to memory with RE-interleaving the data on factor. The interleave
|
||||
// factor is equal to the number of vectors. AArch64 backend supports interleave
|
||||
// factor of 2, 3 and 4.
|
||||
//
|
||||
// E.g. Transform an interleaved load (Factor = 2):
|
||||
// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
|
||||
// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
|
||||
// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
|
||||
// Into:
|
||||
// %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr)
|
||||
// %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
|
||||
// %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
|
||||
//
|
||||
// E.g. Transform an interleaved store (Factor = 2):
|
||||
// %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> ; Interleaved vec
|
||||
// store <8 x i32> %i.vec, <8 x i32>* %ptr
|
||||
// Into:
|
||||
// %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
|
||||
// %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
|
||||
// call void aarch64.neon.st2(%v0, %v1, %ptr)
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AArch64.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/IR/InstIterator.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "aarch64-interleaved-access"
|
||||
|
||||
static const unsigned MIN_FACTOR = 2;
|
||||
static const unsigned MAX_FACTOR = 4;
|
||||
|
||||
namespace llvm {
|
||||
static void initializeAArch64InterleavedAccessPass(PassRegistry &);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
class AArch64InterleavedAccess : public FunctionPass {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
AArch64InterleavedAccess() : FunctionPass(ID) {
|
||||
initializeAArch64InterleavedAccessPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
const char *getPassName() const override {
|
||||
return "AArch64 Interleaved Access Pass";
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override;
|
||||
|
||||
private:
|
||||
const DataLayout *DL;
|
||||
Module *M;
|
||||
|
||||
/// \brief Transform an interleaved load into ldN intrinsic.
|
||||
bool matchInterleavedLoad(ShuffleVectorInst *SVI,
|
||||
SmallSetVector<Instruction *, 32> &DeadInsts);
|
||||
|
||||
/// \brief Transform an interleaved store into stN intrinsic.
|
||||
bool matchInterleavedStore(ShuffleVectorInst *SVI,
|
||||
SmallSetVector<Instruction *, 32> &DeadInsts);
|
||||
};
|
||||
} // end anonymous namespace.
|
||||
|
||||
char AArch64InterleavedAccess::ID = 0;
|
||||
|
||||
INITIALIZE_PASS_BEGIN(AArch64InterleavedAccess, DEBUG_TYPE,
|
||||
"AArch64 interleaved access Pass", false, false)
|
||||
INITIALIZE_PASS_END(AArch64InterleavedAccess, DEBUG_TYPE,
|
||||
"AArch64 interleaved access Pass", false, false)
|
||||
|
||||
FunctionPass *llvm::createAArch64InterleavedAccessPass() {
|
||||
return new AArch64InterleavedAccess();
|
||||
}
|
||||
|
||||
/// \brief Get a ldN/stN intrinsic according to the Factor (2, 3, or 4).
|
||||
static Intrinsic::ID getLdNStNIntrinsic(unsigned Factor, bool IsLoad) {
|
||||
static const Intrinsic::ID LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
|
||||
Intrinsic::aarch64_neon_ld3,
|
||||
Intrinsic::aarch64_neon_ld4};
|
||||
static const Intrinsic::ID StoreInt[3] = {Intrinsic::aarch64_neon_st2,
|
||||
Intrinsic::aarch64_neon_st3,
|
||||
Intrinsic::aarch64_neon_st4};
|
||||
|
||||
assert(Factor >= MIN_FACTOR && Factor <= MAX_FACTOR &&
|
||||
"Invalid interleave factor");
|
||||
|
||||
if (IsLoad)
|
||||
return LoadInt[Factor - 2];
|
||||
else
|
||||
return StoreInt[Factor - 2];
|
||||
}
|
||||
|
||||
/// \brief Check if the mask is a DE-interleave mask of the given factor
|
||||
/// \p Factor like:
|
||||
/// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
|
||||
static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
|
||||
unsigned &Index) {
|
||||
// Check all potential start indices from 0 to (Factor - 1).
|
||||
for (Index = 0; Index < Factor; Index++) {
|
||||
unsigned i = 0;
|
||||
|
||||
// Check that elements are in ascending order by Factor.
|
||||
for (; i < Mask.size(); i++)
|
||||
if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
|
||||
break;
|
||||
|
||||
if (i == Mask.size())
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
|
||||
///
|
||||
/// E.g. DE-interleave masks (Factor = 2) could be:
|
||||
/// <0, 2, 4, 6> (mask of index 0 to extract even elements)
|
||||
/// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
|
||||
static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
|
||||
unsigned &Index) {
|
||||
unsigned NumElts = Mask.size();
|
||||
if (NumElts < 2)
|
||||
return false;
|
||||
|
||||
for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
|
||||
if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// \brief Check if the given mask \p Mask is RE-interleaved mask of the given
|
||||
/// factor \p Factor.
|
||||
///
|
||||
/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
|
||||
static bool isReInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor) {
|
||||
unsigned NumElts = Mask.size();
|
||||
if (NumElts % Factor)
|
||||
return false;
|
||||
|
||||
unsigned NumSubElts = NumElts / Factor;
|
||||
if (!isPowerOf2_32(NumSubElts))
|
||||
return false;
|
||||
|
||||
for (unsigned i = 0; i < NumSubElts; i++)
|
||||
for (unsigned j = 0; j < Factor; j++)
|
||||
if (Mask[i * Factor + j] >= 0 &&
|
||||
static_cast<unsigned>(Mask[i * Factor + j]) != j * NumSubElts + i)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Check if the mask is RE-interleave mask for an interleaved store.
|
||||
///
|
||||
/// E.g. The RE-interleave mask (Factor = 2) could be:
|
||||
/// <0, 4, 1, 5, 2, 6, 3, 7>
|
||||
static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
|
||||
if (Mask.size() < 4)
|
||||
return false;
|
||||
|
||||
// Check potential Factors and return true if find a factor for the mask.
|
||||
for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
|
||||
if (isReInterleaveMaskOfFactor(Mask, Factor))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// \brief Get a mask consisting of sequential integers starting from \p Start.
|
||||
///
|
||||
/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
|
||||
static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
|
||||
unsigned NumElts) {
|
||||
SmallVector<Constant *, 16> Mask;
|
||||
for (unsigned i = 0; i < NumElts; i++)
|
||||
Mask.push_back(Builder.getInt32(Start + i));
|
||||
|
||||
return ConstantVector::get(Mask);
|
||||
}
|
||||
|
||||
bool AArch64InterleavedAccess::matchInterleavedLoad(
|
||||
ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts) {
|
||||
if (DeadInsts.count(SVI))
|
||||
return false;
|
||||
|
||||
LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
|
||||
if (!LI || !LI->isSimple() || !isa<UndefValue>(SVI->getOperand(1)))
|
||||
return false;
|
||||
|
||||
SmallVector<ShuffleVectorInst *, 4> Shuffles;
|
||||
|
||||
// Check if all users of this load are shufflevectors.
|
||||
for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
|
||||
ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(*UI);
|
||||
if (!SV)
|
||||
return false;
|
||||
|
||||
Shuffles.push_back(SV);
|
||||
}
|
||||
|
||||
// Check if the type of the first shuffle is legal.
|
||||
VectorType *VecTy = Shuffles[0]->getType();
|
||||
unsigned TypeSize = DL->getTypeAllocSizeInBits(VecTy);
|
||||
if (TypeSize != 64 && TypeSize != 128)
|
||||
return false;
|
||||
|
||||
// Check if the mask of the first shuffle is strided and get the start index.
|
||||
unsigned Factor, Index;
|
||||
if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
|
||||
return false;
|
||||
|
||||
// Holds the corresponding index for each strided shuffle.
|
||||
SmallVector<unsigned, 4> Indices;
|
||||
Indices.push_back(Index);
|
||||
|
||||
// Check if other shufflevectors are of the same type and factor
|
||||
for (unsigned i = 1; i < Shuffles.size(); i++) {
|
||||
if (Shuffles[i]->getType() != VecTy)
|
||||
return false;
|
||||
|
||||
unsigned Index;
|
||||
if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
|
||||
Index))
|
||||
return false;
|
||||
|
||||
Indices.push_back(Index);
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Found an interleaved load:" << *LI << "\n");
|
||||
|
||||
// A pointer vector can not be the return type of the ldN intrinsics. Need to
|
||||
// load integer vectors first and then convert to pointer vectors.
|
||||
Type *EltTy = VecTy->getVectorElementType();
|
||||
if (EltTy->isPointerTy())
|
||||
VecTy = VectorType::get(DL->getIntPtrType(EltTy),
|
||||
VecTy->getVectorNumElements());
|
||||
|
||||
Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
|
||||
Type *Tys[2] = {VecTy, PtrTy};
|
||||
Function *LdNFunc =
|
||||
Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, true), Tys);
|
||||
|
||||
IRBuilder<> Builder(LI);
|
||||
Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
|
||||
|
||||
CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
|
||||
DEBUG(dbgs() << " Created:" << *LdN << "\n");
|
||||
|
||||
// Replace each strided shufflevector with the corresponding vector loaded
|
||||
// by ldN.
|
||||
for (unsigned i = 0; i < Shuffles.size(); i++) {
|
||||
ShuffleVectorInst *SV = Shuffles[i];
|
||||
unsigned Index = Indices[i];
|
||||
|
||||
Value *SubVec = Builder.CreateExtractValue(LdN, Index);
|
||||
|
||||
// Convert the integer vector to pointer vector if the element is pointer.
|
||||
if (EltTy->isPointerTy())
|
||||
SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
|
||||
|
||||
SV->replaceAllUsesWith(SubVec);
|
||||
|
||||
DEBUG(dbgs() << " Replaced:" << *SV << "\n"
|
||||
<< " With:" << *SubVec << "\n");
|
||||
|
||||
// Avoid analyzing it twice.
|
||||
DeadInsts.insert(SV);
|
||||
}
|
||||
|
||||
// Mark this load as dead.
|
||||
DeadInsts.insert(LI);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AArch64InterleavedAccess::matchInterleavedStore(
|
||||
ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts) {
|
||||
if (DeadInsts.count(SVI) || !SVI->hasOneUse())
|
||||
return false;
|
||||
|
||||
StoreInst *SI = dyn_cast<StoreInst>(SVI->user_back());
|
||||
if (!SI || !SI->isSimple())
|
||||
return false;
|
||||
|
||||
// Check if the mask is interleaved and get the interleave factor.
|
||||
unsigned Factor;
|
||||
if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
|
||||
return false;
|
||||
|
||||
VectorType *VecTy = SVI->getType();
|
||||
unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
|
||||
Type *EltTy = VecTy->getVectorElementType();
|
||||
VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
|
||||
|
||||
// Skip illegal vector types.
|
||||
unsigned TypeSize = DL->getTypeAllocSizeInBits(SubVecTy);
|
||||
if (TypeSize != 64 && TypeSize != 128)
|
||||
return false;
|
||||
|
||||
DEBUG(dbgs() << "Found an interleaved store:" << *SI << "\n");
|
||||
|
||||
Value *Op0 = SVI->getOperand(0);
|
||||
Value *Op1 = SVI->getOperand(1);
|
||||
IRBuilder<> Builder(SI);
|
||||
|
||||
// StN intrinsics don't support pointer vectors as arguments. Convert pointer
|
||||
// vectors to integer vectors.
|
||||
if (EltTy->isPointerTy()) {
|
||||
Type *IntTy = DL->getIntPtrType(EltTy);
|
||||
unsigned NumOpElts =
|
||||
dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
|
||||
|
||||
// The corresponding integer vector type of the same element size.
|
||||
Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
|
||||
|
||||
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
|
||||
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
|
||||
SubVecTy = VectorType::get(IntTy, NumSubElts);
|
||||
}
|
||||
|
||||
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
|
||||
Type *Tys[2] = {SubVecTy, PtrTy};
|
||||
Function *StNFunc =
|
||||
Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, false), Tys);
|
||||
|
||||
SmallVector<Value *, 5> Ops;
|
||||
|
||||
// Split the shufflevector operands into sub vectors for the new stN call.
|
||||
for (unsigned i = 0; i < Factor; i++)
|
||||
Ops.push_back(Builder.CreateShuffleVector(
|
||||
Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
|
||||
|
||||
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
|
||||
CallInst *StN = Builder.CreateCall(StNFunc, Ops);
|
||||
|
||||
(void)StN; // silence warning.
|
||||
DEBUG(dbgs() << " Replaced:" << *SI << "'\n");
|
||||
DEBUG(dbgs() << " with:" << *StN << "\n");
|
||||
|
||||
// Mark this shufflevector and store as dead.
|
||||
DeadInsts.insert(SI);
|
||||
DeadInsts.insert(SVI);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AArch64InterleavedAccess::runOnFunction(Function &F) {
|
||||
DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
|
||||
|
||||
M = F.getParent();
|
||||
DL = &M->getDataLayout();
|
||||
|
||||
// Holds dead instructions that will be erased later.
|
||||
SmallSetVector<Instruction *, 32> DeadInsts;
|
||||
bool Changed = false;
|
||||
for (auto &I : inst_range(F)) {
|
||||
if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
|
||||
Changed |= matchInterleavedLoad(SVI, DeadInsts);
|
||||
Changed |= matchInterleavedStore(SVI, DeadInsts);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto I : DeadInsts)
|
||||
I->eraseFromParent();
|
||||
|
||||
return Changed;
|
||||
}
|
@ -67,6 +67,11 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
|
||||
" to make use of cmpxchg flow-based information"),
|
||||
cl::init(true));
|
||||
|
||||
static cl::opt<bool> AArch64InterleavedAccessOpt(
|
||||
"aarch64-interleaved-access-opt",
|
||||
cl::desc("Optimize interleaved memory accesses in the AArch64 backend"),
|
||||
cl::init(false), cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
|
||||
cl::desc("Run early if-conversion"),
|
||||
@ -223,6 +228,9 @@ void AArch64PassConfig::addIRPasses() {
|
||||
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
|
||||
addPass(createCFGSimplificationPass());
|
||||
|
||||
if (TM->getOptLevel() != CodeGenOpt::None && AArch64InterleavedAccessOpt)
|
||||
addPass(createAArch64InterleavedAccessPass());
|
||||
|
||||
TargetPassConfig::addIRPasses();
|
||||
|
||||
if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
|
||||
|
@ -407,6 +407,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
return LT.first;
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
|
||||
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment, unsigned AddressSpace) {
|
||||
assert(isa<VectorType>(VecTy) && "Expect vector types");
|
||||
|
||||
if (Factor > 1 && Factor < 5 && isTypeLegal(VecTy))
|
||||
return Factor;
|
||||
|
||||
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
|
||||
Alignment, AddressSpace);
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
|
||||
unsigned Cost = 0;
|
||||
for (auto *I : Tys) {
|
||||
|
@ -139,6 +139,11 @@ public:
|
||||
|
||||
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
|
||||
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace);
|
||||
/// @}
|
||||
};
|
||||
|
||||
|
@ -38,6 +38,7 @@ add_llvm_target(AArch64CodeGen
|
||||
AArch64PBQPRegAlloc.cpp
|
||||
AArch64RegisterInfo.cpp
|
||||
AArch64SelectionDAGInfo.cpp
|
||||
AArch64InterleavedAccess.cpp
|
||||
AArch64StorePairSuppress.cpp
|
||||
AArch64Subtarget.cpp
|
||||
AArch64TargetMachine.cpp
|
||||
|
197
test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
Normal file
197
test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
Normal file
@ -0,0 +1,197 @@
|
||||
; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -aarch64-interleaved-access-opt=true < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: load_factor2:
|
||||
; CHECK: ld2 { v0.8b, v1.8b }, [x0]
|
||||
define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
|
||||
%wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
|
||||
%strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%add = add nsw <8 x i8> %strided.v0, %strided.v1
|
||||
ret <8 x i8> %add
|
||||
}
|
||||
|
||||
; CHECK-LABEL: load_delat3:
|
||||
; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
|
||||
define <4 x i32> @load_delat3(i32* %ptr) {
|
||||
%base = bitcast i32* %ptr to <12 x i32>*
|
||||
%wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
|
||||
%strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
|
||||
%strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
|
||||
%add = add nsw <4 x i32> %strided.v2, %strided.v1
|
||||
ret <4 x i32> %add
|
||||
}
|
||||
|
||||
; CHECK-LABEL: load_factor4:
|
||||
; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
|
||||
define <4 x i32> @load_factor4(i32* %ptr) {
|
||||
%base = bitcast i32* %ptr to <16 x i32>*
|
||||
%wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
|
||||
%strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
|
||||
%strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
|
||||
%add = add nsw <4 x i32> %strided.v0, %strided.v2
|
||||
ret <4 x i32> %add
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_factor2:
|
||||
; CHECK: st2 { v0.8b, v1.8b }, [x0]
|
||||
define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
|
||||
%interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
|
||||
store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_factor3:
|
||||
; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
|
||||
define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
|
||||
%base = bitcast i32* %ptr to <12 x i32>*
|
||||
%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
|
||||
store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_factor4:
|
||||
; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
|
||||
define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
|
||||
%base = bitcast i32* %ptr to <16 x i32>*
|
||||
%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
|
||||
store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; The following cases test that interleaved access of pointer vectors can be
|
||||
; matched to ldN/stN instruction.
|
||||
|
||||
; CHECK-LABEL: load_ptrvec_factor2:
|
||||
; CHECK: ld2 { v0.2d, v1.2d }, [x0]
|
||||
define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
|
||||
%base = bitcast i32** %ptr to <4 x i32*>*
|
||||
%wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
|
||||
%strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
|
||||
ret <2 x i32*> %strided.v0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: load_ptrvec_factor3:
|
||||
; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
|
||||
define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
|
||||
%base = bitcast i32** %ptr to <6 x i32*>*
|
||||
%wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
|
||||
%strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
|
||||
store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
|
||||
%strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
|
||||
store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: load_ptrvec_factor4:
|
||||
; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
|
||||
define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
|
||||
%base = bitcast i32** %ptr to <8 x i32*>*
|
||||
%wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
|
||||
%strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
|
||||
%strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
|
||||
store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
|
||||
store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_ptrvec_factor2:
|
||||
; CHECK: st2 { v0.2d, v1.2d }, [x0]
|
||||
define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
|
||||
%base = bitcast i32** %ptr to <4 x i32*>*
|
||||
%interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
|
||||
store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_ptrvec_factor3:
|
||||
; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
|
||||
define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
|
||||
%base = bitcast i32** %ptr to <6 x i32*>*
|
||||
%v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
%interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
|
||||
store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_ptrvec_factor4:
|
||||
; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
|
||||
define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
|
||||
%base = bitcast i32* %ptr to <8 x i32*>*
|
||||
%v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
|
||||
store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Following cases check that shuffle maskes with undef indices can be matched
|
||||
; into ldN/stN instruction.
|
||||
|
||||
; CHECK-LABEL: load_undef_mask_factor2:
|
||||
; CHECK: ld2 { v0.4s, v1.4s }, [x0]
|
||||
define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
|
||||
%base = bitcast i32* %ptr to <8 x i32>*
|
||||
%wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
|
||||
%strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
|
||||
%strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
|
||||
%add = add nsw <4 x i32> %strided.v0, %strided.v1
|
||||
ret <4 x i32> %add
|
||||
}
|
||||
|
||||
; CHECK-LABEL: load_undef_mask_factor3:
|
||||
; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
|
||||
define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
|
||||
%base = bitcast i32* %ptr to <12 x i32>*
|
||||
%wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
|
||||
%strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
|
||||
%strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
|
||||
%add = add nsw <4 x i32> %strided.v2, %strided.v1
|
||||
ret <4 x i32> %add
|
||||
}
|
||||
|
||||
; CHECK-LABEL: load_undef_mask_factor4:
|
||||
; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
|
||||
define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
|
||||
%base = bitcast i32* %ptr to <16 x i32>*
|
||||
%wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
|
||||
%strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
|
||||
%strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
|
||||
%add = add nsw <4 x i32> %strided.v0, %strided.v2
|
||||
ret <4 x i32> %add
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_undef_mask_factor2:
|
||||
; CHECK: st2 { v0.4s, v1.4s }, [x0]
|
||||
define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
|
||||
%base = bitcast i32* %ptr to <8 x i32>*
|
||||
%interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
|
||||
store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_undef_mask_factor3:
|
||||
; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
|
||||
define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
|
||||
%base = bitcast i32* %ptr to <12 x i32>*
|
||||
%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
|
||||
store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: store_undef_mask_factor4:
|
||||
; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
|
||||
define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
|
||||
%base = bitcast i32* %ptr to <16 x i32>*
|
||||
%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
|
||||
store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
|
||||
ret void
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user