mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-11-04 05:17:07 +00:00 
			
		
		
		
	R600: Use LDS and vectors for private memory
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211110 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
		@@ -17,6 +17,7 @@
 | 
				
			|||||||
namespace llvm {
 | 
					namespace llvm {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AMDGPUInstrPrinter;
 | 
					class AMDGPUInstrPrinter;
 | 
				
			||||||
 | 
					class AMDGPUSubtarget;
 | 
				
			||||||
class AMDGPUTargetMachine;
 | 
					class AMDGPUTargetMachine;
 | 
				
			||||||
class FunctionPass;
 | 
					class FunctionPass;
 | 
				
			||||||
class MCAsmInfo;
 | 
					class MCAsmInfo;
 | 
				
			||||||
@@ -47,6 +48,7 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
 | 
				
			|||||||
extern char &SILowerI1CopiesID;
 | 
					extern char &SILowerI1CopiesID;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Passes common to R600 and SI
 | 
					// Passes common to R600 and SI
 | 
				
			||||||
 | 
					FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
 | 
				
			||||||
Pass *createAMDGPUStructurizeCFGPass();
 | 
					Pass *createAMDGPUStructurizeCFGPass();
 | 
				
			||||||
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 | 
					FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -86,28 +86,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
 | 
				
			|||||||
def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 | 
					def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 | 
				
			||||||
def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 | 
					def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
 | 
				
			||||||
 | 
					        "localmemorysize"#Value,
 | 
				
			||||||
 | 
					        "LocalMemorySize",
 | 
				
			||||||
 | 
					        !cast<string>(Value),
 | 
				
			||||||
 | 
					        "The size of local memory in bytes">;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SubtargetFeatureGeneration <string Value,
 | 
					class SubtargetFeatureGeneration <string Value,
 | 
				
			||||||
                                  list<SubtargetFeature> Implies> :
 | 
					                                  list<SubtargetFeature> Implies> :
 | 
				
			||||||
        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
 | 
					        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
 | 
				
			||||||
                          Value#" GPU generation", Implies>;
 | 
					                          Value#" GPU generation", Implies>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
 | 
				
			||||||
 | 
					def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
 | 
				
			||||||
 | 
					def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def FeatureR600 : SubtargetFeatureGeneration<"R600",
 | 
					def FeatureR600 : SubtargetFeatureGeneration<"R600",
 | 
				
			||||||
        [FeatureR600ALUInst, FeatureFetchLimit8]>;
 | 
					        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def FeatureR700 : SubtargetFeatureGeneration<"R700",
 | 
					def FeatureR700 : SubtargetFeatureGeneration<"R700",
 | 
				
			||||||
        [FeatureFetchLimit16]>;
 | 
					        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
 | 
					def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
 | 
				
			||||||
        [FeatureFetchLimit16]>;
 | 
					        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
 | 
					def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
 | 
				
			||||||
        [FeatureFetchLimit16, FeatureWavefrontSize64]>;
 | 
					        [FeatureFetchLimit16, FeatureWavefrontSize64,
 | 
				
			||||||
 | 
					         FeatureLocalMemorySize32768]
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
 | 
					def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
 | 
				
			||||||
        [Feature64BitPtr, FeatureFP64]>;
 | 
					        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
 | 
					def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
 | 
				
			||||||
        [Feature64BitPtr, FeatureFP64]>;
 | 
					        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
 | 
				
			||||||
//===----------------------------------------------------------------------===//
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def AMDGPUInstrInfo : InstrInfo {
 | 
					def AMDGPUInstrInfo : InstrInfo {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -258,6 +258,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 | 
				
			|||||||
    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
 | 
					    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  case ISD::SCALAR_TO_VECTOR:
 | 
					  case ISD::SCALAR_TO_VECTOR:
 | 
				
			||||||
 | 
					  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
 | 
				
			||||||
  case ISD::BUILD_VECTOR: {
 | 
					  case ISD::BUILD_VECTOR: {
 | 
				
			||||||
    unsigned RegClassID;
 | 
					    unsigned RegClassID;
 | 
				
			||||||
    const AMDGPURegisterInfo *TRI =
 | 
					    const AMDGPURegisterInfo *TRI =
 | 
				
			||||||
@@ -308,7 +309,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 | 
				
			|||||||
      // can't be bundled by our scheduler.
 | 
					      // can't be bundled by our scheduler.
 | 
				
			||||||
      switch(NumVectorElts) {
 | 
					      switch(NumVectorElts) {
 | 
				
			||||||
      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
 | 
					      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
 | 
				
			||||||
      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
 | 
					      case 4:
 | 
				
			||||||
 | 
					        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 | 
				
			||||||
 | 
					          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
 | 
				
			||||||
 | 
					        else
 | 
				
			||||||
 | 
					          RegClassID = AMDGPU::R600_Reg128RegClassID;
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
 | 
					      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1911,6 +1911,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
 | 
				
			|||||||
  NODE_NAME_CASE(CVT_F32_UBYTE1)
 | 
					  NODE_NAME_CASE(CVT_F32_UBYTE1)
 | 
				
			||||||
  NODE_NAME_CASE(CVT_F32_UBYTE2)
 | 
					  NODE_NAME_CASE(CVT_F32_UBYTE2)
 | 
				
			||||||
  NODE_NAME_CASE(CVT_F32_UBYTE3)
 | 
					  NODE_NAME_CASE(CVT_F32_UBYTE3)
 | 
				
			||||||
 | 
					  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
 | 
				
			||||||
  NODE_NAME_CASE(STORE_MSKOR)
 | 
					  NODE_NAME_CASE(STORE_MSKOR)
 | 
				
			||||||
  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
 | 
					  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -203,6 +203,15 @@ enum {
 | 
				
			|||||||
  CVT_F32_UBYTE1,
 | 
					  CVT_F32_UBYTE1,
 | 
				
			||||||
  CVT_F32_UBYTE2,
 | 
					  CVT_F32_UBYTE2,
 | 
				
			||||||
  CVT_F32_UBYTE3,
 | 
					  CVT_F32_UBYTE3,
 | 
				
			||||||
 | 
					  /// This node is for VLIW targets and it is used to represent a vector
 | 
				
			||||||
 | 
					  /// that is stored in consecutive registers with the same channel.
 | 
				
			||||||
 | 
					  /// For example:
 | 
				
			||||||
 | 
					  ///   |X  |Y|Z|W|
 | 
				
			||||||
 | 
					  /// T0|v.x| | | |
 | 
				
			||||||
 | 
					  /// T1|v.y| | | |
 | 
				
			||||||
 | 
					  /// T2|v.z| | | |
 | 
				
			||||||
 | 
					  /// T3|v.w| | | |
 | 
				
			||||||
 | 
					  BUILD_VERTICAL_VECTOR,
 | 
				
			||||||
  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
 | 
					  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
 | 
				
			||||||
  STORE_MSKOR,
 | 
					  STORE_MSKOR,
 | 
				
			||||||
  LOAD_CONSTANT,
 | 
					  LOAD_CONSTANT,
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										365
									
								
								lib/Target/R600/AMDGPUPromoteAlloca.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										365
									
								
								lib/Target/R600/AMDGPUPromoteAlloca.cpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,365 @@
 | 
				
			|||||||
 | 
					//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					//                     The LLVM Compiler Infrastructure
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					// This file is distributed under the University of Illinois Open Source
 | 
				
			||||||
 | 
					// License. See LICENSE.TXT for details.
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					// This pass eliminates allocas by either converting them into vectors or
 | 
				
			||||||
 | 
					// by migrating them to local address space.
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "AMDGPU.h"
 | 
				
			||||||
 | 
					#include "AMDGPUSubtarget.h"
 | 
				
			||||||
 | 
					#include "llvm/Analysis/ValueTracking.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/IRBuilder.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/InstVisitor.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/Debug.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define DEBUG_TYPE "amdgpu-promote-alloca"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace llvm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class AMDGPUPromoteAlloca : public FunctionPass,
 | 
				
			||||||
 | 
					                       public InstVisitor<AMDGPUPromoteAlloca> {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static char ID;
 | 
				
			||||||
 | 
					  Module *Mod;
 | 
				
			||||||
 | 
					  const AMDGPUSubtarget &ST;
 | 
				
			||||||
 | 
					  int LocalMemAvailable;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
 | 
				
			||||||
 | 
					                                                   LocalMemAvailable(0) { }
 | 
				
			||||||
 | 
					  virtual bool doInitialization(Module &M);
 | 
				
			||||||
 | 
					  virtual bool runOnFunction(Function &F);
 | 
				
			||||||
 | 
					  virtual const char *getPassName() const {
 | 
				
			||||||
 | 
					    return "AMDGPU Promote Alloca";
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void visitAlloca(AllocaInst &I);
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} // End anonymous namespace
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					char AMDGPUPromoteAlloca::ID = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
 | 
				
			||||||
 | 
					  Mod = &M;
 | 
				
			||||||
 | 
					  return false;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const FunctionType *FTy = F.getFunctionType();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LocalMemAvailable = ST.getLocalMemorySize();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // If the function has any arguments in the local address space, then it's
 | 
				
			||||||
 | 
					  // possible these arguments require the entire local memory space, so
 | 
				
			||||||
 | 
					  // we cannot use local memory in the pass.
 | 
				
			||||||
 | 
					  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
 | 
				
			||||||
 | 
					    const Type *ParamTy = FTy->getParamType(i);
 | 
				
			||||||
 | 
					    if (ParamTy->isPointerTy() &&
 | 
				
			||||||
 | 
					        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
 | 
				
			||||||
 | 
					      LocalMemAvailable = 0;
 | 
				
			||||||
 | 
					      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
 | 
				
			||||||
 | 
					                      "local memory disabled.\n");
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (LocalMemAvailable > 0) {
 | 
				
			||||||
 | 
					    // Check how much local memory is being used by global objects
 | 
				
			||||||
 | 
					    for (Module::global_iterator I = Mod->global_begin(),
 | 
				
			||||||
 | 
					                                 E = Mod->global_end(); I != E; ++I) {
 | 
				
			||||||
 | 
					      GlobalVariable *GV = I;
 | 
				
			||||||
 | 
					      PointerType *GVTy = GV->getType();
 | 
				
			||||||
 | 
					      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
 | 
				
			||||||
 | 
					        continue;
 | 
				
			||||||
 | 
					      for (Value::use_iterator U = GV->use_begin(),
 | 
				
			||||||
 | 
					                               UE = GV->use_end(); U != UE; ++U) {
 | 
				
			||||||
 | 
					        Instruction *Use = dyn_cast<Instruction>(*U);
 | 
				
			||||||
 | 
					        if (!Use)
 | 
				
			||||||
 | 
					          continue;
 | 
				
			||||||
 | 
					        if (Use->getParent()->getParent() == &F)
 | 
				
			||||||
 | 
					          LocalMemAvailable -=
 | 
				
			||||||
 | 
					              Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LocalMemAvailable = std::max(0, LocalMemAvailable);
 | 
				
			||||||
 | 
					  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  visit(F);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return false;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
 | 
				
			||||||
 | 
					  return VectorType::get(ArrayTy->getArrayElementType(),
 | 
				
			||||||
 | 
					                         ArrayTy->getArrayNumElements());
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static Value* calculateVectorIndex(Value *Ptr,
 | 
				
			||||||
 | 
					                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
 | 
				
			||||||
 | 
					  if (isa<AllocaInst>(Ptr))
 | 
				
			||||||
 | 
					    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return GEPIdx[GEP];
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 | 
				
			||||||
 | 
					  // FIXME we only support simple cases
 | 
				
			||||||
 | 
					  if (GEP->getNumOperands() != 3)
 | 
				
			||||||
 | 
					    return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
 | 
				
			||||||
 | 
					  if (!I0 || !I0->isZero())
 | 
				
			||||||
 | 
					    return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return GEP->getOperand(2);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 | 
				
			||||||
 | 
					  Type *AllocaTy = Alloca->getAllocatedType();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // FIXME: There is no reason why we can't support larger arrays, we
 | 
				
			||||||
 | 
					  // are just being conservative for now.
 | 
				
			||||||
 | 
					  if (!AllocaTy->isArrayTy() ||
 | 
				
			||||||
 | 
					      AllocaTy->getArrayElementType()->isVectorTy() ||
 | 
				
			||||||
 | 
					      AllocaTy->getArrayNumElements() > 4) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DEBUG(dbgs() << "  Cannot convert type to vector");
 | 
				
			||||||
 | 
					    return false;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
 | 
				
			||||||
 | 
					  std::vector<Value*> WorkList;
 | 
				
			||||||
 | 
					  for (User *AllocaUser : Alloca->users()) {
 | 
				
			||||||
 | 
					    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
 | 
				
			||||||
 | 
					    if (!GEP) {
 | 
				
			||||||
 | 
					      WorkList.push_back(AllocaUser);
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Value *Index = GEPToVectorIndex(GEP);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // If we can't compute a vector index from this GEP, then we can't
 | 
				
			||||||
 | 
					    // promote this alloca to vector.
 | 
				
			||||||
 | 
					    if (!Index) {
 | 
				
			||||||
 | 
					      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << "\n");
 | 
				
			||||||
 | 
					      return false;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GEPVectorIdx[GEP] = Index;
 | 
				
			||||||
 | 
					    for (User *GEPUser : AllocaUser->users()) {
 | 
				
			||||||
 | 
					      WorkList.push_back(GEPUser);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DEBUG(dbgs() << "  Converting alloca to vector "; AllocaTy->dump();
 | 
				
			||||||
 | 
					        dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (std::vector<Value*>::iterator I = WorkList.begin(),
 | 
				
			||||||
 | 
					                                     E = WorkList.end(); I != E; ++I) {
 | 
				
			||||||
 | 
					    Instruction *Inst = cast<Instruction>(*I);
 | 
				
			||||||
 | 
					    IRBuilder<> Builder(Inst);
 | 
				
			||||||
 | 
					    switch (Inst->getOpcode()) {
 | 
				
			||||||
 | 
					    case Instruction::Load: {
 | 
				
			||||||
 | 
					      Value *Ptr = Inst->getOperand(0);
 | 
				
			||||||
 | 
					      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 | 
				
			||||||
 | 
					      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
 | 
				
			||||||
 | 
					      Value *VecValue = Builder.CreateLoad(BitCast);
 | 
				
			||||||
 | 
					      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
 | 
				
			||||||
 | 
					      Inst->replaceAllUsesWith(ExtractElement);
 | 
				
			||||||
 | 
					      Inst->eraseFromParent();
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    case Instruction::Store: {
 | 
				
			||||||
 | 
					      Value *Ptr = Inst->getOperand(1);
 | 
				
			||||||
 | 
					      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 | 
				
			||||||
 | 
					      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
 | 
				
			||||||
 | 
					      Value *VecValue = Builder.CreateLoad(BitCast);
 | 
				
			||||||
 | 
					      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
 | 
				
			||||||
 | 
					                                                       Inst->getOperand(0),
 | 
				
			||||||
 | 
					                                                       Index);
 | 
				
			||||||
 | 
					      Builder.CreateStore(NewVecValue, BitCast);
 | 
				
			||||||
 | 
					      Inst->eraseFromParent();
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    case Instruction::BitCast:
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    default:
 | 
				
			||||||
 | 
					      Inst->dump();
 | 
				
			||||||
 | 
					      llvm_unreachable("Do not know how to replace this instruction "
 | 
				
			||||||
 | 
					                              "with vector op");
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
 | 
				
			||||||
 | 
					  for (User *User : Val->users()) {
 | 
				
			||||||
 | 
					    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    if (isa<CallInst>(User)) {
 | 
				
			||||||
 | 
					      WorkList.push_back(User);
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if (!User->getType()->isPointerTy())
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    WorkList.push_back(User);
 | 
				
			||||||
 | 
					    collectUsesWithPtrTypes(User, WorkList);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
 | 
				
			||||||
 | 
					  IRBuilder<> Builder(&I);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // First try to replace the alloca with a vector
 | 
				
			||||||
 | 
					  Type *AllocaTy = I.getAllocatedType();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DEBUG(dbgs() << "Trying to promote " << I);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (tryPromoteAllocaToVector(&I))
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // FIXME: This is the maximum work group size.  We should try to get
 | 
				
			||||||
 | 
					  // value from the reqd_work_group_size function attribute if it is
 | 
				
			||||||
 | 
					  // available.
 | 
				
			||||||
 | 
					  unsigned WorkGroupSize = 256;
 | 
				
			||||||
 | 
					  int AllocaSize = WorkGroupSize *
 | 
				
			||||||
 | 
					      Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (AllocaSize > LocalMemAvailable) {
 | 
				
			||||||
 | 
					    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DEBUG(dbgs() << "Promoting alloca to local memory\n");
 | 
				
			||||||
 | 
					  LocalMemAvailable -= AllocaSize;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GlobalVariable *GV = new GlobalVariable(
 | 
				
			||||||
 | 
					      *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
 | 
				
			||||||
 | 
					      GlobalValue::ExternalLinkage, 0, I.getName(), 0,
 | 
				
			||||||
 | 
					      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  FunctionType *FTy = FunctionType::get(
 | 
				
			||||||
 | 
					      Type::getInt32Ty(Mod->getContext()), false);
 | 
				
			||||||
 | 
					  AttributeSet AttrSet;
 | 
				
			||||||
 | 
					  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
 | 
				
			||||||
 | 
					      "llvm.r600.read.local.size.y", FTy, AttrSet);
 | 
				
			||||||
 | 
					  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
 | 
				
			||||||
 | 
					      "llvm.r600.read.local.size.z", FTy, AttrSet);
 | 
				
			||||||
 | 
					  Value *ReadTIDIGX = Mod->getOrInsertFunction(
 | 
				
			||||||
 | 
					      "llvm.r600.read.tidig.x", FTy, AttrSet);
 | 
				
			||||||
 | 
					  Value *ReadTIDIGY = Mod->getOrInsertFunction(
 | 
				
			||||||
 | 
					      "llvm.r600.read.tidig.y", FTy, AttrSet);
 | 
				
			||||||
 | 
					  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
 | 
				
			||||||
 | 
					      "llvm.r600.read.tidig.z", FTy, AttrSet);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
 | 
				
			||||||
 | 
					  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
 | 
				
			||||||
 | 
					  Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
 | 
				
			||||||
 | 
					  Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
 | 
				
			||||||
 | 
					  Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
 | 
				
			||||||
 | 
					  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
 | 
				
			||||||
 | 
					  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
 | 
				
			||||||
 | 
					  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
 | 
				
			||||||
 | 
					  TID = Builder.CreateAdd(TID, TIdZ);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<Value*> Indices;
 | 
				
			||||||
 | 
					  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
 | 
				
			||||||
 | 
					  Indices.push_back(TID);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Value *Offset = Builder.CreateGEP(GV, Indices);
 | 
				
			||||||
 | 
					  I.mutateType(Offset->getType());
 | 
				
			||||||
 | 
					  I.replaceAllUsesWith(Offset);
 | 
				
			||||||
 | 
					  I.eraseFromParent();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<Value*> WorkList;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  collectUsesWithPtrTypes(Offset, WorkList);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (std::vector<Value*>::iterator i = WorkList.begin(),
 | 
				
			||||||
 | 
					                                     e = WorkList.end(); i != e; ++i) {
 | 
				
			||||||
 | 
					    Value *V = *i;
 | 
				
			||||||
 | 
					    CallInst *Call = dyn_cast<CallInst>(V);
 | 
				
			||||||
 | 
					    if (!Call) {
 | 
				
			||||||
 | 
					      Type *EltTy = V->getType()->getPointerElementType();
 | 
				
			||||||
 | 
					      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 | 
				
			||||||
 | 
					      V->mutateType(NewTy);
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
 | 
				
			||||||
 | 
					    if (!Intr) {
 | 
				
			||||||
 | 
					      std::vector<Type*> ArgTypes;
 | 
				
			||||||
 | 
					      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
 | 
				
			||||||
 | 
					                                ArgIdx != ArgEnd; ++ArgIdx) {
 | 
				
			||||||
 | 
					        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      Function *F = Call->getCalledFunction();
 | 
				
			||||||
 | 
					      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
 | 
				
			||||||
 | 
					                                                F->isVarArg());
 | 
				
			||||||
 | 
					      Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
 | 
				
			||||||
 | 
					                                             F->getAttributes());
 | 
				
			||||||
 | 
					      Function *NewF = cast<Function>(C);
 | 
				
			||||||
 | 
					      Call->setCalledFunction(NewF);
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Builder.SetInsertPoint(Intr);
 | 
				
			||||||
 | 
					    switch (Intr->getIntrinsicID()) {
 | 
				
			||||||
 | 
					    case Intrinsic::lifetime_start:
 | 
				
			||||||
 | 
					    case Intrinsic::lifetime_end:
 | 
				
			||||||
 | 
					      // These intrinsics are for address space 0 only
 | 
				
			||||||
 | 
					      Intr->eraseFromParent();
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    case Intrinsic::memcpy: {
 | 
				
			||||||
 | 
					      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
 | 
				
			||||||
 | 
					      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
 | 
				
			||||||
 | 
					                           MemCpy->getLength(), MemCpy->getAlignment(),
 | 
				
			||||||
 | 
					                           MemCpy->isVolatile());
 | 
				
			||||||
 | 
					      Intr->eraseFromParent();
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    case Intrinsic::memset: {
 | 
				
			||||||
 | 
					      MemSetInst *MemSet = cast<MemSetInst>(Intr);
 | 
				
			||||||
 | 
					      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
 | 
				
			||||||
 | 
					                           MemSet->getLength(), MemSet->getAlignment(),
 | 
				
			||||||
 | 
					                           MemSet->isVolatile());
 | 
				
			||||||
 | 
					      Intr->eraseFromParent();
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    default:
 | 
				
			||||||
 | 
					      Intr->dump();
 | 
				
			||||||
 | 
					      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
 | 
				
			||||||
 | 
					  return new AMDGPUPromoteAlloca(ST);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -41,6 +41,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
 | 
				
			|||||||
  EnableIfCvt = true;
 | 
					  EnableIfCvt = true;
 | 
				
			||||||
  WavefrontSize = 0;
 | 
					  WavefrontSize = 0;
 | 
				
			||||||
  CFALUBug = false;
 | 
					  CFALUBug = false;
 | 
				
			||||||
 | 
					  LocalMemorySize = 0;
 | 
				
			||||||
  ParseSubtargetFeatures(GPU, FS);
 | 
					  ParseSubtargetFeatures(GPU, FS);
 | 
				
			||||||
  DevName = GPU;
 | 
					  DevName = GPU;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -109,6 +110,10 @@ AMDGPUSubtarget::hasCFAluBug() const {
 | 
				
			|||||||
  assert(getGeneration() <= NORTHERN_ISLANDS);
 | 
					  assert(getGeneration() <= NORTHERN_ISLANDS);
 | 
				
			||||||
  return CFALUBug;
 | 
					  return CFALUBug;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					int
 | 
				
			||||||
 | 
					AMDGPUSubtarget::getLocalMemorySize() const {
 | 
				
			||||||
 | 
					  return LocalMemorySize;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
bool
 | 
					bool
 | 
				
			||||||
AMDGPUSubtarget::isTargetELF() const {
 | 
					AMDGPUSubtarget::isTargetELF() const {
 | 
				
			||||||
  return false;
 | 
					  return false;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -56,6 +56,7 @@ private:
 | 
				
			|||||||
  bool EnableIfCvt;
 | 
					  bool EnableIfCvt;
 | 
				
			||||||
  unsigned WavefrontSize;
 | 
					  unsigned WavefrontSize;
 | 
				
			||||||
  bool CFALUBug;
 | 
					  bool CFALUBug;
 | 
				
			||||||
 | 
					  int LocalMemorySize;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  InstrItineraryData InstrItins;
 | 
					  InstrItineraryData InstrItins;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -109,6 +110,7 @@ public:
 | 
				
			|||||||
  unsigned getWavefrontSize() const;
 | 
					  unsigned getWavefrontSize() const;
 | 
				
			||||||
  unsigned getStackEntrySize() const;
 | 
					  unsigned getStackEntrySize() const;
 | 
				
			||||||
  bool hasCFAluBug() const;
 | 
					  bool hasCFAluBug() const;
 | 
				
			||||||
 | 
					  int getLocalMemorySize() const;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  bool enableMachineScheduler() const override {
 | 
					  bool enableMachineScheduler() const override {
 | 
				
			||||||
    return getGeneration() <= NORTHERN_ISLANDS;
 | 
					    return getGeneration() <= NORTHERN_ISLANDS;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -109,6 +109,7 @@ public:
 | 
				
			|||||||
    return nullptr;
 | 
					    return nullptr;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  virtual void addCodeGenPrepare();
 | 
				
			||||||
  bool addPreISel() override;
 | 
					  bool addPreISel() override;
 | 
				
			||||||
  bool addInstSelector() override;
 | 
					  bool addInstSelector() override;
 | 
				
			||||||
  bool addPreRegAlloc() override;
 | 
					  bool addPreRegAlloc() override;
 | 
				
			||||||
@@ -134,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
 | 
				
			|||||||
  PM.add(createAMDGPUTargetTransformInfoPass(this));
 | 
					  PM.add(createAMDGPUTargetTransformInfoPass(this));
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void AMDGPUPassConfig::addCodeGenPrepare() {
 | 
				
			||||||
 | 
					  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 | 
				
			||||||
 | 
					  addPass(createAMDGPUPromoteAlloca(ST));
 | 
				
			||||||
 | 
					  addPass(createSROAPass());
 | 
				
			||||||
 | 
					  TargetPassConfig::addCodeGenPrepare();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
bool
 | 
					bool
 | 
				
			||||||
AMDGPUPassConfig::addPreISel() {
 | 
					AMDGPUPassConfig::addPreISel() {
 | 
				
			||||||
  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 | 
					  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -25,6 +25,7 @@ add_llvm_target(R600CodeGen
 | 
				
			|||||||
  AMDGPUTargetTransformInfo.cpp
 | 
					  AMDGPUTargetTransformInfo.cpp
 | 
				
			||||||
  AMDGPUISelLowering.cpp
 | 
					  AMDGPUISelLowering.cpp
 | 
				
			||||||
  AMDGPUInstrInfo.cpp
 | 
					  AMDGPUInstrInfo.cpp
 | 
				
			||||||
 | 
					  AMDGPUPromoteAlloca.cpp
 | 
				
			||||||
  AMDGPURegisterInfo.cpp
 | 
					  AMDGPURegisterInfo.cpp
 | 
				
			||||||
  R600ClauseMergePass.cpp
 | 
					  R600ClauseMergePass.cpp
 | 
				
			||||||
  R600ControlFlowFinalizer.cpp
 | 
					  R600ControlFlowFinalizer.cpp
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -136,6 +136,16 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 | 
				
			|||||||
  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 | 
					  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 | 
				
			||||||
  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 | 
					  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 | 
				
			||||||
 | 
					  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 | 
				
			||||||
 | 
					  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 | 
				
			||||||
 | 
					  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 | 
				
			||||||
 | 
					  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 | 
				
			||||||
 | 
					  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 | 
				
			||||||
 | 
					  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  setTargetDAGCombine(ISD::FP_ROUND);
 | 
					  setTargetDAGCombine(ISD::FP_ROUND);
 | 
				
			||||||
  setTargetDAGCombine(ISD::FP_TO_SINT);
 | 
					  setTargetDAGCombine(ISD::FP_TO_SINT);
 | 
				
			||||||
  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 | 
					  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 | 
				
			||||||
@@ -540,6 +550,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 | 
				
			|||||||
  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 | 
					  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 | 
				
			||||||
  switch (Op.getOpcode()) {
 | 
					  switch (Op.getOpcode()) {
 | 
				
			||||||
  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 | 
					  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 | 
				
			||||||
 | 
					  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 | 
				
			||||||
 | 
					  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 | 
				
			||||||
  case ISD::FCOS:
 | 
					  case ISD::FCOS:
 | 
				
			||||||
  case ISD::FSIN: return LowerTrig(Op, DAG);
 | 
					  case ISD::FSIN: return LowerTrig(Op, DAG);
 | 
				
			||||||
  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 | 
					  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 | 
				
			||||||
@@ -812,6 +824,56 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 | 
				
			||||||
 | 
					                                                   SDValue Vector) const {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SDLoc DL(Vector);
 | 
				
			||||||
 | 
					  EVT VecVT = Vector.getValueType();
 | 
				
			||||||
 | 
					  EVT EltVT = VecVT.getVectorElementType();
 | 
				
			||||||
 | 
					  SmallVector<SDValue, 8> Args;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (unsigned i = 0, e = VecVT.getVectorNumElements();
 | 
				
			||||||
 | 
					                                                           i != e; ++i) {
 | 
				
			||||||
 | 
					    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 | 
				
			||||||
 | 
					                               Vector, DAG.getConstant(i, getVectorIdxTy())));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 | 
				
			||||||
 | 
					                                                    SelectionDAG &DAG) const {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SDLoc DL(Op);
 | 
				
			||||||
 | 
					  SDValue Vector = Op.getOperand(0);
 | 
				
			||||||
 | 
					  SDValue Index = Op.getOperand(1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (isa<ConstantSDNode>(Index) ||
 | 
				
			||||||
 | 
					      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 | 
				
			||||||
 | 
					    return Op;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Vector = vectorToVerticalVector(DAG, Vector);
 | 
				
			||||||
 | 
					  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 | 
				
			||||||
 | 
					                     Vector, Index);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 | 
				
			||||||
 | 
					                                                   SelectionDAG &DAG) const {
 | 
				
			||||||
 | 
					  SDLoc DL(Op);
 | 
				
			||||||
 | 
					  SDValue Vector = Op.getOperand(0);
 | 
				
			||||||
 | 
					  SDValue Value = Op.getOperand(1);
 | 
				
			||||||
 | 
					  SDValue Index = Op.getOperand(2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (isa<ConstantSDNode>(Index) ||
 | 
				
			||||||
 | 
					      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 | 
				
			||||||
 | 
					    return Op;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Vector = vectorToVerticalVector(DAG, Vector);
 | 
				
			||||||
 | 
					  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 | 
				
			||||||
 | 
					                               Vector, Value, Index);
 | 
				
			||||||
 | 
					  return vectorToVerticalVector(DAG, Insert);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 | 
					SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 | 
				
			||||||
  // On hw >= R700, COS/SIN input must be between -1. and 1.
 | 
					  // On hw >= R700, COS/SIN input must be between -1. and 1.
 | 
				
			||||||
  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 | 
					  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -51,7 +51,10 @@ private:
 | 
				
			|||||||
  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
 | 
					  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
 | 
				
			||||||
      MachineRegisterInfo & MRI, unsigned dword_offset) const;
 | 
					      MachineRegisterInfo & MRI, unsigned dword_offset) const;
 | 
				
			||||||
  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
 | 
					  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
 | 
				
			||||||
 | 
					  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 | 
				
			||||||
 | 
					  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 | 
				
			||||||
  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
 | 
					  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
 | 
				
			||||||
  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 | 
					  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 | 
				
			||||||
  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
 | 
					  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -51,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 | 
				
			|||||||
                           unsigned DestReg, unsigned SrcReg,
 | 
					                           unsigned DestReg, unsigned SrcReg,
 | 
				
			||||||
                           bool KillSrc) const {
 | 
					                           bool KillSrc) const {
 | 
				
			||||||
  unsigned VectorComponents = 0;
 | 
					  unsigned VectorComponents = 0;
 | 
				
			||||||
  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
 | 
					  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
 | 
				
			||||||
      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
 | 
					      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
 | 
				
			||||||
 | 
					      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
 | 
				
			||||||
 | 
					       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
 | 
				
			||||||
    VectorComponents = 4;
 | 
					    VectorComponents = 4;
 | 
				
			||||||
  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
 | 
					  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
 | 
				
			||||||
            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
 | 
					            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
 | 
				
			||||||
 | 
					            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
 | 
				
			||||||
 | 
					             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
 | 
				
			||||||
    VectorComponents = 2;
 | 
					    VectorComponents = 2;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1053,6 +1057,29 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 | 
				
			|||||||
  return 2;
 | 
					  return 2;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  switch(MI->getOpcode()) {
 | 
				
			||||||
 | 
					  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 | 
				
			||||||
 | 
					  case AMDGPU::R600_EXTRACT_ELT_V2:
 | 
				
			||||||
 | 
					  case AMDGPU::R600_EXTRACT_ELT_V4:
 | 
				
			||||||
 | 
					    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
 | 
				
			||||||
 | 
					                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
 | 
				
			||||||
 | 
					                      MI->getOperand(2).getReg(),
 | 
				
			||||||
 | 
					                      RI.getHWRegChan(MI->getOperand(1).getReg()));
 | 
				
			||||||
 | 
					    break;
 | 
				
			||||||
 | 
					  case AMDGPU::R600_INSERT_ELT_V2:
 | 
				
			||||||
 | 
					  case AMDGPU::R600_INSERT_ELT_V4:
 | 
				
			||||||
 | 
					    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
 | 
				
			||||||
 | 
					                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
 | 
				
			||||||
 | 
					                       MI->getOperand(3).getReg(),                    // Offset
 | 
				
			||||||
 | 
					                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
 | 
				
			||||||
 | 
					    break;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  MI->eraseFromParent();
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
 | 
					void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
 | 
				
			||||||
                                             const MachineFunction &MF) const {
 | 
					                                             const MachineFunction &MF) const {
 | 
				
			||||||
  const AMDGPUFrameLowering *TFL =
 | 
					  const AMDGPUFrameLowering *TFL =
 | 
				
			||||||
@@ -1090,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
 | 
				
			|||||||
                                       MachineBasicBlock::iterator I,
 | 
					                                       MachineBasicBlock::iterator I,
 | 
				
			||||||
                                       unsigned ValueReg, unsigned Address,
 | 
					                                       unsigned ValueReg, unsigned Address,
 | 
				
			||||||
                                       unsigned OffsetReg) const {
 | 
					                                       unsigned OffsetReg) const {
 | 
				
			||||||
  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
 | 
					  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
 | 
				
			||||||
 | 
					                                       MachineBasicBlock::iterator I,
 | 
				
			||||||
 | 
					                                       unsigned ValueReg, unsigned Address,
 | 
				
			||||||
 | 
					                                       unsigned OffsetReg,
 | 
				
			||||||
 | 
					                                       unsigned AddrChan) const {
 | 
				
			||||||
 | 
					  unsigned AddrReg;
 | 
				
			||||||
 | 
					  switch (AddrChan) {
 | 
				
			||||||
 | 
					    default: llvm_unreachable("Invalid Channel");
 | 
				
			||||||
 | 
					    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
 | 
					  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
 | 
				
			||||||
                                               AMDGPU::AR_X, OffsetReg);
 | 
					                                               AMDGPU::AR_X, OffsetReg);
 | 
				
			||||||
  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
 | 
					  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
 | 
				
			||||||
@@ -1107,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
 | 
				
			|||||||
                                       MachineBasicBlock::iterator I,
 | 
					                                       MachineBasicBlock::iterator I,
 | 
				
			||||||
                                       unsigned ValueReg, unsigned Address,
 | 
					                                       unsigned ValueReg, unsigned Address,
 | 
				
			||||||
                                       unsigned OffsetReg) const {
 | 
					                                       unsigned OffsetReg) const {
 | 
				
			||||||
  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
 | 
					  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
 | 
				
			||||||
 | 
					                                       MachineBasicBlock::iterator I,
 | 
				
			||||||
 | 
					                                       unsigned ValueReg, unsigned Address,
 | 
				
			||||||
 | 
					                                       unsigned OffsetReg,
 | 
				
			||||||
 | 
					                                       unsigned AddrChan) const {
 | 
				
			||||||
 | 
					  unsigned AddrReg;
 | 
				
			||||||
 | 
					  switch (AddrChan) {
 | 
				
			||||||
 | 
					    default: llvm_unreachable("Invalid Channel");
 | 
				
			||||||
 | 
					    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
 | 
					  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
 | 
				
			||||||
                                                       AMDGPU::AR_X,
 | 
					                                                       AMDGPU::AR_X,
 | 
				
			||||||
                                                       OffsetReg);
 | 
					                                                       OffsetReg);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -36,6 +36,18 @@ namespace llvm {
 | 
				
			|||||||
  std::vector<std::pair<int, unsigned> >
 | 
					  std::vector<std::pair<int, unsigned> >
 | 
				
			||||||
  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
 | 
					  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
 | 
				
			||||||
 | 
					                                        MachineBasicBlock::iterator I,
 | 
				
			||||||
 | 
					                                        unsigned ValueReg, unsigned Address,
 | 
				
			||||||
 | 
					                                        unsigned OffsetReg,
 | 
				
			||||||
 | 
					                                        unsigned AddrChan) const;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
 | 
				
			||||||
 | 
					                                        MachineBasicBlock::iterator I,
 | 
				
			||||||
 | 
					                                        unsigned ValueReg, unsigned Address,
 | 
				
			||||||
 | 
					                                        unsigned OffsetReg,
 | 
				
			||||||
 | 
					                                        unsigned AddrChan) const;
 | 
				
			||||||
  public:
 | 
					  public:
 | 
				
			||||||
  enum BankSwizzle {
 | 
					  enum BankSwizzle {
 | 
				
			||||||
    ALU_VEC_012_SCL_210 = 0,
 | 
					    ALU_VEC_012_SCL_210 = 0,
 | 
				
			||||||
@@ -195,6 +207,8 @@ namespace llvm {
 | 
				
			|||||||
  int getInstrLatency(const InstrItineraryData *ItinData,
 | 
					  int getInstrLatency(const InstrItineraryData *ItinData,
 | 
				
			||||||
                      SDNode *Node) const override { return 1;}
 | 
					                      SDNode *Node) const override { return 1;}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /// \brief Reserve the registers that may be accesed using indirect addressing.
 | 
					  /// \brief Reserve the registers that may be accesed using indirect addressing.
 | 
				
			||||||
  void reserveIndirectRegisters(BitVector &Reserved,
 | 
					  void reserveIndirectRegisters(BitVector &Reserved,
 | 
				
			||||||
                                const MachineFunction &MF) const;
 | 
					                                const MachineFunction &MF) const;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1581,6 +1581,60 @@ let isTerminator=1 in {
 | 
				
			|||||||
  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
 | 
					  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
 | 
					// Indirect addressing pseudo instructions
 | 
				
			||||||
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let isPseudo = 1 in {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ExtractVertical <RegisterClass vec_rc> : InstR600 <
 | 
				
			||||||
 | 
					  (outs R600_Reg32:$dst),
 | 
				
			||||||
 | 
					  (ins vec_rc:$vec, R600_Reg32:$index), "",
 | 
				
			||||||
 | 
					  [],
 | 
				
			||||||
 | 
					  AnyALU
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let Constraints = "$dst = $vec" in {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class InsertVertical <RegisterClass vec_rc> : InstR600 <
 | 
				
			||||||
 | 
					  (outs vec_rc:$dst),
 | 
				
			||||||
 | 
					  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
 | 
				
			||||||
 | 
					  [],
 | 
				
			||||||
 | 
					  AnyALU
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} // End Constraints = "$dst = $vec"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} // End isPseudo = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
 | 
				
			||||||
 | 
					def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
 | 
				
			||||||
 | 
					def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
 | 
				
			||||||
 | 
					                          ValueType scalar_ty> : Pat <
 | 
				
			||||||
 | 
					  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
 | 
				
			||||||
 | 
					  (inst $vec, $index)
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
 | 
				
			||||||
 | 
					def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
 | 
				
			||||||
 | 
					def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
 | 
				
			||||||
 | 
					def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class InsertVerticalPat <Instruction inst, ValueType vec_ty,
 | 
				
			||||||
 | 
					                         ValueType scalar_ty> : Pat <
 | 
				
			||||||
 | 
					  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
 | 
				
			||||||
 | 
					  (inst $vec, $value, $index)
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
 | 
				
			||||||
 | 
					def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
 | 
				
			||||||
 | 
					def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
 | 
				
			||||||
 | 
					def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//===----------------------------------------------------------------------===//
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
// ISel Patterns
 | 
					// ISel Patterns
 | 
				
			||||||
//===----------------------------------------------------------------------===//
 | 
					//===----------------------------------------------------------------------===//
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
 | 
					class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
 | 
				
			||||||
    RegisterWithSubRegs<n, subregs> {
 | 
					    RegisterWithSubRegs<n, subregs> {
 | 
				
			||||||
 | 
					  field bits<2> chan_encoding = 0;
 | 
				
			||||||
  let Namespace = "AMDGPU";
 | 
					  let Namespace = "AMDGPU";
 | 
				
			||||||
  let SubRegIndices = [sub0, sub1, sub2, sub3];
 | 
					  let SubRegIndices = [sub0, sub1, sub2, sub3];
 | 
				
			||||||
  let HWEncoding = encoding;
 | 
					  let HWEncoding{8-0} = encoding{8-0};
 | 
				
			||||||
 | 
					  let HWEncoding{10-9} = chan_encoding;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
 | 
					class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
 | 
				
			||||||
    RegisterWithSubRegs<n, subregs> {
 | 
					    RegisterWithSubRegs<n, subregs> {
 | 
				
			||||||
 | 
					  field bits<2> chan_encoding = 0;
 | 
				
			||||||
  let Namespace = "AMDGPU";
 | 
					  let Namespace = "AMDGPU";
 | 
				
			||||||
  let SubRegIndices = [sub0, sub1];
 | 
					  let SubRegIndices = [sub0, sub1];
 | 
				
			||||||
  let HWEncoding = encoding;
 | 
					  let HWEncoding = encoding;
 | 
				
			||||||
 | 
					  let HWEncoding{8-0} = encoding{8-0};
 | 
				
			||||||
 | 
					  let HWEncoding{10-9} = chan_encoding;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
 | 
				
			||||||
 | 
					  "V"#lo#hi#"_"#chan,
 | 
				
			||||||
 | 
					  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
 | 
				
			||||||
 | 
					  lo
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
foreach Index = 0-127 in {
 | 
					foreach Index = 0-127 in {
 | 
				
			||||||
  foreach Chan = [ "X", "Y", "Z", "W" ] in {
 | 
					  foreach Chan = [ "X", "Y", "Z", "W" ] in {
 | 
				
			||||||
@@ -54,6 +64,24 @@ foreach Index = 0-127 in {
 | 
				
			|||||||
                                   Index>;
 | 
					                                   Index>;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					foreach Chan = [ "X", "Y", "Z", "W"] in {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  let chan_encoding = !if(!eq(Chan, "X"), 0,
 | 
				
			||||||
 | 
					                      !if(!eq(Chan, "Y"), 1,
 | 
				
			||||||
 | 
					                      !if(!eq(Chan, "Z"), 2,
 | 
				
			||||||
 | 
					                      !if(!eq(Chan, "W"), 3, 0)))) in {
 | 
				
			||||||
 | 
					    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
 | 
				
			||||||
 | 
					                                   [!cast<Register>("T0_"#Chan),
 | 
				
			||||||
 | 
					                                    !cast<Register>("T1_"#Chan),
 | 
				
			||||||
 | 
					                                    !cast<Register>("T2_"#Chan),
 | 
				
			||||||
 | 
					                                    !cast<Register>("T3_"#Chan)],
 | 
				
			||||||
 | 
					                                    0>;
 | 
				
			||||||
 | 
					    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
 | 
				
			||||||
 | 
					    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// KCACHE_BANK0
 | 
					// KCACHE_BANK0
 | 
				
			||||||
foreach Index = 159-128 in {
 | 
					foreach Index = 159-128 in {
 | 
				
			||||||
  foreach Chan = [ "X", "Y", "Z", "W" ] in {
 | 
					  foreach Chan = [ "X", "Y", "Z", "W" ] in {
 | 
				
			||||||
@@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
let isAllocatable = 0 in {
 | 
					let isAllocatable = 0 in {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// XXX: Only use the X channel, until we support wider stack widths
 | 
					def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
 | 
				
			||||||
def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
 | 
					
 | 
				
			||||||
 | 
					// We only use Addr_[YZW] for vertical vectors.
 | 
				
			||||||
 | 
					// FIXME if we add more vertical vector registers we will need to ad more
 | 
				
			||||||
 | 
					// registers to these classes.
 | 
				
			||||||
 | 
					def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
 | 
				
			||||||
 | 
					def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
 | 
				
			||||||
 | 
					def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
 | 
					def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
 | 
				
			||||||
  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
 | 
					  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
 | 
				
			||||||
@@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
 | 
				
			|||||||
  let CopyCost = -1;
 | 
					  let CopyCost = -1;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
 | 
				
			||||||
 | 
					  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
 | 
				
			||||||
 | 
					>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
 | 
					def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
 | 
				
			||||||
                                (add (sequence "T%u_XY", 0, 63))>;
 | 
					                                (add (sequence "T%u_XY", 0, 63))>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
 | 
				
			||||||
 | 
					                                      (add V01_X, V01_Y, V01_Z, V01_W,
 | 
				
			||||||
 | 
					                                           V23_X, V23_Y, V23_Z, V23_W)>;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2560,13 +2560,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I
 | 
				
			|||||||
  // 1. Extract with offset
 | 
					  // 1. Extract with offset
 | 
				
			||||||
  def : Pat<
 | 
					  def : Pat<
 | 
				
			||||||
    (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
 | 
					    (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
 | 
				
			||||||
    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
 | 
					    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
 | 
				
			||||||
  >;
 | 
					  >;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // 2. Extract without offset
 | 
					  // 2. Extract without offset
 | 
				
			||||||
  def : Pat<
 | 
					  def : Pat<
 | 
				
			||||||
    (vector_extract vt:$vec, i32:$idx),
 | 
					    (vector_extract vt:$vec, i32:$idx),
 | 
				
			||||||
    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
 | 
					    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
 | 
				
			||||||
  >;
 | 
					  >;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // 3. Insert with offset
 | 
					  // 3. Insert with offset
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -10,7 +10,12 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
; SI-LABEL: @test_private_array_ptr_calc:
 | 
					; SI-LABEL: @test_private_array_ptr_calc:
 | 
				
			||||||
; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
 | 
					; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
 | 
					;
 | 
				
			||||||
 | 
					; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 | 
				
			||||||
 | 
					; alloca to a vector.  It currently fails because it does not know how
 | 
				
			||||||
 | 
					; to interpret:
 | 
				
			||||||
 | 
					; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
 | 
				
			||||||
 | 
					; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
 | 
				
			||||||
define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
 | 
					define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
 | 
				
			||||||
  %alloca = alloca [4 x i32], i32 4, align 16
 | 
					  %alloca = alloca [4 x i32], i32 4, align 16
 | 
				
			||||||
  %tid = call i32 @llvm.SI.tid() readnone
 | 
					  %tid = call i32 @llvm.SI.tid() readnone
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,10 +3,8 @@
 | 
				
			|||||||
declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 | 
					declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-LABEL: @private_access_f64_alloca:
 | 
					; SI-LABEL: @private_access_f64_alloca:
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_WRITE_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_READ_B64
 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
					define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
				
			||||||
  %val = load double addrspace(1)* %in, align 8
 | 
					  %val = load double addrspace(1)* %in, align 8
 | 
				
			||||||
  %array = alloca double, i32 16, align 8
 | 
					  %array = alloca double, i32 16, align 8
 | 
				
			||||||
@@ -19,14 +17,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-LABEL: @private_access_v2f64_alloca:
 | 
					; SI-LABEL: @private_access_v2f64_alloca:
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_WRITE_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_WRITE_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_READ_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_READ_B64
 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
					define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
				
			||||||
  %val = load <2 x double> addrspace(1)* %in, align 16
 | 
					  %val = load <2 x double> addrspace(1)* %in, align 16
 | 
				
			||||||
  %array = alloca <2 x double>, i32 16, align 16
 | 
					  %array = alloca <2 x double>, i32 16, align 16
 | 
				
			||||||
@@ -39,10 +33,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-LABEL: @private_access_i64_alloca:
 | 
					; SI-LABEL: @private_access_i64_alloca:
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_WRITE_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_READ_B64
 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
					define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
				
			||||||
  %val = load i64 addrspace(1)* %in, align 8
 | 
					  %val = load i64 addrspace(1)* %in, align 8
 | 
				
			||||||
  %array = alloca i64, i32 16, align 8
 | 
					  %array = alloca i64, i32 16, align 8
 | 
				
			||||||
@@ -55,14 +47,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-LABEL: @private_access_v2i64_alloca:
 | 
					; SI-LABEL: @private_access_v2i64_alloca:
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_WRITE_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_WRITE_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_READ_B64
 | 
				
			||||||
; SI: V_MOVRELD_B32_e32
 | 
					; SI: DS_READ_B64
 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
; SI: V_MOVRELS_B32_e32
 | 
					 | 
				
			||||||
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
					define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
 | 
				
			||||||
  %val = load <2 x i64> addrspace(1)* %in, align 16
 | 
					  %val = load <2 x i64> addrspace(1)* %in, align 16
 | 
				
			||||||
  %array = alloca <2 x i64>, i32 16, align 16
 | 
					  %array = alloca <2 x i64>, i32 16, align 16
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2,10 +2,13 @@
 | 
				
			|||||||
; REQUIRES: asserts
 | 
					; REQUIRES: asserts
 | 
				
			||||||
; RUN: llc -march=r600 -mcpu=SI < %s
 | 
					; RUN: llc -march=r600 -mcpu=SI < %s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define void @large_alloca(i32 addrspace(1)* %out, i32 %x) nounwind {
 | 
					define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
 | 
				
			||||||
  %large = alloca [256 x i32], align 4
 | 
					  %large = alloca [8192 x i32], align 4
 | 
				
			||||||
  %gep = getelementptr [256 x i32]* %large, i32 0, i32 255
 | 
					  %gep = getelementptr [8192 x i32]* %large, i32 0, i32 8191
 | 
				
			||||||
  store i32 %x, i32* %gep
 | 
					  store i32 %x, i32* %gep
 | 
				
			||||||
 | 
					  %gep1 = getelementptr [8192 x i32]* %large, i32 0, i32 %y
 | 
				
			||||||
 | 
					  %0 = load i32* %gep1
 | 
				
			||||||
 | 
					  store i32 %0, i32 addrspace(1)* %out
 | 
				
			||||||
  ret void
 | 
					  ret void
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -7,6 +7,12 @@
 | 
				
			|||||||
; CHECK: AND_INT
 | 
					; CHECK: AND_INT
 | 
				
			||||||
; CHECK-NEXT: AND_INT
 | 
					; CHECK-NEXT: AND_INT
 | 
				
			||||||
; CHECK-NEXT: OR_INT
 | 
					; CHECK-NEXT: OR_INT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; FIXME: For some reason having the allocas here allowed the flatten cfg pass
 | 
				
			||||||
 | 
					; to do its transfomation, however now that we are using local memory for
 | 
				
			||||||
 | 
					; allocas, the transformation isn't happening.
 | 
				
			||||||
 | 
					; XFAIL: *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define void @_Z9chk1D_512v() #0 {
 | 
					define void @_Z9chk1D_512v() #0 {
 | 
				
			||||||
entry:
 | 
					entry:
 | 
				
			||||||
  %a0 = alloca i32, align 4
 | 
					  %a0 = alloca i32, align 4
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,6 +3,11 @@
 | 
				
			|||||||
;
 | 
					;
 | 
				
			||||||
; CFG flattening should use parallel-or to generate branch conditions and
 | 
					; CFG flattening should use parallel-or to generate branch conditions and
 | 
				
			||||||
; then merge if-regions with the same bodies.
 | 
					; then merge if-regions with the same bodies.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; FIXME: For some reason having the allocas here allowed the flatten cfg pass
 | 
				
			||||||
 | 
					; to do its transfomation, however now that we are using local memory for
 | 
				
			||||||
 | 
					; allocas, the transformation isn't happening.
 | 
				
			||||||
 | 
					; XFAIL: *
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; CHECK: OR_INT
 | 
					; CHECK: OR_INT
 | 
				
			||||||
; CHECK-NEXT: OR_INT
 | 
					; CHECK-NEXT: OR_INT
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,24 +1,17 @@
 | 
				
			|||||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
 | 
					; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
 | 
				
			||||||
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 | 
					; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; This test checks that uses and defs of the AR register happen in the same
 | 
					 | 
				
			||||||
; instruction clause.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
; FUNC-LABEL: @mova_same_clause
 | 
					; FUNC-LABEL: @mova_same_clause
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; R600-CHECK: MOVA_INT
 | 
					; R600-CHECK: LDS_WRITE
 | 
				
			||||||
; R600-CHECK-NOT: ALU clause
 | 
					; R600-CHECK: LDS_WRITE
 | 
				
			||||||
; R600-CHECK: 0 + AR.x
 | 
					; R600-CHECK: LDS_READ
 | 
				
			||||||
; R600-CHECK: MOVA_INT
 | 
					; R600-CHECK: LDS_READ
 | 
				
			||||||
; R600-CHECK-NOT: ALU clause
 | 
					 | 
				
			||||||
; R600-CHECK: 0 + AR.x
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
 | 
					; SI-CHECK: DS_WRITE_B32
 | 
				
			||||||
; SI-CHECK: V_MOVRELD
 | 
					; SI-CHECK: DS_WRITE_B32
 | 
				
			||||||
; SI-CHECK: S_CBRANCH
 | 
					; SI-CHECK: DS_READ_B32
 | 
				
			||||||
; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
 | 
					; SI-CHECK: DS_READ_B32
 | 
				
			||||||
; SI-CHECK: V_MOVRELD
 | 
					 | 
				
			||||||
; SI-CHECK: S_CBRANCH
 | 
					 | 
				
			||||||
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 | 
					define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 | 
				
			||||||
entry:
 | 
					entry:
 | 
				
			||||||
  %stack = alloca [5 x i32], align 4
 | 
					  %stack = alloca [5 x i32], align 4
 | 
				
			||||||
@@ -114,12 +107,8 @@ for.end:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
; FUNC-LABEL: @short_array
 | 
					; FUNC-LABEL: @short_array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
 | 
					 | 
				
			||||||
; R600-CHECK: 65536
 | 
					 | 
				
			||||||
; R600-CHECK: *
 | 
					 | 
				
			||||||
; R600-CHECK: MOVA_INT
 | 
					; R600-CHECK: MOVA_INT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
 | 
					 | 
				
			||||||
; SI-CHECK: V_MOVRELS_B32_e32
 | 
					; SI-CHECK: V_MOVRELS_B32_e32
 | 
				
			||||||
define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 | 
					define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 | 
				
			||||||
entry:
 | 
					entry:
 | 
				
			||||||
@@ -137,10 +126,7 @@ entry:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
; FUNC-LABEL: @char_array
 | 
					; FUNC-LABEL: @char_array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
 | 
					; R600-CHECK: MOVA_INT
 | 
				
			||||||
; R600-CHECK: 256
 | 
					 | 
				
			||||||
; R600-CHECK: *
 | 
					 | 
				
			||||||
; R600-CHECK-NEXT: MOVA_INT
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
 | 
					; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
 | 
				
			||||||
; SI-CHECK: V_MOVRELS_B32_e32
 | 
					; SI-CHECK: V_MOVRELS_B32_e32
 | 
				
			||||||
@@ -185,7 +171,9 @@ entry:
 | 
				
			|||||||
; Test that two stack objects are not stored in the same register
 | 
					; Test that two stack objects are not stored in the same register
 | 
				
			||||||
; The second stack object should be in T3.X
 | 
					; The second stack object should be in T3.X
 | 
				
			||||||
; FUNC-LABEL: @no_overlap
 | 
					; FUNC-LABEL: @no_overlap
 | 
				
			||||||
; R600-CHECK: MOV {{\** *}}T3.X
 | 
					; R600_CHECK: MOV
 | 
				
			||||||
 | 
					; R600_CHECK: [[CHAN:[XYZW]]]+
 | 
				
			||||||
 | 
					; R600-CHECK-NOT: [[CHAN]]+
 | 
				
			||||||
; SI-CHECK: V_MOV_B32_e32 v3
 | 
					; SI-CHECK: V_MOV_B32_e32 v3
 | 
				
			||||||
define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 | 
					define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 | 
				
			||||||
entry:
 | 
					entry:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,7 @@
 | 
				
			|||||||
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 | 
					; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; XFAIL: *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; 64-bit select was originally lowered with a build_pair, and this
 | 
					; 64-bit select was originally lowered with a build_pair, and this
 | 
				
			||||||
; could be simplified to 1 cndmask instead of 2, but that broken when
 | 
					; could be simplified to 1 cndmask instead of 2, but that broken when
 | 
				
			||||||
; it started being implemented with a v2i32 build_vector and
 | 
					; it started being implemented with a v2i32 build_vector and
 | 
				
			||||||
@@ -12,9 +14,10 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
 | 
				
			|||||||
  ret void
 | 
					  ret void
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; FIXME: Fix truncating store for local memory
 | 
				
			||||||
; SI-LABEL: @trunc_load_alloca_i64:
 | 
					; SI-LABEL: @trunc_load_alloca_i64:
 | 
				
			||||||
; SI: V_MOVRELS_B32
 | 
					; SI: DS_READ_B32
 | 
				
			||||||
; SI-NOT: V_MOVRELS_B32
 | 
					; SI-NOT: DS_READ_B64
 | 
				
			||||||
; SI: S_ENDPGM
 | 
					; SI: S_ENDPGM
 | 
				
			||||||
define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
 | 
					define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
 | 
				
			||||||
  %idx = add i32 %a, %b
 | 
					  %idx = add i32 %a, %b
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										74
									
								
								test/CodeGen/R600/vector-alloca.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								test/CodeGen/R600/vector-alloca.ll
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,74 @@
 | 
				
			|||||||
 | 
					; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 | 
				
			||||||
 | 
					; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; FUNC-LABEL: @vector_read
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOVA_INT
 | 
				
			||||||
 | 
					define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 | 
				
			||||||
 | 
					entry:
 | 
				
			||||||
 | 
					  %0 = alloca [4 x i32]
 | 
				
			||||||
 | 
					  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
 | 
				
			||||||
 | 
					  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
 | 
				
			||||||
 | 
					  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
 | 
				
			||||||
 | 
					  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
 | 
				
			||||||
 | 
					  store i32 0, i32* %x
 | 
				
			||||||
 | 
					  store i32 1, i32* %y
 | 
				
			||||||
 | 
					  store i32 2, i32* %z
 | 
				
			||||||
 | 
					  store i32 3, i32* %w
 | 
				
			||||||
 | 
					  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
 | 
				
			||||||
 | 
					  %2 = load i32* %1
 | 
				
			||||||
 | 
					  store i32 %2, i32 addrspace(1)* %out
 | 
				
			||||||
 | 
					  ret void
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; FUNC-LABEL: @vector_write
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOV
 | 
				
			||||||
 | 
					; EG: MOVA_INT
 | 
				
			||||||
 | 
					; EG: MOVA_INT
 | 
				
			||||||
 | 
					define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 | 
				
			||||||
 | 
					entry:
 | 
				
			||||||
 | 
					  %0 = alloca [4 x i32]
 | 
				
			||||||
 | 
					  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
 | 
				
			||||||
 | 
					  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
 | 
				
			||||||
 | 
					  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
 | 
				
			||||||
 | 
					  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
 | 
				
			||||||
 | 
					  store i32 0, i32* %x
 | 
				
			||||||
 | 
					  store i32 0, i32* %y
 | 
				
			||||||
 | 
					  store i32 0, i32* %z
 | 
				
			||||||
 | 
					  store i32 0, i32* %w
 | 
				
			||||||
 | 
					  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index
 | 
				
			||||||
 | 
					  store i32 1, i32* %1
 | 
				
			||||||
 | 
					  %2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index
 | 
				
			||||||
 | 
					  %3 = load i32* %2
 | 
				
			||||||
 | 
					  store i32 %3, i32 addrspace(1)* %out
 | 
				
			||||||
 | 
					  ret void
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					; This test should be optimize to:
 | 
				
			||||||
 | 
					; store i32 0, i32 addrspace(1)* %out
 | 
				
			||||||
 | 
					; FUNC-LABEL: @bitcast_gep
 | 
				
			||||||
 | 
					; CHECK: STORE_RAW
 | 
				
			||||||
 | 
					define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 | 
				
			||||||
 | 
					entry:
 | 
				
			||||||
 | 
					  %0 = alloca [4 x i32]
 | 
				
			||||||
 | 
					  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
 | 
				
			||||||
 | 
					  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
 | 
				
			||||||
 | 
					  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
 | 
				
			||||||
 | 
					  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
 | 
				
			||||||
 | 
					  store i32 0, i32* %x
 | 
				
			||||||
 | 
					  store i32 0, i32* %y
 | 
				
			||||||
 | 
					  store i32 0, i32* %z
 | 
				
			||||||
 | 
					  store i32 0, i32* %w
 | 
				
			||||||
 | 
					  %1 = getelementptr [4 x i32]* %0, i32 0, i32 1
 | 
				
			||||||
 | 
					  %2 = bitcast i32* %1 to [4 x i32]*
 | 
				
			||||||
 | 
					  %3 = getelementptr [4 x i32]* %2, i32 0, i32 0
 | 
				
			||||||
 | 
					  %4 = load i32* %3
 | 
				
			||||||
 | 
					  store i32 %4, i32 addrspace(1)* %out
 | 
				
			||||||
 | 
					  ret void
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user