mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-10-31 08:16:47 +00:00 
			
		
		
		
	[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD.
The _Int instructions are special, in that they operate on the full
VR128 instead of FR32.  The load folding then looks at MOVSS, at the
user, and bails out when it sees a size mismatch.
What we really know is that the rm_Int instructions don't load the
higher lanes, so folding is fine.
This happens for the straightforward intrinsic code, e.g.:
    _mm_add_ss(a, _mm_load_ss(p));
Fixes PR23349.
Differential Revision: http://reviews.llvm.org/D10554
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240326 91177308-0d34-0410-b5e6-96231b3b80d8
			
			
This commit is contained in:
		| @@ -5295,21 +5295,57 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( | |||||||
|                                Size, Alignment, /*AllowCommute=*/true); |                                Size, Alignment, /*AllowCommute=*/true); | ||||||
| } | } | ||||||
|  |  | ||||||
| static bool isPartialRegisterLoad(const MachineInstr &LoadMI, | /// Check if \p LoadMI is a partial register load that we can't fold into \p MI | ||||||
|  | /// because the latter uses contents that wouldn't be defined in the folded | ||||||
|  | /// version.  For instance, this transformation isn't legal: | ||||||
|  | ///   movss (%rdi), %xmm0 | ||||||
|  | ///   addps %xmm0, %xmm0 | ||||||
|  | /// -> | ||||||
|  | ///   addps (%rdi), %xmm0 | ||||||
|  | /// | ||||||
|  | /// But this one is: | ||||||
|  | ///   movss (%rdi), %xmm0 | ||||||
|  | ///   addss %xmm0, %xmm0 | ||||||
|  | /// -> | ||||||
|  | ///   addss (%rdi), %xmm0 | ||||||
|  | /// | ||||||
|  | static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, | ||||||
|  |                                              const MachineInstr &UserMI, | ||||||
|                                              const MachineFunction &MF) { |                                              const MachineFunction &MF) { | ||||||
|   unsigned Opc = LoadMI.getOpcode(); |   unsigned Opc = LoadMI.getOpcode(); | ||||||
|  |   unsigned UserOpc = UserMI.getOpcode(); | ||||||
|   unsigned RegSize = |   unsigned RegSize = | ||||||
|       MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); |       MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); | ||||||
|  |  | ||||||
|   if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) |   if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) { | ||||||
|     // These instructions only load 32 bits, we can't fold them if the |     // These instructions only load 32 bits, we can't fold them if the | ||||||
|     // destination register is wider than 32 bits (4 bytes). |     // destination register is wider than 32 bits (4 bytes), and its user | ||||||
|  |     // instruction isn't scalar (SS). | ||||||
|  |     switch (UserOpc) { | ||||||
|  |     case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: | ||||||
|  |     case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: | ||||||
|  |     case X86::MULSSrr_Int: case X86::VMULSSrr_Int: | ||||||
|  |     case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: | ||||||
|  |       return false; | ||||||
|  |     default: | ||||||
|       return true; |       return true; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) |   if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) { | ||||||
|     // These instructions only load 64 bits, we can't fold them if the |     // These instructions only load 64 bits, we can't fold them if the | ||||||
|     // destination register is wider than 64 bits (8 bytes). |     // destination register is wider than 64 bits (8 bytes), and its user | ||||||
|  |     // instruction isn't scalar (SD). | ||||||
|  |     switch (UserOpc) { | ||||||
|  |     case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: | ||||||
|  |     case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: | ||||||
|  |     case X86::MULSDrr_Int: case X86::VMULSDrr_Int: | ||||||
|  |     case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: | ||||||
|  |       return false; | ||||||
|  |     default: | ||||||
|       return true; |       return true; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   return false; |   return false; | ||||||
| } | } | ||||||
| @@ -5321,7 +5357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( | |||||||
|   unsigned NumOps = LoadMI->getDesc().getNumOperands(); |   unsigned NumOps = LoadMI->getDesc().getNumOperands(); | ||||||
|   int FrameIndex; |   int FrameIndex; | ||||||
|   if (isLoadFromStackSlot(LoadMI, FrameIndex)) { |   if (isLoadFromStackSlot(LoadMI, FrameIndex)) { | ||||||
|     if (isPartialRegisterLoad(*LoadMI, MF)) |     if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) | ||||||
|       return nullptr; |       return nullptr; | ||||||
|     return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex); |     return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex); | ||||||
|   } |   } | ||||||
| @@ -5434,7 +5470,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( | |||||||
|     break; |     break; | ||||||
|   } |   } | ||||||
|   default: { |   default: { | ||||||
|     if (isPartialRegisterLoad(*LoadMI, MF)) |     if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) | ||||||
|       return nullptr; |       return nullptr; | ||||||
|  |  | ||||||
|     // Folding a normal load. Just copy the load's address operands. |     // Folding a normal load. Just copy the load's address operands. | ||||||
|   | |||||||
							
								
								
									
										142
									
								
								test/CodeGen/X86/fold-load-binops.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										142
									
								
								test/CodeGen/X86/fold-load-binops.ll
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,142 @@ | |||||||
|  | ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE | ||||||
|  | ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX | ||||||
|  |  | ||||||
|  | ; Verify that we're folding the load into the math instruction. | ||||||
|  | ; This pattern is generated out of the simplest intrinsics usage: | ||||||
|  | ;  _mm_add_ss(a, _mm_load_ss(b)); | ||||||
|  |  | ||||||
|  | define <4 x float> @addss(<4 x float> %va, float* %pb) { | ||||||
|  | ; SSE-LABEL: addss: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    addss (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: addss: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vaddss (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <4 x float> %va, i32 0 | ||||||
|  |     %b = load float, float* %pb | ||||||
|  |     %r = fadd float %a, %b | ||||||
|  |     %vr = insertelement <4 x float> %va, float %r, i32 0 | ||||||
|  |     ret <4 x float> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <2 x double> @addsd(<2 x double> %va, double* %pb) { | ||||||
|  | ; SSE-LABEL: addsd: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    addsd (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: addsd: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <2 x double> %va, i32 0 | ||||||
|  |     %b = load double, double* %pb | ||||||
|  |     %r = fadd double %a, %b | ||||||
|  |     %vr = insertelement <2 x double> %va, double %r, i32 0 | ||||||
|  |     ret <2 x double> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <4 x float> @subss(<4 x float> %va, float* %pb) { | ||||||
|  | ; SSE-LABEL: subss: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    subss (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: subss: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vsubss (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <4 x float> %va, i32 0 | ||||||
|  |     %b = load float, float* %pb | ||||||
|  |     %r = fsub float %a, %b | ||||||
|  |     %vr = insertelement <4 x float> %va, float %r, i32 0 | ||||||
|  |     ret <4 x float> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <2 x double> @subsd(<2 x double> %va, double* %pb) { | ||||||
|  | ; SSE-LABEL: subsd: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    subsd (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: subsd: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <2 x double> %va, i32 0 | ||||||
|  |     %b = load double, double* %pb | ||||||
|  |     %r = fsub double %a, %b | ||||||
|  |     %vr = insertelement <2 x double> %va, double %r, i32 0 | ||||||
|  |     ret <2 x double> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <4 x float> @mulss(<4 x float> %va, float* %pb) { | ||||||
|  | ; SSE-LABEL: mulss: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    mulss (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: mulss: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vmulss (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <4 x float> %va, i32 0 | ||||||
|  |     %b = load float, float* %pb | ||||||
|  |     %r = fmul float %a, %b | ||||||
|  |     %vr = insertelement <4 x float> %va, float %r, i32 0 | ||||||
|  |     ret <4 x float> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <2 x double> @mulsd(<2 x double> %va, double* %pb) { | ||||||
|  | ; SSE-LABEL: mulsd: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    mulsd (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: mulsd: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <2 x double> %va, i32 0 | ||||||
|  |     %b = load double, double* %pb | ||||||
|  |     %r = fmul double %a, %b | ||||||
|  |     %vr = insertelement <2 x double> %va, double %r, i32 0 | ||||||
|  |     ret <2 x double> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <4 x float> @divss(<4 x float> %va, float* %pb) { | ||||||
|  | ; SSE-LABEL: divss: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    divss (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: divss: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vdivss (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <4 x float> %va, i32 0 | ||||||
|  |     %b = load float, float* %pb | ||||||
|  |     %r = fdiv float %a, %b | ||||||
|  |     %vr = insertelement <4 x float> %va, float %r, i32 0 | ||||||
|  |     ret <4 x float> %vr | ||||||
|  | } | ||||||
|  |  | ||||||
|  | define <2 x double> @divsd(<2 x double> %va, double* %pb) { | ||||||
|  | ; SSE-LABEL: divsd: | ||||||
|  | ; SSE:       # BB#0: | ||||||
|  | ; SSE-NEXT:    divsd (%rdi), %xmm0 | ||||||
|  | ; SSE-NEXT:    retq | ||||||
|  | ; | ||||||
|  | ; AVX-LABEL: divsd: | ||||||
|  | ; AVX:       # BB#0: | ||||||
|  | ; AVX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 | ||||||
|  | ; AVX-NEXT:    retq | ||||||
|  |     %a = extractelement <2 x double> %va, i32 0 | ||||||
|  |     %b = load double, double* %pb | ||||||
|  |     %r = fdiv double %a, %b | ||||||
|  |     %vr = insertelement <2 x double> %va, double %r, i32 0 | ||||||
|  |     ret <2 x double> %vr | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user