mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-11-04 05:17:07 +00:00 
			
		
		
		
	X86: optimized i64 vector multiply with constant
When we multiply two 64-bit vectors, we extract lower and upper part and use the PMULUDQ instruction. When one of the operands is a constant, the upper part may be zero, we know this at compile time. Example: %a = mul <4 x i64> %b, <4 x i64> < i64 5, i64 5, i64 5, i64 5>. I'm checking the value of the upper part and prevent redundant "multiply", "shift" and "add" operations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239802 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
		@@ -16530,6 +16530,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 | 
				
			|||||||
  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
 | 
					  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
 | 
				
			||||||
  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
 | 
					  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SDValue AhiBlo = Ahi;
 | 
				
			||||||
 | 
					  SDValue AloBhi = Bhi;
 | 
				
			||||||
  // Bit cast to 32-bit vectors for MULUDQ
 | 
					  // Bit cast to 32-bit vectors for MULUDQ
 | 
				
			||||||
  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
 | 
					  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
 | 
				
			||||||
                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
 | 
					                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
 | 
				
			||||||
@@ -16539,11 +16541,15 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 | 
				
			|||||||
  Bhi = DAG.getBitcast(MulVT, Bhi);
 | 
					  Bhi = DAG.getBitcast(MulVT, Bhi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
 | 
					  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
 | 
				
			||||||
  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
 | 
					  // After shifting right const values the result may be all-zero.
 | 
				
			||||||
  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
 | 
					  if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
 | 
				
			||||||
 | 
					    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
 | 
				
			||||||
  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
 | 
					    AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
 | 
				
			||||||
  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
 | 
					  }
 | 
				
			||||||
 | 
					  if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
 | 
				
			||||||
 | 
					    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
 | 
				
			||||||
 | 
					    AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
 | 
					  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
 | 
				
			||||||
  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 | 
					  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,6 @@
 | 
				
			|||||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
 | 
				
			||||||
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s --check-prefix=AVX2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define <16 x i8> @mul8c(<16 x i8> %i) nounwind  {
 | 
					define <16 x i8> @mul8c(<16 x i8> %i) nounwind  {
 | 
				
			||||||
; SSE2-LABEL: mul8c:
 | 
					; SSE2-LABEL: mul8c:
 | 
				
			||||||
@@ -75,10 +76,6 @@ define <2 x i64> @b(<2 x i64> %i) nounwind  {
 | 
				
			|||||||
; ALL-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
 | 
					; ALL-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
 | 
				
			||||||
; ALL-NEXT:    movdqa %xmm0, %xmm2
 | 
					; ALL-NEXT:    movdqa %xmm0, %xmm2
 | 
				
			||||||
; ALL-NEXT:    pmuludq %xmm1, %xmm2
 | 
					; ALL-NEXT:    pmuludq %xmm1, %xmm2
 | 
				
			||||||
; ALL-NEXT:    pxor %xmm3, %xmm3
 | 
					 | 
				
			||||||
; ALL-NEXT:    pmuludq %xmm0, %xmm3
 | 
					 | 
				
			||||||
; ALL-NEXT:    psllq $32, %xmm3
 | 
					 | 
				
			||||||
; ALL-NEXT:    paddq %xmm3, %xmm2
 | 
					 | 
				
			||||||
; ALL-NEXT:    psrlq $32, %xmm0
 | 
					; ALL-NEXT:    psrlq $32, %xmm0
 | 
				
			||||||
; ALL-NEXT:    pmuludq %xmm1, %xmm0
 | 
					; ALL-NEXT:    pmuludq %xmm1, %xmm0
 | 
				
			||||||
; ALL-NEXT:    psllq $32, %xmm0
 | 
					; ALL-NEXT:    psllq $32, %xmm0
 | 
				
			||||||
@@ -248,3 +245,35 @@ entry:
 | 
				
			|||||||
  %A = mul <2 x i64> %i, %j
 | 
					  %A = mul <2 x i64> %i, %j
 | 
				
			||||||
  ret <2 x i64> %A
 | 
					  ret <2 x i64> %A
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					define <4 x i64> @b1(<4 x i64> %i) nounwind  {
 | 
				
			||||||
 | 
					; AVX2-LABEL: @b1
 | 
				
			||||||
 | 
					; AVX2: vpbroadcastq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpmuludq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpsrlq  $32 
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpmuludq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpsllq  $32
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpaddq
 | 
				
			||||||
 | 
					; AVX2-NEXT: retq
 | 
				
			||||||
 | 
					entry:
 | 
				
			||||||
 | 
					  %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
 | 
				
			||||||
 | 
					  ret <4 x i64> %A
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind  {
 | 
				
			||||||
 | 
					; AVX2-LABEL: @b2
 | 
				
			||||||
 | 
					; AVX2:  vpmuludq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpsrlq  $32
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpmuludq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpsllq  $32
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpaddq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpsrlq  $32
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpmuludq
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpsllq  $32
 | 
				
			||||||
 | 
					; AVX2-NEXT: vpaddq
 | 
				
			||||||
 | 
					; AVX2-NEXT: retq
 | 
				
			||||||
 | 
					entry:
 | 
				
			||||||
 | 
					  %A = mul <4 x i64> %i, %j
 | 
				
			||||||
 | 
					  ret <4 x i64> %A
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user