From 128b2c383cbe99bde4099374e094bdc5a528fecf Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 16 Dec 2014 21:57:18 +0000 Subject: [PATCH] merge consecutive loads that are offset from a base address SelectionDAG::isConsecutiveLoad() was not detecting consecutive loads when the first load was offset from a base address. This patch recognizes that pattern and subtracts the offset before comparing the second load to see if it is consecutive. The codegen change in the new test case improves from: vmovsd 32(%rdi), %xmm0 vmovsd 48(%rdi), %xmm1 vmovhpd 56(%rdi), %xmm1, %xmm1 vmovhpd 40(%rdi), %xmm0, %xmm0 vinsertf128 $1, %xmm1, %ymm0, %ymm0 To: vmovups 32(%rdi), %ymm0 An existing test case is also improved from: vmovsd (%rdi), %xmm0 vmovsd 16(%rdi), %xmm1 vmovsd 24(%rdi), %xmm2 vunpcklpd %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm2[0] vmovhpd 8(%rdi), %xmm1, %xmm3 To: vmovsd (%rdi), %xmm0 vmovsd 16(%rdi), %xmm1 vmovhpd 24(%rdi), %xmm0, %xmm0 vmovhpd 8(%rdi), %xmm1, %xmm1 This patch fixes PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ). Differential Revision: http://reviews.llvm.org/D6642 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224379 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 24 ++++++++++++++---- test/CodeGen/X86/chain_order.ll | 16 ++++++------ test/CodeGen/X86/vec_loadsingles.ll | 31 +++++++++++++++++++++-- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index f9131e7f2b2..20eaa2965e2 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6553,11 +6553,25 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } - // Handle X+C - if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && - cast(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) - return true; - + // Handle X + C. + if (isBaseWithConstantOffset(Loc)) { + int64_t LocOffset = cast(Loc.getOperand(1))->getSExtValue(); + if (Loc.getOperand(0) == BaseLoc) { + // If the base location is a simple address with no offset itself, then + // the second load's first add operand should be the base address. + if (LocOffset == Dist * (int)Bytes) + return true; + } else if (isBaseWithConstantOffset(BaseLoc)) { + // The base location itself has an offset, so subtract that value from the + // second load's offset before comparing to distance * size. + int64_t BOffset = + cast(BaseLoc.getOperand(1))->getSExtValue(); + if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { + if ((LocOffset - BOffset) == Dist * (int)Bytes) + return true; + } + } + } const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; int64_t Offset1 = 0; diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll index c88726e75a8..72e6f78bdef 100644 --- a/test/CodeGen/X86/chain_order.ll +++ b/test/CodeGen/X86/chain_order.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s -;CHECK-LABEL: cftx020: -;CHECK: vmovsd (%rdi), %xmm{{.*}} -;CHECK: vmovsd 16(%rdi), %xmm{{.*}} -;CHECK: vmovsd 24(%rdi), %xmm{{.*}} -;CHECK: vmovhpd 8(%rdi), %xmm{{.*}} -;CHECK: vmovupd %xmm{{.*}}, (%rdi) -;CHECK: vmovupd %xmm{{.*}}, 16(%rdi) -;CHECK: ret +; CHECK-LABEL: cftx020: +; CHECK: vmovsd (%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovsd 16(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 24(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 8(%rdi), %xmm{{.*}} +; CHECK: vmovupd %xmm{{.*}}, (%rdi) +; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi) +; CHECK: ret ; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads. define void @cftx020(double* nocapture %a) { diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll index af4d6fa61fd..fd132a52b8f 100644 --- a/test/CodeGen/X86/vec_loadsingles.ll +++ b/test/CodeGen/X86/vec_loadsingles.ll @@ -89,7 +89,7 @@ define <8 x float> @merge_8_floats(float* %ptr) { ; FAST32-NEXT: retq ; SLOW32: vmovups -; SLOW32: vinsertf128 +; SLOW32-NEXT: vinsertf128 ; SLOW32-NEXT: retq } @@ -112,7 +112,34 @@ define <4 x double> @merge_4_doubles(double* %ptr) { ; FAST32-NEXT: retq ; SLOW32: vmovups -; SLOW32: vinsertf128 +; SLOW32-NEXT: vinsertf128 +; SLOW32-NEXT: retq +} + +; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) +; Recognize and combine consecutive loads even when the +; first of the combined loads is offset from the base address. +define <4 x double> @merge_4_doubles_offset(double* %ptr) { + %arrayidx4 = getelementptr inbounds double* %ptr, i64 4 + %arrayidx5 = getelementptr inbounds double* %ptr, i64 5 + %arrayidx6 = getelementptr inbounds double* %ptr, i64 6 + %arrayidx7 = getelementptr inbounds double* %ptr, i64 7 + %e = load double* %arrayidx4, align 8 + %f = load double* %arrayidx5, align 8 + %g = load double* %arrayidx6, align 8 + %h = load double* %arrayidx7, align 8 + %vecinit4 = insertelement <4 x double> undef, double %e, i32 0 + %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1 + %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2 + %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3 + ret <4 x double> %vecinit7 + +; ALL-LABEL: merge_4_doubles_offset +; FAST32: vmovups +; FAST32-NEXT: retq + +; SLOW32: vmovups +; SLOW32-NEXT: vinsertf128 ; SLOW32-NEXT: retq }