combine consecutive subvector 16-byte loads into one 32-byte load

This is a fix for PR21709 ( http://llvm.org/bugs/show_bug.cgi?id=21709 ).
When we have 2 consecutive 16-byte loads that are merged into one 32-byte vector,
we can use a single 32-byte load instead. 
But we don't do this for SandyBridge / IvyBridge because they have slower 32-byte memops.
We also don't bother using 32-byte *integer* loads on a machine that only has AVX1 (btver2)
because those operands would have to be split in half anyway since there is no support for
32-byte integer math ops.

Differential Revision: http://reviews.llvm.org/D6492



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224344 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Sanjay Patel
2014-12-16 16:30:01 +00:00
parent d69e4e2945
commit 8fe9488a40
3 changed files with 281 additions and 4 deletions

View File

@@ -773,6 +773,7 @@ def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
//===----------------------------------------------------------------------===//
// X86 Instruction Format Definitions.

View File

@@ -8158,6 +8158,49 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
(INSERT_get_vinsert128_imm VR256:$ins))>;
}
// Combine two consecutive 16-byte loads with a common destination register into
// one 32-byte load to that register.
let Predicates = [HasAVX, HasFastMem32] in {
def : Pat<(insert_subvector
(v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))),
(loadv4f32 (add addr:$src, (iPTR 16))),
(iPTR 4)),
(VMOVUPSYrm addr:$src)>;
def : Pat<(insert_subvector
(v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))),
(loadv2f64 (add addr:$src, (iPTR 16))),
(iPTR 2)),
(VMOVUPDYrm addr:$src)>;
def : Pat<(insert_subvector
(v32i8 (insert_subvector
undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))),
(bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))),
(iPTR 16)),
(VMOVDQUYrm addr:$src)>;
def : Pat<(insert_subvector
(v16i16 (insert_subvector
undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))),
(bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))),
(iPTR 8)),
(VMOVDQUYrm addr:$src)>;
def : Pat<(insert_subvector
(v8i32 (insert_subvector
undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))),
(bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))),
(iPTR 4)),
(VMOVDQUYrm addr:$src)>;
def : Pat<(insert_subvector
(v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))),
(loadv2i64 (add addr:$src, (iPTR 16))),
(iPTR 2)),
(VMOVDQUYrm addr:$src)>;
}
let Predicates = [HasAVX1Only] in {
def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),