diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1a0b62970cf..19bfb9f2659 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1314,23 +1314,19 @@ void X86TargetLowering::resetOperationActions() { i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT VT = (MVT::SimpleValueType)i; + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) { - if (VT.getScalarSizeInBits() >= 32) { - setOperationAction(ISD::MLOAD, VT, Custom); - setOperationAction(ISD::MSTORE, VT, Custom); - } setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; - if (VT.getScalarSizeInBits() >= 32) { - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); - } setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); @@ -1499,10 +1495,6 @@ void X86TargetLowering::resetOperationActions() { // (result) is 256/128-bit but the source is 512-bit wide. if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - if ( EltSize >= 32) { - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); - } } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 080694a75c8..312c6800cdf 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2211,6 +2211,11 @@ def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", "16", "8", "4", SSEPackedInt, HasAVX512>, avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e66813fa151..971cef12dab 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8966,20 +8966,26 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q_256>, VEX_W; def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), - (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; +def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), + (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), + (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; + def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), - (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (bc_v8f32 (v8i32 immAllZerosV)))), - (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), - (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), VR256:$mask)>; def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), @@ -8992,21 +8998,42 @@ def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0) (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), VR256:$mask)>; +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), + (bc_v4f32 (v4i32 immAllZerosV)))), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), - (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), - (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 immAllZerosV))), - (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), - (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), VR256:$mask)>; def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), @@ -9020,6 +9047,33 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0) (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), VR256:$mask)>; +def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), + (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), + (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (v2f64 immAllZerosV))), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (bc_v2i64 (v4i32 immAllZerosV)))), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), + VR128:$mask)>; //===----------------------------------------------------------------------===// // Variable Bit Shifts diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index 8cb2d63d5f6..0b88ec6fe8d 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -41,8 +41,8 @@ define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) { ; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}} ; AVX2-LABEL: test4 -; AVX2: vpmaskmovd {{.*}}(%rdi) -; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: vmaskmovps {{.*}}(%rdi) +; AVX2: vmaskmovps {{.*}}(%rdi) ; AVX2: blend define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) { %mask = icmp eq <16 x i32> %trigger, zeroinitializer @@ -54,9 +54,9 @@ define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) { ; AVX512: vmovupd (%rdi), %zmm1 {%k1} ; AVX2-LABEL: test5 -; AVX2: vpmaskmovq +; AVX2: vmaskmovpd ; AVX2: vblendvpd -; AVX2: vpmaskmovq +; AVX2: vmaskmovpd ; AVX2: vblendvpd define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) { %mask = icmp eq <8 x i32> %trigger, zeroinitializer @@ -64,10 +64,80 @@ define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) { ret <8 x double> %res } -declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>) -declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>) -declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>) -declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>) -declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>) -declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>) +; AVX2-LABEL: test6 +; AVX2: vmaskmovpd +; AVX2: vblendvpd +define <2 x double> @test6(<2 x i64> %trigger, i8* %addr, <2 x double> %dst) { + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64(i8* %addr, <2 x double>%dst, i32 4, <2 x i1>%mask) + ret <2 x double> %res +} + +; AVX2-LABEL: test7 +; AVX2: vmaskmovps {{.*}}(%rdi) +; AVX2: blend +define <4 x float> @test7(<4 x i32> %trigger, i8* %addr, <4 x float> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x float> @llvm.masked.load.v4f32(i8* %addr, <4 x float>%dst, i32 4, <4 x i1>%mask) + ret <4 x float> %res +} + +; AVX2-LABEL: test8 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: blend +define <4 x i32> @test8(<4 x i32> %trigger, i8* %addr, <4 x i32> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32(i8* %addr, <4 x i32>%dst, i32 4, <4 x i1>%mask) + ret <4 x i32> %res +} + +; AVX2-LABEL: test9 +; AVX2: vpmaskmovd %xmm +define void @test9(<4 x i32> %trigger, i8* %addr, <4 x i32> %val) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32(i8* %addr, <4 x i32>%val, i32 4, <4 x i1>%mask) + ret void +} + +; AVX2-LABEL: test10 +; AVX2: vmaskmovpd (%rdi), %ymm +; AVX2: blend +define <4 x double> @test10(<4 x i32> %trigger, i8* %addr, <4 x double> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x double> @llvm.masked.load.v4f64(i8* %addr, <4 x double>%dst, i32 4, <4 x i1>%mask) + ret <4 x double> %res +} + +; AVX2-LABEL: test11 +; AVX2: vmaskmovps +; AVX2: vblendvps +define <8 x float> @test11(<8 x i32> %trigger, i8* %addr, <8 x float> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32(i8* %addr, <8 x float>%dst, i32 4, <8 x i1>%mask) + ret <8 x float> %res +} + +; AVX2-LABEL: test12 +; AVX2: vpmaskmovd %ymm +define void @test12(<8 x i32> %trigger, i8* %addr, <8 x i32> %val) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v8i32(i8* %addr, <8 x i32>%val, i32 4, <8 x i1>%mask) + ret void +} + +declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>) +declare <4 x i32> @llvm.masked.load.v4i32(i8*, <4 x i32>, i32, <4 x i1>) +declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32(i8*, <8 x i32>, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32(i8*, <4 x i32>, i32, <4 x i1>) +declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>) +declare <8 x float> @llvm.masked.load.v8f32(i8*, <8 x float>, i32, <8 x i1>) +declare <4 x float> @llvm.masked.load.v4f32(i8*, <4 x float>, i32, <4 x i1>) +declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>) +declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>) +declare <4 x double> @llvm.masked.load.v4f64(i8*, <4 x double>, i32, <4 x i1>) +declare <2 x double> @llvm.masked.load.v2f64(i8*, <2 x double>, i32, <2 x i1>) +declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>) +declare void @llvm.masked.store.v2f64(i8*, <2 x double>, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64(i8*, <2 x i64>, i32, <2 x i1>)