[X86] Remove AVX1 vbroadcast intrinsics

The corresponding CFE patch replaces these intrinsics with vector initializers
in avxintrin.h.  This patch removes the LLVM intrinsics from the backend.

We now stop lowering at X86ISD::VBROADCAST custom node rather than lowering
that further to the intrinsics.

The patch only changes VBROADCASTS* and leaves VBROADCAST[FI]128 to continue
to use intrinsics.  As explained in the CFE patch, the reason is that we
currently don't generate as good code for them without the intrinsics.

CodeGen/X86/avx-vbroadcast.ll already provides coverage for this change.  It
checks that for a series of insertelements we generate the appropriate
vbroadcast instruction.

Also verified that there was no assembly change in the test-suite before and
after this patch.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209864 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Adam Nemet 2014-05-29 23:35:36 +00:00
parent 23356d4d64
commit 7cd893a985
3 changed files with 17 additions and 48 deletions

View File

@ -1304,15 +1304,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// Vector load with broadcast
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_vbroadcast_ss :
GCCBuiltin<"__builtin_ia32_vbroadcastss">,
Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
def int_x86_avx_vbroadcast_sd_256 :
GCCBuiltin<"__builtin_ia32_vbroadcastsd256">,
Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
def int_x86_avx_vbroadcast_ss_256 :
GCCBuiltin<"__builtin_ia32_vbroadcastss256">,
Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
def int_x86_avx_vbroadcastf128_pd_256 :
GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">,
Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;

View File

@ -7969,6 +7969,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType VT,
PatFrag ld_frag, SchedWrite Sched> :
AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
Sched<[Sched]>, VEX {
let mayLoad = 1;
}
// AVX2 adds register forms
class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
Intrinsic Int, SchedWrite Sched> :
@ -7977,16 +7987,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
let ExeDomain = SSEPackedSingle in {
def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
int_x86_avx_vbroadcast_ss, WriteLoad>;
def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
int_x86_avx_vbroadcast_ss_256,
WriteFShuffleLd>, VEX_L;
def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
f32mem, v4f32, loadf32, WriteLoad>;
def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
f32mem, v8f32, loadf32,
WriteFShuffleLd>, VEX_L;
}
let ExeDomain = SSEPackedDouble in
def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
int_x86_avx_vbroadcast_sd_256,
WriteFShuffleLd>, VEX_L;
def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
v4f64, loadf64, WriteFShuffleLd>, VEX_L;
def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
int_x86_avx_vbroadcastf128_pd_256,
WriteFShuffleLd>, VEX_L;
@ -8543,13 +8552,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
}
let Predicates = [HasAVX] in {
def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSYrm addr:$src)>;
def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
(VBROADCASTSDYrm addr:$src)>;
def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
let AddedComplexity = 20 in {

View File

@ -2219,14 +2219,6 @@ define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
define <4 x double> @test_x86_avx_vbroadcast_sd_256(i8* %a0) {
; CHECK: vbroadcastsd
%res = call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*) nounwind readonly
define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
; CHECK: vbroadcastf128
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
@ -2243,22 +2235,6 @@ define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
define <4 x float> @test_x86_avx_vbroadcast_ss(i8* %a0) {
; CHECK: vbroadcastss
%res = call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*) nounwind readonly
define <8 x float> @test_x86_avx_vbroadcast_ss_256(i8* %a0) {
; CHECK: vbroadcastss
%res = call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*) nounwind readonly
define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) {
; CHECK: vextractf128
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]