AVX-512: changes in intel_ocl_bi calling conventions

- added mask types v8i1 and v16i1 to possible function parameters
- enabled passing 512-bit vectors in standard CC
- added a test for KNL intel_ocl_bi conventions


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229482 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2015-02-17 09:20:12 +00:00
parent 2c388f7653
commit 199f58a198
3 changed files with 138 additions and 14 deletions

View File

@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[
CCIfSubtarget<"hasFp256()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
// The first 4 AVX 512-bit vector arguments are passed in ZMM registers.
CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
// Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToStack<32, 32>>,
// 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToStack<64, 64>>,
// __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
// passed in the parameter area.
CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[
CCIfType<[v16f32, v8f64, v16i32, v8i64],
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
// Pass masks in mask registers
CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
CCDelegateTo<CC_X86_32_C>

View File

@ -1604,14 +1604,14 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
//
multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
string OpcodeStr, RegisterClass KRC,
ValueType vvt, ValueType ivt, X86MemOperand x86memop> {
ValueType vvt, X86MemOperand x86memop> {
let hasSideEffects = 0 in {
def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
let mayLoad = 1 in
def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
[(set KRC:$dst, (vvt (load addr:$src)))]>;
let mayStore = 1 in
def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@ -1631,27 +1631,25 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
}
let Predicates = [HasDQI] in
defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8,
i8mem>,
defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
VEX, PD;
let Predicates = [HasAVX512] in
defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16,
i16mem>,
defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
VEX, PS;
let Predicates = [HasBWI] in {
defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32,
i32mem>, VEX, PD, VEX_W;
defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
VEX, PD, VEX_W;
defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
VEX, XD;
}
let Predicates = [HasBWI] in {
defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64,
i64mem>, VEX, PS, VEX_W;
defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
VEX, PS, VEX_W;
defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
VEX, XD, VEX_W;
}
@ -1682,24 +1680,34 @@ let Predicates = [HasBWI] in {
let Predicates = [HasDQI] in {
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
(KMOVBmk addr:$dst, VK8:$src)>;
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
(KMOVBkm addr:$src)>;
}
let Predicates = [HasAVX512, NoDQI] in {
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
(KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
}
let Predicates = [HasAVX512] in {
def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
(KMOVWmk addr:$dst, VK16:$src)>;
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
(KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
def : Pat<(i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
(KMOVWkm addr:$src)>;
}
let Predicates = [HasBWI] in {
def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
(KMOVDmk addr:$dst, VK32:$src)>;
def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
(KMOVDkm addr:$src)>;
}
let Predicates = [HasBWI] in {
def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
(KMOVQmk addr:$dst, VK64:$src)>;
def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
(KMOVQkm addr:$src)>;
}
let Predicates = [HasAVX512] in {

View File

@ -0,0 +1,105 @@
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck -check-prefix=X32 %s
; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck -check-prefix=X32 %s
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck -check-prefix=WIN64 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck -check-prefix=X64 %s
declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
declare <16 x float> @func_float16(<16 x float>, <16 x float>)
declare i32 @func_int(i32, i32)
; WIN64-LABEL: testf16_inp
; WIN64: vaddps {{.*}}, {{%zmm[0-1]}}
; WIN64: leaq {{.*}}(%rsp), %rcx
; WIN64: call
; WIN64: ret
; X32-LABEL: testf16_inp
; X32: vaddps {{.*}}, {{%zmm[0-1]}}
; X32: movl %eax, (%esp)
; X32: call
; X32: ret
; X64-LABEL: testf16_inp
; X64: vaddps {{.*}}, {{%zmm[0-1]}}
; X64: leaq {{.*}}(%rsp), %rdi
; X64: call
; X64: ret
;test calling conventions - input parameters
define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
%y = alloca <16 x float>, align 16
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>* %y, align 16
%3 = fadd <16 x float> %2, %1
ret <16 x float> %3
}
;test calling conventions - preserved registers
; preserved zmm16-
; WIN64-LABEL: testf16_regs
; WIN64: call
; WIN64: vaddps %zmm16, %zmm0, %zmm0
; WIN64: ret
; preserved zmm16-
; X64-LABEL: testf16_regs
; X64: call
; X64: vaddps %zmm16, %zmm0, %zmm0
; X64: ret
define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
%y = alloca <16 x float>, align 16
%x = fadd <16 x float> %a, %b
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
%2 = load <16 x float>* %y, align 16
%3 = fadd <16 x float> %1, %b
%4 = fadd <16 x float> %2, %3
ret <16 x float> %4
}
; test calling conventions - prolog and epilog
; WIN64-LABEL: test_prolog_epilog
; WIN64: vmovups %zmm21, {{.*(%rbp).*}} # 64-byte Spill
; WIN64: vmovups %zmm6, {{.*(%rbp).*}} # 64-byte Spill
; WIN64: call
; WIN64: vmovups {{.*(%rbp).*}}, %zmm6 # 64-byte Reload
; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload
; X64-LABEL: test_prolog_epilog
; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill
; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill
; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill
; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill
; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill
; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill
; X64: call
; X64: vmovups {{.*}}(%rsp), %zmm16 ## 64-byte Reload
; X64: vmovups {{.*}}(%rsp), %zmm31 ## 64-byte Reload
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
ret <16 x float> %c
}
declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
; X64-LABEL: testf16_inp_mask
; X64: kmovw %edi, %k1
; X64: call
define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) {
%imask = bitcast i16 %mask to <16 x i1>
%1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
ret <16 x float> %1
}
; X64-LABEL: test_prolog_epilog_with_mask
; X64: kxorw %k{{.*}}, %k{{.*}}, %k1
; X64: call
define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
%cmp_res = icmp eq <16 x i32>%x1, %x2
%mask1 = xor <16 x i1> %cmp_res, %mask
%c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
ret <16 x float> %c
}