mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-23 17:32:49 +00:00
AVX-512: changes in intel_ocl_bi calling conventions
- added mask types v8i1 and v16i1 to possible function parameters - enabled passing 512-bit vectors in standard CC - added a test for KNL intel_ocl_bi conventions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229482 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
2c388f7653
commit
199f58a198
@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[
|
||||
CCIfSubtarget<"hasFp256()",
|
||||
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
|
||||
|
||||
// The first 4 AVX 512-bit vector arguments are passed in ZMM registers.
|
||||
CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
|
||||
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
|
||||
|
||||
// Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
|
||||
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
|
||||
|
||||
@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[
|
||||
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
|
||||
CCAssignToStack<32, 32>>,
|
||||
|
||||
// 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
|
||||
CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
|
||||
CCAssignToStack<64, 64>>,
|
||||
|
||||
// __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
|
||||
// passed in the parameter area.
|
||||
CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
|
||||
@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[
|
||||
CCIfType<[v16f32, v8f64, v16i32, v8i64],
|
||||
CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
|
||||
|
||||
// Pass masks in mask registers
|
||||
CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
|
||||
|
||||
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
|
||||
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
|
||||
CCDelegateTo<CC_X86_32_C>
|
||||
|
@ -1604,14 +1604,14 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
|
||||
//
|
||||
multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
|
||||
string OpcodeStr, RegisterClass KRC,
|
||||
ValueType vvt, ValueType ivt, X86MemOperand x86memop> {
|
||||
ValueType vvt, X86MemOperand x86memop> {
|
||||
let hasSideEffects = 0 in {
|
||||
def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
|
||||
let mayLoad = 1 in
|
||||
def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
|
||||
[(set KRC:$dst, (vvt (load addr:$src)))]>;
|
||||
let mayStore = 1 in
|
||||
def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
@ -1631,27 +1631,25 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
|
||||
}
|
||||
|
||||
let Predicates = [HasDQI] in
|
||||
defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8,
|
||||
i8mem>,
|
||||
defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
|
||||
avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
|
||||
VEX, PD;
|
||||
|
||||
let Predicates = [HasAVX512] in
|
||||
defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16,
|
||||
i16mem>,
|
||||
defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
|
||||
avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
|
||||
VEX, PS;
|
||||
|
||||
let Predicates = [HasBWI] in {
|
||||
defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32,
|
||||
i32mem>, VEX, PD, VEX_W;
|
||||
defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
|
||||
VEX, PD, VEX_W;
|
||||
defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
|
||||
VEX, XD;
|
||||
}
|
||||
|
||||
let Predicates = [HasBWI] in {
|
||||
defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64,
|
||||
i64mem>, VEX, PS, VEX_W;
|
||||
defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
|
||||
VEX, PS, VEX_W;
|
||||
defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
|
||||
VEX, XD, VEX_W;
|
||||
}
|
||||
@ -1682,24 +1680,34 @@ let Predicates = [HasBWI] in {
|
||||
let Predicates = [HasDQI] in {
|
||||
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
|
||||
(KMOVBmk addr:$dst, VK8:$src)>;
|
||||
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
|
||||
(KMOVBkm addr:$src)>;
|
||||
}
|
||||
let Predicates = [HasAVX512, NoDQI] in {
|
||||
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
|
||||
(KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
|
||||
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
|
||||
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
|
||||
}
|
||||
let Predicates = [HasAVX512] in {
|
||||
def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
|
||||
(KMOVWmk addr:$dst, VK16:$src)>;
|
||||
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
|
||||
(KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
|
||||
def : Pat<(i1 (load addr:$src)),
|
||||
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
|
||||
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
|
||||
(COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
|
||||
def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
|
||||
(KMOVWkm addr:$src)>;
|
||||
}
|
||||
let Predicates = [HasBWI] in {
|
||||
def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
|
||||
(KMOVDmk addr:$dst, VK32:$src)>;
|
||||
def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
|
||||
(KMOVDkm addr:$src)>;
|
||||
}
|
||||
let Predicates = [HasBWI] in {
|
||||
def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
|
||||
(KMOVQmk addr:$dst, VK64:$src)>;
|
||||
def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
|
||||
(KMOVQkm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX512] in {
|
||||
|
105
test/CodeGen/X86/avx512-intel-ocl.ll
Normal file
105
test/CodeGen/X86/avx512-intel-ocl.ll
Normal file
@ -0,0 +1,105 @@
|
||||
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck -check-prefix=X32 %s
|
||||
; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck -check-prefix=X32 %s
|
||||
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck -check-prefix=WIN64 %s
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck -check-prefix=X64 %s
|
||||
|
||||
declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
|
||||
declare <16 x float> @func_float16(<16 x float>, <16 x float>)
|
||||
declare i32 @func_int(i32, i32)
|
||||
|
||||
; WIN64-LABEL: testf16_inp
|
||||
; WIN64: vaddps {{.*}}, {{%zmm[0-1]}}
|
||||
; WIN64: leaq {{.*}}(%rsp), %rcx
|
||||
; WIN64: call
|
||||
; WIN64: ret
|
||||
|
||||
; X32-LABEL: testf16_inp
|
||||
; X32: vaddps {{.*}}, {{%zmm[0-1]}}
|
||||
; X32: movl %eax, (%esp)
|
||||
; X32: call
|
||||
; X32: ret
|
||||
|
||||
; X64-LABEL: testf16_inp
|
||||
; X64: vaddps {{.*}}, {{%zmm[0-1]}}
|
||||
; X64: leaq {{.*}}(%rsp), %rdi
|
||||
; X64: call
|
||||
; X64: ret
|
||||
|
||||
;test calling conventions - input parameters
|
||||
define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
|
||||
%y = alloca <16 x float>, align 16
|
||||
%x = fadd <16 x float> %a, %b
|
||||
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
|
||||
%2 = load <16 x float>* %y, align 16
|
||||
%3 = fadd <16 x float> %2, %1
|
||||
ret <16 x float> %3
|
||||
}
|
||||
|
||||
;test calling conventions - preserved registers
|
||||
|
||||
; preserved zmm16-
|
||||
; WIN64-LABEL: testf16_regs
|
||||
; WIN64: call
|
||||
; WIN64: vaddps %zmm16, %zmm0, %zmm0
|
||||
; WIN64: ret
|
||||
|
||||
; preserved zmm16-
|
||||
; X64-LABEL: testf16_regs
|
||||
; X64: call
|
||||
; X64: vaddps %zmm16, %zmm0, %zmm0
|
||||
; X64: ret
|
||||
|
||||
define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
|
||||
%y = alloca <16 x float>, align 16
|
||||
%x = fadd <16 x float> %a, %b
|
||||
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
|
||||
%2 = load <16 x float>* %y, align 16
|
||||
%3 = fadd <16 x float> %1, %b
|
||||
%4 = fadd <16 x float> %2, %3
|
||||
ret <16 x float> %4
|
||||
}
|
||||
|
||||
; test calling conventions - prolog and epilog
|
||||
; WIN64-LABEL: test_prolog_epilog
|
||||
; WIN64: vmovups %zmm21, {{.*(%rbp).*}} # 64-byte Spill
|
||||
; WIN64: vmovups %zmm6, {{.*(%rbp).*}} # 64-byte Spill
|
||||
; WIN64: call
|
||||
; WIN64: vmovups {{.*(%rbp).*}}, %zmm6 # 64-byte Reload
|
||||
; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload
|
||||
|
||||
; X64-LABEL: test_prolog_epilog
|
||||
; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill
|
||||
; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill
|
||||
; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill
|
||||
; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill
|
||||
; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill
|
||||
; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill
|
||||
; X64: call
|
||||
; X64: vmovups {{.*}}(%rsp), %zmm16 ## 64-byte Reload
|
||||
; X64: vmovups {{.*}}(%rsp), %zmm31 ## 64-byte Reload
|
||||
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
|
||||
%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
|
||||
ret <16 x float> %c
|
||||
}
|
||||
|
||||
|
||||
declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
|
||||
|
||||
; X64-LABEL: testf16_inp_mask
|
||||
; X64: kmovw %edi, %k1
|
||||
; X64: call
|
||||
define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) {
|
||||
%imask = bitcast i16 %mask to <16 x i1>
|
||||
%1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
|
||||
ret <16 x float> %1
|
||||
}
|
||||
|
||||
; X64-LABEL: test_prolog_epilog_with_mask
|
||||
; X64: kxorw %k{{.*}}, %k{{.*}}, %k1
|
||||
; X64: call
|
||||
define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
|
||||
%cmp_res = icmp eq <16 x i32>%x1, %x2
|
||||
%mask1 = xor <16 x i1> %cmp_res, %mask
|
||||
%c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
|
||||
ret <16 x float> %c
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user