From 5440f2198dce5f02031d09390edf62a593af5370 Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <michael.m.kuperstein@intel.com>
Date: Tue, 19 May 2015 11:06:56 +0000
Subject: [PATCH] [X86] ABI change for x86-32: pass 3 vector arguments
 in-register instead of 4, except on Darwin.

This changes the ABI used on 32-bit x86 for passing vector arguments.
Historically, clang passes the first 4 vector arguments in-register, and additional vector arguments on the stack, regardless of platform. That is different from the behavior of gcc, icc, and msvc, all of which pass only the first 3 arguments in-register.
The 3-register convention is documented, unofficially, in Agner's calling convention guide, and, officially, in the recently released version 1.0 of the i386 psABI.

Darwin is kept as is because the OS X ABI Function Call Guide explicitly documents the current (4-register) behavior.

This fixes PR21510

Differential revision: http://reviews.llvm.org/D9644

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237682 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86CallingConv.td              | 89 +++++++++++++------
 test/CodeGen/X86/fp-trunc.ll                  |  8 +-
 .../CodeGen/X86/illegal-vector-args-return.ll |  2 +
 .../CodeGen/X86/x86-32-vector-calling-conv.ll | 44 +++++++++
 4 files changed, 114 insertions(+), 29 deletions(-)
 create mode 100644 test/CodeGen/X86/x86-32-vector-calling-conv.ll

diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 790160662ef..8f88888f5ce 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -445,9 +445,61 @@ def CC_X86_64_AnyReg : CallingConv<[
 // X86 C Calling Convention
 //===----------------------------------------------------------------------===//
 
+/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_32_Vector_Common : CallingConv<[
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
+]>;
+
+// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
+// vector registers
+def CC_X86_32_Vector_Standard : CallingConv<[
+  // SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
+
+  // AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
+
+  // AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
+
+  CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in
+// vector registers.
+def CC_X86_32_Vector_Darwin : CallingConv<[
+  // SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+  // AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
+  CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
 /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
-/// values are spilled on the stack, and the first 4 vector values go in XMM
-/// regs.
+/// values are spilled on the stack.
 def CC_X86_32_Common : CallingConv<[
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
@@ -483,33 +535,16 @@ def CC_X86_32_Common : CallingConv<[
   CCIfType<[v32i1], CCPromoteToType<v32i8>>,
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
-  // The first 4 SSE vector arguments are passed in XMM registers.
-  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
-
-  // The first 4 AVX 256-bit vector arguments are passed in YMM registers.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                CCIfSubtarget<"hasFp256()",
-                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
-
-  // The first 4 AVX 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-                CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
-
-  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
-
-  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToStack<32, 32>>,
-
-  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToStack<64, 64>>,
-
   // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
   // passed in the parameter area.
-  CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
+  CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+  // Darwin passes vectors in a form that differs from the i386 psABI
+  CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
+
+  // Otherwise, drop to 'normal' X86-32 CC
+  CCDelegateTo<CC_X86_32_Vector_Standard>
+]>;
 
 def CC_X86_32_C : CallingConv<[
   // Promote i1/i8/i16 arguments to i32.
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 6424bfc9c21..807a8c8fe5e 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
 ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
+target triple = "i686-pc-linux-gnu"
+
 define <1 x float> @test1(<1 x double> %x) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # BB#0:
@@ -59,12 +61,14 @@ define <4 x float> @test3(<4 x double> %x) nounwind {
 define <8 x float> @test4(<8 x double> %x) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       # BB#0:
+; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
 ; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
 ; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    cvtpd2ps %xmm3, %xmm3
 ; CHECK-NEXT:    cvtpd2ps %xmm2, %xmm1
-; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT:    cvtpd2ps 16(%esp), %xmm2
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 ;
 ; AVX-LABEL: test4:
diff --git a/test/CodeGen/X86/illegal-vector-args-return.ll b/test/CodeGen/X86/illegal-vector-args-return.ll
index 62a21f4c5aa..d783d4fa1b4 100644
--- a/test/CodeGen/X86/illegal-vector-args-return.ll
+++ b/test/CodeGen/X86/illegal-vector-args-return.ll
@@ -3,6 +3,8 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm3, %xmm1"
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm2, %xmm0"
 
+target triple = "i686-apple-darwin8"
+
 define <4 x double> @foo(<4 x double> %x, <4 x double> %z) {
   %y = fmul <4 x double> %x, %z
   ret <4 x double> %y
diff --git a/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/test/CodeGen/X86/x86-32-vector-calling-conv.ll
new file mode 100644
index 00000000000..b2bda7ab8d0
--- /dev/null
+++ b/test/CodeGen/X86/x86-32-vector-calling-conv.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=DARWIN
+; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=LINUX
+
+; CHECK-LABEL: test_sse:
+; DARWIN-DAG: vpaddd  %xmm1, %xmm0, %xmm0
+; DARWIN-DAG: vpaddd  %xmm3, %xmm2, %xmm1
+; DARWIN: vpaddd  %xmm1, %xmm0, %xmm0
+; LINUX-DAG:  vpaddd  %xmm1, %xmm0, %xmm0
+; LINUX-DAG:  vpaddd  {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm1
+; LINUX:  vpaddd  %xmm1, %xmm0, %xmm0
+define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
+  %r0 = add <4 x i32> %a, %b
+  %r1 = add <4 x i32> %c, %d
+  %ret = add <4 x i32> %r0, %r1
+  ret <4 x i32> %ret
+}
+
+; CHECK-LABEL: test_avx:
+; DARWIN-DAG: vpaddd  %ymm1, %ymm0, %ymm0
+; DARWIN-DAG: vpaddd  %ymm3, %ymm2, %ymm1
+; DARWIN: vpaddd  %ymm1, %ymm0, %ymm0
+; LINUX-DAG:  vpaddd  %ymm1, %ymm0, %ymm0
+; LINUX-DAG:  vpaddd  {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm1
+; LINUX:  vpaddd  %ymm1, %ymm0, %ymm0
+define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
+  %r0 = add <8 x i32> %a, %b
+  %r1 = add <8 x i32> %c, %d
+  %ret = add <8 x i32> %r0, %r1
+  ret <8 x i32> %ret
+}
+
+; CHECK-LABEL: test_avx512:
+; DARWIN-DAG: vpaddd  %zmm1, %zmm0, %zmm0
+; DARWIN-DAG: vpaddd  %zmm3, %zmm2, %zmm1
+; DARWIN: vpaddd  %zmm1, %zmm0, %zmm0
+; LINUX-DAG:  vpaddd  %zmm1, %zmm0, %zmm0
+; LINUX-DAG:  vpaddd  {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm1
+; LINUX:  vpaddd  %zmm1, %zmm0, %zmm0
+define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
+  %r0 = add <16 x i32> %a, %b
+  %r1 = add <16 x i32> %c, %d
+  %ret = add <16 x i32> %r0, %r1
+  ret <16 x i32> %ret
+}