From 6991623dd7b8a096ccddbaedd9de3f515bacb1e2 Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Tue, 13 Jul 2010 01:53:31 +0000
Subject: [PATCH] Add AVX 256-bit unop arithmetic instructions

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@108223 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrSSE.td           | 141 +++++++++++++++---------
 test/MC/AsmParser/X86/x86_32-encoding.s |  32 ++++++
 test/MC/AsmParser/X86/x86_64-encoding.s |  32 ++++++
 3 files changed, 151 insertions(+), 54 deletions(-)

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 61723db7a2a..96a82f32526 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1584,23 +1584,6 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
                     [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
 }
 
-/// sse1_fp_unop_p - SSE1 unops in scalar form.
-multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr,
-                          SDNode OpNode, Intrinsic V4F32Int> {
-  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>;
-  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
-  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V4F32Int VR128:$src))]>;
-  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
-}
-
 /// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
 multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, Intrinsic F32Int> {
@@ -1621,6 +1604,38 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
 }
 
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>;
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+}
+
+/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form.
+multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>;
+  def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))]>;
+}
+
+/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
+multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V4F32Int> {
+  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]>;
+  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+}
+
+
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
                           SDNode OpNode, Intrinsic F64Int> {
@@ -1640,23 +1655,6 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
                     [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
 }
 
-/// sse2_fp_unop_p - SSE2 unops in vector forms.
-multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
-                          SDNode OpNode, Intrinsic V2F64Int> {
-  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>;
-  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
-  def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V2F64Int VR128:$src))]>;
-  def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
-}
-
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
 multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, Intrinsic F64Int> {
@@ -1677,44 +1675,79 @@ multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
                     []>;
 }
 
-let isAsmParserOnly = 1 in {
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode> {
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>;
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+}
+
+/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms.
+multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>;
+  def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))]>;
+}
+
+/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
+multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V2F64Int> {
+  def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]>;
+  def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   // Square root.
-  let Predicates = [HasAVX] in {
-  defm VSQRT  : sse2_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>,
-                  VEX_4V;
-
-  defm VSQRT  : sse2_fp_unop_p<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_pd>, VEX;
-  }
-
-  let Predicates = [HasAVX] in {
   defm VSQRT  : sse1_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>,
-                  VEX_4V;
-  defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ps>, VEX;
+                sse2_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>,
+                VEX_4V;
+
+  defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
+                sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
+                sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
+                sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
+                VEX;
+
   // Reciprocal approximations. Note that these typically require refinement
   // in order to obtain suitable precision.
   defm VRSQRT : sse1_fp_unop_s_avx<0x52, "rsqrt", X86frsqrt,
                                    int_x86_sse_rsqrt_ss>, VEX_4V;
-  defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, int_x86_sse_rsqrt_ps>,
-                                   VEX;
+  defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
+                sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>, VEX;
+
   defm VRCP   : sse1_fp_unop_s_avx<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
                                    VEX_4V;
-  defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp, int_x86_sse_rcp_ps>,
-                                   VEX;
-  }
+  defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
+                sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>, VEX;
 }
 
 // Square root.
 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss>,
-             sse1_fp_unop_p<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ps>,
+             sse1_fp_unop_p<0x51, "sqrt",  fsqrt>,
+             sse1_fp_unop_p_int<0x51, "sqrt",  int_x86_sse_sqrt_ps>,
              sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd>,
-             sse2_fp_unop_p<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_pd>;
+             sse2_fp_unop_p<0x51, "sqrt",  fsqrt>,
+             sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ps>;
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>,
+             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>;
 defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
-             sse1_fp_unop_p<0x53, "rcp", X86frcp, int_x86_sse_rcp_ps>;
+             sse1_fp_unop_p<0x53, "rcp", X86frcp>,
+             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>;
 
 // There is no f64 version of the reciprocal approximation instructions.
 
diff --git a/test/MC/AsmParser/X86/x86_32-encoding.s b/test/MC/AsmParser/X86/x86_32-encoding.s
index 43438deb618..177b2a37d0f 100644
--- a/test/MC/AsmParser/X86/x86_32-encoding.s
+++ b/test/MC/AsmParser/X86/x86_32-encoding.s
@@ -12502,3 +12502,35 @@
 // CHECK: encoding: [0xc5,0xdd,0x59,0x30]
           vmulpd  (%eax), %ymm4, %ymm6
 
+// CHECK: vsqrtpd  %ymm1, %ymm2
+// CHECK: encoding: [0xc5,0xfd,0x51,0xd1]
+          vsqrtpd  %ymm1, %ymm2
+
+// CHECK: vsqrtpd  (%eax), %ymm2
+// CHECK: encoding: [0xc5,0xfd,0x51,0x10]
+          vsqrtpd  (%eax), %ymm2
+
+// CHECK: vsqrtps  %ymm1, %ymm2
+// CHECK: encoding: [0xc5,0xfc,0x51,0xd1]
+          vsqrtps  %ymm1, %ymm2
+
+// CHECK: vsqrtps  (%eax), %ymm2
+// CHECK: encoding: [0xc5,0xfc,0x51,0x10]
+          vsqrtps  (%eax), %ymm2
+
+// CHECK: vrsqrtps  %ymm1, %ymm2
+// CHECK: encoding: [0xc5,0xfc,0x52,0xd1]
+          vrsqrtps  %ymm1, %ymm2
+
+// CHECK: vrsqrtps  (%eax), %ymm2
+// CHECK: encoding: [0xc5,0xfc,0x52,0x10]
+          vrsqrtps  (%eax), %ymm2
+
+// CHECK: vrcpps  %ymm1, %ymm2
+// CHECK: encoding: [0xc5,0xfc,0x53,0xd1]
+          vrcpps  %ymm1, %ymm2
+
+// CHECK: vrcpps  (%eax), %ymm2
+// CHECK: encoding: [0xc5,0xfc,0x53,0x10]
+          vrcpps  (%eax), %ymm2
+
diff --git a/test/MC/AsmParser/X86/x86_64-encoding.s b/test/MC/AsmParser/X86/x86_64-encoding.s
index 74a4d675c49..1504f6b0abf 100644
--- a/test/MC/AsmParser/X86/x86_64-encoding.s
+++ b/test/MC/AsmParser/X86/x86_64-encoding.s
@@ -2576,3 +2576,35 @@ pshufb	CPI1_0(%rip), %xmm1
 // CHECK: encoding: [0xc5,0xdd,0x59,0x30]
           vmulpd  (%rax), %ymm4, %ymm6
 
+// CHECK: vsqrtpd  %ymm11, %ymm12
+// CHECK: encoding: [0xc4,0x41,0x7d,0x51,0xe3]
+          vsqrtpd  %ymm11, %ymm12
+
+// CHECK: vsqrtpd  (%rax), %ymm12
+// CHECK: encoding: [0xc5,0x7d,0x51,0x20]
+          vsqrtpd  (%rax), %ymm12
+
+// CHECK: vsqrtps  %ymm11, %ymm12
+// CHECK: encoding: [0xc4,0x41,0x7c,0x51,0xe3]
+          vsqrtps  %ymm11, %ymm12
+
+// CHECK: vsqrtps  (%rax), %ymm12
+// CHECK: encoding: [0xc5,0x7c,0x51,0x20]
+          vsqrtps  (%rax), %ymm12
+
+// CHECK: vrsqrtps  %ymm11, %ymm12
+// CHECK: encoding: [0xc4,0x41,0x7c,0x52,0xe3]
+          vrsqrtps  %ymm11, %ymm12
+
+// CHECK: vrsqrtps  (%rax), %ymm12
+// CHECK: encoding: [0xc5,0x7c,0x52,0x20]
+          vrsqrtps  (%rax), %ymm12
+
+// CHECK: vrcpps  %ymm11, %ymm12
+// CHECK: encoding: [0xc4,0x41,0x7c,0x53,0xe3]
+          vrcpps  %ymm11, %ymm12
+
+// CHECK: vrcpps  (%rax), %ymm12
+// CHECK: encoding: [0xc5,0x7c,0x53,0x20]
+          vrcpps  (%rax), %ymm12
+