From 8bb9e48752b4a88e512ceb8fb802e2cdf8150e7b Mon Sep 17 00:00:00 2001
From: Bob Wilson <bob.wilson@apple.com>
Date: Sun, 26 Jul 2009 00:39:34 +0000
Subject: [PATCH] Add support for ARM Neon VREV instructions. Patch by Anton
 Korzh, with some modifications from me.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@77101 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp |  24 ++++++
 lib/Target/ARM/ARMISelLowering.h   |   5 ++
 lib/Target/ARM/ARMInstrNEON.td     |  72 ++++++++++++++++++
 test/CodeGen/ARM/vrev.ll           | 113 +++++++++++++++++++++++++++++
 4 files changed, 214 insertions(+)
 create mode 100644 test/CodeGen/ARM/vrev.ll
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 7e2bbcdc974..226f1618c9a 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2188,6 +2188,30 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
                      SplatBitSize, DAG);
 }
 
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+bool ARM::isVREVMask(ShuffleVectorSDNode *N, unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  MVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  unsigned BlockElts = N->getMaskElt(0) + 1;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned) N->getMaskElt(i) !=
+        (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
 static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
   // Canonicalize all-zeros and all-ones vectors.
   ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode());
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 10f9cea1a9d..d0806fb9c1d 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -125,6 +125,11 @@ namespace llvm {
     /// return the constant being splatted.  The ByteSize field indicates the
     /// number of bytes of each element [1248].
     SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+
+    /// isVREVMask - Check if a vector shuffle corresponds to a VREV
+    /// instruction with the specified blocksize.  (The order of the elements
+    /// within each block of the vector is reversed.)
+    bool isVREVMask(ShuffleVectorSDNode *N, unsigned blocksize);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 2b7192321dd..9415b40e768 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1662,6 +1662,78 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 
+//   VREV     : Vector Reverse
+
+def vrev64_shuffle : PatFrag<(ops node:$in),
+                             (vector_shuffle node:$in, undef), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp, 64);
+}]>;
+
+def vrev32_shuffle : PatFrag<(ops node:$in),
+                             (vector_shuffle node:$in, undef), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp, 32);
+}]>;
+
+def vrev16_shuffle : PatFrag<(ops node:$in),
+                             (vector_shuffle node:$in, undef), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp, 16);
+}]>;
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vrev64_shuffle (Ty DPR:$src))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vrev64_shuffle (Ty QPR:$src))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64.8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64.16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64.32", v2i32>;
+def VREV64df  : VREV64D<0b10, "vrev64.32", v2f32>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64.8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64.16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64.32", v4i32>;
+def VREV64qf  : VREV64Q<0b10, "vrev64.32", v4f32>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vrev32_shuffle (Ty DPR:$src))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vrev32_shuffle (Ty QPR:$src))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32.8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32.16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32.8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32.16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vrev16_shuffle (Ty DPR:$src))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vrev16_shuffle (Ty QPR:$src))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16.8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16.8", v16i8>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
new file mode 100644
index 00000000000..994d89d31c7
--- /dev/null
+++ b/test/CodeGen/ARM/vrev.ll
@@ -0,0 +1,113 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
+
+define arm_apcscc <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev64D8:
+;CHECK: vrev64.8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK: test_vrev64D16:
+;CHECK: vrev64.16
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define arm_apcscc <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK: test_vrev64D32:
+;CHECK: vrev64.32
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define arm_apcscc <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK: test_vrev64Df:
+;CHECK: vrev64.32
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev64Q8:
+;CHECK: vrev64.8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define arm_apcscc <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK: test_vrev64Q16:
+;CHECK: vrev64.16
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define arm_apcscc <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK: test_vrev64Q32:
+;CHECK: vrev64.32
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define arm_apcscc <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK: test_vrev64Qf:
+;CHECK: vrev64.32
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define arm_apcscc <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev32D8:
+;CHECK: vrev32.8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK: test_vrev32D16:
+;CHECK: vrev32.16
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev32Q8:
+;CHECK: vrev32.8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define arm_apcscc <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK: test_vrev32Q16:
+;CHECK: vrev32.16
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define arm_apcscc <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev16D8:
+;CHECK: vrev16.8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev16Q8:
+;CHECK: vrev16.8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}