Add support for ARM Neon VREV instructions.

Patch by Anton Korzh, with some modifications from me. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@77101 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-25 13:24:46 +00:00 · 2009-07-26 00:39:34 +00:00
parent 3f53fa9a51
commit 8bb9e48752
4 changed files with 214 additions and 0 deletions
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2188,6 +2188,30 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
                     SplatBitSize, DAG);
 }

+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+bool ARM::isVREVMask(ShuffleVectorSDNode *N, unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  MVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  unsigned BlockElts = N->getMaskElt(0) + 1;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned) N->getMaskElt(i) !=
+        (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
 static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
  // Canonicalize all-zeros and all-ones vectors.
  ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode());
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -125,6 +125,11 @@ namespace llvm {
    /// return the constant being splatted.  The ByteSize field indicates the
    /// number of bytes of each element [1248].
    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+
+    /// isVREVMask - Check if a vector shuffle corresponds to a VREV
+    /// instruction with the specified blocksize.  (The order of the elements
+    /// within each block of the vector is reversed.)
+    bool isVREVMask(ShuffleVectorSDNode *N, unsigned blocksize);
  }

  //===--------------------------------------------------------------------===//
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1662,6 +1662,78 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
                        v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;

+//   VREV     : Vector Reverse
+
+def vrev64_shuffle : PatFrag<(ops node:$in),
+                             (vector_shuffle node:$in, undef), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp, 64);
+}]>;
+
+def vrev32_shuffle : PatFrag<(ops node:$in),
+                             (vector_shuffle node:$in, undef), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp, 32);
+}]>;
+
+def vrev16_shuffle : PatFrag<(ops node:$in),
+                             (vector_shuffle node:$in, undef), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp, 16);
+}]>;
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vrev64_shuffle (Ty DPR:$src))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vrev64_shuffle (Ty QPR:$src))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64.8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64.16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64.32", v2i32>;
+def VREV64df  : VREV64D<0b10, "vrev64.32", v2f32>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64.8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64.16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64.32", v4i32>;
+def VREV64qf  : VREV64Q<0b10, "vrev64.32", v4f32>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vrev32_shuffle (Ty DPR:$src))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vrev32_shuffle (Ty QPR:$src))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32.8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32.16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32.8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32.16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vrev16_shuffle (Ty DPR:$src))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vrev16_shuffle (Ty QPR:$src))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16.8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16.8", v16i8>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -0,0 +1,113 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
+
+define arm_apcscc <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev64D8:
+;CHECK: vrev64.8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK: test_vrev64D16:
+;CHECK: vrev64.16
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define arm_apcscc <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK: test_vrev64D32:
+;CHECK: vrev64.32
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define arm_apcscc <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK: test_vrev64Df:
+;CHECK: vrev64.32
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev64Q8:
+;CHECK: vrev64.8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define arm_apcscc <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK: test_vrev64Q16:
+;CHECK: vrev64.16
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define arm_apcscc <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK: test_vrev64Q32:
+;CHECK: vrev64.32
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define arm_apcscc <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK: test_vrev64Qf:
+;CHECK: vrev64.32
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define arm_apcscc <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev32D8:
+;CHECK: vrev32.8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK: test_vrev32D16:
+;CHECK: vrev32.16
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev32Q8:
+;CHECK: vrev32.8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define arm_apcscc <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK: test_vrev32Q16:
+;CHECK: vrev32.16
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define arm_apcscc <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev16D8:
+;CHECK: vrev16.8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev16Q8:
+;CHECK: vrev16.8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}