This patch adds ABI support for v1i128 data type.

It adds v1i128 to the appropriate register classes and checks parameter passing and return values. This is related to http://reviews.llvm.org/D9081, which will add instructions that exploit the v1i128 datatype. Phabricator review: http://reviews.llvm.org/D9475 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236503 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-23 20:29:30 +00:00 · 2015-05-05 16:10:44 +00:00 · 2015-05-05 16:10:44 +00:00 · c3c0de39db
commit c3c0de39db
parent a5f2faff5c
6 changed files with 330 additions and 13 deletions
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@ -62,7 +62,8 @@ def RetCC_PPC : CallingConv<[
 
  // Vector types returned as "direct" go into V2 .. V9; note that only the
  // ELFv2 ABI fully utilizes all these registers.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCIfSubtarget<"hasAltivec()",
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], 
+           CCIfSubtarget<"hasAltivec()",
           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>>
@ -114,7 +115,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
  CCIfType<[v4f64, v4f32, v4i1],
           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCIfSubtarget<"hasAltivec()",
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], 
+           CCIfSubtarget<"hasAltivec()",
           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>>
@ -172,9 +174,9 @@ def CC_PPC32_SVR4 : CallingConv<[
    CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,

  // The first 12 Vector arguments are passed in AltiVec registers.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCIfSubtarget<"hasAltivec()",
-           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9,
-                          V10, V11, V12, V13]>>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], 
+           CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+                          V8, V9, V10, V11, V12, V13]>>>,
  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9,
                          VSH10, VSH11, VSH12, VSH13]>>>,
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -39,6 +39,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+
 using namespace llvm;

 // FIXME: Remove this once soft-float is supported.
@ -402,11 +403,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
    // will selectively turn on ones that can be effectively codegen'd.
    for (MVT VT : MVT::vector_valuetypes()) {
      // add/sub are legal for all supported vector VT's.
-      setOperationAction(ISD::ADD , VT, Legal);
-      setOperationAction(ISD::SUB , VT, Legal);
-
+      // This check is temporary until support for quadword add/sub is added
+      if (VT.SimpleTy != MVT::v1i128) {
+        setOperationAction(ISD::ADD , VT, Legal);
+        setOperationAction(ISD::SUB , VT, Legal);
+      }
+      else {
+        setOperationAction(ISD::ADD , VT, Expand);
+        setOperationAction(ISD::SUB , VT, Expand);
+      }
+      
      // Vector instructions introduced in P8
-      if (Subtarget.hasP8Altivec()) {
+      if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
        setOperationAction(ISD::CTPOP, VT, Legal);
        setOperationAction(ISD::CTLZ, VT, Legal);
      }
@ -620,8 +628,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
      addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
    }

-    if (Subtarget.hasP8Altivec()) 
+    if (Subtarget.hasP8Altivec()) {
      addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
+      addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
+    }
  }

  if (Subtarget.hasQPX()) {
@ -2473,7 +2483,8 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
  // Altivec parameters are padded to a 16 byte boundary.
  if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
      ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
-      ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+      ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
+      ArgVT == MVT::v1i128)
    Align = 16;
  // QPX vector types stored in double-precision are padded to a 32 byte
  // boundary.
@ -2552,7 +2563,8 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
      }
    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
-        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
+        ArgVT == MVT::v1i128)
      if (AvailableVRs > 0) {
        --AvailableVRs;
        return false;
@ -3131,6 +3143,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
    case MVT::v16i8:
    case MVT::v2f64:
    case MVT::v2i64:
+    case MVT::v1i128:
      if (!Subtarget.hasQPX()) {
      // These can be scalar arguments or elements of a vector array type
      // passed directly.  The latter are used to implement ELFv2 homogenous
@ -4605,6 +4618,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
        case MVT::v16i8:
        case MVT::v2f64:
        case MVT::v2i64:
+        case MVT::v1i128:
          if (++NumVRsUsed <= NumVRs)
            continue;
          break;
@ -4967,6 +4981,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
    case MVT::v16i8:
    case MVT::v2f64:
    case MVT::v2i64:
+    case MVT::v1i128:
      if (!Subtarget.hasQPX()) {
      // These can be scalar arguments or elements of a vector array type
      // passed directly.  The latter are used to implement ELFv2 homogenous
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@ -817,26 +817,37 @@ def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
 def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v1i128 VRRC:$src))), (v16i8 VRRC:$src)>;

 def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v1i128 VRRC:$src))), (v8i16 VRRC:$src)>;

 def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v1i128 VRRC:$src))), (v4i32 VRRC:$src)>;

 def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v1i128 VRRC:$src))), (v4f32 VRRC:$src)>;

 def : Pat<(v2i64 (bitconvert (v16i8 VRRC:$src))), (v2i64 VRRC:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 VRRC:$src))), (v2i64 VRRC:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 VRRC:$src))), (v2i64 VRRC:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v1i128 VRRC:$src))), (v2i64 VRRC:$src)>;
+
+def : Pat<(v1i128 (bitconvert (v16i8 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v8i16 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;

 // Shuffles.

--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@ -890,6 +890,11 @@ def : Pat<(v2f64 (bitconvert v2i64:$A)),
 def : Pat<(v2i64 (bitconvert v2f64:$A)),
          (COPY_TO_REGCLASS $A, VRRC)>;

+def : Pat<(v2f64 (bitconvert v1i128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v1i128 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
 // sign extension patterns
 // To extend "in place" from v2i32 to v2i64, we have input data like:
 // | undef | i32 | undef | i32 |
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@ -288,7 +288,7 @@ def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
                                                (sequence "F%u", 31, 14))>;
 def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;

-def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v4f32], 128,
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32], 128,
                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@ -0,0 +1,284 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-BE-NOVSX
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-LE-NOVSX
+
+@x = common global <1 x i128> zeroinitializer, align 16
+@y = common global <1 x i128> zeroinitializer, align 16
+@a = common global i128 zeroinitializer, align 16
+@b = common global i128 zeroinitializer, align 16
+
+; VSX:
+;   %a is passed in register 34
+;   On LE, ensure %a is swapped before being used (using xxswapd)
+;   Similarly, on LE ensure the results are swapped before being returned in 
+;   register 34
+; VMX (no VSX): 
+;   %a is passed in register 2
+;   No swaps are necessary on LE
+define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind {
+       %tmp = add <1 x i128> %a, <i128 1>
+       ret <1 x i128> %tmp  
+
+; CHECK-LE-LABEL: @v1i128_increment_by_one
+; CHECK-LE: xxswapd [[PARAM1:[0-9]+]], 34
+; CHECK-LE: stxvd2x [[PARAM1]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: lxvd2x [[RESULT:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: xxswapd 34, [[RESULT]]
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @v1i128_increment_by_one
+; CHECK-BE-NOT: xxswapd {{[0-9]+}}, 34
+; CHECK-BE: stxvd2x 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE: lxvd2x 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE-NOT: xxswapd 34, {{[0-9]+}}
+; CHECK-BE: blr
+
+; CHECK-NOVSX-LABEL: @v1i128_increment_by_one
+; CHECK-NOVSX-NOT: xxswapd {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: stxvd2x {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: lxvd2x {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: xxswapd {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX: blr
+}
+
+; VSX:
+;   %a is passed in register 34
+;   %b is passed in register 35
+;   On LE, ensure the contents of 34 and 35 are swapped before being used
+;   Similarly, on LE ensure the results are swapped before being returned in
+;   register 34
+; VMX (no VSX):
+;   %a is passewd in register 2
+;   %b is passed in register 3
+;   On LE, do not need to swap contents of 2 and 3 because the lvx/stvx 
+;   instructions no not swap elements
+define <1 x i128> @v1i128_increment_by_val(<1 x i128> %a, <1 x i128> %b) nounwind {
+       %tmp = add <1 x i128> %a, %b
+       ret <1 x i128> %tmp
+
+; CHECK-LE-LABEL: @v1i128_increment_by_val
+; CHECK-LE-DAG: xxswapd [[PARAM1:[0-9]+]], 34
+; CHECK-LE-DAG: xxswapd [[PARAM2:[0-9]+]], 35
+; CHECK-LE-DAG: stxvd2x [[PARAM1]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE-DAG: stxvd2x [[PARAM2]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: lxvd2x [[RESULT:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: xxswapd 34, [[RESULT]]
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @v1i128_increment_by_val
+; CHECK-BE-NOT: xxswapd {{[0-9]+}}, 34
+; CHECK-BE-NOT: xxswapd {{[0-9]+}}, 35
+; CHECK-BE-DAG: stxvd2x 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE-DAG: stxvd2x 35, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE: lxvd2x [[RESULT:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE-NOT: xxswapd 34, [[RESULT]]
+; CHECK-BE: blr
+
+; CHECK-NOVSX-LABEL: @v1i128_increment_by_val
+; CHECK-NOVSX-NOT: xxswapd {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: xxswapd {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-DAG: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-DAG: stvx 3, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX: lvx [[RESULT:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: xxswapd 34, [[RESULT]]
+; CHECK-NOVSX: blr
+}
+
+; Little Endian (VSX and VMX):
+;   Lower 64-bits of %a are passed in register 3
+;   Upper 64-bits of %a are passed in register 4
+;   Increment lower 64-bits using addic (immediate value of 1)
+;   Increment upper 64-bits using add zero extended
+;   Results are placed in registers 3 and 4
+; Big Endian (VSX and VMX)
+;   Lower 64-bits of %a are passed in register 4
+;   Upper 64-bits of %a are passed in register 3
+;   Increment lower 64-bits using addic (immediate value of 1)
+;   Increment upper 64-bits using add zero extended
+;   Results are placed in registers 3 and 4
+define i128 @i128_increment_by_one(i128 %a) nounwind {
+       %tmp =  add i128 %a,  1
+       ret i128 %tmp
+; CHECK-LE-LABEL: @i128_increment_by_one
+; CHECK-LE: addic 3, 3, 1
+; CHECK-LE-NEXT: addze 4, 4
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @i128_increment_by_one
+; CHECK-BE: addic 4, 4, 1
+; CHECK-BE-NEXT: addze 3, 3
+; CHECK-BE: blr
+
+; CHECK-LE-NOVSX-LABEL: @i128_increment_by_one
+; CHECK-LE-NOVSX: addic 3, 3, 1
+; CHECK-LE-NOVSX-NEXT: addze 4, 4
+; CHECK-LE-NOVSX: blr
+
+; CHECK-BE-NOVSX-LABEL: @i128_increment_by_one
+; CHECK-BE-NOVSX: addic 4, 4, 1
+; CHECK-BE-NOVSX-NEXT: addze 3, 3
+; CHECK-BE-NOVSX: blr
+}
+
+; Little Endian (VSX and VMX):
+;   Lower 64-bits of %a are passed in register 3
+;   Upper 64-bits of %a are passed in register 4
+;   Lower 64-bits of %b are passed in register 5
+;   Upper 64-bits of %b are passed in register 6
+;   Add the lower 64-bits using addc on registers 3 and 5
+;   Add the upper 64-bits using adde on registers 4 and 6
+;   Registers 3 and 4 should hold the result
+; Big Endian (VSX and VMX):
+;   Upper 64-bits of %a are passed in register 3
+;   Lower 64-bits of %a are passed in register 4
+;   Upper 64-bits of %b are passed in register 5
+;   Lower 64-bits of %b are passed in register 6
+;   Add the lower 64-bits using addc on registers 4 and 6
+;   Add the upper 64-bits using adde on registers 3 and 5
+;   Registers 3 and 4 should hold the result
+define i128 @i128_increment_by_val(i128 %a, i128 %b) nounwind {
+       %tmp =  add i128 %a, %b
+       ret i128 %tmp
+; CHECK-LE-LABEL: @i128_increment_by_val
+; CHECK-LE: addc 3, 3, 5
+; CHECK-LE-NEXT: adde 4, 4, 6
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @i128_increment_by_val
+; CHECK-BE: addc 4, 4, 6
+; CHECK-BE-NEXT: adde 3, 3, 5
+; CHECK-BE: blr
+
+; CHECK-LE-NOVSX-LABEL: @i128_increment_by_val
+; CHECK-LE-NOVSX: addc 3, 3, 5
+; CHECK-LE-NOVSX-NEXT: adde 4, 4, 6
+; CHECK-LE-NOVSX: blr
+
+; CHECK-BE-NOVSX-LABEL: @i128_increment_by_val
+; CHECK-BE-NOVSX: addc 4, 4, 6
+; CHECK-BE-NOVSX-NEXT: adde 3, 3, 5
+; CHECK-BE-NOVSX: blr
+}
+
+
+; Callsites for the routines defined above. 
+; Ensure the parameters are loaded in the same order that is expected by the 
+; callee. See comments for individual functions above for details on registers
+; used for parameters.
+define <1 x i128> @call_v1i128_increment_by_one() nounwind {
+       %tmp = load <1 x i128>, <1 x i128>* @x, align 16
+       %ret = call <1 x i128> @v1i128_increment_by_one(<1 x i128> %tmp)
+       ret <1 x i128> %ret
+
+; CHECK-LE-LABEL: @call_v1i128_increment_by_one
+; CHECK-LE: lxvd2x [[PARAM:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: xxswapd 34, [[PARAM]]
+; CHECK-LE: bl v1i128_increment_by_one
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @call_v1i128_increment_by_one
+; CHECK-BE: lxvw4x 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE-NOT: xxswapd 34, {{[0-9]+}}
+; CHECK-BE: bl v1i128_increment_by_one
+; CHECK-BE: blr
+
+; CHECK-NOVSX-LABEL: @call_v1i128_increment_by_one
+; CHECK-NOVSX: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: xxswapd {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX: bl v1i128_increment_by_one
+; CHECK-NOVSX: blr
+}
+
+define <1 x i128> @call_v1i128_increment_by_val() nounwind {
+       %tmp = load <1 x i128>, <1 x i128>* @x, align 16
+       %tmp2 = load <1 x i128>, <1 x i128>* @y, align 16
+       %ret = call <1 x i128> @v1i128_increment_by_val(<1 x i128> %tmp, <1 x i128> %tmp2)
+       ret <1 x i128> %ret
+
+; CHECK-LE-LABEL: @call_v1i128_increment_by_val
+; CHECK-LE: lxvd2x [[PARAM1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: lxvd2x [[PARAM2:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE-DAG: xxswapd 34, [[PARAM1]]
+; CHECK-LE-DAG: xxswapd 35, [[PARAM2]]
+; CHECK-LE: bl v1i128_increment_by_val
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @call_v1i128_increment_by_val
+
+
+; CHECK-BE-DAG: lxvw4x 35, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-BE-NOT: xxswapd 34, {{[0-9]+}}
+; CHECK-BE-NOT: xxswapd 35, {{[0-9]+}}
+; CHECK-BE: bl v1i128_increment_by_val
+; CHECK-BE: blr
+
+; CHECK-NOVSX-LABEL: @call_v1i128_increment_by_val
+; CHECK-NOVSX-DAG: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-DAG: lvx 3, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NOVSX-NOT: xxswapd 34, {{[0-9]+}}
+; CHECK-NOVSX-NOT: xxswapd 35, {{[0-9]+}}
+; CHECK-NOVSX: bl v1i128_increment_by_val
+; CHECK-NOVSX: blr
+
+}
+
+define i128 @call_i128_increment_by_one() nounwind {
+       %tmp = load i128, i128* @a, align 16
+       %ret = call i128 @i128_increment_by_one(i128 %tmp)
+       ret i128 %ret
+;       %ret4 = call i128 @i128_increment_by_val(i128 %tmp2, i128 %tmp2)
+; CHECK-LE-LABEL: @call_i128_increment_by_one
+; CHECK-LE-DAG: ld 3, 0([[BASEREG:[0-9]+]])
+; CHECK-LE-DAG: ld 4, 8([[BASEREG]])
+; CHECK-LE: bl i128_increment_by_one
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @call_i128_increment_by_one
+; CHECK-BE-DAG: ld 3, 0([[BASEREG:[0-9]+]])
+; CHECK-BE-DAG: ld 4, 8([[BASEREG]])
+; CHECK-BE: bl i128_increment_by_one
+; CHECK-BE: blr
+
+; CHECK-NOVSX-LABEL: @call_i128_increment_by_one
+; CHECK-NOVSX-DAG: ld 3, 0([[BASEREG:[0-9]+]])
+; CHECK-NOVSX-DAG: ld 4, 8([[BASEREG]])
+; CHECK-NOVSX: bl i128_increment_by_one
+; CHECK-NOVSX: blr
+}
+
+define i128 @call_i128_increment_by_val() nounwind {
+       %tmp = load i128, i128* @a, align 16
+       %tmp2 = load i128, i128* @b, align 16
+       %ret = call i128 @i128_increment_by_val(i128 %tmp, i128 %tmp2)
+       ret i128 %ret
+; CHECK-LE-LABEL: @call_i128_increment_by_val
+; CHECK-LE-DAG: ld 3, 0([[P1BASEREG:[0-9]+]])
+; CHECK-LE-DAG: ld 4, 8([[P1BASEREG]])
+; CHECK-LE-DAG: ld 5, 0([[P2BASEREG:[0-9]+]])
+; CHECK-LE-DAG: ld 6, 8([[P2BASEREG]])
+; CHECK-LE: bl i128_increment_by_val
+; CHECK-LE: blr
+
+; CHECK-BE-LABEL: @call_i128_increment_by_val
+; CHECK-BE-DAG: ld 3, 0([[P1BASEREG:[0-9]+]])
+; CHECK-BE-DAG: ld 4, 8([[P1BASEREG]])
+; CHECK-BE-DAG: ld 5, 0([[P2BASEREG:[0-9]+]])
+; CHECK-BE-DAG: ld 6, 8([[P2BASEREG]])
+; CHECK-BE: bl i128_increment_by_val
+; CHECK-BE: blr
+
+; CHECK-NOVSX-LABEL: @call_i128_increment_by_val
+; CHECK-NOVSX-DAG: ld 3, 0([[P1BASEREG:[0-9]+]])
+; CHECK-NOVSX-DAG: ld 4, 8([[P1BASEREG]])
+; CHECK-NOVSX-DAG: ld 5, 0([[P2BASEREG:[0-9]+]])
+; CHECK-NOVSX-DAG: ld 6, 8([[P2BASEREG]])
+; CHECK-NOVSX: bl i128_increment_by_val
+; CHECK-NOVSX: blr
+}
+
+