diff --git a/test/CodeGen/Mips/msa/arithmetic.ll b/test/CodeGen/Mips/msa/arithmetic.ll new file mode 100644 index 00000000000..303778f7a21 --- /dev/null +++ b/test/CodeGen/Mips/msa/arithmetic.ll @@ -0,0 +1,320 @@ +; RUN: llc -march=mips -mattr=+msa < %s | FileCheck %s + +define void @add_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: add_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = add <16 x i8> %1, %2 + ; CHECK-DAG: addv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size add_v16i8 +} + +define void @add_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: add_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = add <8 x i16> %1, %2 + ; CHECK-DAG: addv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size add_v8i16 +} + +define void @add_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: add_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = add <4 x i32> %1, %2 + ; CHECK-DAG: addv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size add_v4i32 +} + +define void @add_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: add_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = add <2 x i64> %1, %2 + ; CHECK-DAG: addv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size add_v2i64 +} +define void @sub_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: sub_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = sub <16 x i8> %1, %2 + ; CHECK-DAG: subv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size sub_v16i8 +} + +define void @sub_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: sub_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = sub <8 x i16> %1, %2 + ; CHECK-DAG: subv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size sub_v8i16 +} + +define void @sub_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: sub_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = sub <4 x i32> %1, %2 + ; CHECK-DAG: subv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size sub_v4i32 +} + +define void @sub_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: sub_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = sub <2 x i64> %1, %2 + ; CHECK-DAG: subv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size sub_v2i64 +} + +define void @mul_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: mul_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = mul <16 x i8> %1, %2 + ; CHECK-DAG: mulv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size mul_v16i8 +} + +define void @mul_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: mul_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = mul <8 x i16> %1, %2 + ; CHECK-DAG: mulv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size mul_v8i16 +} + +define void @mul_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: mul_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = mul <4 x i32> %1, %2 + ; CHECK-DAG: mulv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size mul_v4i32 +} + +define void @mul_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: mul_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = mul <2 x i64> %1, %2 + ; CHECK-DAG: mulv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size mul_v2i64 +} + +define void @div_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: div_s_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = sdiv <16 x i8> %1, %2 + ; CHECK-DAG: div_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size div_s_v16i8 +} + +define void @div_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: div_s_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = sdiv <8 x i16> %1, %2 + ; CHECK-DAG: div_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size div_s_v8i16 +} + +define void @div_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: div_s_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = sdiv <4 x i32> %1, %2 + ; CHECK-DAG: div_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size div_s_v4i32 +} + +define void @div_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: div_s_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = sdiv <2 x i64> %1, %2 + ; CHECK-DAG: div_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size div_s_v2i64 +} + +define void @div_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: div_u_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = udiv <16 x i8> %1, %2 + ; CHECK-DAG: div_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size div_u_v16i8 +} + +define void @div_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: div_u_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = udiv <8 x i16> %1, %2 + ; CHECK-DAG: div_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size div_u_v8i16 +} + +define void @div_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: div_u_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = udiv <4 x i32> %1, %2 + ; CHECK-DAG: div_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size div_u_v4i32 +} + +define void @div_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: div_u_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = udiv <2 x i64> %1, %2 + ; CHECK-DAG: div_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size div_u_v2i64 +} diff --git a/test/CodeGen/Mips/msa/arithmetic_float.ll b/test/CodeGen/Mips/msa/arithmetic_float.ll new file mode 100644 index 00000000000..3d26cc72a6d --- /dev/null +++ b/test/CodeGen/Mips/msa/arithmetic_float.ll @@ -0,0 +1,160 @@ +; RUN: llc -march=mips -mattr=+msa < %s | FileCheck %s + +define void @add_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind { + ; CHECK: add_v4f32: + + %1 = load <4 x float>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x float>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = fadd <4 x float> %1, %2 + ; CHECK-DAG: fadd.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x float> %3, <4 x float>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size add_v4f32 +} + +define void @add_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind { + ; CHECK: add_v2f64: + + %1 = load <2 x double>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x double>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = fadd <2 x double> %1, %2 + ; CHECK-DAG: fadd.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x double> %3, <2 x double>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size add_v2f64 +} + +define void @sub_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind { + ; CHECK: sub_v4f32: + + %1 = load <4 x float>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x float>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = fsub <4 x float> %1, %2 + ; CHECK-DAG: fsub.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x float> %3, <4 x float>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size sub_v4f32 +} + +define void @sub_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind { + ; CHECK: sub_v2f64: + + %1 = load <2 x double>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x double>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = fsub <2 x double> %1, %2 + ; CHECK-DAG: fsub.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x double> %3, <2 x double>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size sub_v2f64 +} + +define void @mul_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind { + ; CHECK: mul_v4f32: + + %1 = load <4 x float>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x float>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = fmul <4 x float> %1, %2 + ; CHECK-DAG: fmul.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x float> %3, <4 x float>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size mul_v4f32 +} + +define void @mul_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind { + ; CHECK: mul_v2f64: + + %1 = load <2 x double>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x double>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = fmul <2 x double> %1, %2 + ; CHECK-DAG: fmul.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x double> %3, <2 x double>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size mul_v2f64 +} + +define void @fdiv_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind { + ; CHECK: fdiv_v4f32: + + %1 = load <4 x float>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x float>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = fdiv <4 x float> %1, %2 + ; CHECK-DAG: fdiv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x float> %3, <4 x float>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size fdiv_v4f32 +} + +define void @fdiv_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind { + ; CHECK: fdiv_v2f64: + + %1 = load <2 x double>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x double>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = fdiv <2 x double> %1, %2 + ; CHECK-DAG: fdiv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x double> %3, <2 x double>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size fdiv_v2f64 +} + +define void @fsqrt_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind { + ; CHECK: fsqrt_v4f32: + + %1 = load <4 x float>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = tail call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %1) + ; CHECK-DAG: fsqrt.w [[R3:\$w[0-9]+]], [[R1]] + store <4 x float> %2, <4 x float>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size fsqrt_v4f32 +} + +define void @fsqrt_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind { + ; CHECK: fsqrt_v2f64: + + %1 = load <2 x double>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = tail call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %1) + ; CHECK-DAG: fsqrt.d [[R3:\$w[0-9]+]], [[R1]] + store <2 x double> %2, <2 x double>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size fsqrt_v2f64 +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %Val) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %Val) diff --git a/test/CodeGen/Mips/msa/bitwise.ll b/test/CodeGen/Mips/msa/bitwise.ll new file mode 100644 index 00000000000..61d2ccb6432 --- /dev/null +++ b/test/CodeGen/Mips/msa/bitwise.ll @@ -0,0 +1,254 @@ +; RUN: llc -march=mips -mattr=+msa < %s | FileCheck %s + +define void @sll_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: sll_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shl <16 x i8> %1, %2 + ; CHECK-DAG: sll.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size sll_v16i8 +} + +define void @sll_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: sll_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shl <8 x i16> %1, %2 + ; CHECK-DAG: sll.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size sll_v8i16 +} + +define void @sll_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: sll_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shl <4 x i32> %1, %2 + ; CHECK-DAG: sll.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size sll_v4i32 +} + +define void @sll_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: sll_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shl <2 x i64> %1, %2 + ; CHECK-DAG: sll.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size sll_v2i64 +} + +define void @sra_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: sra_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = ashr <16 x i8> %1, %2 + ; CHECK-DAG: sra.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size sra_v16i8 +} + +define void @sra_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: sra_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = ashr <8 x i16> %1, %2 + ; CHECK-DAG: sra.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size sra_v8i16 +} + +define void @sra_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: sra_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = ashr <4 x i32> %1, %2 + ; CHECK-DAG: sra.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size sra_v4i32 +} + +define void @sra_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: sra_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = ashr <2 x i64> %1, %2 + ; CHECK-DAG: sra.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size sra_v2i64 +} + +define void @srl_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: srl_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = lshr <16 x i8> %1, %2 + ; CHECK-DAG: srl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size srl_v16i8 +} + +define void @srl_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: srl_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = lshr <8 x i16> %1, %2 + ; CHECK-DAG: srl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size srl_v8i16 +} + +define void @srl_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: srl_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = lshr <4 x i32> %1, %2 + ; CHECK-DAG: srl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size srl_v4i32 +} + +define void @srl_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: srl_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = lshr <2 x i64> %1, %2 + ; CHECK-DAG: srl.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size srl_v2i64 +} + +define void @ctlz_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind { + ; CHECK: ctlz_v16i8: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = tail call <16 x i8> @llvm.ctlz.v16i8 (<16 x i8> %1) + ; CHECK-DAG: nlzc.b [[R3:\$w[0-9]+]], [[R1]] + store <16 x i8> %2, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size ctlz_v16i8 +} + +define void @ctlz_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind { + ; CHECK: ctlz_v8i16: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = tail call <8 x i16> @llvm.ctlz.v8i16 (<8 x i16> %1) + ; CHECK-DAG: nlzc.h [[R3:\$w[0-9]+]], [[R1]] + store <8 x i16> %2, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size ctlz_v8i16 +} + +define void @ctlz_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind { + ; CHECK: ctlz_v4i32: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = tail call <4 x i32> @llvm.ctlz.v4i32 (<4 x i32> %1) + ; CHECK-DAG: nlzc.w [[R3:\$w[0-9]+]], [[R1]] + store <4 x i32> %2, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size ctlz_v4i32 +} + +define void @ctlz_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind { + ; CHECK: ctlz_v2i64: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = tail call <2 x i64> @llvm.ctlz.v2i64 (<2 x i64> %1) + ; CHECK-DAG: nlzc.d [[R3:\$w[0-9]+]], [[R1]] + store <2 x i64> %2, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size ctlz_v2i64 +} + +declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %val) +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %val) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val) +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %val)