//======--- SPUMathInst.td - Cell SPU math operations -*- tablegen -*---======//
//
//                     Cell SPU math operations
//
// This target description file contains instruction sequences for various
// math operations, such as vector multiplies, i32 multiply, etc., for the
// SPU's i32, i16 i8 and corresponding vector types.
//
// Any resemblance to libsimdmath or the Cell SDK simdmath library is
// purely and completely coincidental.
//
// Primary author: Scott Michel (scottm@aero.org)
//===----------------------------------------------------------------------===//

//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// v16i8 multiply instruction sequence:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)),
          (ORv4i32
           (ANDv4i32
            (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB),
                       (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8),
                                             (ROTMAHIv8i16 VECREG:$rB, 8)), 8),
                       (FSMBIv8i16 0x2222)),
            (ILAv4i32 0x0000ffff)),
           (SHLIv4i32
            (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16),
                                 (ROTMAIv4i32_i32 VECREG:$rB, 16)),
                       (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8),
                                             (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8),
                       (FSMBIv8i16 0x2222)), 16))>;
                        
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// v8i16 multiply instruction sequence:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
          (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB),
                     (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16),
                     (FSMBIv8i16 0xcccc))>;
                 
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// v4i32, i32 multiply instruction sequence:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

def MPYv4i32:
  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
      (Av4i32
        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;

def MPYi32:
  Pat<(mul R32C:$rA, R32C:$rB),
      (Ar32
        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
              (MPYHr32 R32C:$rB, R32C:$rA)),
        (MPYUr32 R32C:$rA, R32C:$rB))>;

//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// f32, v4f32 divide instruction sequence:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

// Reciprocal estimate and interpolation
def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>;
// Division estimate
def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>;
// Newton-Raphson iteration
def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA),
		  	       Interpf32.Fragment,
	  	  	       DivEstf32.Fragment)>;
// Epsilon addition
def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>;

def : Pat<(fdiv R32FP:$rA, R32FP:$rB),
	  (SELBf32_cond NRaphf32.Fragment,
			Epsilonf32.Fragment,
			(CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>;

// Reciprocal estimate and interpolation
def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>;
// Division estimate
def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>;
// Newton-Raphson iteration
def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment,
					      (v4f32 VECREG:$rB),
					      (v4f32 VECREG:$rA)),
		  	           Interpv4f32.Fragment,
	  	  	           DivEstv4f32.Fragment)>;
// Epsilon addition
def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>;

def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
	  (SELBv4f32_cond NRaphv4f32.Fragment,
			Epsilonv4f32.Fragment,
			(CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB),
					      Epsilonv4f32.Fragment,
					      (v4f32 VECREG:$rA)), -1))>;