###################################- 
# 
#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
#
#  Contributed by Michael Eager <eager@eagercon.com>.
#
#  This file is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License as published by the
#  Free Software Foundation; either version 3, or (at your option) any
#  later version.
#
#  GCC is distributed in the hope that it will be useful, but WITHOUT
#  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
#  License for more details.
#
#  Under Section 7 of GPL version 3, you are granted additional
#  permissions described in the GCC Runtime Library Exception, version
#  3.1, as published by the Free Software Foundation.
#
#  You should have received a copy of the GNU General Public License and
#  a copy of the GCC Runtime Library Exception along with this program;
#  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
#  <http://www.gnu.org/licenses/>. 
# 
#  muldi3_hard.S
# 
#  Multiply operation for 64 bit integers, for devices with hard multiply
#	Input :	Operand1[H] in Reg r5
#		Operand1[L] in Reg r6		
#		Operand2[H] in Reg r7
#		Operand2[L] in Reg r8	
#	Output: Result[H] in Reg r3
#		Result[L] in Reg r4	
# 
#  Explaination:
#
# 	Both the input numbers are divided into 16 bit number as follows
#		op1 = A B C D
# 		op2 = E F G H
#	result =    D * H 
#		 + (C * H + D * G) << 16
#		 + (B * H + C * G + D * F) << 32
#		 + (A * H + B * G + C * F + D * E) << 48 
#
# 	Only 64 bits of the output are considered
#
#######################################

/* An executable stack is *not* required for these functions.  */
#ifdef __linux__
.section .note.GNU-stack,"",%progbits
.previous
#endif

	.globl	muldi3_hardproc
	.ent	muldi3_hardproc
muldi3_hardproc:
	addi	r1,r1,-40

#  Save the input operands on the caller's stack
	swi	r5,r1,44
	swi	r6,r1,48
	swi	r7,r1,52
	swi	r8,r1,56

# Store all the callee saved registers 
	sw	r20,r1,r0
	swi	r21,r1,4
	swi	r22,r1,8
	swi	r23,r1,12
	swi	r24,r1,16
	swi	r25,r1,20
	swi	r26,r1,24
	swi	r27,r1,28

# Load all the 16 bit values for A through H
	lhui	r20,r1,44   # A
	lhui	r21,r1,46   # B
	lhui	r22,r1,48   # C
	lhui	r23,r1,50   # D
	lhui	r24,r1,52   # E
	lhui	r25,r1,54   # F
	lhui	r26,r1,56   # G
	lhui	r27,r1,58   # H

# D * H ==> LSB of the result on stack ==> Store1
	mul	r9,r23,r27
	swi	r9,r1,36    # Pos2 and Pos3

# Hi (Store1) + C * H + D * G ==> Store2 ==> Pos1 and Pos2
# Store the carry generated in position 2 for Pos 3
	lhui	r11,r1,36   # Pos2
	mul	r9,r22,r27   # C * H
	mul	r10,r23,r26  # D * G
	add	r9,r9,r10
	addc	r12,r0,r0
	add	r9,r9,r11
	addc	r12,r12,r0    # Store the Carry
	shi	r9,r1,36    # Store Pos2
	swi	r9,r1,32 
	lhui	r11,r1,32
	shi	r11,r1,34   # Store Pos1

# Hi (Store2) + B * H + C * G + D * F ==> Store3 ==> Pos0 and Pos1
	mul	r9,r21,r27  # B * H
	mul	r10,r22,r26 # C * G
	mul	r7,r23,r25 # D * F	
	add	r9,r9,r11
	add	r9,r9,r10
	add	r9,r9,r7
	swi	r9,r1,32   # Pos0 and Pos1

# Hi (Store3) + A * H + B * G + C * F + D * E ==> Store3 ==> Pos0
	lhui	r11,r1,32  # Pos0
	mul	r9,r20,r27  # A * H
	mul	r10,r21,r26 # B * G
	mul	r7,r22,r25 # C * F
	mul	r8,r23,r24 # D * E
	add	r9,r9,r11
	add 	r9,r9,r10
	add	r9,r9,r7
	add	r9,r9,r8
	sext16	r9,r9       # Sign extend the MSB
	shi	r9,r1,32

# Move results to r3 and r4
	lhui	r3,r1,32
	add	r3,r3,r12
	shi	r3,r1,32
	lwi	r3,r1,32  # Hi Part
	lwi	r4,r1,36  # Lo Part

# Restore Callee saved registers
	lw	r20,r1,r0
	lwi	r21,r1,4
	lwi	r22,r1,8
	lwi	r23,r1,12
	lwi	r24,r1,16
	lwi	r25,r1,20
	lwi	r26,r1,24
	lwi	r27,r1,28

# Restore Frame and return	
	rtsd	r15,8
	addi	r1,r1,40

.end muldi3_hardproc