diff --git a/lib1funcs.S b/lib1funcs.S new file mode 100644 index 0000000..ac101b4 --- /dev/null +++ b/lib1funcs.S @@ -0,0 +1,3315 @@ +/* -*- Mode: Asm -*- */ +/* Copyright (C) 1998-2021 Free Software Foundation, Inc. + Contributed by Denis Chertykov + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#if defined (__AVR_TINY__) +#define __zero_reg__ r17 +#define __tmp_reg__ r16 +#else +#define __zero_reg__ r1 +#define __tmp_reg__ r0 +#endif +#define __SREG__ 0x3f +#if defined (__AVR_HAVE_SPH__) +#define __SP_H__ 0x3e +#endif +#define __SP_L__ 0x3d +#define __RAMPZ__ 0x3B +#define __EIND__ 0x3C + +/* Most of the functions here are called directly from avr.md + patterns, instead of using the standard libcall mechanisms. + This can make better code because GCC knows exactly which + of the call-used registers (not all of them) are clobbered. */ + +/* FIXME: At present, there is no SORT directive in the linker + script so that we must not assume that different modules + in the same input section like .libgcc.text.mul will be + located close together. Therefore, we cannot use + RCALL/RJMP to call a function like __udivmodhi4 from + __divmodhi4 and have to use lengthy XCALL/XJMP even + though they are in the same input section and all same + input sections together are small enough to reach every + location with a RCALL/RJMP instruction. */ + +#if defined (__AVR_HAVE_EIJMP_EICALL__) && !defined (__AVR_HAVE_ELPMX__) +#error device not supported +#endif + + .macro mov_l r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + movw \r_dest, \r_src +#else + mov \r_dest, \r_src +#endif + .endm + + .macro mov_h r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + ; empty +#else + mov \r_dest, \r_src +#endif + .endm + +.macro wmov r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + movw \r_dest, \r_src +#else + mov \r_dest, \r_src + mov \r_dest+1, \r_src+1 +#endif +.endm + +#if defined (__AVR_HAVE_JMP_CALL__) +#define XCALL call +#define XJMP jmp +#else +#define XCALL rcall +#define XJMP rjmp +#endif + +#if defined (__AVR_HAVE_EIJMP_EICALL__) +#define XICALL eicall +#define XIJMP eijmp +#else +#define XICALL icall +#define XIJMP ijmp +#endif + +;; Prologue stuff + +.macro do_prologue_saves n_pushed n_frame=0 + ldi r26, lo8(\n_frame) + ldi r27, hi8(\n_frame) + ldi r30, lo8(gs(.L_prologue_saves.\@)) + ldi r31, hi8(gs(.L_prologue_saves.\@)) + XJMP __prologue_saves__ + ((18 - (\n_pushed)) * 2) +.L_prologue_saves.\@: +.endm + +;; Epilogue stuff + +.macro do_epilogue_restores n_pushed n_frame=0 + in r28, __SP_L__ +#ifdef __AVR_HAVE_SPH__ + in r29, __SP_H__ +.if \n_frame > 63 + subi r28, lo8(-\n_frame) + sbci r29, hi8(-\n_frame) +.elseif \n_frame > 0 + adiw r28, \n_frame +.endif +#else + clr r29 +.if \n_frame > 0 + subi r28, lo8(-\n_frame) +.endif +#endif /* HAVE SPH */ + ldi r30, \n_pushed + XJMP __epilogue_restores__ + ((18 - (\n_pushed)) * 2) +.endm + +;; Support function entry and exit for convenience + +.macro wsubi r_arg1, i_arg2 +#if defined (__AVR_TINY__) + subi \r_arg1, lo8(\i_arg2) + sbci \r_arg1+1, hi8(\i_arg2) +#else + sbiw \r_arg1, \i_arg2 +#endif +.endm + +.macro waddi r_arg1, i_arg2 +#if defined (__AVR_TINY__) + subi \r_arg1, lo8(-\i_arg2) + sbci \r_arg1+1, hi8(-\i_arg2) +#else + adiw \r_arg1, \i_arg2 +#endif +.endm + +.macro DEFUN name +.global \name +.func \name +\name: +.endm + +.macro ENDF name +.size \name, .-\name +.endfunc +.endm + +.macro FALIAS name +.global \name +.func \name +\name: +.size \name, .-\name +.endfunc +.endm + +;; Skip next instruction, typically a jump target +#define skip cpse 16,16 + +;; Negate a 2-byte value held in consecutive registers +.macro NEG2 reg + com \reg+1 + neg \reg + sbci \reg+1, -1 +.endm + +;; Negate a 4-byte value held in consecutive registers +;; Sets the V flag for signed overflow tests if REG >= 16 +.macro NEG4 reg + com \reg+3 + com \reg+2 + com \reg+1 +.if \reg >= 16 + neg \reg + sbci \reg+1, -1 + sbci \reg+2, -1 + sbci \reg+3, -1 +.else + com \reg + adc \reg, __zero_reg__ + adc \reg+1, __zero_reg__ + adc \reg+2, __zero_reg__ + adc \reg+3, __zero_reg__ +.endif +.endm + +#define exp_lo(N) hlo8 ((N) << 23) +#define exp_hi(N) hhi8 ((N) << 23) + + +.section .text.libgcc.mul, "ax", @progbits + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/* Note: mulqi3, mulhi3 are open-coded on the enhanced core. */ +#if !defined (__AVR_HAVE_MUL__) +/******************************************************* + Multiplication 8 x 8 without MUL +*******************************************************/ +#if defined (L_mulqi3) + +#define r_arg2 r22 /* multiplicand */ +#define r_arg1 r24 /* multiplier */ +#define r_res __tmp_reg__ /* result */ + +DEFUN __mulqi3 + clr r_res ; clear result +__mulqi3_loop: + sbrc r_arg1,0 + add r_res,r_arg2 + add r_arg2,r_arg2 ; shift multiplicand + breq __mulqi3_exit ; while multiplicand != 0 + lsr r_arg1 ; + brne __mulqi3_loop ; exit if multiplier = 0 +__mulqi3_exit: + mov r_arg1,r_res ; result to return register + ret +ENDF __mulqi3 + +#undef r_arg2 +#undef r_arg1 +#undef r_res + +#endif /* defined (L_mulqi3) */ + + +/******************************************************* + Widening Multiplication 16 = 8 x 8 without MUL + Multiplication 16 x 16 without MUL +*******************************************************/ + +#define A0 22 +#define A1 23 +#define B0 24 +#define BB0 20 +#define B1 25 +;; Output overlaps input, thus expand result in CC0/1 +#define C0 24 +#define C1 25 +#define CC0 __tmp_reg__ +#define CC1 21 + +#if defined (L_umulqihi3) +;;; R25:R24 = (unsigned int) R22 * (unsigned int) R24 +;;; (C1:C0) = (unsigned int) A0 * (unsigned int) B0 +;;; Clobbers: __tmp_reg__, R21..R23 +DEFUN __umulqihi3 + clr A1 + clr B1 + XJMP __mulhi3 +ENDF __umulqihi3 +#endif /* L_umulqihi3 */ + +#if defined (L_mulqihi3) +;;; R25:R24 = (signed int) R22 * (signed int) R24 +;;; (C1:C0) = (signed int) A0 * (signed int) B0 +;;; Clobbers: __tmp_reg__, R20..R23 +DEFUN __mulqihi3 + ;; Sign-extend B0 + clr B1 + sbrc B0, 7 + com B1 + ;; The multiplication runs twice as fast if A1 is zero, thus: + ;; Zero-extend A0 + clr A1 +#ifdef __AVR_HAVE_JMP_CALL__ + ;; Store B0 * sign of A + clr BB0 + sbrc A0, 7 + mov BB0, B0 + call __mulhi3 +#else /* have no CALL */ + ;; Skip sign-extension of A if A >= 0 + ;; Same size as with the first alternative but avoids errata skip + ;; and is faster if A >= 0 + sbrs A0, 7 + rjmp __mulhi3 + ;; If A < 0 store B + mov BB0, B0 + rcall __mulhi3 +#endif /* HAVE_JMP_CALL */ + ;; 1-extend A after the multiplication + sub C1, BB0 + ret +ENDF __mulqihi3 +#endif /* L_mulqihi3 */ + +#if defined (L_mulhi3) +;;; R25:R24 = R23:R22 * R25:R24 +;;; (C1:C0) = (A1:A0) * (B1:B0) +;;; Clobbers: __tmp_reg__, R21..R23 +DEFUN __mulhi3 + + ;; Clear result + clr CC0 + clr CC1 + rjmp 3f +1: + ;; Bit n of A is 1 --> C += B << n + add CC0, B0 + adc CC1, B1 +2: + lsl B0 + rol B1 +3: + ;; If B == 0 we are ready + wsubi B0, 0 + breq 9f + + ;; Carry = n-th bit of A + lsr A1 + ror A0 + ;; If bit n of A is set, then go add B * 2^n to C + brcs 1b + + ;; Carry = 0 --> The ROR above acts like CP A0, 0 + ;; Thus, it is sufficient to CPC the high part to test A against 0 + cpc A1, __zero_reg__ + ;; Only proceed if A != 0 + brne 2b +9: + ;; Move Result into place + mov C0, CC0 + mov C1, CC1 + ret +ENDF __mulhi3 +#endif /* L_mulhi3 */ + +#undef A0 +#undef A1 +#undef B0 +#undef BB0 +#undef B1 +#undef C0 +#undef C1 +#undef CC0 +#undef CC1 + + +#define A0 22 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 + +#define B0 18 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 + +#define CC0 26 +#define CC1 CC0+1 +#define CC2 30 +#define CC3 CC2+1 + +#define C0 22 +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 + +/******************************************************* + Widening Multiplication 32 = 16 x 16 without MUL +*******************************************************/ + +#if defined (L_umulhisi3) +DEFUN __umulhisi3 + wmov B0, 24 + ;; Zero-extend B + clr B2 + clr B3 + ;; Zero-extend A + wmov A2, B2 + XJMP __mulsi3 +ENDF __umulhisi3 +#endif /* L_umulhisi3 */ + +#if defined (L_mulhisi3) +DEFUN __mulhisi3 + wmov B0, 24 + ;; Sign-extend B + lsl r25 + sbc B2, B2 + mov B3, B2 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Sign-extend A + clr A2 + sbrc A1, 7 + com A2 + mov A3, A2 + XJMP __mulsi3 +#else /* no __AVR_ERRATA_SKIP_JMP_CALL__ */ + ;; Zero-extend A and __mulsi3 will run at least twice as fast + ;; compared to a sign-extended A. + clr A2 + clr A3 + sbrs A1, 7 + XJMP __mulsi3 + ;; If A < 0 then perform the B * 0xffff.... before the + ;; very multiplication by initializing the high part of the + ;; result CC with -B. + wmov CC2, A2 + sub CC2, B0 + sbc CC3, B1 + XJMP __mulsi3_helper +#endif /* __AVR_ERRATA_SKIP_JMP_CALL__ */ +ENDF __mulhisi3 +#endif /* L_mulhisi3 */ + + +/******************************************************* + Multiplication 32 x 32 without MUL +*******************************************************/ + +#if defined (L_mulsi3) +DEFUN __mulsi3 +#if defined (__AVR_TINY__) + in r26, __SP_L__ ; safe to use X, as it is CC0/CC1 + in r27, __SP_H__ + subi r26, lo8(-3) ; Add 3 to point past return address + sbci r27, hi8(-3) + push B0 ; save callee saved regs + push B1 + ld B0, X+ ; load from caller stack + ld B1, X+ + ld B2, X+ + ld B3, X +#endif + ;; Clear result + clr CC2 + clr CC3 + ;; FALLTHRU +ENDF __mulsi3 + +DEFUN __mulsi3_helper + clr CC0 + clr CC1 + rjmp 3f + +1: ;; If bit n of A is set, then add B * 2^n to the result in CC + ;; CC += B + add CC0,B0 $ adc CC1,B1 $ adc CC2,B2 $ adc CC3,B3 + +2: ;; B <<= 1 + lsl B0 $ rol B1 $ rol B2 $ rol B3 + +3: ;; A >>= 1: Carry = n-th bit of A + lsr A3 $ ror A2 $ ror A1 $ ror A0 + + brcs 1b + ;; Only continue if A != 0 + sbci A1, 0 + brne 2b + wsubi A2, 0 + brne 2b + + ;; All bits of A are consumed: Copy result to return register C + wmov C0, CC0 + wmov C2, CC2 +#if defined (__AVR_TINY__) + pop B1 ; restore callee saved regs + pop B0 +#endif /* defined (__AVR_TINY__) */ + + ret +ENDF __mulsi3_helper +#endif /* L_mulsi3 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef CC0 +#undef CC1 +#undef CC2 +#undef CC3 + +#endif /* !defined (__AVR_HAVE_MUL__) */ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +#if defined (__AVR_HAVE_MUL__) +#define A0 26 +#define B0 18 +#define C0 22 + +#define A1 A0+1 + +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 + +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 + +/******************************************************* + Widening Multiplication 32 = 16 x 16 with MUL +*******************************************************/ + +#if defined (L_mulhisi3) +;;; R25:R22 = (signed long) R27:R26 * (signed long) R19:R18 +;;; C3:C0 = (signed long) A1:A0 * (signed long) B1:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __mulhisi3 + XCALL __umulhisi3 + ;; Sign-extend B + tst B1 + brpl 1f + sub C2, A0 + sbc C3, A1 +1: ;; Sign-extend A + XJMP __usmulhisi3_tail +ENDF __mulhisi3 +#endif /* L_mulhisi3 */ + +#if defined (L_usmulhisi3) +;;; R25:R22 = (signed long) R27:R26 * (unsigned long) R19:R18 +;;; C3:C0 = (signed long) A1:A0 * (unsigned long) B1:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __usmulhisi3 + XCALL __umulhisi3 + ;; FALLTHRU +ENDF __usmulhisi3 + +DEFUN __usmulhisi3_tail + ;; Sign-extend A + sbrs A1, 7 + ret + sub C2, B0 + sbc C3, B1 + ret +ENDF __usmulhisi3_tail +#endif /* L_usmulhisi3 */ + +#if defined (L_umulhisi3) +;;; R25:R22 = (unsigned long) R27:R26 * (unsigned long) R19:R18 +;;; C3:C0 = (unsigned long) A1:A0 * (unsigned long) B1:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __umulhisi3 + mul A0, B0 + movw C0, r0 + mul A1, B1 + movw C2, r0 + mul A0, B1 +#ifdef __AVR_HAVE_JMP_CALL__ + ;; This function is used by many other routines, often multiple times. + ;; Therefore, if the flash size is not too limited, avoid the RCALL + ;; and inverst 6 Bytes to speed things up. + add C1, r0 + adc C2, r1 + clr __zero_reg__ + adc C3, __zero_reg__ +#else + rcall 1f +#endif + mul A1, B0 +1: add C1, r0 + adc C2, r1 + clr __zero_reg__ + adc C3, __zero_reg__ + ret +ENDF __umulhisi3 +#endif /* L_umulhisi3 */ + +/******************************************************* + Widening Multiplication 32 = 16 x 32 with MUL +*******************************************************/ + +#if defined (L_mulshisi3) +;;; R25:R22 = (signed long) R27:R26 * R21:R18 +;;; (C3:C0) = (signed long) A1:A0 * B3:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __mulshisi3 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Some cores have problem skipping 2-word instruction + tst A1 + brmi __mulohisi3 +#else + sbrs A1, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XJMP __muluhisi3 + ;; FALLTHRU +ENDF __mulshisi3 + +;;; R25:R22 = (one-extended long) R27:R26 * R21:R18 +;;; (C3:C0) = (one-extended long) A1:A0 * B3:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __mulohisi3 + XCALL __muluhisi3 + ;; One-extend R27:R26 (A1:A0) + sub C2, B0 + sbc C3, B1 + ret +ENDF __mulohisi3 +#endif /* L_mulshisi3 */ + +#if defined (L_muluhisi3) +;;; R25:R22 = (unsigned long) R27:R26 * R21:R18 +;;; (C3:C0) = (unsigned long) A1:A0 * B3:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __muluhisi3 + XCALL __umulhisi3 + mul A0, B3 + add C3, r0 + mul A1, B2 + add C3, r0 + mul A0, B2 + add C2, r0 + adc C3, r1 + clr __zero_reg__ + ret +ENDF __muluhisi3 +#endif /* L_muluhisi3 */ + +/******************************************************* + Multiplication 32 x 32 with MUL +*******************************************************/ + +#if defined (L_mulsi3) +;;; R25:R22 = R25:R22 * R21:R18 +;;; (C3:C0) = C3:C0 * B3:B0 +;;; Clobbers: R26, R27, __tmp_reg__ +DEFUN __mulsi3 + movw A0, C0 + push C2 + push C3 + XCALL __muluhisi3 + pop A1 + pop A0 + ;; A1:A0 now contains the high word of A + mul A0, B0 + add C2, r0 + adc C3, r1 + mul A0, B1 + add C3, r0 + mul A1, B0 + add C3, r0 + clr __zero_reg__ + ret +ENDF __mulsi3 +#endif /* L_mulsi3 */ + +#undef A0 +#undef A1 + +#undef B0 +#undef B1 +#undef B2 +#undef B3 + +#undef C0 +#undef C1 +#undef C2 +#undef C3 + +#endif /* __AVR_HAVE_MUL__ */ + +/******************************************************* + Multiplication 24 x 24 with MUL +*******************************************************/ + +#if defined (L_mulpsi3) + +;; A[0..2]: In: Multiplicand; Out: Product +#define A0 22 +#define A1 A0+1 +#define A2 A0+2 + +;; B[0..2]: In: Multiplier +#define B0 18 +#define B1 B0+1 +#define B2 B0+2 + +#if defined (__AVR_HAVE_MUL__) + +;; C[0..2]: Expand Result +#define C0 22 +#define C1 C0+1 +#define C2 C0+2 + +;; R24:R22 *= R20:R18 +;; Clobbers: r21, r25, r26, r27, __tmp_reg__ + +#define AA0 26 +#define AA2 21 + +DEFUN __mulpsi3 + wmov AA0, A0 + mov AA2, A2 + XCALL __umulhisi3 + mul AA2, B0 $ add C2, r0 + mul AA0, B2 $ add C2, r0 + clr __zero_reg__ + ret +ENDF __mulpsi3 + +#undef AA2 +#undef AA0 + +#undef C2 +#undef C1 +#undef C0 + +#else /* !HAVE_MUL */ +;; C[0..2]: Expand Result +#if defined (__AVR_TINY__) +#define C0 16 +#else +#define C0 0 +#endif /* defined (__AVR_TINY__) */ +#define C1 C0+1 +#define C2 21 + +;; R24:R22 *= R20:R18 +;; Clobbers: __tmp_reg__, R18, R19, R20, R21 + +DEFUN __mulpsi3 +#if defined (__AVR_TINY__) + in r26,__SP_L__ + in r27,__SP_H__ + subi r26, lo8(-3) ; Add 3 to point past return address + sbci r27, hi8(-3) + push B0 ; save callee saved regs + push B1 + ld B0,X+ ; load from caller stack + ld B1,X+ + ld B2,X+ +#endif /* defined (__AVR_TINY__) */ + + ;; C[] = 0 + clr __tmp_reg__ + clr C2 + +0: ;; Shift N-th Bit of B[] into Carry. N = 24 - Loop + LSR B2 $ ror B1 $ ror B0 + + ;; If the N-th Bit of B[] was set... + brcc 1f + + ;; ...then add A[] * 2^N to the Result C[] + ADD C0,A0 $ adc C1,A1 $ adc C2,A2 + +1: ;; Multiply A[] by 2 + LSL A0 $ rol A1 $ rol A2 + + ;; Loop until B[] is 0 + subi B0,0 $ sbci B1,0 $ sbci B2,0 + brne 0b + + ;; Copy C[] to the return Register A[] + wmov A0, C0 + mov A2, C2 + + clr __zero_reg__ +#if defined (__AVR_TINY__) + pop B1 + pop B0 +#endif /* (__AVR_TINY__) */ + ret +ENDF __mulpsi3 + +#undef C2 +#undef C1 +#undef C0 + +#endif /* HAVE_MUL */ + +#undef B2 +#undef B1 +#undef B0 + +#undef A2 +#undef A1 +#undef A0 + +#endif /* L_mulpsi3 */ + +#if defined (L_mulsqipsi3) && defined (__AVR_HAVE_MUL__) + +;; A[0..2]: In: Multiplicand +#define A0 22 +#define A1 A0+1 +#define A2 A0+2 + +;; BB: In: Multiplier +#define BB 25 + +;; C[0..2]: Result +#define C0 18 +#define C1 C0+1 +#define C2 C0+2 + +;; C[] = A[] * sign_extend (BB) +DEFUN __mulsqipsi3 + mul A0, BB + movw C0, r0 + mul A2, BB + mov C2, r0 + mul A1, BB + add C1, r0 + adc C2, r1 + clr __zero_reg__ + sbrs BB, 7 + ret + ;; One-extend BB + sub C1, A0 + sbc C2, A1 + ret +ENDF __mulsqipsi3 + +#undef C2 +#undef C1 +#undef C0 + +#undef BB + +#undef A2 +#undef A1 +#undef A0 + +#endif /* L_mulsqipsi3 && HAVE_MUL */ + +/******************************************************* + Multiplication 64 x 64 +*******************************************************/ + +;; A[] = A[] * B[] + +;; A[0..7]: In: Multiplicand +;; Out: Product +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 +#define A4 A0+4 +#define A5 A0+5 +#define A6 A0+6 +#define A7 A0+7 + +;; B[0..7]: In: Multiplier +#define B0 10 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 +#define B4 B0+4 +#define B5 B0+5 +#define B6 B0+6 +#define B7 B0+7 + +#ifndef __AVR_TINY__ +#if defined (__AVR_HAVE_MUL__) +;; Define C[] for convenience +;; Notice that parts of C[] overlap A[] respective B[] +#define C0 16 +#define C1 C0+1 +#define C2 20 +#define C3 C2+1 +#define C4 28 +#define C5 C4+1 +#define C6 C4+2 +#define C7 C4+3 + +#if defined (L_muldi3) + +;; A[] *= B[] +;; R25:R18 *= R17:R10 +;; Ordinary ABI-Function + +DEFUN __muldi3 + push r29 + push r28 + push r17 + push r16 + + ;; Counting in Words, we have to perform a 4 * 4 Multiplication + + ;; 3 * 0 + 0 * 3 + mul A7,B0 $ $ mov C7,r0 + mul A0,B7 $ $ add C7,r0 + mul A6,B1 $ $ add C7,r0 + mul A6,B0 $ mov C6,r0 $ add C7,r1 + mul B6,A1 $ $ add C7,r0 + mul B6,A0 $ add C6,r0 $ adc C7,r1 + + ;; 1 * 2 + mul A2,B4 $ add C6,r0 $ adc C7,r1 + mul A3,B4 $ $ add C7,r0 + mul A2,B5 $ $ add C7,r0 + + push A5 + push A4 + push B1 + push B0 + push A3 + push A2 + + ;; 0 * 0 + wmov 26, B0 + XCALL __umulhisi3 + wmov C0, 22 + wmov C2, 24 + + ;; 0 * 2 + wmov 26, B4 + XCALL __umulhisi3 $ wmov C4,22 $ add C6,24 $ adc C7,25 + + wmov 26, B2 + ;; 0 * 1 + XCALL __muldi3_6 + + pop A0 + pop A1 + ;; 1 * 1 + wmov 26, B2 + XCALL __umulhisi3 $ add C4,22 $ adc C5,23 $ adc C6,24 $ adc C7,25 + + pop r26 + pop r27 + ;; 1 * 0 + XCALL __muldi3_6 + + pop A0 + pop A1 + ;; 2 * 0 + XCALL __umulhisi3 $ add C4,22 $ adc C5,23 $ adc C6,24 $ adc C7,25 + + ;; 2 * 1 + wmov 26, B2 + XCALL __umulhisi3 $ $ $ add C6,22 $ adc C7,23 + + ;; A[] = C[] + wmov A0, C0 + ;; A2 = C2 already + wmov A4, C4 + wmov A6, C6 + + pop r16 + pop r17 + pop r28 + pop r29 + ret +ENDF __muldi3 +#endif /* L_muldi3 */ + +#if defined (L_muldi3_6) +;; A helper for some 64-bit multiplications with MUL available +DEFUN __muldi3_6 +__muldi3_6: + XCALL __umulhisi3 + add C2, 22 + adc C3, 23 + adc C4, 24 + adc C5, 25 + brcc 0f + adiw C6, 1 +0: ret +ENDF __muldi3_6 +#endif /* L_muldi3_6 */ + +#undef C7 +#undef C6 +#undef C5 +#undef C4 +#undef C3 +#undef C2 +#undef C1 +#undef C0 + +#else /* !HAVE_MUL */ + +#if defined (L_muldi3) + +#define C0 26 +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 +#define C4 C0+4 +#define C5 C0+5 +#define C6 0 +#define C7 C6+1 + +#define Loop 9 + +;; A[] *= B[] +;; R25:R18 *= R17:R10 +;; Ordinary ABI-Function + +DEFUN __muldi3 + push r29 + push r28 + push Loop + + ldi C0, 64 + mov Loop, C0 + + ;; C[] = 0 + clr __tmp_reg__ + wmov C0, 0 + wmov C2, 0 + wmov C4, 0 + +0: ;; Rotate B[] right by 1 and set Carry to the N-th Bit of B[] + ;; where N = 64 - Loop. + ;; Notice that B[] = B[] >>> 64 so after this Routine has finished, + ;; B[] will have its initial Value again. + LSR B7 $ ror B6 $ ror B5 $ ror B4 + ror B3 $ ror B2 $ ror B1 $ ror B0 + + ;; If the N-th Bit of B[] was set then... + brcc 1f + ;; ...finish Rotation... + ori B7, 1 << 7 + + ;; ...and add A[] * 2^N to the Result C[] + ADD C0,A0 $ adc C1,A1 $ adc C2,A2 $ adc C3,A3 + adc C4,A4 $ adc C5,A5 $ adc C6,A6 $ adc C7,A7 + +1: ;; Multiply A[] by 2 + LSL A0 $ rol A1 $ rol A2 $ rol A3 + rol A4 $ rol A5 $ rol A6 $ rol A7 + + dec Loop + brne 0b + + ;; We expanded the Result in C[] + ;; Copy Result to the Return Register A[] + wmov A0, C0 + wmov A2, C2 + wmov A4, C4 + wmov A6, C6 + + clr __zero_reg__ + pop Loop + pop r28 + pop r29 + ret +ENDF __muldi3 + +#undef Loop + +#undef C7 +#undef C6 +#undef C5 +#undef C4 +#undef C3 +#undef C2 +#undef C1 +#undef C0 + +#endif /* L_muldi3 */ +#endif /* HAVE_MUL */ +#endif /* if not __AVR_TINY__ */ + +#undef B7 +#undef B6 +#undef B5 +#undef B4 +#undef B3 +#undef B2 +#undef B1 +#undef B0 + +#undef A7 +#undef A6 +#undef A5 +#undef A4 +#undef A3 +#undef A2 +#undef A1 +#undef A0 + +/******************************************************* + Widening Multiplication 64 = 32 x 32 with MUL +*******************************************************/ + +#if defined (__AVR_HAVE_MUL__) +#define A0 r22 +#define A1 r23 +#define A2 r24 +#define A3 r25 + +#define B0 r18 +#define B1 r19 +#define B2 r20 +#define B3 r21 + +#define C0 18 +#define C1 C0+1 +#define C2 20 +#define C3 C2+1 +#define C4 28 +#define C5 C4+1 +#define C6 C4+2 +#define C7 C4+3 + +#if defined (L_umulsidi3) + +;; Unsigned widening 64 = 32 * 32 Multiplication with MUL + +;; R18[8] = R22[4] * R18[4] +;; +;; Ordinary ABI Function, but additionally sets +;; X = R20[2] = B2[2] +;; Z = R22[2] = A0[2] +DEFUN __umulsidi3 + clt + ;; FALLTHRU +ENDF __umulsidi3 + ;; T = sign (A) +DEFUN __umulsidi3_helper + push 29 $ push 28 ; Y + wmov 30, A2 + ;; Counting in Words, we have to perform 4 Multiplications + ;; 0 * 0 + wmov 26, A0 + XCALL __umulhisi3 + push 23 $ push 22 ; C0 + wmov 28, B0 + wmov 18, B2 + wmov C2, 24 + push 27 $ push 26 ; A0 + push 19 $ push 18 ; B2 + ;; + ;; 18 20 22 24 26 28 30 | B2, B3, A0, A1, C0, C1, Y + ;; B2 C2 -- -- -- B0 A2 + ;; 1 * 1 + wmov 26, 30 ; A2 + XCALL __umulhisi3 + ;; Sign-extend A. T holds the sign of A + brtc 0f + ;; Subtract B from the high part of the result + sub 22, 28 + sbc 23, 29 + sbc 24, 18 + sbc 25, 19 +0: wmov 18, 28 ;; B0 + wmov C4, 22 + wmov C6, 24 + ;; + ;; 18 20 22 24 26 28 30 | B2, B3, A0, A1, C0, C1, Y + ;; B0 C2 -- -- A2 C4 C6 + ;; + ;; 1 * 0 + XCALL __muldi3_6 + ;; 0 * 1 + pop 26 $ pop 27 ;; B2 + pop 18 $ pop 19 ;; A0 + XCALL __muldi3_6 + + ;; Move result C into place and save A0 in Z + wmov 22, C4 + wmov 24, C6 + wmov 30, 18 ; A0 + pop C0 $ pop C1 + + ;; Epilogue + pop 28 $ pop 29 ;; Y + ret +ENDF __umulsidi3_helper +#endif /* L_umulsidi3 */ + + +#if defined (L_mulsidi3) + +;; Signed widening 64 = 32 * 32 Multiplication +;; +;; R18[8] = R22[4] * R18[4] +;; Ordinary ABI Function +DEFUN __mulsidi3 + bst A3, 7 + sbrs B3, 7 ; Enhanced core has no skip bug + XJMP __umulsidi3_helper + + ;; B needs sign-extension + push A3 + push A2 + XCALL __umulsidi3_helper + ;; A0 survived in Z + sub r22, r30 + sbc r23, r31 + pop r26 + pop r27 + sbc r24, r26 + sbc r25, r27 + ret +ENDF __mulsidi3 +#endif /* L_mulsidi3 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#endif /* HAVE_MUL */ + +/********************************************************** + Widening Multiplication 64 = 32 x 32 without MUL +**********************************************************/ +#ifndef __AVR_TINY__ /* if not __AVR_TINY__ */ +#if defined (L_mulsidi3) && !defined (__AVR_HAVE_MUL__) +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 +#define A4 A0+4 +#define A5 A0+5 +#define A6 A0+6 +#define A7 A0+7 + +#define B0 10 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 +#define B4 B0+4 +#define B5 B0+5 +#define B6 B0+6 +#define B7 B0+7 + +#define AA0 22 +#define AA1 AA0+1 +#define AA2 AA0+2 +#define AA3 AA0+3 + +#define BB0 18 +#define BB1 BB0+1 +#define BB2 BB0+2 +#define BB3 BB0+3 + +#define Mask r30 + +;; Signed / Unsigned widening 64 = 32 * 32 Multiplication without MUL +;; +;; R18[8] = R22[4] * R18[4] +;; Ordinary ABI Function +DEFUN __mulsidi3 + set + skip + ;; FALLTHRU +ENDF __mulsidi3 + +DEFUN __umulsidi3 + clt ; skipped + ;; Save 10 Registers: R10..R17, R28, R29 + do_prologue_saves 10 + ldi Mask, 0xff + bld Mask, 7 + ;; Move B into place... + wmov B0, BB0 + wmov B2, BB2 + ;; ...and extend it + and BB3, Mask + lsl BB3 + sbc B4, B4 + mov B5, B4 + wmov B6, B4 + ;; Move A into place... + wmov A0, AA0 + wmov A2, AA2 + ;; ...and extend it + and AA3, Mask + lsl AA3 + sbc A4, A4 + mov A5, A4 + wmov A6, A4 + XCALL __muldi3 + do_epilogue_restores 10 +ENDF __umulsidi3 + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef AA0 +#undef AA1 +#undef AA2 +#undef AA3 +#undef BB0 +#undef BB1 +#undef BB2 +#undef BB3 +#undef Mask +#endif /* L_mulsidi3 && !HAVE_MUL */ +#endif /* if not __AVR_TINY__ */ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +.section .text.libgcc.div, "ax", @progbits + +/******************************************************* + Division 8 / 8 => (result + remainder) +*******************************************************/ +#define r_rem r25 /* remainder */ +#define r_arg1 r24 /* dividend, quotient */ +#define r_arg2 r22 /* divisor */ +#define r_cnt r23 /* loop count */ + +#if defined (L_udivmodqi4) +DEFUN __udivmodqi4 + sub r_rem,r_rem ; clear remainder and carry + ldi r_cnt,9 ; init loop counter + rjmp __udivmodqi4_ep ; jump to entry point +__udivmodqi4_loop: + rol r_rem ; shift dividend into remainder + cp r_rem,r_arg2 ; compare remainder & divisor + brcs __udivmodqi4_ep ; remainder <= divisor + sub r_rem,r_arg2 ; restore remainder +__udivmodqi4_ep: + rol r_arg1 ; shift dividend (with CARRY) + dec r_cnt ; decrement loop counter + brne __udivmodqi4_loop + com r_arg1 ; complement result + ; because C flag was complemented in loop + ret +ENDF __udivmodqi4 +#endif /* defined (L_udivmodqi4) */ + +#if defined (L_divmodqi4) +DEFUN __divmodqi4 + bst r_arg1,7 ; store sign of dividend + mov __tmp_reg__,r_arg1 + eor __tmp_reg__,r_arg2; r0.7 is sign of result + sbrc r_arg1,7 + neg r_arg1 ; dividend negative : negate + sbrc r_arg2,7 + neg r_arg2 ; divisor negative : negate + XCALL __udivmodqi4 ; do the unsigned div/mod + brtc __divmodqi4_1 + neg r_rem ; correct remainder sign +__divmodqi4_1: + sbrc __tmp_reg__,7 + neg r_arg1 ; correct result sign +__divmodqi4_exit: + ret +ENDF __divmodqi4 +#endif /* defined (L_divmodqi4) */ + +#undef r_rem +#undef r_arg1 +#undef r_arg2 +#undef r_cnt + + +/******************************************************* + Division 16 / 16 => (result + remainder) +*******************************************************/ +#define r_remL r26 /* remainder Low */ +#define r_remH r27 /* remainder High */ + +/* return: remainder */ +#define r_arg1L r24 /* dividend Low */ +#define r_arg1H r25 /* dividend High */ + +/* return: quotient */ +#define r_arg2L r22 /* divisor Low */ +#define r_arg2H r23 /* divisor High */ + +#define r_cnt r21 /* loop count */ + +#if defined (L_udivmodhi4) +DEFUN __udivmodhi4 + sub r_remL,r_remL + sub r_remH,r_remH ; clear remainder and carry + ldi r_cnt,17 ; init loop counter + rjmp __udivmodhi4_ep ; jump to entry point +__udivmodhi4_loop: + rol r_remL ; shift dividend into remainder + rol r_remH + cp r_remL,r_arg2L ; compare remainder & divisor + cpc r_remH,r_arg2H + brcs __udivmodhi4_ep ; remainder < divisor + sub r_remL,r_arg2L ; restore remainder + sbc r_remH,r_arg2H +__udivmodhi4_ep: + rol r_arg1L ; shift dividend (with CARRY) + rol r_arg1H + dec r_cnt ; decrement loop counter + brne __udivmodhi4_loop + com r_arg1L + com r_arg1H +; div/mod results to return registers, as for the div() function + mov_l r_arg2L, r_arg1L ; quotient + mov_h r_arg2H, r_arg1H + mov_l r_arg1L, r_remL ; remainder + mov_h r_arg1H, r_remH + ret +ENDF __udivmodhi4 +#endif /* defined (L_udivmodhi4) */ + +#if defined (L_divmodhi4) +DEFUN __divmodhi4 + .global _div +_div: + bst r_arg1H,7 ; store sign of dividend + mov __tmp_reg__,r_arg2H + brtc 0f + com __tmp_reg__ ; r0.7 is sign of result + rcall __divmodhi4_neg1 ; dividend negative: negate +0: + sbrc r_arg2H,7 + rcall __divmodhi4_neg2 ; divisor negative: negate + XCALL __udivmodhi4 ; do the unsigned div/mod + sbrc __tmp_reg__,7 + rcall __divmodhi4_neg2 ; correct remainder sign + brtc __divmodhi4_exit +__divmodhi4_neg1: + ;; correct dividend/remainder sign + com r_arg1H + neg r_arg1L + sbci r_arg1H,0xff + ret +__divmodhi4_neg2: + ;; correct divisor/result sign + com r_arg2H + neg r_arg2L + sbci r_arg2H,0xff +__divmodhi4_exit: + ret +ENDF __divmodhi4 +#endif /* defined (L_divmodhi4) */ + +#undef r_remH +#undef r_remL + +#undef r_arg1H +#undef r_arg1L + +#undef r_arg2H +#undef r_arg2L + +#undef r_cnt + +/******************************************************* + Division 24 / 24 => (result + remainder) +*******************************************************/ + +;; A[0..2]: In: Dividend; Out: Quotient +#define A0 22 +#define A1 A0+1 +#define A2 A0+2 + +;; B[0..2]: In: Divisor; Out: Remainder +#define B0 18 +#define B1 B0+1 +#define B2 B0+2 + +;; C[0..2]: Expand remainder +#define C0 __zero_reg__ +#define C1 26 +#define C2 25 + +;; Loop counter +#define r_cnt 21 + +#if defined (L_udivmodpsi4) +;; R24:R22 = R24:R24 udiv R20:R18 +;; R20:R18 = R24:R22 umod R20:R18 +;; Clobbers: R21, R25, R26 + +DEFUN __udivmodpsi4 + ; init loop counter + ldi r_cnt, 24+1 + ; Clear remainder and carry. C0 is already 0 + clr C1 + sub C2, C2 + ; jump to entry point + rjmp __udivmodpsi4_start +__udivmodpsi4_loop: + ; shift dividend into remainder + rol C0 + rol C1 + rol C2 + ; compare remainder & divisor + cp C0, B0 + cpc C1, B1 + cpc C2, B2 + brcs __udivmodpsi4_start ; remainder <= divisor + sub C0, B0 ; restore remainder + sbc C1, B1 + sbc C2, B2 +__udivmodpsi4_start: + ; shift dividend (with CARRY) + rol A0 + rol A1 + rol A2 + ; decrement loop counter + dec r_cnt + brne __udivmodpsi4_loop + com A0 + com A1 + com A2 + ; div/mod results to return registers + ; remainder + mov B0, C0 + mov B1, C1 + mov B2, C2 + clr __zero_reg__ ; C0 + ret +ENDF __udivmodpsi4 +#endif /* defined (L_udivmodpsi4) */ + +#if defined (L_divmodpsi4) +;; R24:R22 = R24:R22 div R20:R18 +;; R20:R18 = R24:R22 mod R20:R18 +;; Clobbers: T, __tmp_reg__, R21, R25, R26 + +DEFUN __divmodpsi4 + ; R0.7 will contain the sign of the result: + ; R0.7 = A.sign ^ B.sign + mov __tmp_reg__, B2 + ; T-flag = sign of dividend + bst A2, 7 + brtc 0f + com __tmp_reg__ + ; Adjust dividend's sign + rcall __divmodpsi4_negA +0: + ; Adjust divisor's sign + sbrc B2, 7 + rcall __divmodpsi4_negB + + ; Do the unsigned div/mod + XCALL __udivmodpsi4 + + ; Adjust quotient's sign + sbrc __tmp_reg__, 7 + rcall __divmodpsi4_negA + + ; Adjust remainder's sign + brtc __divmodpsi4_end + +__divmodpsi4_negB: + ; Correct divisor/remainder sign + com B2 + com B1 + neg B0 + sbci B1, -1 + sbci B2, -1 + ret + + ; Correct dividend/quotient sign +__divmodpsi4_negA: + com A2 + com A1 + neg A0 + sbci A1, -1 + sbci A2, -1 +__divmodpsi4_end: + ret + +ENDF __divmodpsi4 +#endif /* defined (L_divmodpsi4) */ + +#undef A0 +#undef A1 +#undef A2 + +#undef B0 +#undef B1 +#undef B2 + +#undef C0 +#undef C1 +#undef C2 + +#undef r_cnt + +/******************************************************* + Division 32 / 32 => (result + remainder) +*******************************************************/ +#define r_remHH r31 /* remainder High */ +#define r_remHL r30 +#define r_remH r27 +#define r_remL r26 /* remainder Low */ + +/* return: remainder */ +#define r_arg1HH r25 /* dividend High */ +#define r_arg1HL r24 +#define r_arg1H r23 +#define r_arg1L r22 /* dividend Low */ + +/* return: quotient */ +#define r_arg2HH r21 /* divisor High */ +#define r_arg2HL r20 +#define r_arg2H r19 +#define r_arg2L r18 /* divisor Low */ + +#define r_cnt __zero_reg__ /* loop count (0 after the loop!) */ + +#if defined (L_udivmodsi4) +DEFUN __udivmodsi4 + ldi r_remL, 33 ; init loop counter + mov r_cnt, r_remL + sub r_remL,r_remL + sub r_remH,r_remH ; clear remainder and carry + mov_l r_remHL, r_remL + mov_h r_remHH, r_remH + rjmp __udivmodsi4_ep ; jump to entry point +__udivmodsi4_loop: + rol r_remL ; shift dividend into remainder + rol r_remH + rol r_remHL + rol r_remHH + cp r_remL,r_arg2L ; compare remainder & divisor + cpc r_remH,r_arg2H + cpc r_remHL,r_arg2HL + cpc r_remHH,r_arg2HH + brcs __udivmodsi4_ep ; remainder <= divisor + sub r_remL,r_arg2L ; restore remainder + sbc r_remH,r_arg2H + sbc r_remHL,r_arg2HL + sbc r_remHH,r_arg2HH +__udivmodsi4_ep: + rol r_arg1L ; shift dividend (with CARRY) + rol r_arg1H + rol r_arg1HL + rol r_arg1HH + dec r_cnt ; decrement loop counter + brne __udivmodsi4_loop + ; __zero_reg__ now restored (r_cnt == 0) + com r_arg1L + com r_arg1H + com r_arg1HL + com r_arg1HH +; div/mod results to return registers, as for the ldiv() function + mov_l r_arg2L, r_arg1L ; quotient + mov_h r_arg2H, r_arg1H + mov_l r_arg2HL, r_arg1HL + mov_h r_arg2HH, r_arg1HH + mov_l r_arg1L, r_remL ; remainder + mov_h r_arg1H, r_remH + mov_l r_arg1HL, r_remHL + mov_h r_arg1HH, r_remHH + ret +ENDF __udivmodsi4 +#endif /* defined (L_udivmodsi4) */ + +#if defined (L_divmodsi4) +DEFUN __divmodsi4 + mov __tmp_reg__,r_arg2HH + bst r_arg1HH,7 ; store sign of dividend + brtc 0f + com __tmp_reg__ ; r0.7 is sign of result + XCALL __negsi2 ; dividend negative: negate +0: + sbrc r_arg2HH,7 + rcall __divmodsi4_neg2 ; divisor negative: negate + XCALL __udivmodsi4 ; do the unsigned div/mod + sbrc __tmp_reg__, 7 ; correct quotient sign + rcall __divmodsi4_neg2 + brtc __divmodsi4_exit ; correct remainder sign + XJMP __negsi2 +__divmodsi4_neg2: + ;; correct divisor/quotient sign + com r_arg2HH + com r_arg2HL + com r_arg2H + neg r_arg2L + sbci r_arg2H,0xff + sbci r_arg2HL,0xff + sbci r_arg2HH,0xff +__divmodsi4_exit: + ret +ENDF __divmodsi4 +#endif /* defined (L_divmodsi4) */ + +#if defined (L_negsi2) +;; (set (reg:SI 22) +;; (neg:SI (reg:SI 22))) +;; Sets the V flag for signed overflow tests +DEFUN __negsi2 + NEG4 22 + ret +ENDF __negsi2 +#endif /* L_negsi2 */ + +#undef r_remHH +#undef r_remHL +#undef r_remH +#undef r_remL +#undef r_arg1HH +#undef r_arg1HL +#undef r_arg1H +#undef r_arg1L +#undef r_arg2HH +#undef r_arg2HL +#undef r_arg2H +#undef r_arg2L +#undef r_cnt + +/* *di routines use registers below R19 and won't work with tiny arch + right now. */ + +#if !defined (__AVR_TINY__) +/******************************************************* + Division 64 / 64 + Modulo 64 % 64 +*******************************************************/ + +;; Use Speed-optimized Version on "big" Devices, i.e. Devices with +;; at least 16k of Program Memory. For smaller Devices, depend +;; on MOVW and SP Size. There is a Connexion between SP Size and +;; Flash Size so that SP Size can be used to test for Flash Size. + +#if defined (__AVR_HAVE_JMP_CALL__) +# define SPEED_DIV 8 +#elif defined (__AVR_HAVE_MOVW__) && defined (__AVR_HAVE_SPH__) +# define SPEED_DIV 16 +#else +# define SPEED_DIV 0 +#endif + +;; A[0..7]: In: Dividend; +;; Out: Quotient (T = 0) +;; Out: Remainder (T = 1) +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 +#define A4 A0+4 +#define A5 A0+5 +#define A6 A0+6 +#define A7 A0+7 + +;; B[0..7]: In: Divisor; Out: Clobber +#define B0 10 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 +#define B4 B0+4 +#define B5 B0+5 +#define B6 B0+6 +#define B7 B0+7 + +;; C[0..7]: Expand remainder; Out: Remainder (unused) +#define C0 8 +#define C1 C0+1 +#define C2 30 +#define C3 C2+1 +#define C4 28 +#define C5 C4+1 +#define C6 26 +#define C7 C6+1 + +;; Holds Signs during Division Routine +#define SS __tmp_reg__ + +;; Bit-Counter in Division Routine +#define R_cnt __zero_reg__ + +;; Scratch Register for Negation +#define NN r31 + +#if defined (L_udivdi3) + +;; R25:R18 = R24:R18 umod R17:R10 +;; Ordinary ABI-Function + +DEFUN __umoddi3 + set + rjmp __udivdi3_umoddi3 +ENDF __umoddi3 + +;; R25:R18 = R24:R18 udiv R17:R10 +;; Ordinary ABI-Function + +DEFUN __udivdi3 + clt +ENDF __udivdi3 + +DEFUN __udivdi3_umoddi3 + push C0 + push C1 + push C4 + push C5 + XCALL __udivmod64 + pop C5 + pop C4 + pop C1 + pop C0 + ret +ENDF __udivdi3_umoddi3 +#endif /* L_udivdi3 */ + +#if defined (L_udivmod64) + +;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation +;; No Registers saved/restored; the Callers will take Care. +;; Preserves B[] and T-flag +;; T = 0: Compute Quotient in A[] +;; T = 1: Compute Remainder in A[] and shift SS one Bit left + +DEFUN __udivmod64 + + ;; Clear Remainder (C6, C7 will follow) + clr C0 + clr C1 + wmov C2, C0 + wmov C4, C0 + ldi C7, 64 + +#if SPEED_DIV == 0 || SPEED_DIV == 16 + ;; Initialize Loop-Counter + mov R_cnt, C7 + wmov C6, C0 +#endif /* SPEED_DIV */ + +#if SPEED_DIV == 8 + + push A7 + clr C6 + +1: ;; Compare shifted Devidend against Divisor + ;; If -- even after Shifting -- it is smaller... + CP A7,B0 $ cpc C0,B1 $ cpc C1,B2 $ cpc C2,B3 + cpc C3,B4 $ cpc C4,B5 $ cpc C5,B6 $ cpc C6,B7 + brcc 2f + + ;; ...then we can subtract it. Thus, it is legal to shift left + $ mov C6,C5 $ mov C5,C4 $ mov C4,C3 + mov C3,C2 $ mov C2,C1 $ mov C1,C0 $ mov C0,A7 + mov A7,A6 $ mov A6,A5 $ mov A5,A4 $ mov A4,A3 + mov A3,A2 $ mov A2,A1 $ mov A1,A0 $ clr A0 + + ;; 8 Bits are done + subi C7, 8 + brne 1b + + ;; Shifted 64 Bits: A7 has traveled to C7 + pop C7 + ;; Divisor is greater than Dividend. We have: + ;; A[] % B[] = A[] + ;; A[] / B[] = 0 + ;; Thus, we can return immediately + rjmp 5f + +2: ;; Initialze Bit-Counter with Number of Bits still to be performed + mov R_cnt, C7 + + ;; Push of A7 is not needed because C7 is still 0 + pop C7 + clr C7 + +#elif SPEED_DIV == 16 + + ;; Compare shifted Dividend against Divisor + cp A7, B3 + cpc C0, B4 + cpc C1, B5 + cpc C2, B6 + cpc C3, B7 + brcc 2f + + ;; Divisor is greater than shifted Dividen: We can shift the Dividend + ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk + wmov C2,A6 $ wmov C0,A4 + wmov A6,A2 $ wmov A4,A0 + wmov A2,C6 $ wmov A0,C4 + + ;; Set Bit Counter to 32 + lsr R_cnt +2: +#elif SPEED_DIV +#error SPEED_DIV = ? +#endif /* SPEED_DIV */ + +;; The very Division + Remainder Routine + +3: ;; Left-shift Dividend... + lsl A0 $ rol A1 $ rol A2 $ rol A3 + rol A4 $ rol A5 $ rol A6 $ rol A7 + + ;; ...into Remainder + rol C0 $ rol C1 $ rol C2 $ rol C3 + rol C4 $ rol C5 $ rol C6 $ rol C7 + + ;; Compare Remainder and Divisor + CP C0,B0 $ cpc C1,B1 $ cpc C2,B2 $ cpc C3,B3 + cpc C4,B4 $ cpc C5,B5 $ cpc C6,B6 $ cpc C7,B7 + + brcs 4f + + ;; Divisor fits into Remainder: Subtract it from Remainder... + SUB C0,B0 $ sbc C1,B1 $ sbc C2,B2 $ sbc C3,B3 + sbc C4,B4 $ sbc C5,B5 $ sbc C6,B6 $ sbc C7,B7 + + ;; ...and set according Bit in the upcoming Quotient + ;; The Bit will travel to its final Position + ori A0, 1 + +4: ;; This Bit is done + dec R_cnt + brne 3b + ;; __zero_reg__ is 0 again + + ;; T = 0: We are fine with the Quotient in A[] + ;; T = 1: Copy Remainder to A[] +5: brtc 6f + wmov A0, C0 + wmov A2, C2 + wmov A4, C4 + wmov A6, C6 + ;; Move the Sign of the Result to SS.7 + lsl SS + +6: ret + +ENDF __udivmod64 +#endif /* L_udivmod64 */ + + +#if defined (L_divdi3) + +;; R25:R18 = R24:R18 mod R17:R10 +;; Ordinary ABI-Function + +DEFUN __moddi3 + set + rjmp __divdi3_moddi3 +ENDF __moddi3 + +;; R25:R18 = R24:R18 div R17:R10 +;; Ordinary ABI-Function + +DEFUN __divdi3 + clt +ENDF __divdi3 + +DEFUN __divdi3_moddi3 +#if SPEED_DIV + mov r31, A7 + or r31, B7 + brmi 0f + ;; Both Signs are 0: the following Complexitiy is not needed + XJMP __udivdi3_umoddi3 +#endif /* SPEED_DIV */ + +0: ;; The Prologue + ;; Save 12 Registers: Y, 17...8 + ;; No Frame needed + do_prologue_saves 12 + + ;; SS.7 will contain the Sign of the Quotient (A.sign * B.sign) + ;; SS.6 will contain the Sign of the Remainder (A.sign) + mov SS, A7 + asr SS + ;; Adjust Dividend's Sign as needed +#if SPEED_DIV + ;; Compiling for Speed we know that at least one Sign must be < 0 + ;; Thus, if A[] >= 0 then we know B[] < 0 + brpl 22f +#else + brpl 21f +#endif /* SPEED_DIV */ + + XCALL __negdi2 + + ;; Adjust Divisor's Sign and SS.7 as needed +21: tst B7 + brpl 3f +22: ldi NN, 1 << 7 + eor SS, NN + + ldi NN, -1 + com B4 $ com B5 $ com B6 $ com B7 + $ com B1 $ com B2 $ com B3 + NEG B0 + $ sbc B1,NN $ sbc B2,NN $ sbc B3,NN + sbc B4,NN $ sbc B5,NN $ sbc B6,NN $ sbc B7,NN + +3: ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag) + XCALL __udivmod64 + + ;; Adjust Result's Sign +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + tst SS + brpl 4f +#else + sbrc SS, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XCALL __negdi2 + +4: ;; Epilogue: Restore 12 Registers and return + do_epilogue_restores 12 + +ENDF __divdi3_moddi3 + +#endif /* L_divdi3 */ + +#undef R_cnt +#undef SS +#undef NN + +.section .text.libgcc, "ax", @progbits + +#define TT __tmp_reg__ + +#if defined (L_adddi3) +;; (set (reg:DI 18) +;; (plus:DI (reg:DI 18) +;; (reg:DI 10))) +;; Sets the V flag for signed overflow tests +;; Sets the C flag for unsigned overflow tests +DEFUN __adddi3 + ADD A0,B0 $ adc A1,B1 $ adc A2,B2 $ adc A3,B3 + adc A4,B4 $ adc A5,B5 $ adc A6,B6 $ adc A7,B7 + ret +ENDF __adddi3 +#endif /* L_adddi3 */ + +#if defined (L_adddi3_s8) +;; (set (reg:DI 18) +;; (plus:DI (reg:DI 18) +;; (sign_extend:SI (reg:QI 26)))) +;; Sets the V flag for signed overflow tests +;; Sets the C flag for unsigned overflow tests provided 0 <= R26 < 128 +DEFUN __adddi3_s8 + clr TT + sbrc r26, 7 + com TT + ADD A0,r26 $ adc A1,TT $ adc A2,TT $ adc A3,TT + adc A4,TT $ adc A5,TT $ adc A6,TT $ adc A7,TT + ret +ENDF __adddi3_s8 +#endif /* L_adddi3_s8 */ + +#if defined (L_subdi3) +;; (set (reg:DI 18) +;; (minus:DI (reg:DI 18) +;; (reg:DI 10))) +;; Sets the V flag for signed overflow tests +;; Sets the C flag for unsigned overflow tests +DEFUN __subdi3 + SUB A0,B0 $ sbc A1,B1 $ sbc A2,B2 $ sbc A3,B3 + sbc A4,B4 $ sbc A5,B5 $ sbc A6,B6 $ sbc A7,B7 + ret +ENDF __subdi3 +#endif /* L_subdi3 */ + +#if defined (L_cmpdi2) +;; (set (cc0) +;; (compare (reg:DI 18) +;; (reg:DI 10))) +DEFUN __cmpdi2 + CP A0,B0 $ cpc A1,B1 $ cpc A2,B2 $ cpc A3,B3 + cpc A4,B4 $ cpc A5,B5 $ cpc A6,B6 $ cpc A7,B7 + ret +ENDF __cmpdi2 +#endif /* L_cmpdi2 */ + +#if defined (L_cmpdi2_s8) +;; (set (cc0) +;; (compare (reg:DI 18) +;; (sign_extend:SI (reg:QI 26)))) +DEFUN __cmpdi2_s8 + clr TT + sbrc r26, 7 + com TT + CP A0,r26 $ cpc A1,TT $ cpc A2,TT $ cpc A3,TT + cpc A4,TT $ cpc A5,TT $ cpc A6,TT $ cpc A7,TT + ret +ENDF __cmpdi2_s8 +#endif /* L_cmpdi2_s8 */ + +#if defined (L_negdi2) +;; (set (reg:DI 18) +;; (neg:DI (reg:DI 18))) +;; Sets the V flag for signed overflow tests +DEFUN __negdi2 + + com A4 $ com A5 $ com A6 $ com A7 + $ com A1 $ com A2 $ com A3 + NEG A0 + $ sbci A1,-1 $ sbci A2,-1 $ sbci A3,-1 + sbci A4,-1 $ sbci A5,-1 $ sbci A6,-1 $ sbci A7,-1 + ret + +ENDF __negdi2 +#endif /* L_negdi2 */ + +#undef TT + +#undef C7 +#undef C6 +#undef C5 +#undef C4 +#undef C3 +#undef C2 +#undef C1 +#undef C0 + +#undef B7 +#undef B6 +#undef B5 +#undef B4 +#undef B3 +#undef B2 +#undef B1 +#undef B0 + +#undef A7 +#undef A6 +#undef A5 +#undef A4 +#undef A3 +#undef A2 +#undef A1 +#undef A0 + +#endif /* !defined (__AVR_TINY__) */ + + +.section .text.libgcc.prologue, "ax", @progbits + +/********************************** + * This is a prologue subroutine + **********************************/ +#if !defined (__AVR_TINY__) +#if defined (L_prologue) + +;; This function does not clobber T-flag; 64-bit division relies on it +DEFUN __prologue_saves__ + push r2 + push r3 + push r4 + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 +#if !defined (__AVR_HAVE_SPH__) + in r28,__SP_L__ + sub r28,r26 + out __SP_L__,r28 + clr r29 +#elif defined (__AVR_XMEGA__) + in r28,__SP_L__ + in r29,__SP_H__ + sub r28,r26 + sbc r29,r27 + out __SP_L__,r28 + out __SP_H__,r29 +#else + in r28,__SP_L__ + in r29,__SP_H__ + sub r28,r26 + sbc r29,r27 + in __tmp_reg__,__SREG__ + cli + out __SP_H__,r29 + out __SREG__,__tmp_reg__ + out __SP_L__,r28 +#endif /* #SP = 8/16 */ + + XIJMP + +ENDF __prologue_saves__ +#endif /* defined (L_prologue) */ + +/* + * This is an epilogue subroutine + */ +#if defined (L_epilogue) + +DEFUN __epilogue_restores__ + ldd r2,Y+18 + ldd r3,Y+17 + ldd r4,Y+16 + ldd r5,Y+15 + ldd r6,Y+14 + ldd r7,Y+13 + ldd r8,Y+12 + ldd r9,Y+11 + ldd r10,Y+10 + ldd r11,Y+9 + ldd r12,Y+8 + ldd r13,Y+7 + ldd r14,Y+6 + ldd r15,Y+5 + ldd r16,Y+4 + ldd r17,Y+3 + ldd r26,Y+2 +#if !defined (__AVR_HAVE_SPH__) + ldd r29,Y+1 + add r28,r30 + out __SP_L__,r28 + mov r28, r26 +#elif defined (__AVR_XMEGA__) + ldd r27,Y+1 + add r28,r30 + adc r29,__zero_reg__ + out __SP_L__,r28 + out __SP_H__,r29 + wmov 28, 26 +#else + ldd r27,Y+1 + add r28,r30 + adc r29,__zero_reg__ + in __tmp_reg__,__SREG__ + cli + out __SP_H__,r29 + out __SREG__,__tmp_reg__ + out __SP_L__,r28 + mov_l r28, r26 + mov_h r29, r27 +#endif /* #SP = 8/16 */ + ret +ENDF __epilogue_restores__ +#endif /* defined (L_epilogue) */ +#endif /* !defined (__AVR_TINY__) */ + +#ifdef L_exit + .section .fini9,"ax",@progbits +DEFUN _exit + .weak exit +exit: +ENDF _exit + + /* Code from .fini8 ... .fini1 sections inserted by ld script. */ + + .section .fini0,"ax",@progbits + cli +__stop_program: + rjmp __stop_program +#endif /* defined (L_exit) */ + +#ifdef L_cleanup + .weak _cleanup + .func _cleanup +_cleanup: + ret +.endfunc +#endif /* defined (L_cleanup) */ + + +.section .text.libgcc, "ax", @progbits + +#ifdef L_tablejump2 +DEFUN __tablejump2__ + lsl r30 + rol r31 +#if defined (__AVR_HAVE_EIJMP_EICALL__) + ;; Word address of gs() jumptable entry in R24:Z + rol r24 + out __RAMPZ__, r24 +#elif defined (__AVR_HAVE_ELPM__) + ;; Word address of jumptable entry in Z + clr __tmp_reg__ + rol __tmp_reg__ + out __RAMPZ__, __tmp_reg__ +#endif + + ;; Read word address from jumptable and jump + +#if defined (__AVR_HAVE_ELPMX__) + elpm __tmp_reg__, Z+ + elpm r31, Z + mov r30, __tmp_reg__ +#ifdef __AVR_HAVE_RAMPD__ + ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM + out __RAMPZ__, __zero_reg__ +#endif /* RAMPD */ + XIJMP +#elif defined (__AVR_HAVE_ELPM__) + elpm + push r0 + adiw r30, 1 + elpm + push r0 + ret +#elif defined (__AVR_HAVE_LPMX__) + lpm __tmp_reg__, Z+ + lpm r31, Z + mov r30, __tmp_reg__ + ijmp +#elif defined (__AVR_TINY__) + wsubi 30, -(__AVR_TINY_PM_BASE_ADDRESS__) ; Add PM offset to Z + ld __tmp_reg__, Z+ + ld r31, Z ; Use ld instead of lpm to load Z + mov r30, __tmp_reg__ + ijmp +#else + lpm + push r0 + adiw r30, 1 + lpm + push r0 + ret +#endif +ENDF __tablejump2__ +#endif /* L_tablejump2 */ + +#if defined(__AVR_TINY__) +#ifdef L_copy_data + .section .init4,"ax",@progbits + .global __do_copy_data +__do_copy_data: + ldi r18, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start + __AVR_TINY_PM_BASE_ADDRESS__) + ldi r31, hi8(__data_load_start + __AVR_TINY_PM_BASE_ADDRESS__) + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: + ld r19, z+ + st X+, r19 +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r18 + brne .L__do_copy_data_loop +#endif +#else +#ifdef L_copy_data + .section .init4,"ax",@progbits +DEFUN __do_copy_data +#if defined(__AVR_HAVE_ELPMX__) + ldi r17, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start) + ldi r31, hi8(__data_load_start) + ldi r16, hh8(__data_load_start) + out __RAMPZ__, r16 + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: + elpm r0, Z+ + st X+, r0 +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r17 + brne .L__do_copy_data_loop +#elif !defined(__AVR_HAVE_ELPMX__) && defined(__AVR_HAVE_ELPM__) + ldi r17, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start) + ldi r31, hi8(__data_load_start) + ldi r16, hh8(__data_load_start - 0x10000) +.L__do_copy_data_carry: + inc r16 + out __RAMPZ__, r16 + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: + elpm + st X+, r0 + adiw r30, 1 + brcs .L__do_copy_data_carry +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r17 + brne .L__do_copy_data_loop +#elif !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__) + ldi r17, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start) + ldi r31, hi8(__data_load_start) + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: +#if defined (__AVR_HAVE_LPMX__) + lpm r0, Z+ +#else + lpm + adiw r30, 1 +#endif + st X+, r0 +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r17 + brne .L__do_copy_data_loop +#endif /* !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__) */ +#if defined (__AVR_HAVE_ELPM__) && defined (__AVR_HAVE_RAMPD__) + ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM + out __RAMPZ__, __zero_reg__ +#endif /* ELPM && RAMPD */ +ENDF __do_copy_data +#endif /* L_copy_data */ +#endif /* !defined (__AVR_TINY__) */ + +/* __do_clear_bss is only necessary if there is anything in .bss section. */ + +#ifdef L_clear_bss + .section .init4,"ax",@progbits +DEFUN __do_clear_bss + ldi r18, hi8(__bss_end) + ldi r26, lo8(__bss_start) + ldi r27, hi8(__bss_start) + rjmp .do_clear_bss_start +.do_clear_bss_loop: + st X+, __zero_reg__ +.do_clear_bss_start: + cpi r26, lo8(__bss_end) + cpc r27, r18 + brne .do_clear_bss_loop +ENDF __do_clear_bss +#endif /* L_clear_bss */ + +/* __do_global_ctors and __do_global_dtors are only necessary + if there are any constructors/destructors. */ + +#if defined(__AVR_TINY__) +#define cdtors_tst_reg r18 +#else +#define cdtors_tst_reg r17 +#endif + +#ifdef L_ctors + .section .init6,"ax",@progbits +DEFUN __do_global_ctors + ldi cdtors_tst_reg, pm_hi8(__ctors_start) + ldi r28, pm_lo8(__ctors_end) + ldi r29, pm_hi8(__ctors_end) +#ifdef __AVR_HAVE_EIJMP_EICALL__ + ldi r16, pm_hh8(__ctors_end) +#endif /* HAVE_EIJMP */ + rjmp .L__do_global_ctors_start +.L__do_global_ctors_loop: + wsubi 28, 1 +#ifdef __AVR_HAVE_EIJMP_EICALL__ + sbc r16, __zero_reg__ + mov r24, r16 +#endif /* HAVE_EIJMP */ + mov_h r31, r29 + mov_l r30, r28 + XCALL __tablejump2__ +.L__do_global_ctors_start: + cpi r28, pm_lo8(__ctors_start) + cpc r29, cdtors_tst_reg +#ifdef __AVR_HAVE_EIJMP_EICALL__ + ldi r24, pm_hh8(__ctors_start) + cpc r16, r24 +#endif /* HAVE_EIJMP */ + brne .L__do_global_ctors_loop +ENDF __do_global_ctors +#endif /* L_ctors */ + +#ifdef L_dtors + .section .fini6,"ax",@progbits +DEFUN __do_global_dtors + ldi cdtors_tst_reg, pm_hi8(__dtors_end) + ldi r28, pm_lo8(__dtors_start) + ldi r29, pm_hi8(__dtors_start) +#ifdef __AVR_HAVE_EIJMP_EICALL__ + ldi r16, pm_hh8(__dtors_start) +#endif /* HAVE_EIJMP */ + rjmp .L__do_global_dtors_start +.L__do_global_dtors_loop: +#ifdef __AVR_HAVE_EIJMP_EICALL__ + mov r24, r16 +#endif /* HAVE_EIJMP */ + mov_h r31, r29 + mov_l r30, r28 + XCALL __tablejump2__ + waddi 28, 1 +#ifdef __AVR_HAVE_EIJMP_EICALL__ + adc r16, __zero_reg__ +#endif /* HAVE_EIJMP */ +.L__do_global_dtors_start: + cpi r28, pm_lo8(__dtors_end) + cpc r29, cdtors_tst_reg +#ifdef __AVR_HAVE_EIJMP_EICALL__ + ldi r24, pm_hh8(__dtors_end) + cpc r16, r24 +#endif /* HAVE_EIJMP */ + brne .L__do_global_dtors_loop +ENDF __do_global_dtors +#endif /* L_dtors */ + +#undef cdtors_tst_reg + +.section .text.libgcc, "ax", @progbits + +#if !defined (__AVR_TINY__) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Loading n bytes from Flash; n = 3,4 +;; R22... = Flash[Z] +;; Clobbers: __tmp_reg__ + +#if (defined (L_load_3) \ + || defined (L_load_4)) \ + && !defined (__AVR_HAVE_LPMX__) + +;; Destination +#define D0 22 +#define D1 D0+1 +#define D2 D0+2 +#define D3 D0+3 + +.macro .load dest, n + lpm + mov \dest, r0 +.if \dest != D0+\n-1 + adiw r30, 1 +.else + sbiw r30, \n-1 +.endif +.endm + +#if defined (L_load_3) +DEFUN __load_3 + push D3 + XCALL __load_4 + pop D3 + ret +ENDF __load_3 +#endif /* L_load_3 */ + +#if defined (L_load_4) +DEFUN __load_4 + .load D0, 4 + .load D1, 4 + .load D2, 4 + .load D3, 4 + ret +ENDF __load_4 +#endif /* L_load_4 */ + +#endif /* L_load_3 || L_load_3 */ +#endif /* !defined (__AVR_TINY__) */ + +#if !defined (__AVR_TINY__) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Loading n bytes from Flash or RAM; n = 1,2,3,4 +;; R22... = Flash[R21:Z] or RAM[Z] depending on R21.7 +;; Clobbers: __tmp_reg__, R21, R30, R31 + +#if (defined (L_xload_1) \ + || defined (L_xload_2) \ + || defined (L_xload_3) \ + || defined (L_xload_4)) + +;; Destination +#define D0 22 +#define D1 D0+1 +#define D2 D0+2 +#define D3 D0+3 + +;; Register containing bits 16+ of the address + +#define HHI8 21 + +.macro .xload dest, n +#if defined (__AVR_HAVE_ELPMX__) + elpm \dest, Z+ +#elif defined (__AVR_HAVE_ELPM__) + elpm + mov \dest, r0 +.if \dest != D0+\n-1 + adiw r30, 1 + adc HHI8, __zero_reg__ + out __RAMPZ__, HHI8 +.endif +#elif defined (__AVR_HAVE_LPMX__) + lpm \dest, Z+ +#else + lpm + mov \dest, r0 +.if \dest != D0+\n-1 + adiw r30, 1 +.endif +#endif +#if defined (__AVR_HAVE_ELPM__) && defined (__AVR_HAVE_RAMPD__) +.if \dest == D0+\n-1 + ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM + out __RAMPZ__, __zero_reg__ +.endif +#endif +.endm ; .xload + +#if defined (L_xload_1) +DEFUN __xload_1 +#if defined (__AVR_HAVE_LPMX__) && !defined (__AVR_HAVE_ELPM__) + sbrc HHI8, 7 + ld D0, Z + sbrs HHI8, 7 + lpm D0, Z + ret +#else + sbrc HHI8, 7 + rjmp 1f +#if defined (__AVR_HAVE_ELPM__) + out __RAMPZ__, HHI8 +#endif /* __AVR_HAVE_ELPM__ */ + .xload D0, 1 + ret +1: ld D0, Z + ret +#endif /* LPMx && ! ELPM */ +ENDF __xload_1 +#endif /* L_xload_1 */ + +#if defined (L_xload_2) +DEFUN __xload_2 + sbrc HHI8, 7 + rjmp 1f +#if defined (__AVR_HAVE_ELPM__) + out __RAMPZ__, HHI8 +#endif /* __AVR_HAVE_ELPM__ */ + .xload D0, 2 + .xload D1, 2 + ret +1: ld D0, Z+ + ld D1, Z+ + ret +ENDF __xload_2 +#endif /* L_xload_2 */ + +#if defined (L_xload_3) +DEFUN __xload_3 + sbrc HHI8, 7 + rjmp 1f +#if defined (__AVR_HAVE_ELPM__) + out __RAMPZ__, HHI8 +#endif /* __AVR_HAVE_ELPM__ */ + .xload D0, 3 + .xload D1, 3 + .xload D2, 3 + ret +1: ld D0, Z+ + ld D1, Z+ + ld D2, Z+ + ret +ENDF __xload_3 +#endif /* L_xload_3 */ + +#if defined (L_xload_4) +DEFUN __xload_4 + sbrc HHI8, 7 + rjmp 1f +#if defined (__AVR_HAVE_ELPM__) + out __RAMPZ__, HHI8 +#endif /* __AVR_HAVE_ELPM__ */ + .xload D0, 4 + .xload D1, 4 + .xload D2, 4 + .xload D3, 4 + ret +1: ld D0, Z+ + ld D1, Z+ + ld D2, Z+ + ld D3, Z+ + ret +ENDF __xload_4 +#endif /* L_xload_4 */ + +#endif /* L_xload_{1|2|3|4} */ +#endif /* if !defined (__AVR_TINY__) */ + +#if !defined (__AVR_TINY__) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; memcopy from Address Space __pgmx to RAM +;; R23:Z = Source Address +;; X = Destination Address +;; Clobbers: __tmp_reg__, R23, R24, R25, X, Z + +#if defined (L_movmemx) + +#define HHI8 23 +#define LOOP 24 + +DEFUN __movmemx_qi + ;; #Bytes to copy fity in 8 Bits (1..255) + ;; Zero-extend Loop Counter + clr LOOP+1 + ;; FALLTHRU +ENDF __movmemx_qi + +DEFUN __movmemx_hi + +;; Read from where? + sbrc HHI8, 7 + rjmp 1f + +;; Read from Flash + +#if defined (__AVR_HAVE_ELPM__) + out __RAMPZ__, HHI8 +#endif + +0: ;; Load 1 Byte from Flash... + +#if defined (__AVR_HAVE_ELPMX__) + elpm r0, Z+ +#elif defined (__AVR_HAVE_ELPM__) + elpm + adiw r30, 1 + adc HHI8, __zero_reg__ + out __RAMPZ__, HHI8 +#elif defined (__AVR_HAVE_LPMX__) + lpm r0, Z+ +#else + lpm + adiw r30, 1 +#endif + + ;; ...and store that Byte to RAM Destination + st X+, r0 + sbiw LOOP, 1 + brne 0b +#if defined (__AVR_HAVE_ELPM__) && defined (__AVR_HAVE_RAMPD__) + ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM + out __RAMPZ__, __zero_reg__ +#endif /* ELPM && RAMPD */ + ret + +;; Read from RAM + +1: ;; Read 1 Byte from RAM... + ld r0, Z+ + ;; and store that Byte to RAM Destination + st X+, r0 + sbiw LOOP, 1 + brne 1b + ret +ENDF __movmemx_hi + +#undef HHI8 +#undef LOOP + +#endif /* L_movmemx */ +#endif /* !defined (__AVR_TINY__) */ + + +.section .text.libgcc.builtins, "ax", @progbits + +/********************************** + * Find first set Bit (ffs) + **********************************/ + +#if defined (L_ffssi2) +;; find first set bit +;; r25:r24 = ffs32 (r25:r22) +;; clobbers: r22, r26 +DEFUN __ffssi2 + clr r26 + tst r22 + brne 1f + subi r26, -8 + or r22, r23 + brne 1f + subi r26, -8 + or r22, r24 + brne 1f + subi r26, -8 + or r22, r25 + brne 1f + ret +1: mov r24, r22 + XJMP __loop_ffsqi2 +ENDF __ffssi2 +#endif /* defined (L_ffssi2) */ + +#if defined (L_ffshi2) +;; find first set bit +;; r25:r24 = ffs16 (r25:r24) +;; clobbers: r26 +DEFUN __ffshi2 + clr r26 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Some cores have problem skipping 2-word instruction + tst r24 + breq 2f +#else + cpse r24, __zero_reg__ +#endif /* __AVR_HAVE_JMP_CALL__ */ +1: XJMP __loop_ffsqi2 +2: ldi r26, 8 + or r24, r25 + brne 1b + ret +ENDF __ffshi2 +#endif /* defined (L_ffshi2) */ + +#if defined (L_loop_ffsqi2) +;; Helper for ffshi2, ffssi2 +;; r25:r24 = r26 + zero_extend16 (ffs8(r24)) +;; r24 must be != 0 +;; clobbers: r26 +DEFUN __loop_ffsqi2 + inc r26 + lsr r24 + brcc __loop_ffsqi2 + mov r24, r26 + clr r25 + ret +ENDF __loop_ffsqi2 +#endif /* defined (L_loop_ffsqi2) */ + + +/********************************** + * Count trailing Zeros (ctz) + **********************************/ + +#if defined (L_ctzsi2) +;; count trailing zeros +;; r25:r24 = ctz32 (r25:r22) +;; clobbers: r26, r22 +;; ctz(0) = 255 +;; Note that ctz(0) in undefined for GCC +DEFUN __ctzsi2 + XCALL __ffssi2 + dec r24 + ret +ENDF __ctzsi2 +#endif /* defined (L_ctzsi2) */ + +#if defined (L_ctzhi2) +;; count trailing zeros +;; r25:r24 = ctz16 (r25:r24) +;; clobbers: r26 +;; ctz(0) = 255 +;; Note that ctz(0) in undefined for GCC +DEFUN __ctzhi2 + XCALL __ffshi2 + dec r24 + ret +ENDF __ctzhi2 +#endif /* defined (L_ctzhi2) */ + + +/********************************** + * Count leading Zeros (clz) + **********************************/ + +#if defined (L_clzdi2) +;; count leading zeros +;; r25:r24 = clz64 (r25:r18) +;; clobbers: r22, r23, r26 +DEFUN __clzdi2 + XCALL __clzsi2 + sbrs r24, 5 + ret + mov_l r22, r18 + mov_h r23, r19 + mov_l r24, r20 + mov_h r25, r21 + XCALL __clzsi2 + subi r24, -32 + ret +ENDF __clzdi2 +#endif /* defined (L_clzdi2) */ + +#if defined (L_clzsi2) +;; count leading zeros +;; r25:r24 = clz32 (r25:r22) +;; clobbers: r26 +DEFUN __clzsi2 + XCALL __clzhi2 + sbrs r24, 4 + ret + mov_l r24, r22 + mov_h r25, r23 + XCALL __clzhi2 + subi r24, -16 + ret +ENDF __clzsi2 +#endif /* defined (L_clzsi2) */ + +#if defined (L_clzhi2) +;; count leading zeros +;; r25:r24 = clz16 (r25:r24) +;; clobbers: r26 +DEFUN __clzhi2 + clr r26 + tst r25 + brne 1f + subi r26, -8 + or r25, r24 + brne 1f + ldi r24, 16 + ret +1: cpi r25, 16 + brsh 3f + subi r26, -3 + swap r25 +2: inc r26 +3: lsl r25 + brcc 2b + mov r24, r26 + clr r25 + ret +ENDF __clzhi2 +#endif /* defined (L_clzhi2) */ + + +/********************************** + * Parity + **********************************/ + +#if defined (L_paritydi2) +;; r25:r24 = parity64 (r25:r18) +;; clobbers: __tmp_reg__ +DEFUN __paritydi2 + eor r24, r18 + eor r24, r19 + eor r24, r20 + eor r24, r21 + XJMP __paritysi2 +ENDF __paritydi2 +#endif /* defined (L_paritydi2) */ + +#if defined (L_paritysi2) +;; r25:r24 = parity32 (r25:r22) +;; clobbers: __tmp_reg__ +DEFUN __paritysi2 + eor r24, r22 + eor r24, r23 + XJMP __parityhi2 +ENDF __paritysi2 +#endif /* defined (L_paritysi2) */ + +#if defined (L_parityhi2) +;; r25:r24 = parity16 (r25:r24) +;; clobbers: __tmp_reg__ +DEFUN __parityhi2 + eor r24, r25 +;; FALLTHRU +ENDF __parityhi2 + +;; r25:r24 = parity8 (r24) +;; clobbers: __tmp_reg__ +DEFUN __parityqi2 + ;; parity is in r24[0..7] + mov __tmp_reg__, r24 + swap __tmp_reg__ + eor r24, __tmp_reg__ + ;; parity is in r24[0..3] + subi r24, -4 + andi r24, -5 + subi r24, -6 + ;; parity is in r24[0,3] + sbrc r24, 3 + inc r24 + ;; parity is in r24[0] + andi r24, 1 + clr r25 + ret +ENDF __parityqi2 +#endif /* defined (L_parityhi2) */ + + +/********************************** + * Population Count + **********************************/ + +#if defined (L_popcounthi2) +;; population count +;; r25:r24 = popcount16 (r25:r24) +;; clobbers: __tmp_reg__ +DEFUN __popcounthi2 + XCALL __popcountqi2 + push r24 + mov r24, r25 + XCALL __popcountqi2 + clr r25 + ;; FALLTHRU +ENDF __popcounthi2 + +DEFUN __popcounthi2_tail + pop __tmp_reg__ + add r24, __tmp_reg__ + ret +ENDF __popcounthi2_tail +#endif /* defined (L_popcounthi2) */ + +#if defined (L_popcountsi2) +;; population count +;; r25:r24 = popcount32 (r25:r22) +;; clobbers: __tmp_reg__ +DEFUN __popcountsi2 + XCALL __popcounthi2 + push r24 + mov_l r24, r22 + mov_h r25, r23 + XCALL __popcounthi2 + XJMP __popcounthi2_tail +ENDF __popcountsi2 +#endif /* defined (L_popcountsi2) */ + +#if defined (L_popcountdi2) +;; population count +;; r25:r24 = popcount64 (r25:r18) +;; clobbers: r22, r23, __tmp_reg__ +DEFUN __popcountdi2 + XCALL __popcountsi2 + push r24 + mov_l r22, r18 + mov_h r23, r19 + mov_l r24, r20 + mov_h r25, r21 + XCALL __popcountsi2 + XJMP __popcounthi2_tail +ENDF __popcountdi2 +#endif /* defined (L_popcountdi2) */ + +#if defined (L_popcountqi2) +;; population count +;; r24 = popcount8 (r24) +;; clobbers: __tmp_reg__ +DEFUN __popcountqi2 + mov __tmp_reg__, r24 + andi r24, 1 + lsr __tmp_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __tmp_reg__ + ret +ENDF __popcountqi2 +#endif /* defined (L_popcountqi2) */ + + +/********************************** + * Swap bytes + **********************************/ + +;; swap two registers with different register number +.macro bswap a, b + eor \a, \b + eor \b, \a + eor \a, \b +.endm + +#if defined (L_bswapsi2) +;; swap bytes +;; r25:r22 = bswap32 (r25:r22) +DEFUN __bswapsi2 + bswap r22, r25 + bswap r23, r24 + ret +ENDF __bswapsi2 +#endif /* defined (L_bswapsi2) */ + +#if defined (L_bswapdi2) +;; swap bytes +;; r25:r18 = bswap64 (r25:r18) +DEFUN __bswapdi2 + bswap r18, r25 + bswap r19, r24 + bswap r20, r23 + bswap r21, r22 + ret +ENDF __bswapdi2 +#endif /* defined (L_bswapdi2) */ + + +/********************************** + * 64-bit shifts + **********************************/ + +#if defined (L_ashrdi3) + +#define SS __zero_reg__ + +;; Arithmetic shift right +;; r25:r18 = ashr64 (r25:r18, r17:r16) +DEFUN __ashrdi3 + sbrc r25, 7 + com SS + ;; FALLTHRU +ENDF __ashrdi3 + +;; Logic shift right +;; r25:r18 = lshr64 (r25:r18, r17:r16) +DEFUN __lshrdi3 + ;; Signs are in SS (zero_reg) + mov __tmp_reg__, r16 +0: cpi r16, 8 + brlo 2f + subi r16, 8 + mov r18, r19 + mov r19, r20 + mov r20, r21 + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, SS + rjmp 0b +1: asr SS + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 +2: dec r16 + brpl 1b + clr __zero_reg__ + mov r16, __tmp_reg__ + ret +ENDF __lshrdi3 + +#undef SS + +#endif /* defined (L_ashrdi3) */ + +#if defined (L_ashldi3) +;; Shift left +;; r25:r18 = ashl64 (r25:r18, r17:r16) +;; This function does not clobber T. +DEFUN __ashldi3 + mov __tmp_reg__, r16 +0: cpi r16, 8 + brlo 2f + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + mov r21, r20 + mov r20, r19 + mov r19, r18 + clr r18 + subi r16, 8 + rjmp 0b +1: lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 +2: dec r16 + brpl 1b + mov r16, __tmp_reg__ + ret +ENDF __ashldi3 +#endif /* defined (L_ashldi3) */ + +#if defined (L_rotldi3) +;; Rotate left +;; r25:r18 = rotl64 (r25:r18, r17:r16) +DEFUN __rotldi3 + push r16 +0: cpi r16, 8 + brlo 2f + subi r16, 8 + mov __tmp_reg__, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + mov r21, r20 + mov r20, r19 + mov r19, r18 + mov r18, __tmp_reg__ + rjmp 0b +1: lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + adc r18, __zero_reg__ +2: dec r16 + brpl 1b + pop r16 + ret +ENDF __rotldi3 +#endif /* defined (L_rotldi3) */ + + +.section .text.libgcc.fmul, "ax", @progbits + +/***********************************************************/ +;;; Softmul versions of FMUL, FMULS and FMULSU to implement +;;; __builtin_avr_fmul* if !AVR_HAVE_MUL +/***********************************************************/ + +#define A1 24 +#define B1 25 +#define C0 22 +#define C1 23 +#define A0 __tmp_reg__ + +#ifdef L_fmuls +;;; r23:r22 = fmuls (r24, r25) like in FMULS instruction +;;; Clobbers: r24, r25, __tmp_reg__ +DEFUN __fmuls + ;; A0.7 = negate result? + mov A0, A1 + eor A0, B1 + ;; B1 = |B1| + sbrc B1, 7 + neg B1 + XJMP __fmulsu_exit +ENDF __fmuls +#endif /* L_fmuls */ + +#ifdef L_fmulsu +;;; r23:r22 = fmulsu (r24, r25) like in FMULSU instruction +;;; Clobbers: r24, r25, __tmp_reg__ +DEFUN __fmulsu + ;; A0.7 = negate result? + mov A0, A1 +;; FALLTHRU +ENDF __fmulsu + +;; Helper for __fmuls and __fmulsu +DEFUN __fmulsu_exit + ;; A1 = |A1| + sbrc A1, 7 + neg A1 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Some cores have problem skipping 2-word instruction + tst A0 + brmi 1f +#else + sbrs A0, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XJMP __fmul +1: XCALL __fmul + ;; C = -C iff A0.7 = 1 + NEG2 C0 + ret +ENDF __fmulsu_exit +#endif /* L_fmulsu */ + + +#ifdef L_fmul +;;; r22:r23 = fmul (r24, r25) like in FMUL instruction +;;; Clobbers: r24, r25, __tmp_reg__ +DEFUN __fmul + ; clear result + clr C0 + clr C1 + clr A0 +1: tst B1 + ;; 1.0 = 0x80, so test for bit 7 of B to see if A must to be added to C. +2: brpl 3f + ;; C += A + add C0, A0 + adc C1, A1 +3: ;; A >>= 1 + lsr A1 + ror A0 + ;; B <<= 1 + lsl B1 + brne 2b + ret +ENDF __fmul +#endif /* L_fmul */ + +#undef A0 +#undef A1 +#undef B1 +#undef C0 +#undef C1 + +#include "lib1funcs-fixed.S"