diff --git a/include/lib1funcs.hpp b/include/lib1funcs.hpp
new file mode 100644
index 0000000..a4c2e46
--- /dev/null
+++ b/include/lib1funcs.hpp
@@ -0,0 +1,35 @@
+
+static constexpr std::string_view __mulhi3 = 
+R"(
+;;; based on protocol from gcc's calling conventions for AVR
+
+;;; R25:R24 = R23:R22 * R25:R24
+;;; Clobbers: __tmp_reg__, R21..R23
+
+__mulhi3:
+        mov __temp_reg__,r24
+        mov r21,r25
+        ldi r25,0
+        ldi r24,0
+        cp __temp_reg__,__zero_reg__
+        cpc r21,__zero_reg__
+        breq .__mulhi3_L5
+.__mulhi3_L4:
+        sbrs __temp_reg__,0
+        rjmp .__mulhi3_L3
+        add r24,r22
+        adc r25,r23
+.__mulhi3_L3:
+        lsr r21
+        ror __temp_reg__
+        lsl r22
+        rol r23
+        cp __temp_reg__,__zero_reg__
+        cpc r21,__zero_reg__
+        brne .__mulhi3_L4
+        ret
+.__mulhi3_L5:
+        ret
+)";
+
+
diff --git a/lib1funcs.S b/lib1funcs.S
deleted file mode 100644
index ac101b4..0000000
--- a/lib1funcs.S
+++ /dev/null
@@ -1,3315 +0,0 @@
-/*  -*- Mode: Asm -*-  */
-/* Copyright (C) 1998-2021 Free Software Foundation, Inc.
-   Contributed by Denis Chertykov <chertykov@gmail.com>
-
-This file is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 3, or (at your option) any
-later version.
-
-This file is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
-
-Under Section 7 of GPL version 3, you are granted additional
-permissions described in the GCC Runtime Library Exception, version
-3.1, as published by the Free Software Foundation.
-
-You should have received a copy of the GNU General Public License and
-a copy of the GCC Runtime Library Exception along with this program;
-see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-<http://www.gnu.org/licenses/>.  */
-
-#if defined (__AVR_TINY__)
-#define __zero_reg__ r17
-#define __tmp_reg__ r16
-#else
-#define __zero_reg__ r1
-#define __tmp_reg__ r0
-#endif
-#define __SREG__ 0x3f
-#if defined (__AVR_HAVE_SPH__)
-#define __SP_H__ 0x3e
-#endif
-#define __SP_L__ 0x3d
-#define __RAMPZ__ 0x3B
-#define __EIND__  0x3C
-
-/* Most of the functions here are called directly from avr.md
-   patterns, instead of using the standard libcall mechanisms.
-   This can make better code because GCC knows exactly which
-   of the call-used registers (not all of them) are clobbered.  */
-
-/* FIXME:  At present, there is no SORT directive in the linker
-           script so that we must not assume that different modules
-           in the same input section like .libgcc.text.mul will be
-           located close together.  Therefore, we cannot use
-           RCALL/RJMP to call a function like __udivmodhi4 from
-           __divmodhi4 and have to use lengthy XCALL/XJMP even
-           though they are in the same input section and all same
-           input sections together are small enough to reach every
-           location with a RCALL/RJMP instruction.  */
-
-#if defined (__AVR_HAVE_EIJMP_EICALL__) && !defined (__AVR_HAVE_ELPMX__)
-#error device not supported
-#endif
-
-	.macro	mov_l  r_dest, r_src
-#if defined (__AVR_HAVE_MOVW__)
-	movw	\r_dest, \r_src
-#else
-	mov	\r_dest, \r_src
-#endif
-	.endm
-
-	.macro	mov_h  r_dest, r_src
-#if defined (__AVR_HAVE_MOVW__)
-	; empty
-#else
-	mov	\r_dest, \r_src
-#endif
-	.endm
-
-.macro	wmov  r_dest, r_src
-#if defined (__AVR_HAVE_MOVW__)
-    movw \r_dest,   \r_src
-#else
-    mov \r_dest,    \r_src
-    mov \r_dest+1,  \r_src+1
-#endif
-.endm
-
-#if defined (__AVR_HAVE_JMP_CALL__)
-#define XCALL call
-#define XJMP  jmp
-#else
-#define XCALL rcall
-#define XJMP  rjmp
-#endif
-
-#if defined (__AVR_HAVE_EIJMP_EICALL__)
-#define XICALL eicall
-#define XIJMP  eijmp
-#else
-#define XICALL icall
-#define XIJMP  ijmp
-#endif
-
-;; Prologue stuff
-
-.macro do_prologue_saves n_pushed n_frame=0
-    ldi r26, lo8(\n_frame)
-    ldi r27, hi8(\n_frame)
-    ldi r30, lo8(gs(.L_prologue_saves.\@))
-    ldi r31, hi8(gs(.L_prologue_saves.\@))
-    XJMP __prologue_saves__ + ((18 - (\n_pushed)) * 2)
-.L_prologue_saves.\@:
-.endm
-
-;; Epilogue stuff
-
-.macro do_epilogue_restores n_pushed n_frame=0
-    in      r28, __SP_L__
-#ifdef __AVR_HAVE_SPH__
-    in      r29, __SP_H__
-.if \n_frame > 63
-    subi    r28, lo8(-\n_frame)
-    sbci    r29, hi8(-\n_frame)
-.elseif \n_frame > 0
-    adiw    r28, \n_frame
-.endif
-#else
-    clr     r29
-.if \n_frame > 0
-    subi    r28, lo8(-\n_frame)
-.endif
-#endif /* HAVE SPH */
-    ldi     r30, \n_pushed
-    XJMP __epilogue_restores__ + ((18 - (\n_pushed)) * 2)
-.endm
-
-;; Support function entry and exit for convenience
-
-.macro wsubi r_arg1, i_arg2
-#if defined (__AVR_TINY__)
-    subi \r_arg1,   lo8(\i_arg2)
-    sbci \r_arg1+1, hi8(\i_arg2)
-#else
-    sbiw \r_arg1, \i_arg2
-#endif
-.endm
-
-.macro waddi r_arg1, i_arg2
-#if defined (__AVR_TINY__)
-    subi \r_arg1,   lo8(-\i_arg2)
-    sbci \r_arg1+1, hi8(-\i_arg2)
-#else
-    adiw \r_arg1, \i_arg2
-#endif
-.endm
-
-.macro DEFUN name
-.global \name
-.func \name
-\name:
-.endm
-
-.macro ENDF name
-.size \name, .-\name
-.endfunc
-.endm
-
-.macro FALIAS name
-.global \name
-.func \name
-\name:
-.size \name, .-\name
-.endfunc
-.endm
-
-;; Skip next instruction, typically a jump target
-#define skip cpse 16,16
-
-;; Negate a 2-byte value held in consecutive registers
-.macro NEG2  reg
-    com     \reg+1
-    neg     \reg
-    sbci    \reg+1, -1
-.endm
-
-;; Negate a 4-byte value held in consecutive registers
-;; Sets the V flag for signed overflow tests if REG >= 16
-.macro NEG4  reg
-    com     \reg+3
-    com     \reg+2
-    com     \reg+1
-.if \reg >= 16
-    neg     \reg
-    sbci    \reg+1, -1
-    sbci    \reg+2, -1
-    sbci    \reg+3, -1
-.else
-    com     \reg
-    adc     \reg,   __zero_reg__
-    adc     \reg+1, __zero_reg__
-    adc     \reg+2, __zero_reg__
-    adc     \reg+3, __zero_reg__
-.endif
-.endm
-
-#define exp_lo(N)  hlo8 ((N) << 23)
-#define exp_hi(N)  hhi8 ((N) << 23)
-
-
-.section .text.libgcc.mul, "ax", @progbits
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-/* Note: mulqi3, mulhi3 are open-coded on the enhanced core.  */
-#if !defined (__AVR_HAVE_MUL__)
-/*******************************************************
-    Multiplication  8 x 8  without MUL
-*******************************************************/
-#if defined (L_mulqi3)
-
-#define	r_arg2	r22		/* multiplicand */
-#define	r_arg1 	r24		/* multiplier */
-#define r_res	__tmp_reg__	/* result */
-
-DEFUN __mulqi3
-	clr	r_res		; clear result
-__mulqi3_loop:
-	sbrc	r_arg1,0
-	add	r_res,r_arg2
-	add	r_arg2,r_arg2	; shift multiplicand
-	breq	__mulqi3_exit	; while multiplicand != 0
-	lsr	r_arg1		;
-	brne	__mulqi3_loop	; exit if multiplier = 0
-__mulqi3_exit:	
-	mov	r_arg1,r_res	; result to return register
-	ret
-ENDF __mulqi3
-
-#undef r_arg2
-#undef r_arg1
-#undef r_res
-	
-#endif 	/* defined (L_mulqi3) */
-
-
-/*******************************************************
-    Widening Multiplication  16 = 8 x 8  without MUL
-    Multiplication  16 x 16  without MUL
-*******************************************************/
-
-#define A0  22
-#define A1  23
-#define B0  24
-#define BB0 20
-#define B1  25
-;; Output overlaps input, thus expand result in CC0/1
-#define C0  24
-#define C1  25
-#define CC0  __tmp_reg__
-#define CC1  21
-
-#if defined (L_umulqihi3)
-;;; R25:R24 = (unsigned int) R22 * (unsigned int) R24
-;;; (C1:C0) = (unsigned int) A0  * (unsigned int) B0
-;;; Clobbers: __tmp_reg__, R21..R23
-DEFUN __umulqihi3
-    clr     A1
-    clr     B1
-    XJMP    __mulhi3
-ENDF __umulqihi3
-#endif /* L_umulqihi3 */
-
-#if defined (L_mulqihi3)
-;;; R25:R24 = (signed int) R22 * (signed int) R24
-;;; (C1:C0) = (signed int) A0  * (signed int) B0
-;;; Clobbers: __tmp_reg__, R20..R23
-DEFUN __mulqihi3
-    ;; Sign-extend B0
-    clr     B1
-    sbrc    B0, 7
-    com     B1
-    ;; The multiplication runs twice as fast if A1 is zero, thus:
-    ;; Zero-extend A0
-    clr     A1
-#ifdef __AVR_HAVE_JMP_CALL__
-    ;; Store  B0 * sign of A
-    clr     BB0
-    sbrc    A0, 7
-    mov     BB0, B0
-    call    __mulhi3
-#else /* have no CALL */
-    ;; Skip sign-extension of A if A >= 0
-    ;; Same size as with the first alternative but avoids errata skip
-    ;; and is faster if A >= 0
-    sbrs    A0, 7
-    rjmp    __mulhi3
-    ;; If  A < 0  store B
-    mov     BB0, B0
-    rcall   __mulhi3
-#endif /* HAVE_JMP_CALL */
-    ;; 1-extend A after the multiplication
-    sub     C1, BB0
-    ret
-ENDF __mulqihi3
-#endif /* L_mulqihi3 */
-
-#if defined (L_mulhi3)
-;;; R25:R24 = R23:R22 * R25:R24
-;;; (C1:C0) = (A1:A0) * (B1:B0)
-;;; Clobbers: __tmp_reg__, R21..R23
-DEFUN __mulhi3
-
-    ;; Clear result
-    clr     CC0
-    clr     CC1
-    rjmp 3f
-1:
-    ;; Bit n of A is 1  -->  C += B << n
-    add     CC0, B0
-    adc     CC1, B1
-2:
-    lsl     B0
-    rol     B1
-3:
-    ;; If B == 0 we are ready
-    wsubi   B0, 0
-    breq 9f
-
-    ;; Carry = n-th bit of A
-    lsr     A1
-    ror     A0
-    ;; If bit n of A is set, then go add  B * 2^n  to  C
-    brcs 1b
-
-    ;; Carry = 0  -->  The ROR above acts like  CP A0, 0
-    ;; Thus, it is sufficient to CPC the high part to test A against 0
-    cpc     A1, __zero_reg__
-    ;; Only proceed if A != 0
-    brne    2b
-9:
-    ;; Move Result into place
-    mov     C0, CC0
-    mov     C1, CC1
-    ret
-ENDF  __mulhi3
-#endif /* L_mulhi3 */
-
-#undef A0
-#undef A1
-#undef B0
-#undef BB0
-#undef B1
-#undef C0
-#undef C1
-#undef CC0
-#undef CC1
-
-
-#define A0 22
-#define A1 A0+1
-#define A2 A0+2
-#define A3 A0+3
-
-#define B0 18
-#define B1 B0+1
-#define B2 B0+2
-#define B3 B0+3
-
-#define CC0 26
-#define CC1 CC0+1
-#define CC2 30
-#define CC3 CC2+1
-
-#define C0 22
-#define C1 C0+1
-#define C2 C0+2
-#define C3 C0+3
-
-/*******************************************************
-    Widening Multiplication  32 = 16 x 16  without MUL
-*******************************************************/
-
-#if defined (L_umulhisi3)
-DEFUN __umulhisi3
-    wmov    B0, 24
-    ;; Zero-extend B
-    clr     B2
-    clr     B3
-    ;; Zero-extend A
-    wmov    A2, B2
-    XJMP    __mulsi3
-ENDF __umulhisi3
-#endif /* L_umulhisi3 */
-
-#if defined (L_mulhisi3)
-DEFUN __mulhisi3
-    wmov    B0, 24
-    ;; Sign-extend B
-    lsl     r25
-    sbc     B2, B2
-    mov     B3, B2
-#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
-    ;; Sign-extend A
-    clr     A2
-    sbrc    A1, 7
-    com     A2
-    mov     A3, A2
-    XJMP __mulsi3
-#else /*  no __AVR_ERRATA_SKIP_JMP_CALL__ */
-    ;; Zero-extend A and __mulsi3 will run at least twice as fast
-    ;; compared to a sign-extended A.
-    clr     A2
-    clr     A3
-    sbrs    A1, 7
-    XJMP __mulsi3
-    ;; If  A < 0  then perform the  B * 0xffff.... before the
-    ;; very multiplication by initializing the high part of the
-    ;; result CC with -B.
-    wmov    CC2, A2
-    sub     CC2, B0
-    sbc     CC3, B1
-    XJMP __mulsi3_helper
-#endif /*  __AVR_ERRATA_SKIP_JMP_CALL__ */
-ENDF __mulhisi3
-#endif /* L_mulhisi3 */
-
-
-/*******************************************************
-    Multiplication  32 x 32  without MUL
-*******************************************************/
-
-#if defined (L_mulsi3)
-DEFUN __mulsi3
-#if defined (__AVR_TINY__)
-    in     r26, __SP_L__ ; safe to use X, as it is CC0/CC1
-    in     r27, __SP_H__
-    subi   r26, lo8(-3)   ; Add 3 to point past return address
-    sbci   r27, hi8(-3)
-    push   B0    ; save callee saved regs
-    push   B1
-    ld     B0, X+   ; load from caller stack
-    ld     B1, X+
-    ld     B2, X+
-    ld     B3, X
-#endif
-    ;; Clear result
-    clr     CC2
-    clr     CC3
-    ;; FALLTHRU
-ENDF  __mulsi3
-
-DEFUN __mulsi3_helper
-    clr     CC0
-    clr     CC1
-    rjmp 3f
-
-1:  ;; If bit n of A is set, then add  B * 2^n  to the result in CC
-    ;; CC += B
-    add  CC0,B0  $  adc  CC1,B1  $  adc  CC2,B2  $  adc  CC3,B3
-
-2:  ;; B <<= 1
-    lsl  B0      $  rol  B1      $  rol  B2      $  rol  B3
-
-3:  ;; A >>= 1:  Carry = n-th bit of A
-    lsr  A3      $  ror  A2      $  ror  A1      $  ror  A0
-
-    brcs 1b
-    ;; Only continue if  A != 0
-    sbci    A1, 0
-    brne 2b
-    wsubi   A2, 0
-    brne 2b
-
-    ;; All bits of A are consumed:  Copy result to return register C
-    wmov    C0, CC0
-    wmov    C2, CC2
-#if defined (__AVR_TINY__)
-    pop     B1      ; restore callee saved regs
-    pop     B0 
-#endif  /* defined (__AVR_TINY__) */
-
-    ret
-ENDF __mulsi3_helper
-#endif /* L_mulsi3 */
-
-#undef A0
-#undef A1
-#undef A2
-#undef A3
-#undef B0
-#undef B1
-#undef B2
-#undef B3
-#undef C0
-#undef C1
-#undef C2
-#undef C3
-#undef CC0
-#undef CC1
-#undef CC2
-#undef CC3
-
-#endif /* !defined (__AVR_HAVE_MUL__) */
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-#if defined (__AVR_HAVE_MUL__)
-#define A0 26
-#define B0 18
-#define C0 22
-
-#define A1 A0+1
-
-#define B1 B0+1
-#define B2 B0+2
-#define B3 B0+3
-
-#define C1 C0+1
-#define C2 C0+2
-#define C3 C0+3
-
-/*******************************************************
-    Widening Multiplication  32 = 16 x 16  with MUL
-*******************************************************/
-
-#if defined (L_mulhisi3)
-;;; R25:R22 = (signed long) R27:R26 * (signed long) R19:R18
-;;; C3:C0   = (signed long) A1:A0   * (signed long) B1:B0
-;;; Clobbers: __tmp_reg__
-DEFUN __mulhisi3
-    XCALL   __umulhisi3
-    ;; Sign-extend B
-    tst     B1
-    brpl    1f
-    sub     C2, A0
-    sbc     C3, A1
-1:  ;; Sign-extend A
-    XJMP __usmulhisi3_tail
-ENDF __mulhisi3
-#endif /* L_mulhisi3 */
-
-#if defined (L_usmulhisi3)
-;;; R25:R22 = (signed long) R27:R26 * (unsigned long) R19:R18
-;;; C3:C0   = (signed long) A1:A0   * (unsigned long) B1:B0
-;;; Clobbers: __tmp_reg__
-DEFUN __usmulhisi3
-    XCALL   __umulhisi3
-    ;; FALLTHRU
-ENDF __usmulhisi3
-
-DEFUN __usmulhisi3_tail
-    ;; Sign-extend A
-    sbrs    A1, 7
-    ret
-    sub     C2, B0
-    sbc     C3, B1
-    ret
-ENDF __usmulhisi3_tail
-#endif /* L_usmulhisi3 */
-
-#if defined (L_umulhisi3)
-;;; R25:R22 = (unsigned long) R27:R26 * (unsigned long) R19:R18
-;;; C3:C0   = (unsigned long) A1:A0   * (unsigned long) B1:B0
-;;; Clobbers: __tmp_reg__
-DEFUN __umulhisi3
-    mul     A0, B0
-    movw    C0, r0
-    mul     A1, B1
-    movw    C2, r0
-    mul     A0, B1
-#ifdef __AVR_HAVE_JMP_CALL__
-    ;; This function is used by many other routines, often multiple times.
-    ;; Therefore, if the flash size is not too limited, avoid the RCALL
-    ;; and inverst 6 Bytes to speed things up.
-    add     C1, r0
-    adc     C2, r1
-    clr     __zero_reg__
-    adc     C3, __zero_reg__
-#else
-    rcall   1f
-#endif
-    mul     A1, B0
-1:  add     C1, r0
-    adc     C2, r1
-    clr     __zero_reg__
-    adc     C3, __zero_reg__
-    ret
-ENDF __umulhisi3
-#endif /* L_umulhisi3 */
-
-/*******************************************************
-    Widening Multiplication  32 = 16 x 32  with MUL
-*******************************************************/
-
-#if defined (L_mulshisi3)
-;;; R25:R22 = (signed long) R27:R26 * R21:R18
-;;; (C3:C0) = (signed long) A1:A0   * B3:B0
-;;; Clobbers: __tmp_reg__
-DEFUN __mulshisi3
-#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
-    ;; Some cores have problem skipping 2-word instruction
-    tst     A1
-    brmi    __mulohisi3
-#else
-    sbrs    A1, 7
-#endif /* __AVR_HAVE_JMP_CALL__ */
-    XJMP    __muluhisi3
-    ;; FALLTHRU
-ENDF __mulshisi3
-
-;;; R25:R22 = (one-extended long) R27:R26 * R21:R18
-;;; (C3:C0) = (one-extended long) A1:A0   * B3:B0
-;;; Clobbers: __tmp_reg__
-DEFUN __mulohisi3
-    XCALL   __muluhisi3
-    ;; One-extend R27:R26 (A1:A0)
-    sub     C2, B0
-    sbc     C3, B1
-    ret
-ENDF __mulohisi3
-#endif /* L_mulshisi3 */
-
-#if defined (L_muluhisi3)
-;;; R25:R22 = (unsigned long) R27:R26 * R21:R18
-;;; (C3:C0) = (unsigned long) A1:A0   * B3:B0
-;;; Clobbers: __tmp_reg__
-DEFUN __muluhisi3
-    XCALL   __umulhisi3
-    mul     A0, B3
-    add     C3, r0
-    mul     A1, B2
-    add     C3, r0
-    mul     A0, B2
-    add     C2, r0
-    adc     C3, r1
-    clr     __zero_reg__
-    ret
-ENDF __muluhisi3
-#endif /* L_muluhisi3 */
-
-/*******************************************************
-    Multiplication  32 x 32  with MUL
-*******************************************************/
-
-#if defined (L_mulsi3)
-;;; R25:R22 = R25:R22 * R21:R18
-;;; (C3:C0) = C3:C0   * B3:B0
-;;; Clobbers: R26, R27, __tmp_reg__
-DEFUN __mulsi3
-    movw    A0, C0
-    push    C2
-    push    C3
-    XCALL   __muluhisi3
-    pop     A1
-    pop     A0
-    ;; A1:A0 now contains the high word of A
-    mul     A0, B0
-    add     C2, r0
-    adc     C3, r1
-    mul     A0, B1
-    add     C3, r0
-    mul     A1, B0
-    add     C3, r0
-    clr     __zero_reg__
-    ret
-ENDF __mulsi3
-#endif /* L_mulsi3 */
-
-#undef A0
-#undef A1
-
-#undef B0
-#undef B1
-#undef B2
-#undef B3
-
-#undef C0
-#undef C1
-#undef C2
-#undef C3
-
-#endif /* __AVR_HAVE_MUL__ */
-
-/*******************************************************
-       Multiplication 24 x 24 with MUL
-*******************************************************/
-
-#if defined (L_mulpsi3)
-
-;; A[0..2]: In: Multiplicand; Out: Product
-#define A0  22
-#define A1  A0+1
-#define A2  A0+2
-
-;; B[0..2]: In: Multiplier
-#define B0  18
-#define B1  B0+1
-#define B2  B0+2
-
-#if defined (__AVR_HAVE_MUL__)
-
-;; C[0..2]: Expand Result
-#define C0  22
-#define C1  C0+1
-#define C2  C0+2
-
-;; R24:R22 *= R20:R18
-;; Clobbers: r21, r25, r26, r27, __tmp_reg__
-
-#define AA0 26
-#define AA2 21
-
-DEFUN __mulpsi3
-    wmov    AA0, A0
-    mov     AA2, A2
-    XCALL   __umulhisi3
-    mul     AA2, B0     $  add  C2, r0
-    mul     AA0, B2     $  add  C2, r0
-    clr     __zero_reg__
-    ret
-ENDF __mulpsi3
-
-#undef AA2
-#undef AA0
-
-#undef C2
-#undef C1
-#undef C0
-
-#else /* !HAVE_MUL */
-;; C[0..2]: Expand Result
-#if defined (__AVR_TINY__)
-#define C0  16
-#else
-#define C0  0
-#endif /* defined (__AVR_TINY__) */
-#define C1  C0+1
-#define C2  21
-
-;; R24:R22 *= R20:R18
-;; Clobbers: __tmp_reg__, R18, R19, R20, R21
-
-DEFUN __mulpsi3
-#if defined (__AVR_TINY__)
-    in r26,__SP_L__ 
-    in r27,__SP_H__
-    subi r26, lo8(-3)   ; Add 3 to point past return address
-    sbci r27, hi8(-3)
-    push B0    ; save callee saved regs
-    push B1
-    ld B0,X+   ; load from caller stack 
-    ld B1,X+
-    ld B2,X+
-#endif /* defined (__AVR_TINY__) */
-
-    ;; C[] = 0
-    clr     __tmp_reg__
-    clr     C2
-
-0:  ;; Shift N-th Bit of B[] into Carry.  N = 24 - Loop
-    LSR  B2     $  ror  B1     $  ror  B0
-
-    ;; If the N-th Bit of B[] was set...
-    brcc    1f
-
-    ;; ...then add A[] * 2^N to the Result C[]
-    ADD  C0,A0  $  adc  C1,A1  $  adc  C2,A2
-
-1:  ;; Multiply A[] by 2
-    LSL  A0     $  rol  A1     $  rol  A2
-
-    ;; Loop until B[] is 0
-    subi B0,0   $  sbci B1,0   $  sbci B2,0
-    brne    0b
-
-    ;; Copy C[] to the return Register A[]
-    wmov    A0, C0
-    mov     A2, C2
-
-    clr     __zero_reg__
-#if defined (__AVR_TINY__)
-    pop B1
-    pop B0
-#endif /* (__AVR_TINY__) */
-    ret
-ENDF __mulpsi3
-
-#undef C2
-#undef C1
-#undef C0
-
-#endif /* HAVE_MUL */
-
-#undef B2
-#undef B1
-#undef B0
-
-#undef A2
-#undef A1
-#undef A0
-
-#endif /* L_mulpsi3 */
-
-#if defined (L_mulsqipsi3) && defined (__AVR_HAVE_MUL__)
-
-;; A[0..2]: In: Multiplicand
-#define A0  22
-#define A1  A0+1
-#define A2  A0+2
-
-;; BB: In: Multiplier
-#define BB  25
-
-;; C[0..2]: Result
-#define C0  18
-#define C1  C0+1
-#define C2  C0+2
-
-;; C[] = A[] * sign_extend (BB)
-DEFUN __mulsqipsi3
-    mul     A0, BB
-    movw    C0, r0
-    mul     A2, BB
-    mov     C2, r0
-    mul     A1, BB
-    add     C1, r0
-    adc     C2, r1
-    clr     __zero_reg__
-    sbrs    BB, 7
-    ret
-    ;; One-extend BB
-    sub     C1, A0
-    sbc     C2, A1
-    ret
-ENDF __mulsqipsi3
-
-#undef C2
-#undef C1
-#undef C0
-
-#undef BB
-
-#undef A2
-#undef A1
-#undef A0
-
-#endif /* L_mulsqipsi3  &&  HAVE_MUL */
-
-/*******************************************************
-       Multiplication 64 x 64
-*******************************************************/
-
-;; A[] = A[] * B[]
-
-;; A[0..7]: In: Multiplicand
-;; Out: Product
-#define A0  18
-#define A1  A0+1
-#define A2  A0+2
-#define A3  A0+3
-#define A4  A0+4
-#define A5  A0+5
-#define A6  A0+6
-#define A7  A0+7
-
-;; B[0..7]: In: Multiplier
-#define B0  10
-#define B1  B0+1
-#define B2  B0+2
-#define B3  B0+3
-#define B4  B0+4
-#define B5  B0+5
-#define B6  B0+6
-#define B7  B0+7
-
-#ifndef __AVR_TINY__
-#if defined (__AVR_HAVE_MUL__)
-;; Define C[] for convenience
-;; Notice that parts of C[] overlap A[] respective B[]
-#define C0  16
-#define C1  C0+1
-#define C2  20
-#define C3  C2+1
-#define C4  28
-#define C5  C4+1
-#define C6  C4+2
-#define C7  C4+3
-
-#if defined (L_muldi3)
-
-;; A[]     *= B[]
-;; R25:R18 *= R17:R10
-;; Ordinary ABI-Function
-
-DEFUN __muldi3
-    push    r29
-    push    r28
-    push    r17
-    push    r16
-
-    ;; Counting in Words, we have to perform a 4 * 4 Multiplication
-
-    ;; 3 * 0  +  0 * 3
-    mul  A7,B0  $             $  mov C7,r0
-    mul  A0,B7  $             $  add C7,r0
-    mul  A6,B1  $             $  add C7,r0
-    mul  A6,B0  $  mov C6,r0  $  add C7,r1
-    mul  B6,A1  $             $  add C7,r0
-    mul  B6,A0  $  add C6,r0  $  adc C7,r1
-
-    ;; 1 * 2
-    mul  A2,B4  $  add C6,r0  $  adc C7,r1
-    mul  A3,B4  $             $  add C7,r0
-    mul  A2,B5  $             $  add C7,r0
-
-    push    A5
-    push    A4
-    push    B1
-    push    B0
-    push    A3
-    push    A2
-
-    ;; 0 * 0
-    wmov    26, B0
-    XCALL   __umulhisi3
-    wmov    C0, 22
-    wmov    C2, 24
-
-    ;; 0 * 2
-    wmov    26, B4
-    XCALL   __umulhisi3  $  wmov C4,22            $ add C6,24 $ adc C7,25
-
-    wmov    26, B2
-    ;; 0 * 1
-    XCALL   __muldi3_6
-
-    pop     A0
-    pop     A1
-    ;; 1 * 1
-    wmov    26, B2
-    XCALL   __umulhisi3  $  add C4,22 $ adc C5,23 $ adc C6,24 $ adc C7,25
-
-    pop     r26
-    pop     r27
-    ;; 1 * 0
-    XCALL   __muldi3_6
-
-    pop     A0
-    pop     A1
-    ;; 2 * 0
-    XCALL   __umulhisi3  $  add C4,22 $ adc C5,23 $ adc C6,24 $ adc C7,25
-
-    ;; 2 * 1
-    wmov    26, B2
-    XCALL   __umulhisi3  $            $           $ add C6,22 $ adc C7,23
-
-    ;; A[] = C[]
-    wmov    A0, C0
-    ;; A2 = C2 already
-    wmov    A4, C4
-    wmov    A6, C6
-
-    pop     r16
-    pop     r17
-    pop     r28
-    pop     r29
-    ret
-ENDF __muldi3
-#endif /* L_muldi3 */
-
-#if defined (L_muldi3_6)
-;; A helper for some 64-bit multiplications with MUL available
-DEFUN __muldi3_6
-__muldi3_6:
-    XCALL   __umulhisi3
-    add     C2, 22
-    adc     C3, 23
-    adc     C4, 24
-    adc     C5, 25
-    brcc    0f
-    adiw    C6, 1
-0:  ret
-ENDF __muldi3_6
-#endif /* L_muldi3_6 */
-
-#undef C7
-#undef C6
-#undef C5
-#undef C4
-#undef C3
-#undef C2
-#undef C1
-#undef C0
-
-#else /* !HAVE_MUL */
-
-#if defined (L_muldi3)
-
-#define C0  26
-#define C1  C0+1
-#define C2  C0+2
-#define C3  C0+3
-#define C4  C0+4
-#define C5  C0+5
-#define C6  0
-#define C7  C6+1
-
-#define Loop 9
-
-;; A[]     *= B[]
-;; R25:R18 *= R17:R10
-;; Ordinary ABI-Function
-
-DEFUN __muldi3
-    push    r29
-    push    r28
-    push    Loop
-
-    ldi     C0, 64
-    mov     Loop, C0
-
-    ;; C[] = 0
-    clr     __tmp_reg__
-    wmov    C0, 0
-    wmov    C2, 0
-    wmov    C4, 0
-
-0:  ;; Rotate B[] right by 1 and set Carry to the N-th Bit of B[]
-    ;; where N = 64 - Loop.
-    ;; Notice that B[] = B[] >>> 64 so after this Routine has finished,
-    ;; B[] will have its initial Value again.
-    LSR  B7     $  ror  B6     $  ror  B5     $  ror  B4
-    ror  B3     $  ror  B2     $  ror  B1     $  ror  B0
-
-    ;; If the N-th Bit of B[] was set then...
-    brcc    1f
-    ;; ...finish Rotation...
-    ori     B7, 1 << 7
-
-    ;; ...and add A[] * 2^N to the Result C[]
-    ADD  C0,A0  $  adc  C1,A1  $  adc  C2,A2  $  adc  C3,A3
-    adc  C4,A4  $  adc  C5,A5  $  adc  C6,A6  $  adc  C7,A7
-
-1:  ;; Multiply A[] by 2
-    LSL  A0     $  rol  A1     $  rol  A2     $  rol  A3
-    rol  A4     $  rol  A5     $  rol  A6     $  rol  A7
-
-    dec     Loop
-    brne    0b
-
-    ;; We expanded the Result in C[]
-    ;; Copy Result to the Return Register A[]
-    wmov    A0, C0
-    wmov    A2, C2
-    wmov    A4, C4
-    wmov    A6, C6
-
-    clr     __zero_reg__
-    pop     Loop
-    pop     r28
-    pop     r29
-    ret
-ENDF __muldi3
-
-#undef Loop
-
-#undef C7
-#undef C6
-#undef C5
-#undef C4
-#undef C3
-#undef C2
-#undef C1
-#undef C0
-
-#endif /* L_muldi3 */
-#endif /* HAVE_MUL */
-#endif /* if not __AVR_TINY__ */
-
-#undef B7
-#undef B6
-#undef B5
-#undef B4
-#undef B3
-#undef B2
-#undef B1
-#undef B0
-
-#undef A7
-#undef A6
-#undef A5
-#undef A4
-#undef A3
-#undef A2
-#undef A1
-#undef A0
-
-/*******************************************************
-   Widening Multiplication 64 = 32 x 32  with  MUL
-*******************************************************/
-
-#if defined (__AVR_HAVE_MUL__)
-#define A0 r22
-#define A1 r23 
-#define A2 r24
-#define A3 r25
- 
-#define B0 r18
-#define B1 r19
-#define B2 r20
-#define B3 r21
- 
-#define C0  18
-#define C1  C0+1
-#define C2  20
-#define C3  C2+1
-#define C4  28
-#define C5  C4+1
-#define C6  C4+2
-#define C7  C4+3
-
-#if defined (L_umulsidi3)
-
-;; Unsigned widening 64 = 32 * 32 Multiplication with MUL
-
-;; R18[8] = R22[4] * R18[4]
-;;
-;; Ordinary ABI Function, but additionally sets
-;; X = R20[2] = B2[2]
-;; Z = R22[2] = A0[2]
-DEFUN __umulsidi3
-    clt
-    ;; FALLTHRU
-ENDF  __umulsidi3
-    ;; T = sign (A)
-DEFUN __umulsidi3_helper
-    push    29  $  push    28 ; Y
-    wmov    30, A2
-    ;; Counting in Words, we have to perform 4 Multiplications
-    ;; 0 * 0
-    wmov    26, A0
-    XCALL __umulhisi3
-    push    23  $  push    22 ; C0
-    wmov    28, B0
-    wmov    18, B2
-    wmov    C2, 24
-    push    27  $  push    26 ; A0
-    push    19  $  push    18 ; B2
-    ;;
-    ;;  18  20  22  24  26  28  30  |  B2, B3, A0, A1, C0, C1, Y
-    ;;  B2  C2  --  --  --  B0  A2
-    ;; 1 * 1
-    wmov    26, 30      ; A2
-    XCALL __umulhisi3
-    ;; Sign-extend A.  T holds the sign of A
-    brtc    0f
-    ;; Subtract B from the high part of the result
-    sub     22, 28
-    sbc     23, 29
-    sbc     24, 18
-    sbc     25, 19
-0:  wmov    18, 28      ;; B0
-    wmov    C4, 22
-    wmov    C6, 24
-    ;;
-    ;;  18  20  22  24  26  28  30  |  B2, B3, A0, A1, C0, C1, Y
-    ;;  B0  C2  --  --  A2  C4  C6
-    ;;
-    ;; 1 * 0
-    XCALL __muldi3_6
-    ;; 0 * 1
-    pop     26  $   pop 27  ;; B2
-    pop     18  $   pop 19  ;; A0
-    XCALL __muldi3_6
-
-    ;; Move result C into place and save A0 in Z
-    wmov    22, C4
-    wmov    24, C6
-    wmov    30, 18 ; A0
-    pop     C0  $   pop C1
-
-    ;; Epilogue
-    pop     28  $   pop 29  ;; Y
-    ret
-ENDF __umulsidi3_helper
-#endif /* L_umulsidi3 */
-
-
-#if defined (L_mulsidi3)
-
-;; Signed widening 64 = 32 * 32 Multiplication
-;;
-;; R18[8] = R22[4] * R18[4]
-;; Ordinary ABI Function
-DEFUN __mulsidi3
-    bst     A3, 7
-    sbrs    B3, 7           ; Enhanced core has no skip bug
-    XJMP __umulsidi3_helper
-
-    ;; B needs sign-extension
-    push    A3
-    push    A2
-    XCALL __umulsidi3_helper
-    ;; A0 survived in Z
-    sub     r22, r30
-    sbc     r23, r31
-    pop     r26
-    pop     r27
-    sbc     r24, r26
-    sbc     r25, r27
-    ret
-ENDF __mulsidi3
-#endif /* L_mulsidi3 */
-
-#undef A0
-#undef A1
-#undef A2
-#undef A3
-#undef B0
-#undef B1
-#undef B2
-#undef B3
-#undef C0
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#endif /* HAVE_MUL */
-
-/**********************************************************
-    Widening Multiplication 64 = 32 x 32  without  MUL
-**********************************************************/
-#ifndef __AVR_TINY__ /* if not __AVR_TINY__ */
-#if defined (L_mulsidi3) && !defined (__AVR_HAVE_MUL__)
-#define A0 18
-#define A1 A0+1
-#define A2 A0+2
-#define A3 A0+3
-#define A4 A0+4
-#define A5 A0+5
-#define A6 A0+6
-#define A7 A0+7
-
-#define B0 10
-#define B1 B0+1
-#define B2 B0+2
-#define B3 B0+3
-#define B4 B0+4
-#define B5 B0+5
-#define B6 B0+6
-#define B7 B0+7
-
-#define AA0 22
-#define AA1 AA0+1
-#define AA2 AA0+2
-#define AA3 AA0+3
-
-#define BB0 18
-#define BB1 BB0+1
-#define BB2 BB0+2
-#define BB3 BB0+3
-
-#define Mask r30
-
-;; Signed / Unsigned widening 64 = 32 * 32 Multiplication without MUL
-;;
-;; R18[8] = R22[4] * R18[4]
-;; Ordinary ABI Function
-DEFUN __mulsidi3
-    set
-    skip
-    ;; FALLTHRU
-ENDF  __mulsidi3
-
-DEFUN __umulsidi3
-    clt     ; skipped
-    ;; Save 10 Registers: R10..R17, R28, R29
-    do_prologue_saves 10
-    ldi     Mask, 0xff
-    bld     Mask, 7
-    ;; Move B into place...
-    wmov    B0, BB0
-    wmov    B2, BB2
-    ;; ...and extend it
-    and     BB3, Mask
-    lsl     BB3
-    sbc     B4, B4
-    mov     B5, B4
-    wmov    B6, B4
-    ;; Move A into place...
-    wmov    A0, AA0
-    wmov    A2, AA2
-    ;; ...and extend it
-    and     AA3, Mask
-    lsl     AA3
-    sbc     A4, A4
-    mov     A5, A4
-    wmov    A6, A4
-    XCALL   __muldi3
-    do_epilogue_restores 10
-ENDF __umulsidi3
-
-#undef A0
-#undef A1
-#undef A2
-#undef A3
-#undef A4
-#undef A5
-#undef A6
-#undef A7
-#undef B0
-#undef B1
-#undef B2
-#undef B3
-#undef B4
-#undef B5
-#undef B6
-#undef B7
-#undef AA0
-#undef AA1
-#undef AA2
-#undef AA3
-#undef BB0
-#undef BB1
-#undef BB2
-#undef BB3
-#undef Mask
-#endif /* L_mulsidi3 && !HAVE_MUL */
-#endif /* if not __AVR_TINY__ */
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	
-
-.section .text.libgcc.div, "ax", @progbits
-
-/*******************************************************
-       Division 8 / 8 => (result + remainder)
-*******************************************************/
-#define	r_rem	r25	/* remainder */
-#define	r_arg1	r24	/* dividend, quotient */
-#define	r_arg2	r22	/* divisor */
-#define	r_cnt	r23	/* loop count */
-
-#if defined (L_udivmodqi4)
-DEFUN __udivmodqi4
-	sub	r_rem,r_rem	; clear remainder and carry
-	ldi	r_cnt,9		; init loop counter
-	rjmp	__udivmodqi4_ep	; jump to entry point
-__udivmodqi4_loop:
-	rol	r_rem		; shift dividend into remainder
-	cp	r_rem,r_arg2	; compare remainder & divisor
-	brcs	__udivmodqi4_ep	; remainder <= divisor
-	sub	r_rem,r_arg2	; restore remainder
-__udivmodqi4_ep:
-	rol	r_arg1		; shift dividend (with CARRY)
-	dec	r_cnt		; decrement loop counter
-	brne	__udivmodqi4_loop
-	com	r_arg1		; complement result
-				; because C flag was complemented in loop
-	ret
-ENDF __udivmodqi4
-#endif /* defined (L_udivmodqi4) */
-
-#if defined (L_divmodqi4)
-DEFUN __divmodqi4
-        bst     r_arg1,7	; store sign of dividend
-        mov     __tmp_reg__,r_arg1
-        eor     __tmp_reg__,r_arg2; r0.7 is sign of result
-        sbrc	r_arg1,7
-	neg     r_arg1		; dividend negative : negate
-        sbrc	r_arg2,7
-	neg     r_arg2		; divisor negative : negate
-	XCALL	__udivmodqi4	; do the unsigned div/mod
-	brtc	__divmodqi4_1
-	neg	r_rem		; correct remainder sign
-__divmodqi4_1:
-	sbrc	__tmp_reg__,7
-	neg	r_arg1		; correct result sign
-__divmodqi4_exit:
-	ret
-ENDF __divmodqi4
-#endif /* defined (L_divmodqi4) */
-
-#undef r_rem
-#undef r_arg1
-#undef r_arg2
-#undef r_cnt
-	
-		
-/*******************************************************
-       Division 16 / 16 => (result + remainder)
-*******************************************************/
-#define	r_remL	r26	/* remainder Low */
-#define	r_remH	r27	/* remainder High */
-
-/* return: remainder */
-#define	r_arg1L	r24	/* dividend Low */
-#define	r_arg1H	r25	/* dividend High */
-
-/* return: quotient */
-#define	r_arg2L	r22	/* divisor Low */
-#define	r_arg2H	r23	/* divisor High */
-	
-#define	r_cnt	r21	/* loop count */
-
-#if defined (L_udivmodhi4)
-DEFUN __udivmodhi4
-	sub	r_remL,r_remL
-	sub	r_remH,r_remH	; clear remainder and carry
-	ldi	r_cnt,17	; init loop counter
-	rjmp	__udivmodhi4_ep	; jump to entry point
-__udivmodhi4_loop:
-        rol	r_remL		; shift dividend into remainder
-	rol	r_remH
-        cp	r_remL,r_arg2L	; compare remainder & divisor
-	cpc	r_remH,r_arg2H
-        brcs	__udivmodhi4_ep	; remainder < divisor
-        sub	r_remL,r_arg2L	; restore remainder
-        sbc	r_remH,r_arg2H
-__udivmodhi4_ep:
-        rol	r_arg1L		; shift dividend (with CARRY)
-        rol	r_arg1H
-        dec	r_cnt		; decrement loop counter
-        brne	__udivmodhi4_loop
-	com	r_arg1L
-	com	r_arg1H
-; div/mod results to return registers, as for the div() function
-	mov_l	r_arg2L, r_arg1L	; quotient
-	mov_h	r_arg2H, r_arg1H
-	mov_l	r_arg1L, r_remL		; remainder
-	mov_h	r_arg1H, r_remH
-	ret
-ENDF __udivmodhi4
-#endif /* defined (L_udivmodhi4) */
-
-#if defined (L_divmodhi4)
-DEFUN __divmodhi4
-    .global _div
-_div:
-    bst     r_arg1H,7           ; store sign of dividend
-    mov     __tmp_reg__,r_arg2H
-    brtc    0f
-    com     __tmp_reg__         ; r0.7 is sign of result
-    rcall   __divmodhi4_neg1    ; dividend negative: negate
-0:
-    sbrc    r_arg2H,7
-    rcall   __divmodhi4_neg2    ; divisor negative: negate
-    XCALL   __udivmodhi4        ; do the unsigned div/mod
-    sbrc    __tmp_reg__,7
-    rcall   __divmodhi4_neg2    ; correct remainder sign
-    brtc    __divmodhi4_exit
-__divmodhi4_neg1:
-    ;; correct dividend/remainder sign
-    com     r_arg1H
-    neg     r_arg1L
-    sbci    r_arg1H,0xff
-    ret
-__divmodhi4_neg2:
-    ;; correct divisor/result sign
-    com     r_arg2H
-    neg     r_arg2L
-    sbci    r_arg2H,0xff
-__divmodhi4_exit:
-    ret
-ENDF __divmodhi4
-#endif /* defined (L_divmodhi4) */
-
-#undef r_remH
-#undef r_remL
-
-#undef r_arg1H
-#undef r_arg1L
-
-#undef r_arg2H
-#undef r_arg2L
-             	
-#undef r_cnt   	
-
-/*******************************************************
-       Division 24 / 24 => (result + remainder)
-*******************************************************/
-
-;; A[0..2]: In: Dividend; Out: Quotient
-#define A0  22
-#define A1  A0+1
-#define A2  A0+2
-
-;; B[0..2]: In: Divisor;   Out: Remainder
-#define B0  18
-#define B1  B0+1
-#define B2  B0+2
-
-;; C[0..2]: Expand remainder
-#define C0  __zero_reg__
-#define C1  26
-#define C2  25
-
-;; Loop counter
-#define r_cnt   21
-
-#if defined (L_udivmodpsi4)
-;; R24:R22 = R24:R24  udiv  R20:R18
-;; R20:R18 = R24:R22  umod  R20:R18
-;; Clobbers: R21, R25, R26
-
-DEFUN __udivmodpsi4
-    ; init loop counter
-    ldi     r_cnt, 24+1
-    ; Clear remainder and carry.  C0 is already 0
-    clr     C1
-    sub     C2, C2
-    ; jump to entry point
-    rjmp    __udivmodpsi4_start
-__udivmodpsi4_loop:
-    ; shift dividend into remainder
-    rol     C0
-    rol     C1
-    rol     C2
-    ; compare remainder & divisor
-    cp      C0, B0
-    cpc     C1, B1
-    cpc     C2, B2
-    brcs    __udivmodpsi4_start ; remainder <= divisor
-    sub     C0, B0              ; restore remainder
-    sbc     C1, B1
-    sbc     C2, B2
-__udivmodpsi4_start:
-    ; shift dividend (with CARRY)
-    rol     A0
-    rol     A1
-    rol     A2
-    ; decrement loop counter
-    dec     r_cnt
-    brne    __udivmodpsi4_loop
-    com     A0
-    com     A1
-    com     A2
-    ; div/mod results to return registers
-    ; remainder
-    mov     B0, C0
-    mov     B1, C1
-    mov     B2, C2
-    clr     __zero_reg__ ; C0
-    ret
-ENDF __udivmodpsi4
-#endif /* defined (L_udivmodpsi4) */
-
-#if defined (L_divmodpsi4)
-;; R24:R22 = R24:R22  div  R20:R18
-;; R20:R18 = R24:R22  mod  R20:R18
-;; Clobbers: T, __tmp_reg__, R21, R25, R26
-
-DEFUN __divmodpsi4
-    ; R0.7 will contain the sign of the result:
-    ; R0.7 = A.sign ^ B.sign
-    mov __tmp_reg__, B2
-    ; T-flag = sign of dividend
-    bst     A2, 7
-    brtc    0f
-    com     __tmp_reg__
-    ; Adjust dividend's sign
-    rcall   __divmodpsi4_negA
-0:
-    ; Adjust divisor's sign
-    sbrc    B2, 7
-    rcall   __divmodpsi4_negB
-
-    ; Do the unsigned div/mod
-    XCALL   __udivmodpsi4
-
-    ; Adjust quotient's sign
-    sbrc    __tmp_reg__, 7
-    rcall   __divmodpsi4_negA
-
-    ; Adjust remainder's sign
-    brtc    __divmodpsi4_end
-
-__divmodpsi4_negB:
-    ; Correct divisor/remainder sign
-    com     B2
-    com     B1
-    neg     B0
-    sbci    B1, -1
-    sbci    B2, -1
-    ret
-
-    ; Correct dividend/quotient sign
-__divmodpsi4_negA:
-    com     A2
-    com     A1
-    neg     A0
-    sbci    A1, -1
-    sbci    A2, -1
-__divmodpsi4_end:
-    ret
-
-ENDF __divmodpsi4
-#endif /* defined (L_divmodpsi4) */
-
-#undef A0
-#undef A1
-#undef A2
-
-#undef B0
-#undef B1
-#undef B2
-
-#undef C0
-#undef C1
-#undef C2
-
-#undef r_cnt
-
-/*******************************************************
-       Division 32 / 32 => (result + remainder)
-*******************************************************/
-#define	r_remHH	r31	/* remainder High */
-#define	r_remHL	r30
-#define	r_remH	r27
-#define	r_remL	r26	/* remainder Low */
-
-/* return: remainder */
-#define	r_arg1HH r25	/* dividend High */
-#define	r_arg1HL r24
-#define	r_arg1H  r23
-#define	r_arg1L  r22	/* dividend Low */
-
-/* return: quotient */
-#define	r_arg2HH r21	/* divisor High */
-#define	r_arg2HL r20
-#define	r_arg2H  r19
-#define	r_arg2L  r18	/* divisor Low */
-	
-#define	r_cnt __zero_reg__  /* loop count (0 after the loop!) */
-
-#if defined (L_udivmodsi4)
-DEFUN __udivmodsi4
-	ldi	r_remL, 33	; init loop counter
-	mov	r_cnt, r_remL
-	sub	r_remL,r_remL
-	sub	r_remH,r_remH	; clear remainder and carry
-	mov_l	r_remHL, r_remL
-	mov_h	r_remHH, r_remH
-	rjmp	__udivmodsi4_ep	; jump to entry point
-__udivmodsi4_loop:
-        rol	r_remL		; shift dividend into remainder
-	rol	r_remH
-	rol	r_remHL
-	rol	r_remHH
-        cp	r_remL,r_arg2L	; compare remainder & divisor
-	cpc	r_remH,r_arg2H
-	cpc	r_remHL,r_arg2HL
-	cpc	r_remHH,r_arg2HH
-	brcs	__udivmodsi4_ep	; remainder <= divisor
-        sub	r_remL,r_arg2L	; restore remainder
-        sbc	r_remH,r_arg2H
-        sbc	r_remHL,r_arg2HL
-        sbc	r_remHH,r_arg2HH
-__udivmodsi4_ep:
-        rol	r_arg1L		; shift dividend (with CARRY)
-        rol	r_arg1H
-        rol	r_arg1HL
-        rol	r_arg1HH
-        dec	r_cnt		; decrement loop counter
-        brne	__udivmodsi4_loop
-				; __zero_reg__ now restored (r_cnt == 0)
-	com	r_arg1L
-	com	r_arg1H
-	com	r_arg1HL
-	com	r_arg1HH
-; div/mod results to return registers, as for the ldiv() function
-	mov_l	r_arg2L,  r_arg1L	; quotient
-	mov_h	r_arg2H,  r_arg1H
-	mov_l	r_arg2HL, r_arg1HL
-	mov_h	r_arg2HH, r_arg1HH
-	mov_l	r_arg1L,  r_remL	; remainder
-	mov_h	r_arg1H,  r_remH
-	mov_l	r_arg1HL, r_remHL
-	mov_h	r_arg1HH, r_remHH
-	ret
-ENDF __udivmodsi4
-#endif /* defined (L_udivmodsi4) */
-
-#if defined (L_divmodsi4)
-DEFUN __divmodsi4
-    mov     __tmp_reg__,r_arg2HH
-    bst     r_arg1HH,7          ; store sign of dividend
-    brtc    0f
-    com     __tmp_reg__         ; r0.7 is sign of result
-    XCALL   __negsi2            ; dividend negative: negate
-0:
-    sbrc    r_arg2HH,7
-    rcall   __divmodsi4_neg2    ; divisor negative: negate
-    XCALL   __udivmodsi4        ; do the unsigned div/mod
-    sbrc    __tmp_reg__, 7      ; correct quotient sign
-    rcall   __divmodsi4_neg2
-    brtc    __divmodsi4_exit    ; correct remainder sign
-    XJMP    __negsi2
-__divmodsi4_neg2:
-    ;; correct divisor/quotient sign
-    com     r_arg2HH
-    com     r_arg2HL
-    com     r_arg2H
-    neg     r_arg2L
-    sbci    r_arg2H,0xff
-    sbci    r_arg2HL,0xff
-    sbci    r_arg2HH,0xff
-__divmodsi4_exit:
-    ret
-ENDF __divmodsi4
-#endif /* defined (L_divmodsi4) */
-
-#if defined (L_negsi2)
-;; (set (reg:SI 22)
-;;      (neg:SI (reg:SI 22)))
-;; Sets the V flag for signed overflow tests
-DEFUN __negsi2
-    NEG4    22
-    ret
-ENDF __negsi2
-#endif /* L_negsi2 */
-
-#undef r_remHH
-#undef r_remHL
-#undef r_remH
-#undef r_remL
-#undef r_arg1HH
-#undef r_arg1HL
-#undef r_arg1H
-#undef r_arg1L
-#undef r_arg2HH
-#undef r_arg2HL
-#undef r_arg2H
-#undef r_arg2L
-#undef r_cnt
-
-/* *di routines use registers below R19 and won't work with tiny arch
-   right now. */
-
-#if !defined (__AVR_TINY__)
-/*******************************************************
-       Division 64 / 64
-       Modulo   64 % 64
-*******************************************************/
-
-;; Use Speed-optimized Version on "big" Devices, i.e. Devices with
-;; at least 16k of Program Memory.  For smaller Devices, depend
-;; on MOVW and SP Size.  There is a Connexion between SP Size and
-;; Flash Size so that SP Size can be used to test for Flash Size.
-
-#if defined (__AVR_HAVE_JMP_CALL__)
-#   define SPEED_DIV 8
-#elif defined (__AVR_HAVE_MOVW__) && defined (__AVR_HAVE_SPH__)
-#   define SPEED_DIV 16
-#else
-#   define SPEED_DIV 0
-#endif
-
-;; A[0..7]: In: Dividend;
-;; Out: Quotient  (T = 0)
-;; Out: Remainder (T = 1)
-#define A0  18
-#define A1  A0+1
-#define A2  A0+2
-#define A3  A0+3
-#define A4  A0+4
-#define A5  A0+5
-#define A6  A0+6
-#define A7  A0+7
-
-;; B[0..7]: In: Divisor;   Out: Clobber
-#define B0  10
-#define B1  B0+1
-#define B2  B0+2
-#define B3  B0+3
-#define B4  B0+4
-#define B5  B0+5
-#define B6  B0+6
-#define B7  B0+7
-
-;; C[0..7]: Expand remainder;  Out: Remainder (unused)
-#define C0  8
-#define C1  C0+1
-#define C2  30
-#define C3  C2+1
-#define C4  28
-#define C5  C4+1
-#define C6  26
-#define C7  C6+1
-
-;; Holds Signs during Division Routine
-#define SS      __tmp_reg__
-
-;; Bit-Counter in Division Routine
-#define R_cnt   __zero_reg__
-
-;; Scratch Register for Negation
-#define NN      r31
-
-#if defined (L_udivdi3)
-
-;; R25:R18 = R24:R18  umod  R17:R10
-;; Ordinary ABI-Function
-
-DEFUN __umoddi3
-    set
-    rjmp __udivdi3_umoddi3
-ENDF __umoddi3
-
-;; R25:R18 = R24:R18  udiv  R17:R10
-;; Ordinary ABI-Function
-
-DEFUN __udivdi3
-    clt
-ENDF __udivdi3
-
-DEFUN __udivdi3_umoddi3
-    push    C0
-    push    C1
-    push    C4
-    push    C5
-    XCALL   __udivmod64
-    pop     C5
-    pop     C4
-    pop     C1
-    pop     C0
-    ret
-ENDF __udivdi3_umoddi3
-#endif /* L_udivdi3 */
-
-#if defined (L_udivmod64)
-
-;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation
-;; No Registers saved/restored; the Callers will take Care.
-;; Preserves B[] and T-flag
-;; T = 0: Compute Quotient  in A[]
-;; T = 1: Compute Remainder in A[] and shift SS one Bit left
-
-DEFUN __udivmod64
-
-    ;; Clear Remainder (C6, C7 will follow)
-    clr     C0
-    clr     C1
-    wmov    C2, C0
-    wmov    C4, C0
-    ldi     C7, 64
-
-#if SPEED_DIV == 0 || SPEED_DIV == 16
-    ;; Initialize Loop-Counter
-    mov     R_cnt, C7
-    wmov    C6, C0
-#endif /* SPEED_DIV */
-
-#if SPEED_DIV == 8
-
-    push    A7
-    clr     C6
-
-1:  ;; Compare shifted Devidend against Divisor
-    ;; If -- even after Shifting -- it is smaller...
-    CP  A7,B0  $  cpc C0,B1  $  cpc C1,B2  $  cpc C2,B3
-    cpc C3,B4  $  cpc C4,B5  $  cpc C5,B6  $  cpc C6,B7
-    brcc    2f
-
-    ;; ...then we can subtract it.  Thus, it is legal to shift left
-               $  mov C6,C5  $  mov C5,C4  $  mov C4,C3
-    mov C3,C2  $  mov C2,C1  $  mov C1,C0  $  mov C0,A7
-    mov A7,A6  $  mov A6,A5  $  mov A5,A4  $  mov A4,A3
-    mov A3,A2  $  mov A2,A1  $  mov A1,A0  $  clr A0
-
-    ;; 8 Bits are done
-    subi    C7, 8
-    brne    1b
-
-    ;; Shifted 64 Bits:  A7 has traveled to C7
-    pop     C7
-    ;; Divisor is greater than Dividend. We have:
-    ;; A[] % B[] = A[]
-    ;; A[] / B[] = 0
-    ;; Thus, we can return immediately
-    rjmp    5f
-
-2:  ;; Initialze Bit-Counter with Number of Bits still to be performed
-    mov     R_cnt, C7
-
-    ;; Push of A7 is not needed because C7 is still 0
-    pop     C7
-    clr     C7
-
-#elif  SPEED_DIV == 16
-
-    ;; Compare shifted Dividend against Divisor
-    cp      A7, B3
-    cpc     C0, B4
-    cpc     C1, B5
-    cpc     C2, B6
-    cpc     C3, B7
-    brcc    2f
-
-    ;; Divisor is greater than shifted Dividen: We can shift the Dividend
-    ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk
-    wmov  C2,A6  $  wmov C0,A4
-    wmov  A6,A2  $  wmov A4,A0
-    wmov  A2,C6  $  wmov A0,C4
-
-    ;; Set Bit Counter to 32
-    lsr     R_cnt
-2:
-#elif SPEED_DIV
-#error SPEED_DIV = ?
-#endif /* SPEED_DIV */
-
-;; The very Division + Remainder Routine
-
-3:  ;; Left-shift Dividend...
-    lsl A0     $  rol A1     $  rol A2     $  rol A3
-    rol A4     $  rol A5     $  rol A6     $  rol A7
-
-    ;; ...into Remainder
-    rol C0     $  rol C1     $  rol C2     $  rol C3
-    rol C4     $  rol C5     $  rol C6     $  rol C7
-
-    ;; Compare Remainder and Divisor
-    CP  C0,B0  $  cpc C1,B1  $  cpc C2,B2  $  cpc C3,B3
-    cpc C4,B4  $  cpc C5,B5  $  cpc C6,B6  $  cpc C7,B7
-
-    brcs 4f
-
-    ;; Divisor fits into Remainder:  Subtract it from Remainder...
-    SUB C0,B0  $  sbc C1,B1  $  sbc C2,B2  $  sbc C3,B3
-    sbc C4,B4  $  sbc C5,B5  $  sbc C6,B6  $  sbc C7,B7
-
-    ;; ...and set according Bit in the upcoming Quotient
-    ;; The Bit will travel to its final Position
-    ori A0, 1
-
-4:  ;; This Bit is done
-    dec     R_cnt
-    brne    3b
-    ;; __zero_reg__ is 0 again
-
-    ;; T = 0: We are fine with the Quotient in A[]
-    ;; T = 1: Copy Remainder to A[]
-5:  brtc    6f
-    wmov    A0, C0
-    wmov    A2, C2
-    wmov    A4, C4
-    wmov    A6, C6
-    ;; Move the Sign of the Result to SS.7
-    lsl     SS
-
-6:  ret
-
-ENDF __udivmod64
-#endif /* L_udivmod64 */
-
-
-#if defined (L_divdi3)
-
-;; R25:R18 = R24:R18  mod  R17:R10
-;; Ordinary ABI-Function
-
-DEFUN __moddi3
-    set
-    rjmp    __divdi3_moddi3
-ENDF __moddi3
-
-;; R25:R18 = R24:R18  div  R17:R10
-;; Ordinary ABI-Function
-
-DEFUN __divdi3
-    clt
-ENDF __divdi3
-
-DEFUN  __divdi3_moddi3
-#if SPEED_DIV
-    mov     r31, A7
-    or      r31, B7
-    brmi    0f
-    ;; Both Signs are 0:  the following Complexitiy is not needed
-    XJMP    __udivdi3_umoddi3
-#endif /* SPEED_DIV */
-
-0:  ;; The Prologue
-    ;; Save 12 Registers:  Y, 17...8
-    ;; No Frame needed
-    do_prologue_saves 12
-
-    ;; SS.7 will contain the Sign of the Quotient  (A.sign * B.sign)
-    ;; SS.6 will contain the Sign of the Remainder (A.sign)
-    mov     SS, A7
-    asr     SS
-    ;; Adjust Dividend's Sign as needed
-#if SPEED_DIV
-    ;; Compiling for Speed we know that at least one Sign must be < 0
-    ;; Thus, if A[] >= 0 then we know B[] < 0
-    brpl    22f
-#else
-    brpl    21f
-#endif /* SPEED_DIV */
-
-    XCALL   __negdi2
-
-    ;; Adjust Divisor's Sign and SS.7 as needed
-21: tst     B7
-    brpl    3f
-22: ldi     NN, 1 << 7
-    eor     SS, NN
-
-    ldi NN, -1
-    com B4     $  com B5     $  com B6     $  com B7
-               $  com B1     $  com B2     $  com B3
-    NEG B0
-               $  sbc B1,NN  $  sbc B2,NN  $  sbc B3,NN
-    sbc B4,NN  $  sbc B5,NN  $  sbc B6,NN  $  sbc B7,NN
-
-3:  ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag)
-    XCALL   __udivmod64
-
-    ;; Adjust Result's Sign
-#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
-    tst     SS
-    brpl    4f
-#else
-    sbrc    SS, 7
-#endif /* __AVR_HAVE_JMP_CALL__ */
-    XCALL   __negdi2
-
-4:  ;; Epilogue: Restore 12 Registers and return
-    do_epilogue_restores 12
-
-ENDF __divdi3_moddi3
-
-#endif /* L_divdi3 */
-
-#undef R_cnt
-#undef SS
-#undef NN
-
-.section .text.libgcc, "ax", @progbits
-
-#define TT __tmp_reg__
-
-#if defined (L_adddi3)
-;; (set (reg:DI 18)
-;;      (plus:DI (reg:DI 18)
-;;               (reg:DI 10)))
-;; Sets the V flag for signed overflow tests
-;; Sets the C flag for unsigned overflow tests
-DEFUN __adddi3
-    ADD A0,B0  $  adc A1,B1  $  adc A2,B2  $  adc A3,B3
-    adc A4,B4  $  adc A5,B5  $  adc A6,B6  $  adc A7,B7
-    ret
-ENDF __adddi3
-#endif /* L_adddi3 */
-
-#if defined (L_adddi3_s8)
-;; (set (reg:DI 18)
-;;      (plus:DI (reg:DI 18)
-;;               (sign_extend:SI (reg:QI 26))))
-;; Sets the V flag for signed overflow tests
-;; Sets the C flag for unsigned overflow tests provided 0 <= R26 < 128
-DEFUN __adddi3_s8
-    clr     TT
-    sbrc    r26, 7
-    com     TT
-    ADD A0,r26 $  adc A1,TT  $  adc A2,TT  $  adc A3,TT
-    adc A4,TT  $  adc A5,TT  $  adc A6,TT  $  adc A7,TT
-    ret
-ENDF __adddi3_s8
-#endif /* L_adddi3_s8 */
-
-#if defined (L_subdi3)
-;; (set (reg:DI 18)
-;;      (minus:DI (reg:DI 18)
-;;                (reg:DI 10)))
-;; Sets the V flag for signed overflow tests
-;; Sets the C flag for unsigned overflow tests
-DEFUN __subdi3
-    SUB A0,B0  $  sbc A1,B1  $  sbc A2,B2  $  sbc A3,B3
-    sbc A4,B4  $  sbc A5,B5  $  sbc A6,B6  $  sbc A7,B7
-    ret
-ENDF __subdi3
-#endif /* L_subdi3 */
-
-#if defined (L_cmpdi2)
-;; (set (cc0)
-;;      (compare (reg:DI 18)
-;;               (reg:DI 10)))
-DEFUN __cmpdi2
-    CP  A0,B0  $  cpc A1,B1  $  cpc A2,B2  $  cpc A3,B3
-    cpc A4,B4  $  cpc A5,B5  $  cpc A6,B6  $  cpc A7,B7
-    ret
-ENDF __cmpdi2
-#endif /* L_cmpdi2 */
-
-#if defined (L_cmpdi2_s8)
-;; (set (cc0)
-;;      (compare (reg:DI 18)
-;;               (sign_extend:SI (reg:QI 26))))
-DEFUN __cmpdi2_s8
-    clr     TT
-    sbrc    r26, 7
-    com     TT
-    CP  A0,r26 $  cpc A1,TT  $  cpc A2,TT  $  cpc A3,TT
-    cpc A4,TT  $  cpc A5,TT  $  cpc A6,TT  $  cpc A7,TT
-    ret
-ENDF __cmpdi2_s8
-#endif /* L_cmpdi2_s8 */
-
-#if defined (L_negdi2)
-;; (set (reg:DI 18)
-;;      (neg:DI (reg:DI 18)))
-;; Sets the V flag for signed overflow tests
-DEFUN __negdi2
-
-    com  A4    $  com  A5    $  com  A6    $  com  A7
-               $  com  A1    $  com  A2    $  com  A3
-    NEG  A0
-               $  sbci A1,-1 $  sbci A2,-1 $  sbci A3,-1
-    sbci A4,-1 $  sbci A5,-1 $  sbci A6,-1 $  sbci A7,-1
-    ret
-
-ENDF __negdi2
-#endif /* L_negdi2 */
-
-#undef TT
-
-#undef C7
-#undef C6
-#undef C5
-#undef C4
-#undef C3
-#undef C2
-#undef C1
-#undef C0
-
-#undef B7
-#undef B6
-#undef B5
-#undef B4
-#undef B3
-#undef B2
-#undef B1
-#undef B0
-
-#undef A7
-#undef A6
-#undef A5
-#undef A4
-#undef A3
-#undef A2
-#undef A1
-#undef A0
-
-#endif /* !defined (__AVR_TINY__) */
-
-
-.section .text.libgcc.prologue, "ax", @progbits
-
-/**********************************
- * This is a prologue subroutine
- **********************************/
-#if !defined (__AVR_TINY__)
-#if defined (L_prologue)
-
-;; This function does not clobber T-flag; 64-bit division relies on it
-DEFUN __prologue_saves__
-	push r2
-	push r3
-	push r4
-	push r5
-	push r6
-	push r7
-	push r8
-	push r9
-	push r10
-	push r11
-	push r12
-	push r13
-	push r14
-	push r15
-	push r16
-	push r17
-	push r28
-	push r29
-#if !defined (__AVR_HAVE_SPH__)
-	in	r28,__SP_L__
-	sub	r28,r26
-	out	__SP_L__,r28
-	clr	r29
-#elif defined (__AVR_XMEGA__)
-	in	r28,__SP_L__
-	in	r29,__SP_H__
-	sub	r28,r26
-	sbc	r29,r27
-	out	__SP_L__,r28
-	out	__SP_H__,r29
-#else
-	in	r28,__SP_L__
-	in	r29,__SP_H__
-	sub	r28,r26
-	sbc	r29,r27
-	in	__tmp_reg__,__SREG__
-	cli
-	out	__SP_H__,r29
-	out	__SREG__,__tmp_reg__
-	out	__SP_L__,r28
-#endif /* #SP = 8/16 */
-
-	XIJMP
-
-ENDF __prologue_saves__
-#endif /* defined (L_prologue) */
-
-/*
- * This is an epilogue subroutine
- */
-#if defined (L_epilogue)
-
-DEFUN __epilogue_restores__
-	ldd	r2,Y+18
-	ldd	r3,Y+17
-	ldd	r4,Y+16
-	ldd	r5,Y+15
-	ldd	r6,Y+14
-	ldd	r7,Y+13
-	ldd	r8,Y+12
-	ldd	r9,Y+11
-	ldd	r10,Y+10
-	ldd	r11,Y+9
-	ldd	r12,Y+8
-	ldd	r13,Y+7
-	ldd	r14,Y+6
-	ldd	r15,Y+5
-	ldd	r16,Y+4
-	ldd	r17,Y+3
-	ldd	r26,Y+2
-#if !defined (__AVR_HAVE_SPH__)
-	ldd	r29,Y+1
-	add	r28,r30
-	out	__SP_L__,r28
-	mov	r28, r26
-#elif defined (__AVR_XMEGA__)
-	ldd  r27,Y+1
-	add  r28,r30
-	adc  r29,__zero_reg__
-	out  __SP_L__,r28
-	out  __SP_H__,r29
-	wmov 28, 26
-#else
-	ldd	r27,Y+1
-	add	r28,r30
-	adc	r29,__zero_reg__
-	in	__tmp_reg__,__SREG__
-	cli
-	out	__SP_H__,r29
-	out	__SREG__,__tmp_reg__
-	out	__SP_L__,r28
-	mov_l	r28, r26
-	mov_h	r29, r27
-#endif /* #SP = 8/16 */
-	ret
-ENDF __epilogue_restores__
-#endif /* defined (L_epilogue) */
-#endif /* !defined (__AVR_TINY__) */
-
-#ifdef L_exit
-	.section .fini9,"ax",@progbits
-DEFUN _exit
-	.weak	exit
-exit:
-ENDF _exit
-
-	/* Code from .fini8 ... .fini1 sections inserted by ld script.  */
-
-	.section .fini0,"ax",@progbits
-	cli
-__stop_program:
-	rjmp	__stop_program
-#endif /* defined (L_exit) */
-
-#ifdef L_cleanup
-	.weak	_cleanup
-	.func	_cleanup
-_cleanup:
-	ret
-.endfunc
-#endif /* defined (L_cleanup) */
-
-
-.section .text.libgcc, "ax", @progbits
-
-#ifdef L_tablejump2
-DEFUN __tablejump2__
-    lsl     r30
-    rol     r31
-#if defined (__AVR_HAVE_EIJMP_EICALL__)
-    ;; Word address of gs() jumptable entry in R24:Z
-    rol     r24
-    out     __RAMPZ__, r24
-#elif defined (__AVR_HAVE_ELPM__)
-    ;; Word address of jumptable entry in Z
-    clr     __tmp_reg__
-    rol     __tmp_reg__
-    out     __RAMPZ__, __tmp_reg__
-#endif
-
-    ;; Read word address from jumptable and jump
-
-#if defined (__AVR_HAVE_ELPMX__)
-    elpm    __tmp_reg__, Z+
-    elpm    r31, Z
-    mov     r30, __tmp_reg__
-#ifdef __AVR_HAVE_RAMPD__
-    ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM
-    out     __RAMPZ__, __zero_reg__
-#endif /* RAMPD */
-    XIJMP
-#elif defined (__AVR_HAVE_ELPM__)
-    elpm
-    push    r0
-    adiw    r30, 1
-    elpm
-    push    r0
-    ret
-#elif defined (__AVR_HAVE_LPMX__)
-    lpm     __tmp_reg__, Z+
-    lpm     r31, Z
-    mov     r30, __tmp_reg__
-    ijmp
-#elif defined (__AVR_TINY__)
-    wsubi 30, -(__AVR_TINY_PM_BASE_ADDRESS__) ; Add PM offset to Z
-    ld __tmp_reg__, Z+
-    ld r31, Z   ; Use ld instead of lpm to load Z
-    mov r30, __tmp_reg__    
-    ijmp
-#else
-    lpm
-    push    r0
-    adiw    r30, 1
-    lpm
-    push    r0
-    ret
-#endif
-ENDF __tablejump2__
-#endif /* L_tablejump2 */
-
-#if defined(__AVR_TINY__)
-#ifdef L_copy_data
-        .section .init4,"ax",@progbits
-        .global __do_copy_data
-__do_copy_data:
-        ldi     r18, hi8(__data_end)
-        ldi     r26, lo8(__data_start)
-        ldi     r27, hi8(__data_start)
-        ldi     r30, lo8(__data_load_start + __AVR_TINY_PM_BASE_ADDRESS__)
-        ldi     r31, hi8(__data_load_start + __AVR_TINY_PM_BASE_ADDRESS__)
-        rjmp    .L__do_copy_data_start
-.L__do_copy_data_loop:
-        ld      r19, z+
-        st      X+, r19
-.L__do_copy_data_start:
-        cpi     r26, lo8(__data_end)
-        cpc     r27, r18
-        brne    .L__do_copy_data_loop
-#endif
-#else
-#ifdef L_copy_data
-	.section .init4,"ax",@progbits
-DEFUN __do_copy_data
-#if defined(__AVR_HAVE_ELPMX__)
-	ldi	r17, hi8(__data_end)
-	ldi	r26, lo8(__data_start)
-	ldi	r27, hi8(__data_start)
-	ldi	r30, lo8(__data_load_start)
-	ldi	r31, hi8(__data_load_start)
-	ldi	r16, hh8(__data_load_start)
-	out	__RAMPZ__, r16
-	rjmp	.L__do_copy_data_start
-.L__do_copy_data_loop:
-	elpm	r0, Z+
-	st	X+, r0
-.L__do_copy_data_start:
-	cpi	r26, lo8(__data_end)
-	cpc	r27, r17
-	brne	.L__do_copy_data_loop
-#elif  !defined(__AVR_HAVE_ELPMX__) && defined(__AVR_HAVE_ELPM__)
-	ldi	r17, hi8(__data_end)
-	ldi	r26, lo8(__data_start)
-	ldi	r27, hi8(__data_start)
-	ldi	r30, lo8(__data_load_start)
-	ldi	r31, hi8(__data_load_start)
-	ldi	r16, hh8(__data_load_start - 0x10000)
-.L__do_copy_data_carry:
-	inc	r16
-	out	__RAMPZ__, r16
-	rjmp	.L__do_copy_data_start
-.L__do_copy_data_loop:
-	elpm
-	st	X+, r0
-	adiw	r30, 1
-	brcs	.L__do_copy_data_carry
-.L__do_copy_data_start:
-	cpi	r26, lo8(__data_end)
-	cpc	r27, r17
-	brne	.L__do_copy_data_loop
-#elif !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__)
-	ldi	r17, hi8(__data_end)
-	ldi	r26, lo8(__data_start)
-	ldi	r27, hi8(__data_start)
-	ldi	r30, lo8(__data_load_start)
-	ldi	r31, hi8(__data_load_start)
-	rjmp	.L__do_copy_data_start
-.L__do_copy_data_loop:
-#if defined (__AVR_HAVE_LPMX__)
-	lpm	r0, Z+
-#else
-	lpm
-	adiw	r30, 1
-#endif
-	st	X+, r0
-.L__do_copy_data_start:
-	cpi	r26, lo8(__data_end)
-	cpc	r27, r17
-	brne	.L__do_copy_data_loop
-#endif /* !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__) */
-#if defined (__AVR_HAVE_ELPM__) && defined (__AVR_HAVE_RAMPD__)
-	;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM
-	out	__RAMPZ__, __zero_reg__
-#endif /* ELPM && RAMPD */
-ENDF __do_copy_data
-#endif /* L_copy_data */
-#endif /* !defined (__AVR_TINY__) */
-
-/* __do_clear_bss is only necessary if there is anything in .bss section.  */
-
-#ifdef L_clear_bss
-	.section .init4,"ax",@progbits
-DEFUN __do_clear_bss
-	ldi	r18, hi8(__bss_end)
-	ldi	r26, lo8(__bss_start)
-	ldi	r27, hi8(__bss_start)
-	rjmp	.do_clear_bss_start
-.do_clear_bss_loop:
-	st	X+, __zero_reg__
-.do_clear_bss_start:
-	cpi	r26, lo8(__bss_end)
-	cpc	r27, r18
-	brne	.do_clear_bss_loop
-ENDF __do_clear_bss
-#endif /* L_clear_bss */
-
-/* __do_global_ctors and __do_global_dtors are only necessary
-   if there are any constructors/destructors.  */
-
-#if defined(__AVR_TINY__)
-#define cdtors_tst_reg r18
-#else
-#define cdtors_tst_reg r17
-#endif
-
-#ifdef L_ctors
-	.section .init6,"ax",@progbits
-DEFUN __do_global_ctors
-    ldi     cdtors_tst_reg, pm_hi8(__ctors_start)
-    ldi     r28, pm_lo8(__ctors_end)
-    ldi     r29, pm_hi8(__ctors_end)
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    ldi     r16, pm_hh8(__ctors_end)
-#endif /* HAVE_EIJMP */
-    rjmp    .L__do_global_ctors_start
-.L__do_global_ctors_loop:
-    wsubi   28, 1
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    sbc     r16, __zero_reg__
-    mov     r24, r16
-#endif /* HAVE_EIJMP */
-    mov_h   r31, r29
-    mov_l   r30, r28
-    XCALL   __tablejump2__
-.L__do_global_ctors_start:
-    cpi     r28, pm_lo8(__ctors_start)
-    cpc     r29, cdtors_tst_reg
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    ldi     r24, pm_hh8(__ctors_start)
-    cpc     r16, r24
-#endif /* HAVE_EIJMP */
-    brne    .L__do_global_ctors_loop
-ENDF __do_global_ctors
-#endif /* L_ctors */
-
-#ifdef L_dtors
-	.section .fini6,"ax",@progbits
-DEFUN __do_global_dtors
-    ldi     cdtors_tst_reg, pm_hi8(__dtors_end)
-    ldi     r28, pm_lo8(__dtors_start)
-    ldi     r29, pm_hi8(__dtors_start)
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    ldi     r16, pm_hh8(__dtors_start)
-#endif /* HAVE_EIJMP */
-    rjmp    .L__do_global_dtors_start
-.L__do_global_dtors_loop:
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    mov     r24, r16
-#endif /* HAVE_EIJMP */
-    mov_h   r31, r29
-    mov_l   r30, r28
-    XCALL   __tablejump2__
-    waddi   28, 1
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    adc     r16, __zero_reg__
-#endif /* HAVE_EIJMP */
-.L__do_global_dtors_start:
-    cpi     r28, pm_lo8(__dtors_end)
-    cpc     r29, cdtors_tst_reg
-#ifdef __AVR_HAVE_EIJMP_EICALL__
-    ldi     r24, pm_hh8(__dtors_end)
-    cpc     r16, r24
-#endif /* HAVE_EIJMP */
-    brne    .L__do_global_dtors_loop
-ENDF __do_global_dtors
-#endif /* L_dtors */
-
-#undef cdtors_tst_reg
-
-.section .text.libgcc, "ax", @progbits
-
-#if !defined (__AVR_TINY__)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; Loading n bytes from Flash; n = 3,4
-;; R22... = Flash[Z]
-;; Clobbers: __tmp_reg__
-
-#if (defined (L_load_3)        \
-     || defined (L_load_4))    \
-    && !defined (__AVR_HAVE_LPMX__)
-
-;; Destination
-#define D0  22
-#define D1  D0+1
-#define D2  D0+2
-#define D3  D0+3
-
-.macro  .load dest, n
-    lpm
-    mov     \dest, r0
-.if \dest != D0+\n-1
-    adiw    r30, 1
-.else
-    sbiw    r30, \n-1
-.endif
-.endm
-
-#if defined (L_load_3)
-DEFUN __load_3
-    push  D3
-    XCALL __load_4
-    pop   D3
-    ret
-ENDF __load_3
-#endif /* L_load_3 */
-
-#if defined (L_load_4)
-DEFUN __load_4
-    .load D0, 4
-    .load D1, 4
-    .load D2, 4
-    .load D3, 4
-    ret
-ENDF __load_4
-#endif /* L_load_4 */
-
-#endif /* L_load_3 || L_load_3 */
-#endif /* !defined (__AVR_TINY__) */
-
-#if !defined (__AVR_TINY__)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; Loading n bytes from Flash or RAM;  n = 1,2,3,4
-;; R22... = Flash[R21:Z] or RAM[Z] depending on R21.7
-;; Clobbers: __tmp_reg__, R21, R30, R31
-
-#if (defined (L_xload_1)            \
-     || defined (L_xload_2)         \
-     || defined (L_xload_3)         \
-     || defined (L_xload_4))
-
-;; Destination
-#define D0  22
-#define D1  D0+1
-#define D2  D0+2
-#define D3  D0+3
-
-;; Register containing bits 16+ of the address
-
-#define HHI8  21
-
-.macro  .xload dest, n
-#if defined (__AVR_HAVE_ELPMX__)
-    elpm    \dest, Z+
-#elif defined (__AVR_HAVE_ELPM__)
-    elpm
-    mov     \dest, r0
-.if \dest != D0+\n-1
-    adiw    r30, 1
-    adc     HHI8, __zero_reg__
-    out     __RAMPZ__, HHI8
-.endif
-#elif defined (__AVR_HAVE_LPMX__)
-    lpm     \dest, Z+
-#else
-    lpm
-    mov     \dest, r0
-.if \dest != D0+\n-1
-    adiw    r30, 1
-.endif
-#endif
-#if defined (__AVR_HAVE_ELPM__) && defined (__AVR_HAVE_RAMPD__)
-.if \dest == D0+\n-1
-    ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM
-    out     __RAMPZ__, __zero_reg__
-.endif
-#endif
-.endm ; .xload
-
-#if defined (L_xload_1)
-DEFUN __xload_1
-#if defined (__AVR_HAVE_LPMX__) && !defined (__AVR_HAVE_ELPM__)
-    sbrc    HHI8, 7
-    ld      D0, Z
-    sbrs    HHI8, 7
-    lpm     D0, Z
-    ret
-#else
-    sbrc    HHI8, 7
-    rjmp    1f
-#if defined (__AVR_HAVE_ELPM__)
-    out     __RAMPZ__, HHI8
-#endif /* __AVR_HAVE_ELPM__ */
-    .xload  D0, 1
-    ret
-1:  ld      D0, Z
-    ret
-#endif /* LPMx && ! ELPM */
-ENDF __xload_1
-#endif /* L_xload_1 */
-
-#if defined (L_xload_2)
-DEFUN __xload_2
-    sbrc    HHI8, 7
-    rjmp    1f
-#if defined (__AVR_HAVE_ELPM__)
-    out     __RAMPZ__, HHI8
-#endif /* __AVR_HAVE_ELPM__ */
-    .xload  D0, 2
-    .xload  D1, 2
-    ret
-1:  ld      D0, Z+
-    ld      D1, Z+
-    ret
-ENDF __xload_2
-#endif /* L_xload_2 */
-
-#if defined (L_xload_3)
-DEFUN __xload_3
-    sbrc    HHI8, 7
-    rjmp    1f
-#if defined (__AVR_HAVE_ELPM__)
-    out     __RAMPZ__, HHI8
-#endif /* __AVR_HAVE_ELPM__ */
-    .xload  D0, 3
-    .xload  D1, 3
-    .xload  D2, 3
-    ret
-1:  ld      D0, Z+
-    ld      D1, Z+
-    ld      D2, Z+
-    ret
-ENDF __xload_3
-#endif /* L_xload_3 */
-
-#if defined (L_xload_4)
-DEFUN __xload_4
-    sbrc    HHI8, 7
-    rjmp    1f
-#if defined (__AVR_HAVE_ELPM__)
-    out     __RAMPZ__, HHI8
-#endif /* __AVR_HAVE_ELPM__ */
-    .xload  D0, 4
-    .xload  D1, 4
-    .xload  D2, 4
-    .xload  D3, 4
-    ret
-1:  ld      D0, Z+
-    ld      D1, Z+
-    ld      D2, Z+
-    ld      D3, Z+
-    ret
-ENDF __xload_4
-#endif /* L_xload_4 */
-
-#endif /* L_xload_{1|2|3|4} */
-#endif /* if !defined (__AVR_TINY__) */
-
-#if !defined (__AVR_TINY__)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; memcopy from Address Space __pgmx to RAM
-;; R23:Z = Source Address
-;; X     = Destination Address
-;; Clobbers: __tmp_reg__, R23, R24, R25, X, Z
-
-#if defined (L_movmemx)
-
-#define HHI8  23
-#define LOOP  24
-
-DEFUN __movmemx_qi
-    ;; #Bytes to copy fity in 8 Bits (1..255)
-    ;; Zero-extend Loop Counter
-    clr     LOOP+1
-    ;; FALLTHRU
-ENDF __movmemx_qi
-
-DEFUN __movmemx_hi
-
-;; Read from where?
-    sbrc    HHI8, 7
-    rjmp    1f
-
-;; Read from Flash
-
-#if defined (__AVR_HAVE_ELPM__)
-    out     __RAMPZ__, HHI8
-#endif
-
-0:  ;; Load 1 Byte from Flash...
-
-#if defined (__AVR_HAVE_ELPMX__)
-    elpm    r0, Z+
-#elif defined (__AVR_HAVE_ELPM__)
-    elpm
-    adiw    r30, 1
-    adc     HHI8, __zero_reg__
-    out     __RAMPZ__, HHI8
-#elif defined (__AVR_HAVE_LPMX__)
-    lpm     r0, Z+
-#else
-    lpm
-    adiw    r30, 1
-#endif
-
-    ;; ...and store that Byte to RAM Destination
-    st      X+, r0
-    sbiw    LOOP, 1
-    brne    0b
-#if defined (__AVR_HAVE_ELPM__) && defined (__AVR_HAVE_RAMPD__)
-    ;; Reset RAMPZ to 0 so that EBI devices don't read garbage from RAM
-    out	__RAMPZ__, __zero_reg__
-#endif /* ELPM && RAMPD */
-    ret
-
-;; Read from RAM
-
-1:  ;; Read 1 Byte from RAM...
-    ld      r0, Z+
-    ;; and store that Byte to RAM Destination
-    st      X+, r0
-    sbiw    LOOP, 1
-    brne    1b
-    ret
-ENDF __movmemx_hi
-
-#undef HHI8
-#undef LOOP
-
-#endif /* L_movmemx */
-#endif /* !defined (__AVR_TINY__) */ 
-
-
-.section .text.libgcc.builtins, "ax", @progbits
-
-/**********************************
- * Find first set Bit (ffs)
- **********************************/
-
-#if defined (L_ffssi2)
-;; find first set bit
-;; r25:r24 = ffs32 (r25:r22)
-;; clobbers: r22, r26
-DEFUN __ffssi2
-    clr  r26
-    tst  r22
-    brne 1f
-    subi r26, -8
-    or   r22, r23
-    brne 1f
-    subi r26, -8
-    or   r22, r24
-    brne 1f
-    subi r26, -8
-    or   r22, r25
-    brne 1f
-    ret
-1:  mov  r24, r22
-    XJMP __loop_ffsqi2
-ENDF __ffssi2
-#endif /* defined (L_ffssi2) */
-
-#if defined (L_ffshi2)
-;; find first set bit
-;; r25:r24 = ffs16 (r25:r24)
-;; clobbers: r26
-DEFUN __ffshi2
-    clr  r26
-#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
-    ;; Some cores have problem skipping 2-word instruction
-    tst  r24
-    breq 2f
-#else
-    cpse r24, __zero_reg__
-#endif /* __AVR_HAVE_JMP_CALL__ */
-1:  XJMP __loop_ffsqi2
-2:  ldi  r26, 8
-    or   r24, r25
-    brne 1b
-    ret
-ENDF __ffshi2
-#endif /* defined (L_ffshi2) */
-
-#if defined (L_loop_ffsqi2)
-;; Helper for ffshi2, ffssi2
-;; r25:r24 = r26 + zero_extend16 (ffs8(r24))
-;; r24 must be != 0
-;; clobbers: r26
-DEFUN __loop_ffsqi2
-    inc  r26
-    lsr  r24
-    brcc __loop_ffsqi2
-    mov  r24, r26
-    clr  r25
-    ret
-ENDF __loop_ffsqi2
-#endif /* defined (L_loop_ffsqi2) */
-
-
-/**********************************
- * Count trailing Zeros (ctz)
- **********************************/
-
-#if defined (L_ctzsi2)
-;; count trailing zeros
-;; r25:r24 = ctz32 (r25:r22)
-;; clobbers: r26, r22
-;; ctz(0) = 255
-;; Note that ctz(0) in undefined for GCC
-DEFUN __ctzsi2
-    XCALL __ffssi2
-    dec  r24
-    ret
-ENDF __ctzsi2
-#endif /* defined (L_ctzsi2) */
-
-#if defined (L_ctzhi2)
-;; count trailing zeros
-;; r25:r24 = ctz16 (r25:r24)
-;; clobbers: r26
-;; ctz(0) = 255
-;; Note that ctz(0) in undefined for GCC
-DEFUN __ctzhi2
-    XCALL __ffshi2
-    dec  r24
-    ret
-ENDF __ctzhi2
-#endif /* defined (L_ctzhi2) */
-
-
-/**********************************
- * Count leading Zeros (clz)
- **********************************/
-
-#if defined (L_clzdi2)
-;; count leading zeros
-;; r25:r24 = clz64 (r25:r18)
-;; clobbers: r22, r23, r26
-DEFUN __clzdi2
-    XCALL __clzsi2
-    sbrs r24, 5
-    ret
-    mov_l r22, r18
-    mov_h r23, r19
-    mov_l r24, r20
-    mov_h r25, r21
-    XCALL __clzsi2
-    subi r24, -32
-    ret
-ENDF __clzdi2
-#endif /* defined (L_clzdi2) */
-
-#if defined (L_clzsi2)
-;; count leading zeros
-;; r25:r24 = clz32 (r25:r22)
-;; clobbers: r26
-DEFUN __clzsi2
-    XCALL __clzhi2
-    sbrs r24, 4
-    ret
-    mov_l r24, r22
-    mov_h r25, r23
-    XCALL __clzhi2
-    subi r24, -16
-    ret
-ENDF __clzsi2
-#endif /* defined (L_clzsi2) */
-
-#if defined (L_clzhi2)
-;; count leading zeros
-;; r25:r24 = clz16 (r25:r24)
-;; clobbers: r26
-DEFUN __clzhi2
-    clr  r26
-    tst  r25
-    brne 1f
-    subi r26, -8
-    or   r25, r24
-    brne 1f
-    ldi  r24, 16
-    ret
-1:  cpi  r25, 16
-    brsh 3f
-    subi r26, -3
-    swap r25
-2:  inc  r26
-3:  lsl  r25
-    brcc 2b
-    mov  r24, r26
-    clr  r25
-    ret
-ENDF __clzhi2
-#endif /* defined (L_clzhi2) */
-
-
-/**********************************
- * Parity
- **********************************/
-
-#if defined (L_paritydi2)
-;; r25:r24 = parity64 (r25:r18)
-;; clobbers: __tmp_reg__
-DEFUN __paritydi2
-    eor  r24, r18
-    eor  r24, r19
-    eor  r24, r20
-    eor  r24, r21
-    XJMP __paritysi2
-ENDF __paritydi2
-#endif /* defined (L_paritydi2) */
-
-#if defined (L_paritysi2)
-;; r25:r24 = parity32 (r25:r22)
-;; clobbers: __tmp_reg__
-DEFUN __paritysi2
-    eor  r24, r22
-    eor  r24, r23
-    XJMP __parityhi2
-ENDF __paritysi2
-#endif /* defined (L_paritysi2) */
-
-#if defined (L_parityhi2)
-;; r25:r24 = parity16 (r25:r24)
-;; clobbers: __tmp_reg__
-DEFUN __parityhi2
-    eor  r24, r25
-;; FALLTHRU
-ENDF __parityhi2
-
-;; r25:r24 = parity8 (r24)
-;; clobbers: __tmp_reg__
-DEFUN __parityqi2
-    ;; parity is in r24[0..7]
-    mov  __tmp_reg__, r24
-    swap __tmp_reg__
-    eor  r24, __tmp_reg__
-    ;; parity is in r24[0..3]
-    subi r24, -4
-    andi r24, -5
-    subi r24, -6
-    ;; parity is in r24[0,3]
-    sbrc r24, 3
-    inc  r24
-    ;; parity is in r24[0]
-    andi r24, 1
-    clr  r25
-    ret
-ENDF __parityqi2
-#endif /* defined (L_parityhi2) */
-
-
-/**********************************
- * Population Count
- **********************************/
-
-#if defined (L_popcounthi2)
-;; population count
-;; r25:r24 = popcount16 (r25:r24)
-;; clobbers: __tmp_reg__
-DEFUN __popcounthi2
-    XCALL __popcountqi2
-    push r24
-    mov  r24, r25
-    XCALL __popcountqi2
-    clr  r25
-    ;; FALLTHRU
-ENDF __popcounthi2
-
-DEFUN __popcounthi2_tail
-    pop   __tmp_reg__
-    add   r24, __tmp_reg__
-    ret
-ENDF __popcounthi2_tail
-#endif /* defined (L_popcounthi2) */
-
-#if defined (L_popcountsi2)
-;; population count
-;; r25:r24 = popcount32 (r25:r22)
-;; clobbers: __tmp_reg__
-DEFUN __popcountsi2
-    XCALL __popcounthi2
-    push  r24
-    mov_l r24, r22
-    mov_h r25, r23
-    XCALL __popcounthi2
-    XJMP  __popcounthi2_tail
-ENDF __popcountsi2
-#endif /* defined (L_popcountsi2) */
-
-#if defined (L_popcountdi2)
-;; population count
-;; r25:r24 = popcount64 (r25:r18)
-;; clobbers: r22, r23, __tmp_reg__
-DEFUN __popcountdi2
-    XCALL __popcountsi2
-    push  r24
-    mov_l r22, r18
-    mov_h r23, r19
-    mov_l r24, r20
-    mov_h r25, r21
-    XCALL __popcountsi2
-    XJMP  __popcounthi2_tail
-ENDF __popcountdi2
-#endif /* defined (L_popcountdi2) */
-
-#if defined (L_popcountqi2)
-;; population count
-;; r24 = popcount8 (r24)
-;; clobbers: __tmp_reg__
-DEFUN __popcountqi2
-    mov  __tmp_reg__, r24
-    andi r24, 1
-    lsr  __tmp_reg__
-    lsr  __tmp_reg__
-    adc  r24, __zero_reg__
-    lsr  __tmp_reg__
-    adc  r24, __zero_reg__
-    lsr  __tmp_reg__
-    adc  r24, __zero_reg__
-    lsr  __tmp_reg__
-    adc  r24, __zero_reg__
-    lsr  __tmp_reg__
-    adc  r24, __zero_reg__
-    lsr  __tmp_reg__
-    adc  r24, __tmp_reg__
-    ret
-ENDF __popcountqi2
-#endif /* defined (L_popcountqi2) */
-
-
-/**********************************
- * Swap bytes
- **********************************/
-
-;; swap two registers with different register number
-.macro bswap a, b
-    eor \a, \b
-    eor \b, \a
-    eor \a, \b
-.endm
-
-#if defined (L_bswapsi2)
-;; swap bytes
-;; r25:r22 = bswap32 (r25:r22)
-DEFUN __bswapsi2
-    bswap r22, r25
-    bswap r23, r24
-    ret
-ENDF __bswapsi2
-#endif /* defined (L_bswapsi2) */
-
-#if defined (L_bswapdi2)
-;; swap bytes
-;; r25:r18 = bswap64 (r25:r18)
-DEFUN __bswapdi2
-    bswap r18, r25
-    bswap r19, r24
-    bswap r20, r23
-    bswap r21, r22
-    ret
-ENDF __bswapdi2
-#endif /* defined (L_bswapdi2) */
-
-
-/**********************************
- * 64-bit shifts
- **********************************/
-
-#if defined (L_ashrdi3)
-
-#define SS __zero_reg__
-
-;; Arithmetic shift right
-;; r25:r18 = ashr64 (r25:r18, r17:r16)
-DEFUN __ashrdi3
-    sbrc    r25, 7
-    com     SS
-    ;; FALLTHRU
-ENDF  __ashrdi3
-
-;; Logic shift right
-;; r25:r18 = lshr64 (r25:r18, r17:r16)
-DEFUN __lshrdi3
-    ;; Signs are in SS (zero_reg)
-    mov     __tmp_reg__, r16
-0:  cpi     r16, 8
-    brlo 2f
-    subi    r16, 8
-    mov     r18, r19
-    mov     r19, r20
-    mov     r20, r21
-    mov     r21, r22
-    mov     r22, r23
-    mov     r23, r24
-    mov     r24, r25
-    mov     r25, SS
-    rjmp 0b
-1:  asr     SS
-    ror     r25
-    ror     r24
-    ror     r23
-    ror     r22
-    ror     r21
-    ror     r20
-    ror     r19
-    ror     r18
-2:  dec     r16
-    brpl 1b
-    clr     __zero_reg__
-    mov     r16, __tmp_reg__
-    ret
-ENDF __lshrdi3
-
-#undef SS
-
-#endif /* defined (L_ashrdi3) */
-
-#if defined (L_ashldi3)
-;; Shift left
-;; r25:r18 = ashl64 (r25:r18, r17:r16)
-;; This function does not clobber T.
-DEFUN __ashldi3
-    mov     __tmp_reg__, r16
-0:  cpi     r16, 8
-    brlo 2f
-    mov     r25, r24
-    mov     r24, r23
-    mov     r23, r22
-    mov     r22, r21
-    mov     r21, r20
-    mov     r20, r19
-    mov     r19, r18
-    clr     r18
-    subi    r16, 8
-    rjmp 0b
-1:  lsl     r18
-    rol     r19
-    rol     r20
-    rol     r21
-    rol     r22
-    rol     r23
-    rol     r24
-    rol     r25
-2:  dec     r16
-    brpl 1b
-    mov     r16, __tmp_reg__
-    ret
-ENDF __ashldi3
-#endif /* defined (L_ashldi3) */
-
-#if defined (L_rotldi3)
-;; Rotate left
-;; r25:r18 = rotl64 (r25:r18, r17:r16)
-DEFUN __rotldi3
-    push    r16
-0:  cpi     r16, 8
-    brlo 2f
-    subi    r16, 8
-    mov     __tmp_reg__, r25
-    mov     r25, r24
-    mov     r24, r23
-    mov     r23, r22
-    mov     r22, r21
-    mov     r21, r20
-    mov     r20, r19
-    mov     r19, r18
-    mov     r18, __tmp_reg__
-    rjmp 0b
-1:  lsl     r18
-    rol     r19
-    rol     r20
-    rol     r21
-    rol     r22
-    rol     r23
-    rol     r24
-    rol     r25
-    adc     r18, __zero_reg__
-2:  dec     r16
-    brpl 1b
-    pop     r16
-    ret
-ENDF __rotldi3
-#endif /* defined (L_rotldi3) */
-
-
-.section .text.libgcc.fmul, "ax", @progbits
-
-/***********************************************************/
-;;; Softmul versions of FMUL, FMULS and FMULSU to implement
-;;; __builtin_avr_fmul* if !AVR_HAVE_MUL
-/***********************************************************/
-
-#define A1 24
-#define B1 25
-#define C0 22
-#define C1 23
-#define A0 __tmp_reg__
-
-#ifdef L_fmuls
-;;; r23:r22 = fmuls (r24, r25) like in FMULS instruction
-;;; Clobbers: r24, r25, __tmp_reg__
-DEFUN __fmuls
-    ;; A0.7 = negate result?
-    mov  A0, A1
-    eor  A0, B1
-    ;; B1 = |B1|
-    sbrc B1, 7
-    neg  B1
-    XJMP __fmulsu_exit
-ENDF __fmuls
-#endif /* L_fmuls */
-
-#ifdef L_fmulsu
-;;; r23:r22 = fmulsu (r24, r25) like in FMULSU instruction
-;;; Clobbers: r24, r25, __tmp_reg__
-DEFUN __fmulsu
-    ;; A0.7 = negate result?
-    mov  A0, A1
-;; FALLTHRU
-ENDF __fmulsu
-
-;; Helper for __fmuls and __fmulsu
-DEFUN __fmulsu_exit
-    ;; A1 = |A1|
-    sbrc A1, 7
-    neg  A1
-#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
-    ;; Some cores have problem skipping 2-word instruction
-    tst  A0
-    brmi 1f
-#else
-    sbrs A0, 7
-#endif /* __AVR_HAVE_JMP_CALL__ */
-    XJMP  __fmul
-1:  XCALL __fmul
-    ;; C = -C iff A0.7 = 1
-    NEG2 C0
-    ret
-ENDF __fmulsu_exit
-#endif /* L_fmulsu */
-
-
-#ifdef L_fmul
-;;; r22:r23 = fmul (r24, r25) like in FMUL instruction
-;;; Clobbers: r24, r25, __tmp_reg__
-DEFUN __fmul
-    ; clear result
-    clr   C0
-    clr   C1
-    clr   A0
-1:  tst   B1
-    ;; 1.0 = 0x80, so test for bit 7 of B to see if A must to be added to C.
-2:  brpl  3f
-    ;; C += A
-    add   C0, A0
-    adc   C1, A1
-3:  ;; A >>= 1
-    lsr   A1
-    ror   A0
-    ;; B <<= 1
-    lsl   B1
-    brne  2b
-    ret
-ENDF __fmul
-#endif /* L_fmul */
-
-#undef A0
-#undef A1
-#undef B1
-#undef C0
-#undef C1
-
-#include "lib1funcs-fixed.S"
diff --git a/src/6502-c++.cpp b/src/6502-c++.cpp
index f13e3f9..15eabbb 100644
--- a/src/6502-c++.cpp
+++ b/src/6502-c++.cpp
@@ -16,10 +16,10 @@
 
 #include "include/6502.hpp"
 #include "include/assembly.hpp"
+#include "include/lib1funcs.hpp"
 #include "include/optimizer.hpp"
 #include "include/personalities/c64.hpp"
 
-
 int to_int(const std::string_view sv)
 {
   int result{};
@@ -875,9 +875,7 @@ std::vector<mos6502> run(const Personality &personality, std::istream &input)
 
   std::vector<AVR> instructions;
 
-  while (input.good()) {
-    std::string line;
-    getline(input, line);
+  const auto parse_line = [&](const auto &line) {
     try {
       std::smatch match;
       if (std::regex_match(line, match, Label)) {
@@ -902,6 +900,29 @@ std::vector<mos6502> run(const Personality &personality, std::istream &input)
     }
 
     ++lineno;
+  };
+
+  const auto parse_stream = [&](auto &stream) {
+    while (stream.good()) {
+      std::string line;
+      getline(stream, line);
+      parse_line(line);
+    }
+  };
+
+  const auto parse_string = [&](const auto &string) {
+    std::stringstream ss{std::string(string)};
+    parse_stream(ss);
+  };
+
+  parse_stream(input);
+
+  const bool needs_mulhi3 = std::any_of(begin(instructions), end(instructions), [](const AVR &instruction) {
+    return instruction.line_text.find("__mulhi3") != std::string::npos;
+  });
+
+  if (needs_mulhi3) {
+    parse_string(__mulhi3);
   }
 
   std::set<std::string> labels;
@@ -910,7 +931,7 @@ std::vector<mos6502> run(const Personality &personality, std::istream &input)
     if (i.type == ASMLine::Type::Label) { labels.insert(i.text); }
   }
 
-  std::set<std::string> used_labels{ "main", "__udivmodhi4", "__mulhi3" };
+  std::set<std::string> used_labels{ "main" };
 
   for (const auto &i : instructions) {
     const auto check_label = [&](const std::string &value) {
@@ -963,7 +984,6 @@ std::vector<mos6502> run(const Personality &personality, std::istream &input)
           i.text = new_labels.at(i.text);
         } catch (...) {
           spdlog::warn("Unused label: '{}', consider making function static until we remove unused functions", i.text);
-
         }
       }
     }