From 7182fc581797f037d85e48e62c69913f8b5b2d74 Mon Sep 17 00:00:00 2001
From: Stephen Heumann <stephenheumann@gmail.com>
Date: Thu, 29 Jun 2017 20:51:36 -0500
Subject: [PATCH] Use separate loops for the four parts of the SHA-1
 computation.

---
 sha1.asm    | 136 +-------------------------------
 sha1.macros | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 224 insertions(+), 132 deletions(-)

diff --git a/sha1.asm b/sha1.asm
index 52e27b5..ab09892 100644
--- a/sha1.asm
+++ b/sha1.asm
@@ -1,6 +1,5 @@
 	case	on
 	mcopy	sha1.macros
-	mcopy	rotate.macros
 
 * Direct page locations	
 ;chunk	gequ	0	; 8 bytes
@@ -92,137 +91,10 @@ SHA1_PROCESSCHUNK start
 	sta	e+2
 
 	ldx	#0
-loop	anop
-	ROTL4MOVE temp,a_,5
-	stx	idx
-	cpx	#60*4
-	bge	f_60
-	cpx	#40*4
-	bge	f_40
-	cpx	#20*4
-	bge	f_20
-
-* f_0 to f_19
-f_0	lda	c
-	eor	d
-	and	b
-	eor	d
-	clc
-	adc	#$7999
-	sta	f_plus_k
-	
-	lda	c+2
-	eor	d+2
-	and	b+2
-	eor	d+2
-	adc	#$5A82
-	sta	f_plus_k+2
-	bra	after_f
-
-* f_20 to f_39
-f_20	lda	b
-	eor	c
-	eor	d
-	clc
-	adc	#$EBA1
-	sta	f_plus_k
-	
-	lda	b+2
-	eor	c+2
-	eor	d+2
-	adc	#$6ED9
-	sta	f_plus_k+2
-	bra	after_f
-
-* f_40 to f_59
-f_40	lda	c
-	ora	d
-	and	b
-	sta	f40temp
-	lda	c
-	and	d
-	ora	f40temp
-	clc
-	adc	#$BCDC
-	sta	f_plus_k
-	
-	lda	c+2
-	ora	d+2
-	and	b+2
-	sta	f40temp
-	lda	c+2
-	and	d+2
-	ora	f40temp
-	adc	#$8F1B
-	sta	f_plus_k+2
-	bra	after_f
-
-* f_60 to f_79
-f_60	lda	b
-	eor	c
-	eor	d
-	clc
-	adc	#$C1D6
-	sta	f_plus_k
-	
-	lda	b+2
-	eor	c+2
-	eor	d+2
-	adc	#$CA62
-	sta	f_plus_k+2
-
-after_f	anop
-	ldx	idx
-	clc
-	lda	w,x
-	adc	temp
-	tay
-	lda	w+2,x
-	adc	temp+2
-	tax
-	clc
-	tya
-	adc	e
-	tay
-	txa
-	adc	e+2
-	tax
-	clc
-	tya
-	adc	f_plus_k
-	tay
-	txa
-	adc	f_plus_k+2
-	tax
-
-	lda	d
-	sta	e
-	lda	d+2
-	sta	e+2
-
-	lda	c
-	sta	d
-	lda	c+2
-	sta	d+2
-
-	ROTL4MOVE c,b,30
-	
-	lda	a_
-	sta	b
-	lda	a_+2
-	sta	b+2
-	
-	sty	a_
-	stx	a_+2
-
-	ldx	idx
-	inx
-	inx
-	inx
-	inx
-	cpx	#80*4
-	bge	endloop
-	jmp	loop
+	BlockLoopPart 1
+	BlockLoopPart 2
+	BlockLoopPart 3
+	BlockLoopPart 4
 
 endloop clc
 	lda	h0
diff --git a/sha1.macros b/sha1.macros
index 6fe9cb2..35f9759 100644
--- a/sha1.macros
+++ b/sha1.macros
@@ -1,3 +1,86 @@
+* Right-rotate 32-bit value in &loc (DP or 16-bit address) by &n positions
+	macro
+	ROTR4	&loc,&n
+	aif	&n>16,.dorotl
+	lda	&loc+2
+	lcla	&i
+&i	seta	&n
+.rotrloop
+	lsr	a		;to set carry
+	ror     &loc
+	ror     &loc+2
+&i	seta	&i-1
+	aif	&i>0,.rotrloop
+	ago	.end
+.dorotl
+	ROTL4	&loc,32-&n
+.end
+	mend
+
+* Left-rotate 32-bit value in &loc (DP or 16-bit address) by &n positions
+	macro
+	ROTL4	&loc,&n
+	aif	&n>16,.dorotr
+        lda     &loc
+	lcla	&i
+&i	seta	&n
+.rotlloop
+        asl     a		;to set carry
+        rol     &loc+2
+        rol     &loc
+&i	seta	&i-1
+	aif	&i>0,.rotlloop
+	ago	.end
+.dorotr
+	ROTR4	&loc,32-&n
+.end
+	mend
+
+* &to := &from ROTR4 &n
+	macro
+	ROTR4MOVE &to,&from,&n
+	aif	&n>16,.dorotl
+        lda     &from
+        sta     &to
+        lda     &from+2
+        sta     &to+2
+	lcla	&i
+&i	seta	&n
+.rotrloop
+	lsr	a		;to set carry
+	ror     &to
+	ror     &to+2
+&i	seta	&i-1
+	aif	&i>0,.rotrloop
+	ago	.end
+dorotl
+	ROTL4MOVE &to,&from,32-&n
+.end
+	mend
+
+* &to := &from ROTL4 &n
+	macro
+	ROTL4MOVE &to,&from,&n
+	aif	&n>16,.dorotr
+        lda     &from+2
+        sta     &to+2
+        lda     &from
+        sta     &to
+	lcla	&i
+&i	seta	&n
+.rotlloop
+        asl     a		;to set carry
+        rol     &to+2
+        rol     &to
+&i	seta	&i-1
+	aif	&i>0,.rotlloop
+	ago	.end
+.dorotr
+	ROTR4MOVE &to,&from,32-&n
+.end
+	mend
+
+
 * This makes a function wrapper that is callable from C,
 * taking a pointer to the context structure as its argument.
 	macro
@@ -130,3 +213,140 @@
 	aif	&i<80,.loop2
 	mend
 
+
+* One part of the loop for processing blocks (&part is 1, 2, 3, or 4)
+	macro
+	BlockLoopPart &part
+	
+loop&part anop
+	stx	idx
+	ROTL4MOVE temp,a_,5
+
+* f_0 to f_19
+	aif	&part<>1,.skip1
+	lda	c
+	eor	d
+	and	b
+	eor	d
+	clc
+	adc	#$7999
+	sta	f_plus_k
+	
+	lda	c+2
+	eor	d+2
+	and	b+2
+	eor	d+2
+	adc	#$5A82
+	sta	f_plus_k+2
+.skip1
+
+* f_20 to f_39
+	aif	&part<>2,.skip2
+	lda	b
+	eor	c
+	eor	d
+	clc
+	adc	#$EBA1
+	sta	f_plus_k
+	
+	lda	b+2
+	eor	c+2
+	eor	d+2
+	adc	#$6ED9
+	sta	f_plus_k+2
+.skip2
+
+* f_40 to f_59
+	aif	&part<>3,.skip3
+	lda	c
+	ora	d
+	and	b
+	sta	f40temp
+	lda	c
+	and	d
+	ora	f40temp
+	clc
+	adc	#$BCDC
+	sta	f_plus_k
+	
+	lda	c+2
+	ora	d+2
+	and	b+2
+	sta	f40temp
+	lda	c+2
+	and	d+2
+	ora	f40temp
+	adc	#$8F1B
+	sta	f_plus_k+2
+.skip3
+
+* f_60 to f_79
+	aif	&part<>4,.skip4
+	lda	b
+	eor	c
+	eor	d
+	clc
+	adc	#$C1D6
+	sta	f_plus_k
+	
+	lda	b+2
+	eor	c+2
+	eor	d+2
+	adc	#$CA62
+	sta	f_plus_k+2
+.skip4
+
+	ldx	idx
+	clc
+	lda	w,x
+	adc	temp
+	tay
+	lda	w+2,x
+	adc	temp+2
+	tax
+	clc
+	tya
+	adc	e
+	tay
+	txa
+	adc	e+2
+	tax
+	clc
+	tya
+	adc	f_plus_k
+	tay
+	txa
+	adc	f_plus_k+2
+	tax
+
+	lda	d
+	sta	e
+	lda	d+2
+	sta	e+2
+
+	lda	c
+	sta	d
+	lda	c+2
+	sta	d+2
+
+	ROTL4MOVE c,b,30
+	
+	lda	a_
+	sta	b
+	lda	a_+2
+	sta	b+2
+	
+	sty	a_
+	stx	a_+2
+
+	ldx	idx
+	inx
+	inx
+	inx
+	inx
+	cpx	#&part*20*4
+	bge	endloop&part
+	jmp	loop&part
+endloop&part anop
+	mend
+