From aab47e38e7fa04855b5f975d634afecf18bdaad2 Mon Sep 17 00:00:00 2001
From: Stephen Heumann <stephenheumann@gmail.com>
Date: Thu, 29 Jun 2017 22:22:06 -0500
Subject: [PATCH] Compute SHA-1 message schedule 20 elements at a time, reusing
 storage.

The reduces the storage needed and boosts performance, since the whole schedule is contained in the direct page.
---
 sha1.asm    | 10 +++++--
 sha1.macros | 86 +++++++++++++++++++++++++----------------------------
 2 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/sha1.asm b/sha1.asm
index ab09892..1ddd857 100644
--- a/sha1.asm
+++ b/sha1.asm
@@ -62,9 +62,6 @@ sha1_processchunk start
 	end
 
 SHA1_PROCESSCHUNK start
-
-	ComputeSchedule
-	
 	lda	h0
 	sta	a_
 	lda	h0+2
@@ -90,10 +87,17 @@ SHA1_PROCESSCHUNK start
 	lda	h4+2
 	sta	e+2
 
+	ComputeSchedule 1
 	ldx	#0
 	BlockLoopPart 1
+	ComputeSchedule 2
+	ldx	#0
 	BlockLoopPart 2
+	ComputeSchedule 3
+	ldx	#0
 	BlockLoopPart 3
+	ComputeSchedule 4
+	ldx	#0
 	BlockLoopPart 4
 
 endloop clc
diff --git a/sha1.macros b/sha1.macros
index 35f9759..daf4ad8 100644
--- a/sha1.macros
+++ b/sha1.macros
@@ -105,81 +105,72 @@ dorotl
 * Macros to operate on elements of the message schedule (W)
 	macro
 &lab	lda_w	&i,&inc
+	lcla	&j
+&j	seta	&i
+.modloop1
+	aif	&j<20,.goodidx1
+&j	seta	&j-20
+	ago	.modloop1
+.goodidx1
 	aif	C:&inc<>0,.haveinc
 	lcla	&inc
 .haveinc
-	aif	w+(&i)*4+&inc>255,.bigidx
-&lab	lda	w+(&i)*4+&inc
-	ago	.end
-.bigidx
-&lab	ldx	#((&i)-16)*4+&inc
-	lda	w+16*4,x
-.end
+&lab	lda	w+(&j)*4+&inc
 	mend
 
 	macro
 &lab	eor_w	&i,&inc
+	lcla	&j
+&j	seta	&i
+.modloop2
+	aif	&j<20,.goodidx2
+&j	seta	&j-20
+	ago	.modloop2
+.goodidx2
 	aif	C:&inc<>0,.haveinc
 	lcla	&inc
 .haveinc
-	aif	w+(&i)*4+&inc>255,.bigidx
-&lab	eor	w+(&i)*4+&inc
-	ago	.end
-.bigidx
-&lab	ldx	#((&i)-16)*4+&inc
-	eor	w+16*4,x
-.end
+&lab	eor	w+(&j)*4+&inc
 	mend
 
 	macro
 &lab	sta_w	&i,&inc
+	lcla	&j
+&j	seta	&i
+.modloop3
+	aif	&j<20,.goodidx3
+&j	seta	&j-20
+	ago	.modloop3
+.goodidx3
 	aif	C:&inc<>0,.haveinc
 	lcla	&inc
 .haveinc
-	aif	w+(&i)*4+&inc>255,.bigidx
-&lab	sta	w+(&i)*4+&inc
-	ago	.end
-.bigidx
-&lab	ldx	#((&i)-16)*4+&inc
-	sta	w+16*4,x
-.end
-	mend
-
-	macro
-&lab	inc_w	&i,&inc
-	aif	C:&inc<>0,.haveinc
-	lcla	&inc
-.haveinc
-	aif	w+(&i)*4+&inc>255,.bigidx
-&lab	inc	w+(&i)*4+&inc
-	ago	.end
-.bigidx
-&lab	ldx	#((&i)-16)*4+&inc
-	inc	w+16*4,x
-.end
+&lab	sta	w+(&j)*4+&inc
 	mend
 
 	macro
 &lab	rol_w	&i,&inc
+	lcla	&j
+&j	seta	&i
+.modloop4
+	aif	&j<20,.goodidx4
+&j	seta	&j-20
+	ago	.modloop4
+.goodidx4
 	aif	C:&inc<>0,.haveinc
 	lcla	&inc
 .haveinc
-	aif	w+(&i)*4+&inc>255,.bigidx
-&lab	rol	w+(&i)*4+&inc
-	ago	.end
-.bigidx
-&lab	ldx	#((&i)-16)*4+&inc
-	rol	w+16*4,x
-.end
+&lab	rol	w+(&j)*4+&inc
 	mend
 
 
-* Compute the message schedule (W_0 to W_79) 
+* Compute one part of the message schedule (20 elements)
 	macro
-	ComputeSchedule
+	ComputeSchedule &part
 	lcla	&i
 
 ; Flip the endianness of W_0 to W_15 (the current chunk of the message)
+	aif	&part<>1,.skippart1
 .loop1
 	lda	w+&i*4
 	xba
@@ -190,8 +181,11 @@ dorotl
 	sta	w+&i*4
 &i	seta	&i+1
 	aif	&i<16,.loop1
+.skippart1
 
 ; compute the rest of the message schedule (W_16 to W_79)
+	aif	&part=1,.loop2
+&i	seta	(&part-1)*20
 .loop2
 	lda_w	&i-3
 	eor_w	&i-8
@@ -210,7 +204,7 @@ dorotl
 	rol_w	&i
 	
 &i	seta	&i+1
-	aif	&i<80,.loop2
+	aif	&i<&part*20,.loop2
 	mend
 
 
@@ -344,7 +338,7 @@ loop&part anop
 	inx
 	inx
 	inx
-	cpx	#&part*20*4
+	cpx	#20*4
 	bge	endloop&part
 	jmp	loop&part
 endloop&part anop