Unroll SHA-1 loop with variables substituted to avoid cycling values around.

SHA-1 has five state variables (a,b,c,d,e), and each iteration of the core loop cycles their values around (e.g. a->b), in some cases with modifications. By unrolling the loop by a factor of five and appropriately substituting the values operated on in each iteration, we can avoid actually doing the copies implied by this cycling, which gives a fairly significant performance gain.
2025-02-19 17:30:33 +00:00 · 2017-06-30 00:15:47 -05:00 · 2017-06-30 00:15:47 -05:00 · 2d8c85bd98
commit 2d8c85bd98
parent aab47e38e7
1 changed files with 74 additions and 76 deletions
--- a/sha1.macros
+++ b/sha1.macros
@ -20,64 +20,64 @@
 * Left-rotate 32-bit value in &loc (DP or 16-bit address) by &n positions
 	macro
 	ROTL4	&loc,&n
-	aif	&n>16,.dorotr
+	aif	&n>16,.dorotr2
        lda     &loc
 	lcla	&i
 &i	seta	&n
-.rotlloop
+.rotlloop2
        asl     a		;to set carry
        rol     &loc+2
        rol     &loc
 &i	seta	&i-1
-	aif	&i>0,.rotlloop
-	ago	.end
-.dorotr
+	aif	&i>0,.rotlloop2
+	ago	.end2
+.dorotr2
 	ROTR4	&loc,32-&n
-.end
+.end2
 	mend

 * &to := &from ROTR4 &n
 	macro
 	ROTR4MOVE &to,&from,&n
-	aif	&n>16,.dorotl
+	aif	&n>16,.dorotl3
        lda     &from
        sta     &to
        lda     &from+2
        sta     &to+2
 	lcla	&i
 &i	seta	&n
-.rotrloop
+.rotrloop3
 	lsr	a		;to set carry
 	ror     &to
 	ror     &to+2
 &i	seta	&i-1
-	aif	&i>0,.rotrloop
-	ago	.end
-dorotl
+	aif	&i>0,.rotrloop3
+	ago	.end3
+dorotl3
 	ROTL4MOVE &to,&from,32-&n
-.end
+.end3
 	mend

 * &to := &from ROTL4 &n
 	macro
 	ROTL4MOVE &to,&from,&n
-	aif	&n>16,.dorotr
+	aif	&n>16,.dorotr4
        lda     &from+2
        sta     &to+2
        lda     &from
        sta     &to
 	lcla	&i
 &i	seta	&n
-.rotlloop
+.rotlloop4
        asl     a		;to set carry
        rol     &to+2
        rol     &to
 &i	seta	&i-1
-	aif	&i>0,.rotlloop
-	ago	.end
-.dorotr
+	aif	&i>0,.rotlloop4
+	ago	.end4
+.dorotr4
 	ROTR4MOVE &to,&from,32-&n
-.end
+.end4
 	mend


@ -208,67 +208,68 @@ dorotl
 	mend


-* One part of the loop for processing blocks (&part is 1, 2, 3, or 4)
+
+* One iteration of the loop for processing blocks.
+* The a,b,c,d,e variables are given as parameters so we can aviod cycling them.
 	macro
-	BlockLoopPart &part
-	
-loop&part anop
+	BlockLoopIter &a,&b,&c,&d,&e,&part
+
 	stx	idx
-	ROTL4MOVE temp,a_,5
+	ROTL4MOVE temp,&a,5

 * f_0 to f_19
 	aif	&part<>1,.skip1
-	lda	c
-	eor	d
-	and	b
-	eor	d
+	lda	&c
+	eor	&d
+	and	&b
+	eor	&d
 	clc
 	adc	#$7999
 	sta	f_plus_k
 	
-	lda	c+2
-	eor	d+2
-	and	b+2
-	eor	d+2
+	lda	&c+2
+	eor	&d+2
+	and	&b+2
+	eor	&d+2
 	adc	#$5A82
 	sta	f_plus_k+2
 .skip1

 * f_20 to f_39
 	aif	&part<>2,.skip2
-	lda	b
-	eor	c
-	eor	d
+	lda	&b
+	eor	&c
+	eor	&d
 	clc
 	adc	#$EBA1
 	sta	f_plus_k
 	
-	lda	b+2
-	eor	c+2
-	eor	d+2
+	lda	&b+2
+	eor	&c+2
+	eor	&d+2
 	adc	#$6ED9
 	sta	f_plus_k+2
 .skip2

 * f_40 to f_59
 	aif	&part<>3,.skip3
-	lda	c
-	ora	d
-	and	b
+	lda	&c
+	ora	&d
+	and	&b
 	sta	f40temp
-	lda	c
-	and	d
+	lda	&c
+	and	&d
 	ora	f40temp
 	clc
 	adc	#$BCDC
 	sta	f_plus_k
 	
-	lda	c+2
-	ora	d+2
-	and	b+2
+	lda	&c+2
+	ora	&d+2
+	and	&b+2
 	sta	f40temp
-	lda	c+2
-	and	d+2
+	lda	&c+2
+	and	&d+2
 	ora	f40temp
 	adc	#$8F1B
 	sta	f_plus_k+2
@ -276,16 +277,16 @@ loop&part anop

 * f_60 to f_79
 	aif	&part<>4,.skip4
-	lda	b
-	eor	c
-	eor	d
+	lda	&b
+	eor	&c
+	eor	&d
 	clc
 	adc	#$C1D6
 	sta	f_plus_k
 	
-	lda	b+2
-	eor	c+2
-	eor	d+2
+	lda	&b+2
+	eor	&c+2
+	eor	&d+2
 	adc	#$CA62
 	sta	f_plus_k+2
 .skip4
@ -300,44 +301,41 @@ loop&part anop
 	tax
 	clc
 	tya
-	adc	e
+	adc	&e
 	tay
 	txa
-	adc	e+2
+	adc	&e+2
 	tax
 	clc
 	tya
 	adc	f_plus_k
-	tay
+	sta	&e
 	txa
 	adc	f_plus_k+2
-	tax
+	sta	&e+2

-	lda	d
-	sta	e
-	lda	d+2
-	sta	e+2
-
-	lda	c
-	sta	d
-	lda	c+2
-	sta	d+2
-
-	ROTL4MOVE c,b,30
+	ROTL4	&b,30
 	
-	lda	a_
-	sta	b
-	lda	a_+2
-	sta	b+2
-	
-	sty	a_
-	stx	a_+2
-
 	ldx	idx
 	inx
 	inx
 	inx
 	inx
+	mend
+
+
+* One part of the loop for processing blocks (20 iterations)
+	macro
+	BlockLoopPart &part
+	
+loop&part anop
+	
+	BlockLoopIter a_,b,c,d,e,&part
+	BlockLoopIter e,a_,b,c,d,&part
+	BlockLoopIter d,e,a_,b,c,&part
+	BlockLoopIter c,d,e,a_,b,&part
+	BlockLoopIter b,c,d,e,a_,&part
+
 	cpx	#20*4
 	bge	endloop&part
 	jmp	loop&part