* Copyright (c) 2017 Stephen Heumann
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


* Right-rotate 32-bit value in &loc (DP or 16-bit address) by &n positions
	macro
	ROTR4	&loc,&n
	aif	&n>16,.dorotl
	lda	&loc+2
	lcla	&i
&i	seta	&n
.rotrloop
	lsr	a		;to set carry
	ror     &loc
	ror     &loc+2
&i	seta	&i-1
	aif	&i>0,.rotrloop
	ago	.end
.dorotl
	ROTL4	&loc,32-&n
.end
	mend

* Left-rotate 32-bit value in &loc (DP or 16-bit address) by &n positions
	macro
	ROTL4	&loc,&n
	aif	&n>16,.dorotr2
        lda     &loc
	lcla	&i
&i	seta	&n
.rotlloop2
        asl     a		;to set carry
        rol     &loc+2
        rol     &loc
&i	seta	&i-1
	aif	&i>0,.rotlloop2
	ago	.end2
.dorotr2
	ROTR4	&loc,32-&n
.end2
	mend

* &to := &from ROTR4 &n
	macro
	ROTR4MOVE &to,&from,&n
	aif	&n>16,.dorotl3
        lda     &from
        sta     &to
        lda     &from+2
        sta     &to+2
	lcla	&i
&i	seta	&n
.rotrloop3
	lsr	a		;to set carry
	ror     &to
	ror     &to+2
&i	seta	&i-1
	aif	&i>0,.rotrloop3
	ago	.end3
.dorotl3
	ROTL4MOVE &to,&from,32-&n
.end3
	mend

* &to := &from ROTL4 &n
	macro
	ROTL4MOVE &to,&from,&n
	aif	&n>16,.dorotr4
        lda     &from+2
        sta     &to+2
        lda     &from
        sta     &to
	lcla	&i
&i	seta	&n
.rotlloop4
        asl     a		;to set carry
        rol     &to+2
        rol     &to
&i	seta	&i-1
	aif	&i>0,.rotlloop4
	ago	.end4
.dorotr4
	ROTR4MOVE &to,&from,32-&n
.end4
	mend


* This makes a function wrapper that is callable from C,
* taking a pointer to the context structure as its argument.
	macro
	CFunction &fn
	phb
	plx
	ply
	tdc
	pld
	plb
	plb
	phy
	phx
	plb
	pha
	jsl	&fn
	pld
	rtl
	mend


* Macros to operate on elements of the message schedule (W)
	macro
&lab	lda_w	&i,&inc
	lcla	&j
&j	seta	&i
.modloop1
	aif	&j<20,.goodidx1
&j	seta	&j-20
	ago	.modloop1
.goodidx1
	aif	C:&inc<>0,.haveinc
	lcla	&inc
.haveinc
&lab	lda	w+(&j)*4+&inc
	mend

	macro
&lab	eor_w	&i,&inc
	lcla	&j
&j	seta	&i
.modloop2
	aif	&j<20,.goodidx2
&j	seta	&j-20
	ago	.modloop2
.goodidx2
	aif	C:&inc<>0,.haveinc
	lcla	&inc
.haveinc
&lab	eor	w+(&j)*4+&inc
	mend

	macro
&lab	sta_w	&i,&inc
	lcla	&j
&j	seta	&i
.modloop3
	aif	&j<20,.goodidx3
&j	seta	&j-20
	ago	.modloop3
.goodidx3
	aif	C:&inc<>0,.haveinc
	lcla	&inc
.haveinc
&lab	sta	w+(&j)*4+&inc
	mend

	macro
&lab	rol_w	&i,&inc
	lcla	&j
&j	seta	&i
.modloop4
	aif	&j<20,.goodidx4
&j	seta	&j-20
	ago	.modloop4
.goodidx4
	aif	C:&inc<>0,.haveinc
	lcla	&inc
.haveinc
&lab	rol	w+(&j)*4+&inc
	mend


* Compute one part of the message schedule (20 elements)
	macro
	ComputeSchedule &part
	lcla	&i

; Flip the endianness of W_0 to W_15 (the current block of the message)
	aif	&part<>1,.skippart1
.loop1
	lda	w+&i*4
	xba
	ldx	w+&i*4+2
	sta	w+&i*4+2
	txa
	xba
	sta	w+&i*4
&i	seta	&i+1
	aif	&i<16,.loop1
.skippart1

; compute the rest of the message schedule (W_16 to W_79)
	aif	&part=1,.loop2
&i	seta	(&part-1)*20
.loop2
	lda_w	&i-3
	eor_w	&i-8
	eor_w	&i-14
	eor_w	&i-16
	sta_w	&i
	asl	a		; to set carry

	lda_w	&i-3,2
	eor_w	&i-8,2
	eor_w	&i-14,2
	eor_w	&i-16,2
	rol	a
	sta_w	&i,2

	rol_w	&i
	
&i	seta	&i+1
	aif	&i<&part*20,.loop2
	mend


* One iteration of the loop for processing blocks.
* The a,b,c,d,e variables are given as parameters so we can avoid cycling them.
	macro
	BlockLoopIter &a,&b,&c,&d,&e,&iter

* f_0 to f_19
	aif	&part<>1,.skip1
	lda	&c
	eor	&d
	and	&b
	eor	&d
	clc
	adc	#$7999
	sta	f_plus_k
	
	lda	&c+2
	eor	&d+2
	and	&b+2
	eor	&d+2
	adc	#$5A82
	sta	f_plus_k+2
.skip1

* f_20 to f_39
	aif	&part<>2,.skip2
	lda	&b
	eor	&c
	eor	&d
	clc
	adc	#$EBA1
	sta	f_plus_k
	
	lda	&b+2
	eor	&c+2
	eor	&d+2
	adc	#$6ED9
	sta	f_plus_k+2
.skip2

* f_40 to f_59
	aif	&part<>3,.skip3
	lda	&c
	ora	&d
	and	&b
	sta	temp
	lda	&c
	and	&d
	ora	temp
	clc
	adc	#$BCDC
	sta	f_plus_k
	
	lda	&c+2
	ora	&d+2
	and	&b+2
	sta	temp
	lda	&c+2
	and	&d+2
	ora	temp
	adc	#$8F1B
	sta	f_plus_k+2
.skip3

* f_60 to f_79
	aif	&part<>4,.skip4
	lda	&b
	eor	&c
	eor	&d
	clc
	adc	#$C1D6
	sta	f_plus_k
	
	lda	&b+2
	eor	&c+2
	eor	&d+2
	adc	#$CA62
	sta	f_plus_k+2
.skip4

	ROTL4MOVE temp,&a,5
	ldx	idx
	clc
	lda	w+&iter*4,x
	adc	temp
	tay
	lda	w+&iter*4+2,x
	adc	temp+2
	tax
	clc
	tya
	adc	&e
	tay
	txa
	adc	&e+2
	tax
	clc
	tya
	adc	f_plus_k
	sta	&e
	txa
	adc	f_plus_k+2
	sta	&e+2

	ROTL4	&b,30
	
	mend


* One part of the loop for processing blocks (20 iterations)
	macro
	BlockLoopPart &part

	stz	idx
loop&part anop
	
	BlockLoopIter a_,b,c,d,e,0
	BlockLoopIter e,a_,b,c,d,1
	BlockLoopIter d,e,a_,b,c,2
	BlockLoopIter c,d,e,a_,b,3
	BlockLoopIter b,c,d,e,a_,4

	clc
	lda	idx
	adc	#4*5
	cmp	#20*4
	bge	endloop&part
	sta	idx
	jmp	loop&part
endloop&part anop
	mend