JPEGView/Source/68020/CopyReallyFastScaledFrom32.a

;*********************************************************/
;* This source code copyright (c) 1991-2001, Aaron Giles */
;* See the Read Me file for licensing information.       */
;* Contact email: mac@aarongiles.com                     */
;*********************************************************/

;
;	On entry here we expect the following values:
;
;		srcBase = (long) pointer to the first source pixel
;		srcRow  = (long) rowBytes for the source pixmap
;		dstBase = (long) pointer to the first destination pixel
;		dstRow  = (long) rowBytes for the destination pixmap
;		itAddr  = (long) pointer to the inverse color table
;		ctAddr  = (long) pointer to the color table
;		theRgn  = (long) handle to the destination region
;		boxRect = (Rect) the bounding rectangle of this region
;		height  = (word) height of the region bounding box
;		width   = (word) width of the region bounding box
;		srcWidth = (word) width of the source rectangle
;		srcHeight = (word) height of the source rectangle
;		dstWidth = (word) width of the destination rectangle
;		dstHeight = (word) height of the destination rectangle
;		xRemainder = (word) starting X remainder for the source counter
;		yRemainder = (word) starting Y remainder for the source counter
;
;	The following registers will be modified:
;
;		d0,d1,d2
;		a0,a1
;
;	Internally, the register usage is as follows:
;
;		d0 = accumulator
;		d1 = scratch register for expanding 24-bit pixels
;		d2 = scratch [save for d6.l/d7.l]
;		d3 = scratch [multiplier for xy-scale]
;		d4 = (low word) remaining source Y pixels needed for current dest. Y
;		d4 = (high word) dstHeight/srcHeight
;		d5 = (low word) remaining Y fraction of current source pixel
;		d5 = (high word) 1.00
;		d6 = (low word) remaining source Y pixels needed for current dest. X
;		d6 = (high word) dstWidth/srcWidth
;		d7 = (low word) remaining Y fraction of current source pixel
;		d7 = (high word) scratch [save for d5.l]
;
;		a0 = pointer to 24-bit source row
;		a1 = pointer to 8-bit destination row
;		a2 = sum of the red contributions
;		a3 = sum of the green contributions
;		a4 = sum of the blue contributions
;		a5 = temporary save area for a0
;
	movem.l	d3-d7/a2-a6,-(sp)	;save registers on the stack
	move.l	48(sp),a6			;get the address of the CopyData structure in a6
								;(based off stack: 10 registers + 1 a6 link + 1 return addr.
								;					= 12 * 4 bytes = 48 bytes)
;
;	Create some room in the stack for our buffers
;
	clr.l	d0					;zero out d0
	move.w	width(a6),d0		;get width of destination in d0.l
	addq.l	#2,d0				;plus two for overflows
	lsl.l	#4,d0				;times 16 (2 * 6 + 4)
	addq.l	#4,d0				;add 4 for alignment
	sub.l	d0,sp				;get it from the stack space
	move.l	d0,-(sp)			;and save the amount
	lsr.l	#3,d0				;return to times 2
	move.l	d0,d1				;save that in d1
	lsl.l	#1,d0				;d0 = width * 4
	add.l	d1,d0				;   = width * 6
	moveq.l	#13,d1				;offset by 10, plus 3 for rounding up
	add.l	sp,d1				;add in the stack pointer
	andi.l	#-4,d1				;make it longword-aligned	(-4 == $fffffffc)
	move.l	d1,evenAddr(a6)		;that's the even buffer
	add.l	d0,d1				;point to the odd buffer
	move.l	d1,oddAddr(a6)		;save that pointer
	add.l	d0,d1				;point to the scale buffer
	move.l	d1,outputAddr(a6)	;now we've got them all
;
;	Clear the dithering buffers to zeros
;
	move.l	oddAddr(a6),a0		;point to the odd buffer by default
	subq.l	#4,d1				;decrement d1 for the dbra, plus the extras
	lsr.l	#1,d0				;divide by two since we only clear one buffer
	btst.b	#0,evodd+1(a6)		;check the even/odd flag
	bne.s	@ClearLoop			;if even, skip ahead
	move.l	evenAddr(a6),a0		;point to the even buffer
@ClearLoop:
	clr.w	(a0)+				;clear this word
	dbra.w	d0,@ClearLoop		;loop for all dithering
;
;	Initialize the region data
;
	move.l	theRgn(a6),a0		;get theRgn in a0
	lea.l	rgnBuffer(a6),a1	;point a1 to the region buffer
	move.w	boxRect+0(a6),d2	;get box top in d2
	move.w	boxRect+2(a6),d3	;box left in d3
	move.w	height(a6),d4		;height in d4
	move.w	width(a6),d5		;width in d5
	jsr		InitRegion			;initialize the region
	move.l	a0,-(sp)			;store our pointer to the rgn on the stack
;
;	Set up the source/destination quantities in the high words of d4-d7
;
	clr.l	d6					;clear out d6
	move.w	dstHeight(a6),d6	;get dest height there
	moveq.l	#10,d0				;get shift value in d0
	lsl.l	d0,d6				;shift it up by 10 bits
	divu.w	srcHeight(a6),d6	;divide by source height
	addq.w	#1,d6				;plus one to prevent overflows
	swap	d6					;swap it high
	clr.l	d4					;clear out d4
	move.w	dstWidth(a6),d4		;get destination width in d4
	lsl.l	d0,d4				;shift d6 up by 10 bits
	divu.w	srcWidth(a6),d4		;divide by source width
	addq.w	#1,d4				;plus one to prevent overflows
	swap	d4					;into high word of d6
;
;	Set up the counter "rows" & reset the source Y remainder
;
	move.w	height(a6),rows(a6)	;set up the row counter
	move.w	yRemainder(a6),d5	;restore yRemainder
;
;	The outermost (row) loop begins here; set up our pointers into the data
;
@ScaleRowLoop:
	move.l	srcBase(a6),a0		;get pointer to source in a0
	move.l	outputAddr(a6),a1	;put to destination with a1
;
;	Reset the "columns" counter and reset the source X remainder
;
	move.w	width(a6),columns(a6);reset the column counter
	move.w	xRemainder(a6),d7	;restore xRemainder
;
;	The inner (column) loop begins here; find starting pixel location in Y
;
	move.w	#1024,d0			;get needed Y in d0
	cmp.w	d0,d5				;is the Y contribution completely contained?
	bgt.s	@ScaleCommon		;if so, go straight to the common scaling
	sub.w	d5,d0				;subtract off remaining source
	swap	d6					;get source Y contribution in d6
	cmp.w	d0,d6				;do we overlap next Y pixel only?
	ble.s	@NotScaleY2			;if not, skip ahead
	swap	d6					;swap d6 back
	cmp.w	d0,d5				;should we skip ahead a row?
	bgt.s	@ScaleCommon		;if not, go ahead and go for it
	add.w	srcRow(a6),a0		;point to the next row
	bra.s	@ScaleCommon		;and *then* go for it
@NotScaleY2:
	swap	d6					;swap d6 back
	add.w	srcRow(a6),a0		;point to the next row
;
;	The common scaling loop; here we copy pixels from the current row to the destination
;
@ScaleCommon:
	swap	d4					;get the X contribution
@ScaleColLoop:
	move.w	#1024,d6			;initialize our X counter	(1024 == $400)
	cmp.w	d6,d7				;is the X contribution completely contained?
	ble.s	@ScaleX1			;if not, skip this
	move.l	(a0),(a1)+			;copy the current source pixel
	bra.s	@ScaleEnd			;skip to the end
@ScaleX1:
	sub.w	d7,d6				;subtract remaining X from our needed X
	cmp.w	d6,d4				;do we overlap onto next pixel only?
	ble.s	@ScaleX2			;if not, skip
	cmp.w	d6,d7				;which pixel do we contribute?
	ble.s	@ScaleX1Add2		;if second one, skip ahead
	move.l	(a0)+,(a1)+			;copy the first pixel and increment
	bra.s	@ScaleEndReset		;skip to the end
@ScaleX1Add2:
	addq.l	#4,a0				;increment source first
	move.l	(a0),(a1)+			;copy second pixel now
	bra.s	@ScaleEndReset		;skip to the end
@ScaleX2:
	addq.l	#4,a0				;increment source
	move.l	(a0)+,(a1)+			;copy second pixel and increment
@ScaleX2Loop:
	sub.w	d4,d6				;subtract contribution
	cmp.w	d6,d4				;see if we have another whole contribution left
	bgt.s	@ScaleEndReset		;if not, reset the needed value
	addq.l	#4,a0				;increment source
	bra.s	@ScaleX2Loop		;loop until we exit
@ScaleEndReset:
	move.w	d4,d7				;set the needed value to the contribution
@ScaleEnd:
	sub.w	d6,d7				;subtract needed from remaining
	subq.w	#1,columns(a6)		;decrement the columns count
	bne.s	@ScaleColLoop		;loop until done
;
;	We now have a row; save our registers and set up to do a dithered copy
;
	swap	d4					;swap d4 back
	movem.l	d4-d7,-(sp)			;save registers
	move.l	outputAddr(a6),a0	;source here
	move.l	dstBase(a6),a1		;get destination address
	move.l	itAddr(a6),a2		;inverse color table
	move.l	ctAddr(a6),a3		;color table
	lea.l	rgnBuffer+4(a6),a4	;get rgnBuffer address in a4
	move.l	a4,d2				;point d2 (a6) to the region buffer
	move.l	evenAddr(a6),d0		;even address
	move.l	oddAddr(a6),d1		;odd address
	move.l	errTable(a6),d6		;point d6 to the error table
	move.w	width(a6),d7		;column count
	bchg.b	#0,evodd+1(a6)		;check the even/odd flag
	beq.s	@ScaleEven			;if zero, do an even row
	move.l	odd(a6),a4			;get address of routine
	jsr		(a4)				;otherwise, dither as an odd row
	bra.s	@ScaleRowEnd		;skip ahead
@ScaleEven:
	move.l	even(a6),a4			;get address of routine
	jsr		(a4)				;dither as an even row
@ScaleRowEnd:
	movem.l	(sp)+,d4-d7			;restore registers
;
;	Adjust ourselves for the next row
;
	subq.w	#1,rows(a6)			;decrement rows
	beq.s	@ScaleExit			;exit if done
	move.l	dstRow(a6),d0		;get destination row increment in d0
	add.l	d0,dstBase(a6)		;point to next destination row
	move.l	srcRow(a6),d0		;get source row increment in d0
	move.w	#1024,d4			;get 1.00 count in d4	(1024 == $400)
	swap	d6					;swap d6 beforehand for speed
@ScaleIncRow:
	cmp.w	d4,d5				;did we upgrade 1 row?
	bgt.s	@ScaleRowNext		;if not, loop
	add.l	d0,srcBase(a6)		;point to next source row
	sub.w	d5,d4				;adjust Y for remainder
	move.w	d6,d5				;reset Y count for the new pixel
	bra.s	@ScaleIncRow		;handle any further adjustments
@ScaleRowNext:
	swap	d6					;restore d6
	sub.w	d4,d5				;get the new Y multiplier
;
;	Update the region counters
;
	subq.w	#1,rgnBuffer(a6)	;decrement the Y region count
	bne.w	@ScaleRowLoop		;skip if we're not done
	move.l	(sp)+,a0			;restore region pointer
	lea.l	rgnBuffer+4(a6),a1	;point a1 into the region buffer
	move.w	boxRect+2(a6),d0	;get left end of the box in d0
	jsr		UpdateRegion		;update our region
	move.l	a0,-(sp)			;push new position again
	bra.w	@ScaleRowLoop		;loop for more

@ScaleExit:
	add.l	#4,sp				;pop off region position
	move.l	(sp)+,d0			;get length of buffer
	add.l	d0,sp				;get it off the stack
	movem.l	(sp)+,d3-d7/a2-a6	;get registers from the stack