Path: utzoo!utgpu!water!watmath!clyde!att!osu-cis!tut.cis.ohio-state.edu!mailrus!ncar!ames!pasteur!ucbvax!CORY.BERKELEY.EDU!dillon
From: dillon@CORY.BERKELEY.EDU (Matt Dillon)
Newsgroups: comp.sys.atari.st
Subject: ASSEMBLY MOVE/CLEAR/SET/COMPARE ROUTINES (was Clearing memory chain)
Message-ID: <8808160343.AA10248@cory.Berkeley.EDU>
Date: 16 Aug 88 03:43:42 GMT
Sender: daemon@ucbvax.BERKELEY.EDU
Lines: 291

>I think this problem could be solved quite simply by using one of the reserved
>fields in the GEMDOS executable program header.  One of these fields could be
>used for telling the OS that it does not have to clear the TPA beforeexecuting
	
	Half of this conversation is silly.  Since when is clearing memory
slow?  A properly written memory-set/clear function will use, say, 12 or 13
registers filled with the pattern and then loop on movem.l instruction.

	Needless to say, this is *fast*.  Very fast, in fact.

	And before some of the less sophisticated start blabbering about
special cases, following is some *GENERAL* 68K code for clearing, moving,
and comparing memory.

	All routines work on arbitrary boundries, and optimize according to
the block size and alignment, all the way to using multiple-register moves
to accomplish the goal.  bmov() will do either an ascending or decending
copy accordingly, allowing for overlapped moves.

SPECIAL NOTES:
	-All calls take 32 bit quantities for any pointers or integers.  Note
	-especially that the BSET() function takes a long for the fill
	 character even though only a char is used.  Simply modify the code
	 to fill your needs.

	-This assembles under the Aztec assembler.  Some modifications may
	 be required to work on other assemblers.  However, the code is
	 COMPLETELY self contained.

	-Code is set up for being called from C, with arguments pushed on
	 the stack in reverse argument (i.e. first arg is 4(sp), second is
	 8(sp), and third is 12(sp) on entry to the call)

	-D0/D1/A0/A1 are all assumed to be scratch and are not saved.

						-Matt

#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create:
#	bcmp.asm
#	bmov.asm
#	bset.asm
# This archive created: Mon Aug 15 20:36:56 1988
export PATH; PATH=/bin:/usr/bin:$PATH
echo shar: "extracting 'bcmp.asm'" '(772 characters)'
if test -f 'bcmp.asm'
then
	echo shar: "will not over-write existing file 'bcmp.asm'"
else
cat << \!Funky!Stuff! > 'bcmp.asm'

		public	_bcmp	    ; compare two blocks of memory

		;   BCMP(src, dst, len)
		;   char *src, *dst;
		;   long len;

_bcmp:		move.l	4(sp),A0
		move.l	8(sp),A1
		move.l	12(sp),D0
		move.w	D0,D1	    ;longword align address
		neg.w	D1
		and.w	#3,D1
		cmp.w	D0,D0	    ;force Z bit
		bra	.bc2
.bc1		cmpm.b	(A0)+,(A1)+
.bc2		dbne	D1,.bc1
		bne	.bcfail
		move.l	D0,D1
		lsr.l	#2,D1	    ;# of longwords to compare
		cmp.w	D0,D0	    ;force Z bit
		bra	.bc11
.bc10		cmpm.l	(A0)+,(A1)+
.bc11		dbne	D1,.bc10
		bne	.bcfail
		sub.l	#$10000,D0
		bcc	.bc10
		and.w	#3,D0	    ;remaining bytes to compare
		cmp.w	D0,D0	    ;force Z bit
		bra	.bc21
.bc20		cmpm.b	(A0)+,(A1)+
.bc21		dbne	D0,.bc20
		bne	.bcfail
		moveq.l #1,D0	    ;success!
		rts
.bcfail 	moveq.l #0,D0	    ;failure!
		rts

!Funky!Stuff!
fi  # end of overwriting check
echo shar: "extracting 'bmov.asm'" '(2664 characters)'
if test -f 'bmov.asm'
then
	echo shar: "will not over-write existing file 'bmov.asm'"
else
cat << \!Funky!Stuff! > 'bmov.asm'

		;   BMOV(src, dst, len)
		;
		;   char *src, *dst;
		;   long len;
		;
		;   The memory move algorithm is somewhat more of a mess
		;   since we must do it either ascending or decending.

		public	_bmov
_bmov:		move.l	4(sp),A0
		move.l	8(sp),A1
		move.l	12(sp),D0
		cmp.l	A0,A1		;move to self
		beq	.bmend
		bls	.bmup
.bmdown 	adda.l	D0,A0		;descending copy
		adda.l	D0,A1
		move.w	A0,D1		;CHECK WORD ALIGNED
		btst.l	#0,D1
		bne	.bmdown1
		move.w	A1,D1
		btst.l	#0,D1
		bne	.bmdown1
		cmp.l	#259,D0 	    ;chosen by calculation.
		blo	.bmdown8

		move.l	D0,D1		    ;overhead for bmd44: ~360
		divu	#44,D1
		bvs	.bmdown8	    ;too big (> 2,883,540)
		movem.l D2-D7/A2-A6,-(sp)   ;use D2-D7/A2-A6 (11 regs)
		move.l	#11*4,D0
		bra	.bmd44b
.bmd44a 	sub.l	D0,A0		    ;8		total 214/44bytes
		movem.l (A0),D2-D7/A2-A6    ;12 + 8*11  4.86 cycles/byte
		movem.l D2-D7/A2-A6,-(A1)   ; 8 + 8*11
.bmd44b 	dbf	D1,.bmd44a	    ;10
		swap	D1		    ;D0<15:7> already contain 0
		move.w	D1,D0		    ;D0 = remainder
		movem.l (sp)+,D2-D7/A2-A6

.bmdown8	move.w	D0,D1		    ;D1<2:0> = #bytes left later
		lsr.l	#3,D0		    ;divide by 8
		bra	.bmd8b
.bmd8a		move.l	-(A0),-(A1)         ;20         total 50/8bytes
		move.l	-(A0),-(A1)         ;20         = 6.25 cycles/byte
.bmd8b		dbf	D0,.bmd8a	    ;10
		sub.l	#$10000,D0
		bcc	.bmd8a
		move.w	D1,D0		    ;D0 = 0 to 7 bytes
		and.l	#7,D0
		bne	.bmdown1
		rts

.bmd1a		move.b	-(A0),-(A1)         ;12         total 22/byte
.bmdown1				    ;		= 22 cycles/byte
.bmd1b		dbf	D0,.bmd1a	    ;10
		sub.l	#$10000,D0
		bcc	.bmd1a
		rts

.bmup		move.w	A0,D1		    ;CHECK WORD ALIGNED
		btst.l	#0,D1
		bne	.bmup1
		move.w	A1,D1
		btst.l	#0,D1
		bne	.bmup1
		cmp.l	#259,D0 	    ;chosen by calculation
		blo	.bmup8

		move.l	D0,D1		    ;overhead for bmu44: ~360
		divu	#44,D1
		bvs	.bmup8		    ;too big (> 2,883,540)
		movem.l D2-D7/A2-A6,-(sp)   ;use D2-D7/A2-A6 (11 regs)
		move.l	#11*4,D0
		bra	.bmu44b
.bmu44a 	movem.l (A0)+,D2-D7/A2-A6   ;12 + 8*11  ttl 214/44bytes
		movem.l D2-D7/A2-A6,(A1)    ;8  + 8*11  4.86 cycles/byte
		add.l	D0,A1		    ;8
.bmu44b 	dbf	D1,.bmu44a	    ;10
		swap	D1		    ;D0<15:7> already contain 0
		move.w	D1,D0		    ;D0 = remainder
		movem.l (sp)+,D2-D7/A2-A6

.bmup8		move.w	D0,D1		    ;D1<2:0> = #bytes left later
		lsr.l	#3,D0		    ;divide by 8
		bra	.bmu8b
.bmu8a		move.l	(A0)+,(A1)+         ;20         total 50/8bytes
		move.l	(A0)+,(A1)+         ;20         = 6.25 cycles/byte
.bmu8b		dbf	D0,.bmu8a	    ;10
		sub.l	#$10000,D0
		bcc	.bmu8a
		move.w	D1,D0		    ;D0 = 0 to 7 bytes
		and.l	#7,D0
		bne	.bmup1
		rts

.bmu1a		move.b	(A0)+,(A1)+
.bmup1
.bmu1b		dbf	D0,.bmu1a
		sub.l	#$10000,D0
		bcc	.bmu1a
.bmend		rts


!Funky!Stuff!
fi  # end of overwriting check
echo shar: "extracting 'bset.asm'" '(1702 characters)'
if test -f 'bset.asm'
then
	echo shar: "will not over-write existing file 'bset.asm'"
else
cat << \!Funky!Stuff! > 'bset.asm'

		public	_bzero		; Zero a block of memory
		public	_bset		; Set a block of memory to (byte val)

		;   BSET(buffer, len, byte)
		;   BZERO(buffer, len)
		;
		;   char *buffer;
		;   long len;
		;   long byte;	 (must be passed as a long though only a byte)

		public	_bset
		public	_bzero

_bzero: 	moveq.l #0,D1
		bra	.bz0
_bset:		move.b	12+3(sp),D1
.bz0		move.l	4(sp),A0
		move.l	8(sp),D0

		add.l	D0,A0	    ; start at end of address
		cmp.l	#40,D0	    ; unscientifically chosen
		bls	.bs2
		bra	.bs10
.bs1		move.b	D1,-(A0)    ; any count < 65536
.bs2		dbf	D0,.bs1
		rts

				    ; at least 2 bytes in count (D0)
.bs10		movem.l D2-D7/A2-A6,-(sp)   ;ant count > 4
		move.l	A0,D2
		btst.l	#0,D2	    ; is it aligned?
		beq	.bs22
		move.b	D1,-(A0)    ; no, copy one byte
		subq.l	#1,D0

.bs22		andi.l	#$FF,D1     ; expand data D1.B -> D2-D7/A1-A6
		move.l	D1,D2	    ; D1 000000xx   D2 000000xx
		asl.w	#8,D2	    ;		       0000xx00
		or.w	D2,D1	    ;	 0000xxxx
		move.w	D1,D2	    ;	 0000xxxx      0000xxxx
		swap	D2	    ;	 0000xxxx      xxxx0000
		or.l	D1,D2	    ; D2.L
		move.l	D2,D3
		move.l	D2,D4
		move.l	D2,D5
		move.l	D2,D6
		move.l	D2,D7
		move.l	D2,A1
		move.l	D2,A2
		move.l	D2,A3
		move.l	D2,A4
		move.l	D2,A5
		move.l	D2,A6	    ; D2-D7/A1-A6   (12 registers)
		move.l	#12*4,D1    ; bytes per transfer (48)
.bs30		sub.l	D1,D0	    ; pre subtract
		bmi	.bs40
.bs31		movem.l D2-D7/A1-A6,-(A0)
		sub.l	D1,D0
		bpl	.bs31
.bs40		add.w	D1,D0	    ; less than 48 bytes remaining

		move.w	#4,D1	    ; by 4's
		sub.w	D1,D0
		bmi	.bs50
.bs41		move.l	D2,-(A0)
		sub.w	D1,D0
		bpl	.bs41
.bs50		add.w	D1,D0
		bra	.bs52
.bs51		move.b	D2,-(A0)    ; by 1's
.bs52		dbf	D0,.bs51
		movem.l (sp)+,D2-D7/A2-A6
		rts

!Funky!Stuff!
fi  # end of overwriting check
exit 0
#	End of shell archive