Path: utzoo!utgpu!water!watmath!clyde!att!osu-cis!tut.cis.ohio-state.edu!mailrus!ncar!ames!pasteur!ucbvax!CORY.BERKELEY.EDU!dillon From: dillon@CORY.BERKELEY.EDU (Matt Dillon) Newsgroups: comp.sys.atari.st Subject: ASSEMBLY MOVE/CLEAR/SET/COMPARE ROUTINES (was Clearing memory chain) Message-ID: <8808160343.AA10248@cory.Berkeley.EDU> Date: 16 Aug 88 03:43:42 GMT Sender: daemon@ucbvax.BERKELEY.EDU Lines: 291 >I think this problem could be solved quite simply by using one of the reserved >fields in the GEMDOS executable program header. One of these fields could be >used for telling the OS that it does not have to clear the TPA beforeexecuting Half of this conversation is silly. Since when is clearing memory slow? A properly written memory-set/clear function will use, say, 12 or 13 registers filled with the pattern and then loop on movem.l instruction. Needless to say, this is *fast*. Very fast, in fact. And before some of the less sophisticated start blabbering about special cases, following is some *GENERAL* 68K code for clearing, moving, and comparing memory. All routines work on arbitrary boundries, and optimize according to the block size and alignment, all the way to using multiple-register moves to accomplish the goal. bmov() will do either an ascending or decending copy accordingly, allowing for overlapped moves. SPECIAL NOTES: -All calls take 32 bit quantities for any pointers or integers. Note -especially that the BSET() function takes a long for the fill character even though only a char is used. Simply modify the code to fill your needs. -This assembles under the Aztec assembler. Some modifications may be required to work on other assemblers. However, the code is COMPLETELY self contained. -Code is set up for being called from C, with arguments pushed on the stack in reverse argument (i.e. first arg is 4(sp), second is 8(sp), and third is 12(sp) on entry to the call) -D0/D1/A0/A1 are all assumed to be scratch and are not saved. -Matt #! /bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #! /bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create: # bcmp.asm # bmov.asm # bset.asm # This archive created: Mon Aug 15 20:36:56 1988 export PATH; PATH=/bin:/usr/bin:$PATH echo shar: "extracting 'bcmp.asm'" '(772 characters)' if test -f 'bcmp.asm' then echo shar: "will not over-write existing file 'bcmp.asm'" else cat << \!Funky!Stuff! > 'bcmp.asm' public _bcmp ; compare two blocks of memory ; BCMP(src, dst, len) ; char *src, *dst; ; long len; _bcmp: move.l 4(sp),A0 move.l 8(sp),A1 move.l 12(sp),D0 move.w D0,D1 ;longword align address neg.w D1 and.w #3,D1 cmp.w D0,D0 ;force Z bit bra .bc2 .bc1 cmpm.b (A0)+,(A1)+ .bc2 dbne D1,.bc1 bne .bcfail move.l D0,D1 lsr.l #2,D1 ;# of longwords to compare cmp.w D0,D0 ;force Z bit bra .bc11 .bc10 cmpm.l (A0)+,(A1)+ .bc11 dbne D1,.bc10 bne .bcfail sub.l #$10000,D0 bcc .bc10 and.w #3,D0 ;remaining bytes to compare cmp.w D0,D0 ;force Z bit bra .bc21 .bc20 cmpm.b (A0)+,(A1)+ .bc21 dbne D0,.bc20 bne .bcfail moveq.l #1,D0 ;success! rts .bcfail moveq.l #0,D0 ;failure! rts !Funky!Stuff! fi # end of overwriting check echo shar: "extracting 'bmov.asm'" '(2664 characters)' if test -f 'bmov.asm' then echo shar: "will not over-write existing file 'bmov.asm'" else cat << \!Funky!Stuff! > 'bmov.asm' ; BMOV(src, dst, len) ; ; char *src, *dst; ; long len; ; ; The memory move algorithm is somewhat more of a mess ; since we must do it either ascending or decending. public _bmov _bmov: move.l 4(sp),A0 move.l 8(sp),A1 move.l 12(sp),D0 cmp.l A0,A1 ;move to self beq .bmend bls .bmup .bmdown adda.l D0,A0 ;descending copy adda.l D0,A1 move.w A0,D1 ;CHECK WORD ALIGNED btst.l #0,D1 bne .bmdown1 move.w A1,D1 btst.l #0,D1 bne .bmdown1 cmp.l #259,D0 ;chosen by calculation. blo .bmdown8 move.l D0,D1 ;overhead for bmd44: ~360 divu #44,D1 bvs .bmdown8 ;too big (> 2,883,540) movem.l D2-D7/A2-A6,-(sp) ;use D2-D7/A2-A6 (11 regs) move.l #11*4,D0 bra .bmd44b .bmd44a sub.l D0,A0 ;8 total 214/44bytes movem.l (A0),D2-D7/A2-A6 ;12 + 8*11 4.86 cycles/byte movem.l D2-D7/A2-A6,-(A1) ; 8 + 8*11 .bmd44b dbf D1,.bmd44a ;10 swap D1 ;D0<15:7> already contain 0 move.w D1,D0 ;D0 = remainder movem.l (sp)+,D2-D7/A2-A6 .bmdown8 move.w D0,D1 ;D1<2:0> = #bytes left later lsr.l #3,D0 ;divide by 8 bra .bmd8b .bmd8a move.l -(A0),-(A1) ;20 total 50/8bytes move.l -(A0),-(A1) ;20 = 6.25 cycles/byte .bmd8b dbf D0,.bmd8a ;10 sub.l #$10000,D0 bcc .bmd8a move.w D1,D0 ;D0 = 0 to 7 bytes and.l #7,D0 bne .bmdown1 rts .bmd1a move.b -(A0),-(A1) ;12 total 22/byte .bmdown1 ; = 22 cycles/byte .bmd1b dbf D0,.bmd1a ;10 sub.l #$10000,D0 bcc .bmd1a rts .bmup move.w A0,D1 ;CHECK WORD ALIGNED btst.l #0,D1 bne .bmup1 move.w A1,D1 btst.l #0,D1 bne .bmup1 cmp.l #259,D0 ;chosen by calculation blo .bmup8 move.l D0,D1 ;overhead for bmu44: ~360 divu #44,D1 bvs .bmup8 ;too big (> 2,883,540) movem.l D2-D7/A2-A6,-(sp) ;use D2-D7/A2-A6 (11 regs) move.l #11*4,D0 bra .bmu44b .bmu44a movem.l (A0)+,D2-D7/A2-A6 ;12 + 8*11 ttl 214/44bytes movem.l D2-D7/A2-A6,(A1) ;8 + 8*11 4.86 cycles/byte add.l D0,A1 ;8 .bmu44b dbf D1,.bmu44a ;10 swap D1 ;D0<15:7> already contain 0 move.w D1,D0 ;D0 = remainder movem.l (sp)+,D2-D7/A2-A6 .bmup8 move.w D0,D1 ;D1<2:0> = #bytes left later lsr.l #3,D0 ;divide by 8 bra .bmu8b .bmu8a move.l (A0)+,(A1)+ ;20 total 50/8bytes move.l (A0)+,(A1)+ ;20 = 6.25 cycles/byte .bmu8b dbf D0,.bmu8a ;10 sub.l #$10000,D0 bcc .bmu8a move.w D1,D0 ;D0 = 0 to 7 bytes and.l #7,D0 bne .bmup1 rts .bmu1a move.b (A0)+,(A1)+ .bmup1 .bmu1b dbf D0,.bmu1a sub.l #$10000,D0 bcc .bmu1a .bmend rts !Funky!Stuff! fi # end of overwriting check echo shar: "extracting 'bset.asm'" '(1702 characters)' if test -f 'bset.asm' then echo shar: "will not over-write existing file 'bset.asm'" else cat << \!Funky!Stuff! > 'bset.asm' public _bzero ; Zero a block of memory public _bset ; Set a block of memory to (byte val) ; BSET(buffer, len, byte) ; BZERO(buffer, len) ; ; char *buffer; ; long len; ; long byte; (must be passed as a long though only a byte) public _bset public _bzero _bzero: moveq.l #0,D1 bra .bz0 _bset: move.b 12+3(sp),D1 .bz0 move.l 4(sp),A0 move.l 8(sp),D0 add.l D0,A0 ; start at end of address cmp.l #40,D0 ; unscientifically chosen bls .bs2 bra .bs10 .bs1 move.b D1,-(A0) ; any count < 65536 .bs2 dbf D0,.bs1 rts ; at least 2 bytes in count (D0) .bs10 movem.l D2-D7/A2-A6,-(sp) ;ant count > 4 move.l A0,D2 btst.l #0,D2 ; is it aligned? beq .bs22 move.b D1,-(A0) ; no, copy one byte subq.l #1,D0 .bs22 andi.l #$FF,D1 ; expand data D1.B -> D2-D7/A1-A6 move.l D1,D2 ; D1 000000xx D2 000000xx asl.w #8,D2 ; 0000xx00 or.w D2,D1 ; 0000xxxx move.w D1,D2 ; 0000xxxx 0000xxxx swap D2 ; 0000xxxx xxxx0000 or.l D1,D2 ; D2.L move.l D2,D3 move.l D2,D4 move.l D2,D5 move.l D2,D6 move.l D2,D7 move.l D2,A1 move.l D2,A2 move.l D2,A3 move.l D2,A4 move.l D2,A5 move.l D2,A6 ; D2-D7/A1-A6 (12 registers) move.l #12*4,D1 ; bytes per transfer (48) .bs30 sub.l D1,D0 ; pre subtract bmi .bs40 .bs31 movem.l D2-D7/A1-A6,-(A0) sub.l D1,D0 bpl .bs31 .bs40 add.w D1,D0 ; less than 48 bytes remaining move.w #4,D1 ; by 4's sub.w D1,D0 bmi .bs50 .bs41 move.l D2,-(A0) sub.w D1,D0 bpl .bs41 .bs50 add.w D1,D0 bra .bs52 .bs51 move.b D2,-(A0) ; by 1's .bs52 dbf D0,.bs51 movem.l (sp)+,D2-D7/A2-A6 rts !Funky!Stuff! fi # end of overwriting check exit 0 # End of shell archive