APLib decruncher 10% faster ;)

SyX · 10:17, 21 April 12

While i'm working in the next project, i have made a few changes to the APLib decruncher (based in the optimizations made by Antonio Villena to exomizer decruncher), my tests show that now is between 10-12% faster.

The decruncher code takes 4 extra bytes (from 163 to 167 bytes) and very important for the people using the firmware, the code uses AF', because that you will need to disable the interrupts while decrunching and preserve AF'.

The code is here:

Code Select

; ---------------------------------------------------------------------------
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
; extra optimizations by SyX
;
; hl = source
; de = dest
; ---------------------------------------------------------------------------
depack
    LD   A,128
apbranch1
    LDI
aploop0
    LD   IXH,1                      ; LWM = 0
aploop
    CALL ap_getbit
    JR   NC,apbranch1
    CALL ap_getbit
    JR   NC,apbranch2
    LD   B,0
    CALL ap_getbit
    JR   NC,apbranch3
    LD   C,16                       ; get an offset
apget4bits
    CALL ap_getbit
    RL   C
    JR   NC,apget4bits
    JR   NZ,apbranch4
    EX   AF,AF'
    LD   A,B
apwritebyte
    LD   (DE),A                     ; write a 0
    EX   AF,AF'
    INC  DE
    JR   aploop0
apbranch4
    AND  A
    EX   DE,HL                      ; write a previous byte (1-15 away from dest)
    SBC  HL,BC
    EX   AF,AF'
    LD   A,(HL)
    ADD  HL,BC
    EX   DE,HL
    JR   apwritebyte
apbranch3
    LD   C,(HL)                     ; use 7 bit offset, length = 2 or 3
    INC  HL
    EX   AF,AF'
    OR   A
    RR   C
    RET  Z                          ; if a zero is encountered here, it is EOF
    LD   A,2
    ADC  A,B
    PUSH HL
    LD   IYH,B
    LD   IYL,C
    LD   H,D
    LD   L,E
    SBC  HL,BC
    LD   C,A
    EX   AF,AF'
    JR   ap_finishup2
apbranch2
    CALL ap_getgamma                ; use a gamma code * 256 for offset, another gamma code for length
    DEC  C
    EX   AF,AF'
    LD   A,C
    SUB  IXH
    JR   Z,ap_r0_gamma              ; if gamma code is 2, use old r0 offset,
    DEC  A
    ; do I even need this code?
    ; bc=bc*256+(hl), lazy 16bit way
    LD   B,A
    EX   AF,AF'
    LD   C,(HL)
    INC  HL
    LD   IYH,B
    LD   IYL,C
    PUSH BC
    CALL ap_getgamma
    EX   (SP),HL                    ; bc = len, hl=offs
    PUSH DE
    EX   DE,HL
    EX   AF,AF'
    LD   A,4
    CP   D
    JR   NC,apskip2
    INC  BC
apskip2 
    EX   AF,AF'
    OR   A
    LD   HL,127
    SBC  HL,DE
    JR   C,apskip3
    INC  BC
    INC  BC
apskip3
    POP  HL                         ; bc = len, de = offs, hl=junk
    PUSH HL
    OR   A
ap_finishup
    SBC  HL,DE
    POP  DE                         ; hl=dest-offs, bc=len, de = dest
ap_finishup2
    LDIR
    POP  HL
    LD   IXH,B
    JR   aploop
ap_r0_gamma 
    EX   AF,AF'
    CALL ap_getgamma                ; and a new gamma code for length
    PUSH HL
    PUSH DE
    EX   DE,HL
    LD   D,IYH
    LD   E,IYL
    JR   ap_finishup
ap_getbit
    ADD  A,A
    RET  NZ
    LD   A,(HL)
    INC  HL
    RLA
    RET
ap_getgamma
    LD   BC,1
ap_getgammaloop
    CALL ap_getbit
    RL   C
    RL   B
    CALL ap_getbit
    JR   C,ap_getgammaloop
    RET

Metalbrain · 16:32, 24 November 12

Hi SyX!
I didn't see this till today. Last week I optimized both versions (speed & size optimized) of aPLib depackers, and what you did here was a speed optimization of the size optimized one, resulting in a nice speed/size tradeoff.
Here are my last versions:
1 - Speed optimized, 185 bytes:

Code Select


; aPPack decompressor 
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain

;hl = source
;de = dest

depack:         ld    a,128
apbranch1:      ldi
aploop2:        ld    ixh,1
aploop:	        call  ap_getbit
                jr    nc,apbranch1
                call  ap_getbit
                jr    nc,apbranch2
                call  ap_getbit
                jr    nc,apbranch3

                ld    bc,16		;get an offset
apget4bits:     call  ap_getbit
                rl    c
                jr    nc,apget4bits
                jr    nz,apbranch4
                ex    de,hl
                ld    (hl),b		;write a 0
                ex    de,hl
                inc   de
                jp    aploop2
apbranch4:      ex    af,af'
                ex    de,hl 		;write a previous byte (1-15 away from dest)
                sbc   hl,bc
                ld    a,(hl)
                add   hl,bc
                ld    (hl),a
                ex    af,af'
                ex    de,hl
                inc   de
                jp    aploop2

apbranch3:      ld    c,(hl)		;use 7 bit offset, length = 2 or 3
                inc   hl
                ex    af,af'
                rr    c
                ret   z		;if a zero is found here, it's EOF
                ld    a,2
                ld    b,0
                adc   a,b
                push  hl
                ld    iyh,b
                ld    iyl,c
                ld    h,d
                ld    l,e
                sbc   hl,bc
                ld    c,a
                ex    af,af'
                ldir
                pop   hl
                ld    ixh,b
                jp    aploop
apbranch2:      call  ap_getgamma	;use a gamma code * 256 for offset, another gamma code for length
                dec   c
                ex    af,af'
                ld    a,c
                sub   ixh
                jr    z,ap_r0_gamma
                dec   a
                ;bc=bc*256+(hl), lazy 16bit way
                ld    b,a
                ld    c,(hl)
                inc   hl
                ld    iyh,b
                ld    iyl,c

                push  bc

                call  ap_getgamma2

                ex    (sp),hl		;bc = len, hl=offs
                push  de
                ex    de,hl

                ex    af,af'
                ld    a,4
                cp    d
                jr    nc,apskip2
                inc   bc
                or    a
apskip2:        ld    hl,127
                sbc   hl,de
                jr    c,apskip3
                inc   bc
                inc   bc
apskip3:        pop   hl		;bc = len, de = offs, hl=junk
                push  hl
                or    a
                sbc   hl,de
                ex    af,af'
                pop   de		;hl=dest-offs, bc=len, de = dest
                ldir
                pop   hl
                ld    ixh,b
                jp    aploop

ap_r0_gamma:	call  ap_getgamma2	;and a new gamma code for length
                push  hl
                push  de
                ex    de,hl

                ld    d,iyh
                ld    e,iyl
                sbc   hl,de
                pop   de		;hl=dest-offs, bc=len, de = dest
                ldir
                pop   hl
                ld    ixh,b
                jp    aploop

ap_getgamma2:   ex    af,af'
ap_getgamma:    ld    bc,1
ap_getgammaloop:call  ap_getbit
                rl    c
                rl    b
                call  ap_getbit
                jr    c,ap_getgammaloop
                ret

ap_getbit:      add   a,a
                ret   nz
                ld    a,(hl)
                inc   hl
                rla
                ret

2 - Size optimized: 156 bytes:

Code Select


; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain

;hl = source
;de = dest

depack:		ld	ixl,128
apbranch1:	ldi
aploop0:	ld	ixh,1		;LWM = 0
aploop:		call 	ap_getbit
		jr 	nc,apbranch1
		call 	ap_getbit2
		jr 	nc,apbranch2
		ld 	bc,16
		call 	ap_getbit2
		jr 	nc,apbranch3
apget4bits:	call 	ap_getbit2
		rl 	c
		jr	nc,apget4bits
		ld 	a,b
		jr 	z,apwritebyte
		and	a
		ex 	de,hl 		;write a previous byte (1-15 away from dest)
		sbc 	hl,bc
		ld 	a,(hl)
		add	hl,bc
		ex 	de,hl
apwritebyte:	ld 	(de),a		;write a 0
		inc 	de
		jr	aploop0
apbranch3:	ld 	c,(hl)		;use 7 bit offset, length = 2 or 3
		inc 	hl
		rr 	c
		ret 	z		;if a zero is encountered here, it's EOF
		ld	a,2
		adc	a,b
		push 	hl
		push	bc
		pop	iy
		ld 	h,d
		ld 	l,e
		sbc 	hl,bc
		ld 	c,a
		jr	ap_finishup2
apbranch2:	call 	ap_getgamma	;use a gamma code * 256 for offset, another gamma code for length
		dec 	c
		ld	a,c
		sub	ixh
		jr 	z,ap_r0_gamma		;if gamma code is 2, use old r0 offset,
		dec 	a
		;do I even need this code?
		;bc=bc*256+(hl), lazy 16bit way
		ld 	b,a
		ld 	c,(hl)
		inc 	hl
		push	bc
		pop	iy

		push 	bc
		
		call 	ap_getgamma

		ex 	(sp),hl		;bc = len, hl=offs
		push 	de
		ex 	de,hl

		ld	a,4
		cp	d
		jr 	nc,apskip2
		inc 	bc
		or	a
apskip2:	ld 	hl,127
		sbc 	hl,de
		jr 	c,apskip3
		inc 	bc
		inc 	bc
apskip3:	pop 	hl		;bc = len, de = offs, hl=junk
		push 	hl
		or 	a
ap_finishup:	sbc 	hl,de
		pop 	de		;hl=dest-offs, bc=len, de = dest
ap_finishup2:	ldir
		pop 	hl
		ld	ixh,b
		jr 	aploop

ap_r0_gamma:	call 	ap_getgamma		;and a new gamma code for length
		push 	hl
		push 	de
		ex	de,hl

		push	iy
		pop	de
		jr 	ap_finishup

ap_getbit:	ld	a,ixl
ap_getbit2:	add	a,a
		jr	nz,ap_endbit
		ld	a,(hl)
		inc	hl
		rla
ap_endbit:	ld	ixl,a
		ret

ap_getgamma:	ld 	bc,1
ap_getgammaloop:call 	ap_getbit
		rl 	c
		rl 	b
		call 	ap_getbit2
		jr 	c,ap_getgammaloop
		ret

antoniovillena · 17:10, 24 November 12

To Syx and Metalbrain

I suggest this change to the speed versions of your decrunchers. The idea was originally from Urusergi.

Change every call of ap_getbit for

Code Select



        add     a, a
        call    z, ap_getbit

And delete the first 2 lines of ap_getbit (add a,a/jr nz).

SyX · 18:54, 24 November 12

Fantastic work Metalbrain!!!

Yes, antoniovillena, i was waiting at the end of the exomizer optimizations to adapt all the nice ideas in that thread

PD: For the cpcwiki people, the famous exomizer optimization thread is in wos, although the really hard work was started by Urusergi in the spanish forum. And aside of having now a superfast exomizer decruncher, we have another present, the antoniovillena's Ticks tool, a z80 emulator for the commandline (pass the ZEXALL Z80 tests

) that it has been used for making a benchmark with the crunchers.

Metalbrain · 19:06, 24 November 12

Quote from: antoniovillena on 17:10, 24 November 12
I suggest this change to the speed versions of your decrunchers. The idea was originally from Urusergi.

Change every call of ap_getbit for

Code Select Expand
add a, a call z, ap_getbit

And delete the first 2 lines of ap_getbit (add a,a/jr nz).

Thanks! It's even faster without call, inlining it, 197 bytes:

Code Select


; aPPack decompressor 
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain

;hl = source
;de = dest

depack:		ld	a,128
apbranch1:	ldi
aploop2:	ld	ixh,1
aploop:		add	a,a
                jr      nz,apnogetbit1
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit1:	jr 	nc,apbranch1
		add	a,a
                jr      nz,apnogetbit2
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit2:	jr 	nc,apbranch2
		add	a,a
                jr      nz,apnogetbit3
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit3:	jr 	nc,apbranch3
		ld	bc,16		;get an offset
apget4bits:	add	a,a
                jr      nz,apnogetbit4
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit4:	rl 	c
		jr	nc,apget4bits
		jr 	nz,apbranch4
		ex	de,hl
		ld 	(hl),b		;write a 0
		ex	de,hl
		inc 	de
		jp	aploop2
apbranch4:	ex	af,af'
		ex 	de,hl 		;write a previous byte (1-15 away from dest)
		sbc 	hl,bc
		ld 	a,(hl)
		add	hl,bc
		ld 	(hl),a
		ex	af,af'
		ex 	de,hl
		inc 	de
		jp	aploop2

apbranch3:	ld 	c,(hl)		;use 7 bit offset, length = 2 or 3
		inc 	hl
		ex	af,af'
		rr 	c
		ret 	z		;if a zero is found here, it's EOF
		ld	a,2
		ld 	b,0
		adc	a,b
		push 	hl
		ld	iyh,b
		ld	iyl,c
		ld 	h,d
		ld 	l,e
		sbc 	hl,bc
		ld 	c,a
		ex	af,af'
		ldir
		pop 	hl
		ld	ixh,b
		jp	aploop
apbranch2:	call 	ap_getgamma	;use a gamma code * 256 for offset, another gamma code for length
		dec	c
		ex	af,af'
		ld	a,c
		sub	ixh
		jr	z,ap_r0_gamma
		dec	a

		;do I even need this code?
		;bc=bc*256+(hl), lazy 16bit way
		ld 	b,a
		ld 	c,(hl)
		inc 	hl
		ld	iyh,b
		ld	iyl,c

		push 	bc

		call 	ap_getgamma2

		ex 	(sp),hl		;bc = len, hl=offs
		push 	de
		ex 	de,hl

		ex	af,af'
		ld	a,4
		cp	d
		jr 	nc,apskip2
		inc 	bc
		or	a
apskip2:	ld 	hl,127
		sbc 	hl,de
		jr 	c,apskip3
		inc 	bc
		inc 	bc
apskip3:	pop 	hl		;bc = len, de = offs, hl=junk
		push 	hl
		or 	a
		sbc 	hl,de
		ex	af,af'
		pop 	de		;hl=dest-offs, bc=len, de = dest
		ldir
		pop 	hl
		ld	ixh,b
		jp 	aploop

ap_r0_gamma:	call 	ap_getgamma2	;and a new gamma code for length
		push 	hl
		push 	de
		ex	de,hl

		ld	d,iyh
		ld	e,iyl
		sbc 	hl,de
		pop 	de		;hl=dest-offs, bc=len, de = dest
		ldir
		pop 	hl
		ld	ixh,b
		jp 	aploop

ap_getgamma2:	ex	af,af'
ap_getgamma:	ld 	bc,1
ap_getgammaloop:add	a,a
                jr      nz,apnogetbit5
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit5:	rl 	c
		rl 	b
		add	a,a
                jr      nz,apnogetbit6
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit6:	jr 	c,ap_getgammaloop
		ret

Metalbrain · 19:34, 24 November 12

Inlining the first steps of ap_getgamma we can get even faster speeds at a price of 15 bytes (17 from the 8th) per bit optimized (up till all 15 max bits), but the speed gains are smaller every time (because higher numbers are less frequent). As an example, here's one with 2 bits inlined, taking 227 bytes in total:

Code Select


; aPPack decompressor 
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain

;hl = source
;de = dest

depack:		ld	a,128
apbranch1:	ldi
aploop2:	ld	ixh,1
aploop:		add	a,a
                jr      nz,apnogetbit1
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit1:	jr 	nc,apbranch1
		add	a,a
                jr      nz,apnogetbit2
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit2:	jr 	nc,apbranch2
		add	a,a
                jr      nz,apnogetbit3
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit3:	jr 	nc,apbranch3
		ld	bc,16		;get an offset
apget4bits:	add	a,a
                jr      nz,apnogetbit4
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit4:	rl 	c
		jr	nc,apget4bits
		jr 	nz,apbranch4
		ex	de,hl
		ld 	(hl),b		;write a 0
		ex	de,hl
		inc 	de
		jp	aploop2
apbranch4:	ex	af,af'
		ex 	de,hl 		;write a previous byte (1-15 away from dest)
		sbc 	hl,bc
		ld 	a,(hl)
		add	hl,bc
		ld 	(hl),a
		ex	af,af'
		ex 	de,hl
		inc 	de
		jp	aploop2

apbranch3:	ld 	c,(hl)		;use 7 bit offset, length = 2 or 3
		inc 	hl
		ex	af,af'
		rr 	c
		ret 	z		;if a zero is found here, it's EOF
		ld	a,2
		ld 	b,0
		adc	a,b
		push 	hl
		ld	iyh,b
		ld	iyl,c
		ld 	h,d
		ld 	l,e
		sbc 	hl,bc
		ld 	c,a
		ex	af,af'
		ldir
		pop 	hl
		ld	ixh,b
		jp	aploop
apbranch2:	call 	ap_getgamma	;use a gamma code * 256 for offset, another gamma code for length
		dec	c
		ex	af,af'
		ld	a,c
		sub	ixh
		jr	z,ap_r0_gamma
		dec	a

		;do I even need this code?
		;bc=bc*256+(hl), lazy 16bit way
		ld 	b,a
		ld 	c,(hl)
		inc 	hl
		ld	iyh,b
		ld	iyl,c

		push 	bc

		call 	ap_getgamma2

		ex 	(sp),hl		;bc = len, hl=offs
		push 	de
		ex 	de,hl

		ex	af,af'
		ld	a,4
		cp	d
		jr 	nc,apskip2
		inc 	bc
		or	a
apskip2:	ld 	hl,127
		sbc 	hl,de
		jr 	c,apskip3
		inc 	bc
		inc 	bc
apskip3:	pop 	hl		;bc = len, de = offs, hl=junk
		push 	hl
		or 	a
		sbc 	hl,de
		ex	af,af'
		pop 	de		;hl=dest-offs, bc=len, de = dest
		ldir
		pop 	hl
		ld	ixh,b
		jp 	aploop

ap_r0_gamma:	call 	ap_getgamma2	;and a new gamma code for length
		push 	hl
		push 	de
		ex	de,hl

		ld	d,iyh
		ld	e,iyl
		sbc 	hl,de
		pop 	de		;hl=dest-offs, bc=len, de = dest
		ldir
		pop 	hl
		ld	ixh,b
		jp 	aploop

ap_getgamma2:	ex	af,af'
ap_getgamma:	ld 	bc,1
                add	a,a
                jr      nz,apnogetbit5
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit5:	rl 	c
		add	a,a
                jr      nz,apnogetbit6
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit6:	ret     nc
                add	a,a
                jr      nz,apnogetbit7
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit7:	rl 	c
		add	a,a
                jr      nz,apnogetbit8
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit8:    ret     nc
ap_getgammaloop:add	a,a
                jr      nz,apnogetbit9
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit9:	rl 	c
                rl      b
		add	a,a
                jr      nz,apnogetbit10
        	ld	a,(hl)
		inc	hl
		rla
apnogetbit10:   jr 	c,ap_getgammaloop
		ret

TFM · 19:22, 26 November 12

So... time for an updated release now?

SyX · 19:40, 26 November 12

Metalbrain has pasted his optimized versions of the decruncher here, you can use them without problems

TFM · 20:08, 26 November 12

Sorry, I talk about an update for everybody and not only the few elite people reading this thread. It would be a good thing to provide access to this new nice routine for everybody out there, right?

antoniovillena · 13:42, 16 December 12

I have modified the last Metalbrain version:

Code Select



; aPPack decompressor 
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain


;hl = source
;de = dest


depack:         ld      a, 128
apbranch1:      ldi
aploop2:        ld      ixh, 1
aploop:         add     a, a
                jr      z, ap1
                jr      nc, apbranch1
apnogetbit1:    add     a, a
                jr      z, ap2
                jr      nc, apbranch2
apnogetbit2:    add     a, a
                jr      z, ap3
                jr      nc, apbranch3
apnogetbit3:    ld      bc, 16           ;get an offset
apget4bits:     add     a, a
                jr      z, ap4
apnogetbit4:    rl      c
                jp      nc, apget4bits


                jr      nz, apbranch4
                ex      de, hl
                ld      (hl), b          ;write a 0
                ex      de, hl
                inc     de
                jp      aploop2


ap1:            ld      a, (hl)
                inc     hl
                rla
                jr      c, apnogetbit1
                jp      apbranch1


ap4:            ld      a, (hl)
                inc     hl
                rla
                jp      apnogetbit4


apbranch4:      ex      af, af'
                ex      de, hl          ;write a previous byte (1-15 away from dest)
                sbc     hl, bc
                ld      a, (hl)
                add     hl, bc
                ld      (hl), a
                ex      af, af'
                ex      de, hl
                inc     de
                jp      aploop2


ap3:            ld      a, (hl)
                inc     hl
                rla
                jr      c, apnogetbit3
apbranch3:      ld      c, (hl)          ;use 7 bit offset, length = 2 or 3
                inc     hl
                ex      af, af'
                rr      c
                ret     z               ;if a zero is found here, it's EOF
                ld      a, 2
                ld      b, 0
                adc     a, b
                push    hl
                ld      iyh, b
                ld      iyl, c
                ld      h, d
                ld      l, e
                sbc     hl, bc
                ld      c, a
                ex      af, af'
                ldir
                pop     hl
                ld      ixh, b
                jp      aploop
ap2:            ld      a, (hl)
                inc     hl
                rla
                jr      c, apnogetbit2
apbranch2:      call    ap_getgamma     ;use a gamma code * 256 for offset, another gamma code for length
                dec     c
                ex      af, af'
                ld      a, c
                sub     ixh
                jr      z, ap_r0_gamma
                dec     a


                ;do I even need this code?
                ;bc=bc*256+(hl), lazy 16bit way
                ld      b, a
                ld      c, (hl)
                inc     hl
                ld      iyh, b
                ld      iyl, c


                push    bc


                call    ap_getgamma2


                ex      (sp), hl         ;bc = len, hl=offs
                push    de
                ex      de, hl


                ex      af, af'
                ld      a, 4
                cp      d
                jr      nc, apskip2
                inc     bc
                or      a
apskip2:        ld      hl, 127
                sbc     hl, de
                jr      c, apskip3
                inc     bc
                inc     bc
apskip3:        pop     hl              ;bc = len, de = offs, hl=junk
                push    hl
                or      a
                sbc     hl, de
                ex      af, af'
                pop     de              ;hl=dest-offs, bc=len, de = dest
                ldir
                pop     hl
                ld      ixh, b
                jp      aploop


ap_r0_gamma:    call    ap_getgamma2    ;and a new gamma code for length
                push    hl
                push    de
                ex      de, hl


                ld      d, iyh
                ld      e, iyl
                sbc     hl, de
                pop     de              ;hl=dest-offs, bc=len, de = dest
                ldir
                pop     hl
                ld      ixh, b
                jp      aploop


ap5:            ld      a,(hl)
                inc     hl
                rla
                jp      apnogetbit5
ap6:            ld      a, (hl)
                inc     hl
                rla
                jp      apnogetbit6
ap7:            ld      a, (hl)
                inc     hl
                rla
                jp      apnogetbit7
ap8:            ld      a, (hl)
                inc     hl
                rla
                jp      apnogetbit8
ap9:            ld      a, (hl)
                inc     hl
                rla
                jp      apnogetbit9


ap10:           ld      a, (hl)
                inc     hl
                rla
                ret     nc 
                jp      ap_getgammaloop


ap_getgamma2:   ex      af, af'
ap_getgamma:    ld      bc, 1
                add     a, a
                jr      z, ap5
apnogetbit5:    rl      c
                add     a, a
                jr      z, ap6
apnogetbit6:    ret     nc
                add     a, a
                jr      z, ap7
apnogetbit7:    rl      c
                add     a, a
                jr      z, ap8
apnogetbit8:    ret     nc
ap_getgammaloop:add     a, a
                jr      z, ap9
apnogetbit9:    rl      c
                rl      b
                add     a, a
                jr      z, ap10
apnogetbit10:   ret     nc 
                jp      ap_getgammaloop

The only change that I have made is favor the shorter branch of the conditional jump (7 cycles) to the most common case (not need a new byte, 7/8 of the times), and the larger one (12 cycles) to the most rare case (need a new byte, 1/8 of the times).

db6128 · 16:34, 16 December 12

Just to check that I understand your wording properly:

Quote from: antoniovillena on 13:42, 16 December 12The only change that I have made is favor the shorter branch of the conditional jump (7 cycles) to the most common case (not need a new byte, 7/8 of the times), and the larger one (12 cycles) to the most rare case (need a new byte, 1/8 of the times).

Do you mean that the branching condition usually is not true and so it falls through to the next instruction (2 NOPs) instead of jumping to the supplied offset (3 NOPs)?

antoniovillena · 21:59, 16 December 12

Quote from: db6128 on 16:34, 16 December 12
Just to check that I understand your wording properly:Do you mean that the branching condition usually is not true and so it falls through to the next instruction (2 NOPs) instead of jumping to the supplied offset (3 NOPs)?

I say that this code:

Code Select


aploop:         add     a, a
                jr      nz, apnogetbit1
                ld      a, (hl)
                inc     hl
                rla
apnogetbit1:    jr      nc,apbranch1
                add     a, a

can be replaced for this faster one:

Code Select


aploop:         add     a, a
                jr      z, ap1
                jr      nc, apbranch1
apnogetbit1:    add     a, a
                ...
ap1:            ld      a, (hl)
                inc     hl
                rla
                jr      c, apnogetbit1
                jp      apbranch1

db6128 · 14:08, 17 December 12

OK, so I guess that after the ADD A,A the Z flag is usually not set, meaning that the following JR can now take 2 NOPs instead of 3.

Just interested in different optimisations like this after having to use a lot of them myself recently.

antoniovillena · 22:05, 17 December 12

Yes, you are right. In other machines is even better (7 cycles vs 12 cycles)

db6128 · 23:01, 17 December 12

Thanks for the info! I'm trying out APLib for a little program at the moment, and this is a good and simple optimisation that I could replace, even if I end up using one of the more size- rather than speed-optimised ones.

I suppose JR doesn't lose out too much by being rounded up to 4 T-states. DJNZ has it worse, theoretically faster than DEC B:JR NZ, but rounded up to the same speed on the CPC. I guess this is 'good' in a way if we have to use other registers to loop!

News:

APLib decruncher 10% faster ;)