While i'm working in the next project, i have made a few changes to the APLib decruncher (based in the optimizations made by Antonio Villena to exomizer decruncher), my tests show that now is between 10-12% faster.
The decruncher code takes 4 extra bytes (from 163 to 167 bytes) and very important for the people using the firmware, the code uses AF', because that you will need to disable the interrupts while decrunching and preserve AF'.
The code is here:; ---------------------------------------------------------------------------
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
; extra optimizations by SyX
;
; hl = source
; de = dest
; ---------------------------------------------------------------------------
depack
LD A,128
apbranch1
LDI
aploop0
LD IXH,1 ; LWM = 0
aploop
CALL ap_getbit
JR NC,apbranch1
CALL ap_getbit
JR NC,apbranch2
LD B,0
CALL ap_getbit
JR NC,apbranch3
LD C,16 ; get an offset
apget4bits
CALL ap_getbit
RL C
JR NC,apget4bits
JR NZ,apbranch4
EX AF,AF'
LD A,B
apwritebyte
LD (DE),A ; write a 0
EX AF,AF'
INC DE
JR aploop0
apbranch4
AND A
EX DE,HL ; write a previous byte (1-15 away from dest)
SBC HL,BC
EX AF,AF'
LD A,(HL)
ADD HL,BC
EX DE,HL
JR apwritebyte
apbranch3
LD C,(HL) ; use 7 bit offset, length = 2 or 3
INC HL
EX AF,AF'
OR A
RR C
RET Z ; if a zero is encountered here, it is EOF
LD A,2
ADC A,B
PUSH HL
LD IYH,B
LD IYL,C
LD H,D
LD L,E
SBC HL,BC
LD C,A
EX AF,AF'
JR ap_finishup2
apbranch2
CALL ap_getgamma ; use a gamma code * 256 for offset, another gamma code for length
DEC C
EX AF,AF'
LD A,C
SUB IXH
JR Z,ap_r0_gamma ; if gamma code is 2, use old r0 offset,
DEC A
; do I even need this code?
; bc=bc*256+(hl), lazy 16bit way
LD B,A
EX AF,AF'
LD C,(HL)
INC HL
LD IYH,B
LD IYL,C
PUSH BC
CALL ap_getgamma
EX (SP),HL ; bc = len, hl=offs
PUSH DE
EX DE,HL
EX AF,AF'
LD A,4
CP D
JR NC,apskip2
INC BC
apskip2
EX AF,AF'
OR A
LD HL,127
SBC HL,DE
JR C,apskip3
INC BC
INC BC
apskip3
POP HL ; bc = len, de = offs, hl=junk
PUSH HL
OR A
ap_finishup
SBC HL,DE
POP DE ; hl=dest-offs, bc=len, de = dest
ap_finishup2
LDIR
POP HL
LD IXH,B
JR aploop
ap_r0_gamma
EX AF,AF'
CALL ap_getgamma ; and a new gamma code for length
PUSH HL
PUSH DE
EX DE,HL
LD D,IYH
LD E,IYL
JR ap_finishup
ap_getbit
ADD A,A
RET NZ
LD A,(HL)
INC HL
RLA
RET
ap_getgamma
LD BC,1
ap_getgammaloop
CALL ap_getbit
RL C
RL B
CALL ap_getbit
JR C,ap_getgammaloop
RET
Hi SyX!
I didn't see this till today. Last week I optimized both versions (speed & size optimized) of aPLib depackers, and what you did here was a speed optimization of the size optimized one, resulting in a nice speed/size tradeoff.
Here are my last versions:
1 - Speed optimized, 185 bytes:
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
;hl = source
;de = dest
depack: ld a,128
apbranch1: ldi
aploop2: ld ixh,1
aploop: call ap_getbit
jr nc,apbranch1
call ap_getbit
jr nc,apbranch2
call ap_getbit
jr nc,apbranch3
ld bc,16 ;get an offset
apget4bits: call ap_getbit
rl c
jr nc,apget4bits
jr nz,apbranch4
ex de,hl
ld (hl),b ;write a 0
ex de,hl
inc de
jp aploop2
apbranch4: ex af,af'
ex de,hl ;write a previous byte (1-15 away from dest)
sbc hl,bc
ld a,(hl)
add hl,bc
ld (hl),a
ex af,af'
ex de,hl
inc de
jp aploop2
apbranch3: ld c,(hl) ;use 7 bit offset, length = 2 or 3
inc hl
ex af,af'
rr c
ret z ;if a zero is found here, it's EOF
ld a,2
ld b,0
adc a,b
push hl
ld iyh,b
ld iyl,c
ld h,d
ld l,e
sbc hl,bc
ld c,a
ex af,af'
ldir
pop hl
ld ixh,b
jp aploop
apbranch2: call ap_getgamma ;use a gamma code * 256 for offset, another gamma code for length
dec c
ex af,af'
ld a,c
sub ixh
jr z,ap_r0_gamma
dec a
;bc=bc*256+(hl), lazy 16bit way
ld b,a
ld c,(hl)
inc hl
ld iyh,b
ld iyl,c
push bc
call ap_getgamma2
ex (sp),hl ;bc = len, hl=offs
push de
ex de,hl
ex af,af'
ld a,4
cp d
jr nc,apskip2
inc bc
or a
apskip2: ld hl,127
sbc hl,de
jr c,apskip3
inc bc
inc bc
apskip3: pop hl ;bc = len, de = offs, hl=junk
push hl
or a
sbc hl,de
ex af,af'
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh,b
jp aploop
ap_r0_gamma: call ap_getgamma2 ;and a new gamma code for length
push hl
push de
ex de,hl
ld d,iyh
ld e,iyl
sbc hl,de
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh,b
jp aploop
ap_getgamma2: ex af,af'
ap_getgamma: ld bc,1
ap_getgammaloop:call ap_getbit
rl c
rl b
call ap_getbit
jr c,ap_getgammaloop
ret
ap_getbit: add a,a
ret nz
ld a,(hl)
inc hl
rla
ret
2 - Size optimized: 156 bytes:
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
;hl = source
;de = dest
depack: ld ixl,128
apbranch1: ldi
aploop0: ld ixh,1 ;LWM = 0
aploop: call ap_getbit
jr nc,apbranch1
call ap_getbit2
jr nc,apbranch2
ld bc,16
call ap_getbit2
jr nc,apbranch3
apget4bits: call ap_getbit2
rl c
jr nc,apget4bits
ld a,b
jr z,apwritebyte
and a
ex de,hl ;write a previous byte (1-15 away from dest)
sbc hl,bc
ld a,(hl)
add hl,bc
ex de,hl
apwritebyte: ld (de),a ;write a 0
inc de
jr aploop0
apbranch3: ld c,(hl) ;use 7 bit offset, length = 2 or 3
inc hl
rr c
ret z ;if a zero is encountered here, it's EOF
ld a,2
adc a,b
push hl
push bc
pop iy
ld h,d
ld l,e
sbc hl,bc
ld c,a
jr ap_finishup2
apbranch2: call ap_getgamma ;use a gamma code * 256 for offset, another gamma code for length
dec c
ld a,c
sub ixh
jr z,ap_r0_gamma ;if gamma code is 2, use old r0 offset,
dec a
;do I even need this code?
;bc=bc*256+(hl), lazy 16bit way
ld b,a
ld c,(hl)
inc hl
push bc
pop iy
push bc
call ap_getgamma
ex (sp),hl ;bc = len, hl=offs
push de
ex de,hl
ld a,4
cp d
jr nc,apskip2
inc bc
or a
apskip2: ld hl,127
sbc hl,de
jr c,apskip3
inc bc
inc bc
apskip3: pop hl ;bc = len, de = offs, hl=junk
push hl
or a
ap_finishup: sbc hl,de
pop de ;hl=dest-offs, bc=len, de = dest
ap_finishup2: ldir
pop hl
ld ixh,b
jr aploop
ap_r0_gamma: call ap_getgamma ;and a new gamma code for length
push hl
push de
ex de,hl
push iy
pop de
jr ap_finishup
ap_getbit: ld a,ixl
ap_getbit2: add a,a
jr nz,ap_endbit
ld a,(hl)
inc hl
rla
ap_endbit: ld ixl,a
ret
ap_getgamma: ld bc,1
ap_getgammaloop:call ap_getbit
rl c
rl b
call ap_getbit2
jr c,ap_getgammaloop
ret
To Syx and Metalbrain
I suggest this change to the speed versions of your decrunchers. The idea was originally from Urusergi.
Change every call of ap_getbit for
add a, a
call z, ap_getbit
And delete the first 2 lines of ap_getbit (add a,a/jr nz).
Fantastic work Metalbrain!!! :)
Yes, antoniovillena, i was waiting at the end of the exomizer optimizations to adapt all the nice ideas in that thread :)
PD: For the cpcwiki people, the famous exomizer optimization thread is in wos (http://www.worldofspectrum.org/forums/showthread.php?t=41523), although the really hard work was started by Urusergi in the spanish forum (http://www.amstrad.es/forum/viewtopic.php?f=6&t=2608). And aside of having now a superfast exomizer decruncher, we have another present, the antoniovillena's Ticks tool (http://retrolandia.net/foro/showthread.php?tid=43), a z80 emulator for the commandline (pass the ZEXALL Z80 tests ;) ) that it has been used for making a benchmark with the crunchers.
Quote from: antoniovillena on 17:10, 24 November 12
I suggest this change to the speed versions of your decrunchers. The idea was originally from Urusergi.
Change every call of ap_getbit for
add a, a
call z, ap_getbit
And delete the first 2 lines of ap_getbit (add a,a/jr nz).
Thanks! It's even faster without call, inlining it, 197 bytes:
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
;hl = source
;de = dest
depack: ld a,128
apbranch1: ldi
aploop2: ld ixh,1
aploop: add a,a
jr nz,apnogetbit1
ld a,(hl)
inc hl
rla
apnogetbit1: jr nc,apbranch1
add a,a
jr nz,apnogetbit2
ld a,(hl)
inc hl
rla
apnogetbit2: jr nc,apbranch2
add a,a
jr nz,apnogetbit3
ld a,(hl)
inc hl
rla
apnogetbit3: jr nc,apbranch3
ld bc,16 ;get an offset
apget4bits: add a,a
jr nz,apnogetbit4
ld a,(hl)
inc hl
rla
apnogetbit4: rl c
jr nc,apget4bits
jr nz,apbranch4
ex de,hl
ld (hl),b ;write a 0
ex de,hl
inc de
jp aploop2
apbranch4: ex af,af'
ex de,hl ;write a previous byte (1-15 away from dest)
sbc hl,bc
ld a,(hl)
add hl,bc
ld (hl),a
ex af,af'
ex de,hl
inc de
jp aploop2
apbranch3: ld c,(hl) ;use 7 bit offset, length = 2 or 3
inc hl
ex af,af'
rr c
ret z ;if a zero is found here, it's EOF
ld a,2
ld b,0
adc a,b
push hl
ld iyh,b
ld iyl,c
ld h,d
ld l,e
sbc hl,bc
ld c,a
ex af,af'
ldir
pop hl
ld ixh,b
jp aploop
apbranch2: call ap_getgamma ;use a gamma code * 256 for offset, another gamma code for length
dec c
ex af,af'
ld a,c
sub ixh
jr z,ap_r0_gamma
dec a
;do I even need this code?
;bc=bc*256+(hl), lazy 16bit way
ld b,a
ld c,(hl)
inc hl
ld iyh,b
ld iyl,c
push bc
call ap_getgamma2
ex (sp),hl ;bc = len, hl=offs
push de
ex de,hl
ex af,af'
ld a,4
cp d
jr nc,apskip2
inc bc
or a
apskip2: ld hl,127
sbc hl,de
jr c,apskip3
inc bc
inc bc
apskip3: pop hl ;bc = len, de = offs, hl=junk
push hl
or a
sbc hl,de
ex af,af'
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh,b
jp aploop
ap_r0_gamma: call ap_getgamma2 ;and a new gamma code for length
push hl
push de
ex de,hl
ld d,iyh
ld e,iyl
sbc hl,de
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh,b
jp aploop
ap_getgamma2: ex af,af'
ap_getgamma: ld bc,1
ap_getgammaloop:add a,a
jr nz,apnogetbit5
ld a,(hl)
inc hl
rla
apnogetbit5: rl c
rl b
add a,a
jr nz,apnogetbit6
ld a,(hl)
inc hl
rla
apnogetbit6: jr c,ap_getgammaloop
ret
Inlining the first steps of ap_getgamma we can get even faster speeds at a price of 15 bytes (17 from the 8th) per bit optimized (up till all 15 max bits), but the speed gains are smaller every time (because higher numbers are less frequent). As an example, here's one with 2 bits inlined, taking 227 bytes in total:
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
;hl = source
;de = dest
depack: ld a,128
apbranch1: ldi
aploop2: ld ixh,1
aploop: add a,a
jr nz,apnogetbit1
ld a,(hl)
inc hl
rla
apnogetbit1: jr nc,apbranch1
add a,a
jr nz,apnogetbit2
ld a,(hl)
inc hl
rla
apnogetbit2: jr nc,apbranch2
add a,a
jr nz,apnogetbit3
ld a,(hl)
inc hl
rla
apnogetbit3: jr nc,apbranch3
ld bc,16 ;get an offset
apget4bits: add a,a
jr nz,apnogetbit4
ld a,(hl)
inc hl
rla
apnogetbit4: rl c
jr nc,apget4bits
jr nz,apbranch4
ex de,hl
ld (hl),b ;write a 0
ex de,hl
inc de
jp aploop2
apbranch4: ex af,af'
ex de,hl ;write a previous byte (1-15 away from dest)
sbc hl,bc
ld a,(hl)
add hl,bc
ld (hl),a
ex af,af'
ex de,hl
inc de
jp aploop2
apbranch3: ld c,(hl) ;use 7 bit offset, length = 2 or 3
inc hl
ex af,af'
rr c
ret z ;if a zero is found here, it's EOF
ld a,2
ld b,0
adc a,b
push hl
ld iyh,b
ld iyl,c
ld h,d
ld l,e
sbc hl,bc
ld c,a
ex af,af'
ldir
pop hl
ld ixh,b
jp aploop
apbranch2: call ap_getgamma ;use a gamma code * 256 for offset, another gamma code for length
dec c
ex af,af'
ld a,c
sub ixh
jr z,ap_r0_gamma
dec a
;do I even need this code?
;bc=bc*256+(hl), lazy 16bit way
ld b,a
ld c,(hl)
inc hl
ld iyh,b
ld iyl,c
push bc
call ap_getgamma2
ex (sp),hl ;bc = len, hl=offs
push de
ex de,hl
ex af,af'
ld a,4
cp d
jr nc,apskip2
inc bc
or a
apskip2: ld hl,127
sbc hl,de
jr c,apskip3
inc bc
inc bc
apskip3: pop hl ;bc = len, de = offs, hl=junk
push hl
or a
sbc hl,de
ex af,af'
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh,b
jp aploop
ap_r0_gamma: call ap_getgamma2 ;and a new gamma code for length
push hl
push de
ex de,hl
ld d,iyh
ld e,iyl
sbc hl,de
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh,b
jp aploop
ap_getgamma2: ex af,af'
ap_getgamma: ld bc,1
add a,a
jr nz,apnogetbit5
ld a,(hl)
inc hl
rla
apnogetbit5: rl c
add a,a
jr nz,apnogetbit6
ld a,(hl)
inc hl
rla
apnogetbit6: ret nc
add a,a
jr nz,apnogetbit7
ld a,(hl)
inc hl
rla
apnogetbit7: rl c
add a,a
jr nz,apnogetbit8
ld a,(hl)
inc hl
rla
apnogetbit8: ret nc
ap_getgammaloop:add a,a
jr nz,apnogetbit9
ld a,(hl)
inc hl
rla
apnogetbit9: rl c
rl b
add a,a
jr nz,apnogetbit10
ld a,(hl)
inc hl
rla
apnogetbit10: jr c,ap_getgammaloop
ret
So... time for an updated release now?
Metalbrain has pasted his optimized versions of the decruncher here, you can use them without problems :)
Sorry, I talk about an update for everybody and not only the few elite people reading this thread. It would be a good thing to provide access to this new nice routine for everybody out there, right?
I have modified the last Metalbrain version:
; aPPack decompressor
; original source by dwedit
; very slightly adapted by utopian
; optimized by Metalbrain
;hl = source
;de = dest
depack: ld a, 128
apbranch1: ldi
aploop2: ld ixh, 1
aploop: add a, a
jr z, ap1
jr nc, apbranch1
apnogetbit1: add a, a
jr z, ap2
jr nc, apbranch2
apnogetbit2: add a, a
jr z, ap3
jr nc, apbranch3
apnogetbit3: ld bc, 16 ;get an offset
apget4bits: add a, a
jr z, ap4
apnogetbit4: rl c
jp nc, apget4bits
jr nz, apbranch4
ex de, hl
ld (hl), b ;write a 0
ex de, hl
inc de
jp aploop2
ap1: ld a, (hl)
inc hl
rla
jr c, apnogetbit1
jp apbranch1
ap4: ld a, (hl)
inc hl
rla
jp apnogetbit4
apbranch4: ex af, af'
ex de, hl ;write a previous byte (1-15 away from dest)
sbc hl, bc
ld a, (hl)
add hl, bc
ld (hl), a
ex af, af'
ex de, hl
inc de
jp aploop2
ap3: ld a, (hl)
inc hl
rla
jr c, apnogetbit3
apbranch3: ld c, (hl) ;use 7 bit offset, length = 2 or 3
inc hl
ex af, af'
rr c
ret z ;if a zero is found here, it's EOF
ld a, 2
ld b, 0
adc a, b
push hl
ld iyh, b
ld iyl, c
ld h, d
ld l, e
sbc hl, bc
ld c, a
ex af, af'
ldir
pop hl
ld ixh, b
jp aploop
ap2: ld a, (hl)
inc hl
rla
jr c, apnogetbit2
apbranch2: call ap_getgamma ;use a gamma code * 256 for offset, another gamma code for length
dec c
ex af, af'
ld a, c
sub ixh
jr z, ap_r0_gamma
dec a
;do I even need this code?
;bc=bc*256+(hl), lazy 16bit way
ld b, a
ld c, (hl)
inc hl
ld iyh, b
ld iyl, c
push bc
call ap_getgamma2
ex (sp), hl ;bc = len, hl=offs
push de
ex de, hl
ex af, af'
ld a, 4
cp d
jr nc, apskip2
inc bc
or a
apskip2: ld hl, 127
sbc hl, de
jr c, apskip3
inc bc
inc bc
apskip3: pop hl ;bc = len, de = offs, hl=junk
push hl
or a
sbc hl, de
ex af, af'
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh, b
jp aploop
ap_r0_gamma: call ap_getgamma2 ;and a new gamma code for length
push hl
push de
ex de, hl
ld d, iyh
ld e, iyl
sbc hl, de
pop de ;hl=dest-offs, bc=len, de = dest
ldir
pop hl
ld ixh, b
jp aploop
ap5: ld a,(hl)
inc hl
rla
jp apnogetbit5
ap6: ld a, (hl)
inc hl
rla
jp apnogetbit6
ap7: ld a, (hl)
inc hl
rla
jp apnogetbit7
ap8: ld a, (hl)
inc hl
rla
jp apnogetbit8
ap9: ld a, (hl)
inc hl
rla
jp apnogetbit9
ap10: ld a, (hl)
inc hl
rla
ret nc
jp ap_getgammaloop
ap_getgamma2: ex af, af'
ap_getgamma: ld bc, 1
add a, a
jr z, ap5
apnogetbit5: rl c
add a, a
jr z, ap6
apnogetbit6: ret nc
add a, a
jr z, ap7
apnogetbit7: rl c
add a, a
jr z, ap8
apnogetbit8: ret nc
ap_getgammaloop:add a, a
jr z, ap9
apnogetbit9: rl c
rl b
add a, a
jr z, ap10
apnogetbit10: ret nc
jp ap_getgammaloop
The only change that I have made is favor the shorter branch of the conditional jump (7 cycles) to the most common case (not need a new byte, 7/8 of the times), and the larger one (12 cycles) to the most rare case (need a new byte, 1/8 of the times).
Just to check that I understand your wording properly:
Quote from: antoniovillena on 13:42, 16 December 12The only change that I have made is favor the shorter branch of the conditional jump (7 cycles) to the most common case (not need a new byte, 7/8 of the times), and the larger one (12 cycles) to the most rare case (need a new byte, 1/8 of the times).
Do you mean that the branching condition usually is not true and so it falls through to the next instruction (2 NOPs) instead of jumping to the supplied offset (3 NOPs)?
Quote from: db6128 on 16:34, 16 December 12
Just to check that I understand your wording properly:Do you mean that the branching condition usually is not true and so it falls through to the next instruction (2 NOPs) instead of jumping to the supplied offset (3 NOPs)?
I say that this code:
aploop: add a, a
jr nz, apnogetbit1
ld a, (hl)
inc hl
rla
apnogetbit1: jr nc,apbranch1
add a, a
can be replaced for this faster one:
aploop: add a, a
jr z, ap1
jr nc, apbranch1
apnogetbit1: add a, a
...
ap1: ld a, (hl)
inc hl
rla
jr c, apnogetbit1
jp apbranch1
OK, so I guess that after the ADD A,A the Z flag is usually not set, meaning that the following JR can now take 2 NOPs instead of 3.
Just interested in different optimisations like this after having to use a lot of them myself recently (http://www.cpcwiki.eu/forum/programming/amstrifejohn-conways-game-of-life-for-the-cpcfast-many-features-on-the-way!/msg54525/#msg54525). :D
Yes, you are right. In other machines is even better (7 cycles vs 12 cycles)
Thanks for the info! I'm trying out APLib for a little program at the moment, and this is a good and simple optimisation that I could replace, even if I end up using one of the more size- rather than speed-optimised ones.
I suppose JR doesn't lose out too much by being rounded up to 4 T-states. DJNZ has it worse, theoretically faster than DEC B:JR NZ, but rounded up to the same speed on the CPC. I guess this is 'good' in a way if we have to use other registers to loop!