diff --git a/mandel.s b/mandel.s index d8b25e1..4444274 100644 --- a/mandel.s +++ b/mandel.s @@ -30,8 +30,8 @@ FRX = $ec .endmacro ; inner loop for imul16 -; bitnum < 8: 25 or 30 cycles -; bitnum >= 8: 39 or 44 cycles +; bitnum < 8: 25 or 41 cycles +; bitnum >= 8: 30 or 46 cycles .macro bitmul16 arg1, arg2, result, bitnum .local zero .local one @@ -54,7 +54,7 @@ FRX = $ec .endif bne one ; 2 cyc -zero: ; 18 cyc / 23 cyc +zero: ; 18 cyc, 23 cyc lsr result + 3 ; 5 cyc ror result + 2 ; 5 cyc ror result + 1 ; 5 cyc @@ -65,7 +65,7 @@ zero: ; 18 cyc / 23 cyc .endif jmp next ; 3 cyc -one: ; 32 cyc / 37 cyc +one: ; 32 cyc, 37 cyc ; 16-bit add on the top bits clc ; 2 cyc lda result + 2 ; 3 cyc @@ -73,13 +73,16 @@ one: ; 32 cyc / 37 cyc sta result + 2 ; 3 cyc lda result + 3 ; 3 cyc adc arg2 + 1 ; 3 cyc - ror a ; 2 cyc - get a jump on the shift + ror a ; 2 cyc sta result + 3 ; 3 cyc ror result + 2 ; 5 cyc ror result + 1 ; 5 cyc .if bitnum >= 8 + ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte + ; when it's all uninitialized data ror result ; 5 cyc .endif + next: .endmacro @@ -118,8 +121,8 @@ positive: ; of a larger routine ; 424 to 672 cycles .repeat 16, bitnum - ; first half: 24 to 40 cycles - ; second half: 29 to 44 cycles + ; first half: 22 to 40 cycles + ; second half: 29 to 47 cycles bitmul16 arg1, arg2, result, bitnum .endrepeat