apply jamey's suggestion of skipping add for high byte muls
rather than saving 0 into the high bytes, then adding the high-byte multiplication later, write it directly in place. this saves a few cycles on every iteration, and it adds up nicely. View 1 overview render times: 130XE: 10.050 ms/px - 4m56s 800XL: 10.906 ms/px - 5m21s
This commit is contained in:
parent
d157fe1306
commit
582ddf497f
2 changed files with 4 additions and 28 deletions
29
mandel.s
29
mandel.s
|
@ -464,20 +464,6 @@ viewport_oy:
|
||||||
sta dest + 1
|
sta dest + 1
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; input: arg as u8
|
|
||||||
; input/output: dest as u16
|
|
||||||
; clobbers a, x
|
|
||||||
.macro sqr8_add16 dest, arg
|
|
||||||
ldx arg
|
|
||||||
clc
|
|
||||||
lda sqr_lobyte,x
|
|
||||||
adc dest
|
|
||||||
sta dest
|
|
||||||
lda sqr_hibyte,x
|
|
||||||
adc dest + 1
|
|
||||||
sta dest + 1
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
.segment "TABLES"
|
.segment "TABLES"
|
||||||
; lookup table for top byte -> PORTB value for bank-switch
|
; lookup table for top byte -> PORTB value for bank-switch
|
||||||
.align 256
|
.align 256
|
||||||
|
@ -760,9 +746,8 @@ inner_loop:
|
||||||
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
||||||
|
|
||||||
imul8 result, arg1, arg2, xe
|
imul8 result, arg1, arg2, xe
|
||||||
lda #0
|
|
||||||
sta result + 2
|
imul8 result + 2, arg1 + 1, arg2 + 1, xe
|
||||||
sta result + 3
|
|
||||||
|
|
||||||
imul8 inter, arg1 + 1, arg2, xe
|
imul8 inter, arg1 + 1, arg2, xe
|
||||||
add16 result + 1, result + 1, inter
|
add16 result + 1, result + 1, inter
|
||||||
|
@ -772,9 +757,6 @@ inner_loop:
|
||||||
add16 result + 1, result + 1, inter
|
add16 result + 1, result + 1, inter
|
||||||
add_carry result + 3
|
add_carry result + 3
|
||||||
|
|
||||||
imul8 inter, arg1 + 1, arg2 + 1, xe
|
|
||||||
add16 result + 2, result + 2, inter
|
|
||||||
|
|
||||||
; In case of negative inputs, adjust high word
|
; In case of negative inputs, adjust high word
|
||||||
; https://stackoverflow.com/a/28827013
|
; https://stackoverflow.com/a/28827013
|
||||||
lda arg1 + 1
|
lda arg1 + 1
|
||||||
|
@ -807,9 +789,8 @@ arg2_pos:
|
||||||
; h*h*256*256 + h*l*256 + h*l*256 + l*l
|
; h*h*256*256 + h*l*256 + h*l*256 + l*l
|
||||||
|
|
||||||
sqr8 result, arg
|
sqr8 result, arg
|
||||||
lda #0
|
|
||||||
sta result + 2
|
sqr8 result + 2, arg + 1
|
||||||
sta result + 3
|
|
||||||
|
|
||||||
imul8 inter, arg + 1, arg, xe
|
imul8 inter, arg + 1, arg, xe
|
||||||
add16 result + 1, result + 1, inter
|
add16 result + 1, result + 1, inter
|
||||||
|
@ -817,8 +798,6 @@ arg2_pos:
|
||||||
add16 result + 1, result + 1, inter
|
add16 result + 1, result + 1, inter
|
||||||
add_carry result + 3
|
add_carry result + 3
|
||||||
|
|
||||||
sqr8_add16 result + 2, arg + 1
|
|
||||||
|
|
||||||
rts ; 6 cyc
|
rts ; 6 cyc
|
||||||
.endscope
|
.endscope
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
3
todo.md
3
todo.md
|
@ -1,8 +1,5 @@
|
||||||
things to try:
|
things to try:
|
||||||
|
|
||||||
* skip add on the top-byte multiply in sqr8/mul8
|
|
||||||
* should save a few cycles, suggestion by jamey
|
|
||||||
|
|
||||||
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
|
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
|
||||||
|
|
||||||
* y-axis mirror optimization
|
* y-axis mirror optimization
|
||||||
|
|
Loading…
Reference in a new issue