Shaves 3 seconds off initial view runtime on XE :D

Instead of relying solely on the JMP thunks added to
imul16_func and sqr16_func, three call sites within the
mandelbrot iteration function are patched directly to
jsr to the XE versions, saving like 15 cycles per iter

Ok so it's not a lot, but every seconds counts. ;)

with XE code disabled:
1539 us/iter
5m13s

with old XE code:
1417 us/iter
4m48s

with new XE code:
1406 us/iter
4m45s
This commit is contained in:
Brooke Vibber 2025-09-06 19:53:25 -07:00
commit 29cd3d968f

147
mandel.s
View file

@ -485,6 +485,7 @@ input_max:
.endmacro
; 6 * bytes cycles
; 4 * bytes bytes
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
@ -493,6 +494,7 @@ input_max:
.endmacro
; 12 cycles
; 8 bytes
.macro copy16 dest, arg
copy 2, dest, arg
.endmacro
@ -538,6 +540,8 @@ input_max:
; input: arg1, arg2 as fixed4.12
; output: dest as fixed8.24
; patch point jsr at 16 bytes in
imul16_patch_offset = 16
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
@ -547,6 +551,8 @@ input_max:
; input: arg as fixed4.12
; output: dest as fixed8.24
; patch point jsr at 8 bytes in
sqr16_patch_offset = 8
.macro sqr16 dest, arg
copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc
@ -692,71 +698,6 @@ bank_switch_table:
.endif
.endmacro
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
; ditto for sqr16_func -> sqr16xe_func
lda #$4c ; 'jmp' opcode
sta sqr16_func
lda #.lobyte(sqr16xe_func)
sta sqr16_func + 1
lda #.hibyte(sqr16xe_func)
sta sqr16_func + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
sta ptr
lda #$40
sta ptr + 1
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
@ -1156,12 +1097,15 @@ keep_going:
shift_round_16 zy, 3
; zx_2 = zx * zx
fixup_sqr16_1:
sqr16 zx_2, zx + 2
; zy_2 = zy * zy
fixup_sqr16_2:
sqr16 zy_2, zy + 2
; zx_zy = zx * zy
fixup_imul16_1:
imul16 zx_zy, zx + 2, zy + 2
; dist = zx_2 + zy_2
@ -2162,3 +2106,76 @@ countdown_done:
jsr draw_string
rts
.endproc
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
; ditto for sqr16_func -> sqr16xe_func
lda #$4c ; 'jmp' opcode
sta sqr16_func
lda #.lobyte(sqr16xe_func)
sta sqr16_func + 1
sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
lda #.hibyte(sqr16xe_func)
sta sqr16_func + 2
sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
sta ptr
lda #$40
sta ptr + 1
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc