Shaves 3 seconds off initial view runtime on XE :D

Instead of relying solely on the JMP thunks added to imul16_func and sqr16_func, three call sites within the mandelbrot iteration function are patched directly to jsr to the XE versions, saving like 15 cycles per iter Ok so it's not a lot, but every seconds counts. ;) with XE code disabled: 1539 us/iter 5m13s with old XE code: 1417 us/iter 4m48s with new XE code: 1406 us/iter 4m45s
2025-09-06 19:53:25 -07:00 · 2025-09-06 19:53:25 -07:00 · 29cd3d968f
commit 29cd3d968f
parent b46e6fb343
1 changed files with 82 additions and 65 deletions
--- a/mandel.s
+++ b/mandel.s
@ -485,6 +485,7 @@ input_max:
 .endmacro

 ; 6 * bytes cycles
+; 4 * bytes bytes
 .macro copy bytes, dest, arg
    .repeat bytes, byte ; 6 * bytes cycles
        lda arg + byte  ; 3 cyc
@ -493,6 +494,7 @@ input_max:
 .endmacro

 ; 12 cycles
+; 8 bytes
 .macro copy16 dest, arg
    copy 2, dest, arg
 .endmacro
@ -538,6 +540,8 @@ input_max:

 ; input: arg1, arg2 as fixed4.12
 ; output: dest as fixed8.24
+; patch point jsr at 16 bytes in
+imul16_patch_offset = 16
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
@ -547,6 +551,8 @@ input_max:

 ; input: arg as fixed4.12
 ; output: dest as fixed8.24
+; patch point jsr at 8 bytes in
+sqr16_patch_offset = 8
 .macro sqr16 dest, arg
    copy16 FR0, arg   ; 12 cyc
    jsr sqr16_func    ; ? cyc
@ -692,71 +698,6 @@ bank_switch_table:
    .endif
 .endmacro

-.proc imul8xe_init
-
-    bank_switch 0
-    lda #0
-    sta EXTENDED_RAM
-    bank_switch 1
-    lda #1
-    sta EXTENDED_RAM
-    bank_switch 0
-    lda EXTENDED_RAM
-    beq init
-
-    ; no bank switching available, we just overwrite the value in base ram
-    rts
-
-init:
-
-    ; patch imul16_func into a forwarding thunk to imul16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta imul16_func
-    lda #.lobyte(imul16xe_func)
-    sta imul16_func + 1
-    lda #.hibyte(imul16xe_func)
-    sta imul16_func + 2
-
-    ; ditto for sqr16_func -> sqr16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta sqr16_func
-    lda #.lobyte(sqr16xe_func)
-    sta sqr16_func + 1
-    lda #.hibyte(sqr16xe_func)
-    sta sqr16_func + 2
-
-    ; create the lookup table
-    ; go through the input set, in four 16KB chunks
-
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-
-    lda #$00
-    sta arg1
-    sta arg2
-    sta ptr
-    lda #$40
-    sta ptr + 1
-
-    ; $00 * $00 -> $3f * $ff
-    bank_switch 0
-    jsr imul8xe_init_section
-
-    ; $40 * $00 -> $7f * $ff
-    bank_switch 1
-    jsr imul8xe_init_section
-
-    ; $80 * $00 -> $bf * $ff
-    bank_switch 2
-    jsr imul8xe_init_section
-
-    ; $c0 * $00 -> $ff * $ff
-    bank_switch 3
-    jsr imul8xe_init_section
-
-    rts
-.endproc

 ; Initialize a 16 KB chunk of the table
 ; input: multipliers in temp
@ -1156,12 +1097,15 @@ keep_going:
    shift_round_16 zy, 3

    ; zx_2 = zx * zx
+fixup_sqr16_1:
    sqr16 zx_2, zx + 2

    ; zy_2 = zy * zy
+fixup_sqr16_2:
    sqr16 zy_2, zy + 2

    ; zx_zy = zx * zy
+fixup_imul16_1:
    imul16 zx_zy, zx + 2, zy + 2

    ; dist = zx_2 + zy_2
@ -2162,3 +2106,76 @@ countdown_done:
    jsr draw_string
    rts
 .endproc
+
+.proc imul8xe_init
+
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
+
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
+
+
+    ; create the lookup table
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc