Runtime detection of XE-style extended memory

Uses the "big multiplication table" in 64KB of extended memory if bank switching appears to work, otherwise uses the table of squares lookups. Initial view clocks in at 13.133 ms/px for the XE version and still 14.211 ms/px for the 400/800/XL version. Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs, and base implementation on the 800XL emulator.
2024-12-27 18:37:03 -08:00 · 2024-12-27 18:37:03 -08:00 · 83cba4afa3
commit 83cba4afa3
parent ee1c268705
1 changed files with 66 additions and 9 deletions
--- a/mandel.s
+++ b/mandel.s
@ -347,14 +347,6 @@ fill_masks:
    neg 4, arg
 .endmacro

-; 518 - 828 cyc
-.macro imul16 dest, arg1, arg2
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
-    copy32 dest, FR2  ; 24 cyc
-.endmacro
-
 .macro shift_round_16 arg, shift
    .repeat shift
        shl32 arg
@ -365,7 +357,7 @@ fill_masks:
 .macro imul16_round dest, arg1, arg2, shift
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
+    jsr imul16_func   ; ? cyc
    shift_round_16 FR2, shift
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
@ -505,6 +497,30 @@ done:

 .proc imul8xe_init

+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+
+    ; create the lookup table
    ; go through the input set, in four 16KB chunks

    arg1 = FR1
@ -615,6 +631,47 @@ inner_loop:
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2

+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
+
+    rts ; 6 cyc
+.endproc
+
+.proc imul16xe_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
    imul8xe result, arg1, arg2
    lda #0
    sta result + 2