From 83cba4afa3e28cc8f6b0377c9edc49e60af36187 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Fri, 27 Dec 2024 18:37:03 -0800 Subject: [PATCH] Runtime detection of XE-style extended memory Uses the "big multiplication table" in 64KB of extended memory if bank switching appears to work, otherwise uses the table of squares lookups. Initial view clocks in at 13.133 ms/px for the XE version and still 14.211 ms/px for the 400/800/XL version. Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs, and base implementation on the 800XL emulator. --- mandel.s | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/mandel.s b/mandel.s index 3ff91d1..d198989 100644 --- a/mandel.s +++ b/mandel.s @@ -347,14 +347,6 @@ fill_masks: neg 4, arg .endmacro -; 518 - 828 cyc -.macro imul16 dest, arg1, arg2 - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc - copy32 dest, FR2 ; 24 cyc -.endmacro - .macro shift_round_16 arg, shift .repeat shift shl32 arg @@ -365,7 +357,7 @@ fill_masks: .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc + jsr imul16_func ; ? cyc shift_round_16 FR2, shift copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -505,6 +497,30 @@ done: .proc imul8xe_init + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + + ; create the lookup table ; go through the input set, in four 16KB chunks arg1 = FR1 @@ -615,6 +631,47 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + imul8 result, arg1, arg2 + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg1 + 1 + bpl arg1_pos + sub16 result + 2, result + 2, arg2 +arg1_pos: + lda arg2 + 1 + bpl arg2_pos + sub16 result + 2, result + 2, arg1 +arg2_pos: + + rts ; 6 cyc +.endproc + +.proc imul16xe_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + imul8xe result, arg1, arg2 lda #0 sta result + 2