Runtime detection of XE-style extended memory

Uses the "big multiplication table" in 64KB of extended memory if
bank switching appears to work, otherwise uses the table of squares
lookups.

Initial view clocks in at 13.133 ms/px for the XE version and still
14.211 ms/px for the 400/800/XL version.

Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs,
and base implementation on the 800XL emulator.
This commit is contained in:
Brooke Vibber 2024-12-27 18:37:03 -08:00
parent ee1c268705
commit 83cba4afa3

View file

@ -347,14 +347,6 @@ fill_masks:
neg 4, arg
.endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
copy32 dest, FR2 ; 24 cyc
.endmacro
.macro shift_round_16 arg, shift
.repeat shift
shl32 arg
@ -365,7 +357,7 @@ fill_masks:
.macro imul16_round dest, arg1, arg2, shift
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
jsr imul16_func ; ? cyc
shift_round_16 FR2, shift
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
@ -505,6 +497,30 @@ done:
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
@ -615,6 +631,47 @@ inner_loop:
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2
lda #0
sta result + 2
sta result + 3
imul8 inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endproc
.proc imul16xe_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
inter = temp2
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8xe result, arg1, arg2
lda #0
sta result + 2