Runtime detection of XE-style extended memory
Uses the "big multiplication table" in 64KB of extended memory if bank switching appears to work, otherwise uses the table of squares lookups. Initial view clocks in at 13.133 ms/px for the XE version and still 14.211 ms/px for the 400/800/XL version. Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs, and base implementation on the 800XL emulator.
This commit is contained in:
parent
ee1c268705
commit
83cba4afa3
1 changed files with 66 additions and 9 deletions
75
mandel.s
75
mandel.s
|
@ -347,14 +347,6 @@ fill_masks:
|
|||
neg 4, arg
|
||||
.endmacro
|
||||
|
||||
; 518 - 828 cyc
|
||||
.macro imul16 dest, arg1, arg2
|
||||
copy16 FR0, arg1 ; 12 cyc
|
||||
copy16 FR1, arg2 ; 12 cyc
|
||||
jsr imul16_func ; 470-780 cyc
|
||||
copy32 dest, FR2 ; 24 cyc
|
||||
.endmacro
|
||||
|
||||
.macro shift_round_16 arg, shift
|
||||
.repeat shift
|
||||
shl32 arg
|
||||
|
@ -365,7 +357,7 @@ fill_masks:
|
|||
.macro imul16_round dest, arg1, arg2, shift
|
||||
copy16 FR0, arg1 ; 12 cyc
|
||||
copy16 FR1, arg2 ; 12 cyc
|
||||
jsr imul16_func ; 470-780 cyc
|
||||
jsr imul16_func ; ? cyc
|
||||
shift_round_16 FR2, shift
|
||||
copy16 dest, FR2 + 2 ; 12 cyc
|
||||
.endmacro
|
||||
|
@ -505,6 +497,30 @@ done:
|
|||
|
||||
.proc imul8xe_init
|
||||
|
||||
bank_switch 0
|
||||
lda #0
|
||||
sta EXTENDED_RAM
|
||||
bank_switch 1
|
||||
lda #1
|
||||
sta EXTENDED_RAM
|
||||
bank_switch 0
|
||||
lda EXTENDED_RAM
|
||||
beq init
|
||||
|
||||
; no bank switching available, we just overwrite the value in base ram
|
||||
rts
|
||||
|
||||
init:
|
||||
|
||||
; patch imul16_func into a forwarding thunk to imul16xe_func
|
||||
lda #$4c ; 'jmp' opcode
|
||||
sta imul16_func
|
||||
lda #.lobyte(imul16xe_func)
|
||||
sta imul16_func + 1
|
||||
lda #.hibyte(imul16xe_func)
|
||||
sta imul16_func + 2
|
||||
|
||||
; create the lookup table
|
||||
; go through the input set, in four 16KB chunks
|
||||
|
||||
arg1 = FR1
|
||||
|
@ -615,6 +631,47 @@ inner_loop:
|
|||
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
||||
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
||||
|
||||
imul8 result, arg1, arg2
|
||||
lda #0
|
||||
sta result + 2
|
||||
sta result + 3
|
||||
|
||||
imul8 inter, arg1 + 1, arg2
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
|
||||
imul8 inter, arg1, arg2 + 1
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
|
||||
imul8 inter, arg1 + 1, arg2 + 1
|
||||
add16 result + 2, result + 2, inter
|
||||
|
||||
; In case of negative inputs, adjust high word
|
||||
; https://stackoverflow.com/a/28827013
|
||||
lda arg1 + 1
|
||||
bpl arg1_pos
|
||||
sub16 result + 2, result + 2, arg2
|
||||
arg1_pos:
|
||||
lda arg2 + 1
|
||||
bpl arg2_pos
|
||||
sub16 result + 2, result + 2, arg1
|
||||
arg2_pos:
|
||||
|
||||
rts ; 6 cyc
|
||||
.endproc
|
||||
|
||||
.proc imul16xe_func
|
||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||
result = FR2 ; 32-bit result
|
||||
inter = temp2
|
||||
|
||||
; h1l1 * h2l2
|
||||
; (h1*256 + l1) * (h2*256 + l2)
|
||||
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
||||
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
||||
|
||||
imul8xe result, arg1, arg2
|
||||
lda #0
|
||||
sta result + 2
|
||||
|
|
Loading…
Reference in a new issue