Compare commits

...

11 commits

Author SHA1 Message Date
052a19b6aa Merge pull request 'xe' () from xe into main
Reviewed-on: https://brooke.vibber.net/git/git/brooke/mandel-6502/pulls/1
2024-12-28 02:40:01 +00:00
83cba4afa3 Runtime detection of XE-style extended memory
Uses the "big multiplication table" in 64KB of extended memory if
bank switching appears to work, otherwise uses the table of squares
lookups.

Initial view clocks in at 13.133 ms/px for the XE version and still
14.211 ms/px for the 400/800/XL version.

Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs,
and base implementation on the 800XL emulator.
2024-12-27 18:37:03 -08:00
ee1c268705 it works 2024-12-26 21:49:13 -08:00
e84a990789 tweaks: 2024-12-26 21:41:03 -08:00
0cde31905e runs but doesn't work 2024-12-26 18:35:37 -08:00
45c5a4cb2d called, gets lost 2024-12-26 18:20:10 -08:00
34ce9da030 builds, not used yte 2024-12-26 18:17:01 -08:00
a9d551a98d first draft initializer 2024-12-26 17:50:59 -08:00
829d2860e8 :P 2024-12-26 12:04:01 -08:00
f996c3cbcd provisional maybe
old mode runs in 81-92 cycles

provisional code runs in 58-77 cycles

if it works ;)
2024-12-25 12:47:37 -08:00
405cec6d51 WIP imul8 via table experiments
planning to try a 64KB table of 8x7-bit multiplies in the high memory
on a 130XE or other high-memory-capable machine

not yet working or finished

too many cycles of overhead per invocation
2024-12-25 10:51:27 -08:00
2 changed files with 459 additions and 41 deletions

175
imul8xe.s Normal file
View file

@ -0,0 +1,175 @@
FR0 = $d4 ; float48
PORTB = $d301
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
; lookup table for top byte -> PORTB value for bank-switch
.align 256
bankswitch:
.repeat 256, i
.byte ((i & $c0) >> 5) | $c1
.endrepeat
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
.macro bank_switch bank
lda #((bank << 1) | $c1)
sta PORTB
.endmacro
proc imul8xe_init
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldx #0
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
sta (ptr),y
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$40
bne outer_loop
rts
endproc

325
mandel.s
View file

@ -74,6 +74,9 @@ width = 160
half_width = width >> 1
stride = width >> 2
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
PORTB = $D301 ; memory & bank-switch for XL/XE
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
@ -344,14 +347,6 @@ fill_masks:
neg 4, arg
.endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
copy32 dest, FR2 ; 24 cyc
.endmacro
.macro shift_round_16 arg, shift
.repeat shift
shl32 arg
@ -362,7 +357,7 @@ fill_masks:
.macro imul16_round dest, arg1, arg2, shift
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
jsr imul16_func ; ? cyc
shift_round_16 FR2, shift
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
@ -372,54 +367,259 @@ fill_masks:
.local under256
.local next
.local small_product
; circa 92 cycles? this doesn't seem right
; 81-92 cycles
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
lda mul_factor_a ; 3 cyc
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
; (a + x)^2/2
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x
sec
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
next:
sta mul_product_hi
lda mul_lobyte256,x
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
; + x & a & 1:
; (this is a kludge to correct a
; roundoff error that makes odd * odd too low)
ldx mul_factor_x ; 3 cyc
txa ; 2 cyc
and mul_factor_a ; 3 cyc
and #1 ; 2 cyc
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
clc ; 2 cyc
adc mul_product_lo ; 3 cyc
bcc small_product ; 2 cyc
inc mul_product_hi ; 5 cyc
; - x^2/2
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endmacro
; lookup table for top byte -> PORTB value for bank-switch
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
bank_switch_table:
.repeat 256, i
.byte ((i & $c0) >> 4) | $e1
.endrepeat
.macro bank_switch bank
lda #((bank << 2) | $e1)
sta PORTB
.endmacro
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
.proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
iny
sta (ptr),y
dey
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result + 1
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result + 1
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$80
bne outer_loop
rts
.endproc
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
@ -461,6 +661,47 @@ arg2_pos:
rts ; 6 cyc
.endproc
.proc imul16xe_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
inter = temp2
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8xe result, arg1, arg2
lda #0
sta result + 2
sta result + 3
imul8xe inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8xe inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8xe inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endproc
.macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local increment
@ -966,6 +1207,8 @@ zero_byte_loop:
.proc start
jsr imul8xe_init
; ox = 0; oy = 0; zoom = 0
; count_frames = 0; count_pixels = 0
lda #0