Compare commits
46 commits
Author | SHA1 | Date | |
---|---|---|---|
87caa52543 | |||
d8601bb856 | |||
7985ea9a39 | |||
cc83c76706 | |||
2e8893fd78 | |||
81bf7f3c43 | |||
1e0f577e09 | |||
d2f41f9644 | |||
2fcb30b76a | |||
13257309dc | |||
7184b8e03f | |||
4a1e35699a | |||
0d086a179c | |||
61eb1aaf21 | |||
b56dc1e98b | |||
0a7293d8bc | |||
ec42f672d4 | |||
67649d4743 | |||
ed79c80b16 | |||
e6cbe0bc6b | |||
6db8cef82d | |||
9b7f6b8937 | |||
3bd9b1ac31 | |||
63e74d5152 | |||
14125a398a | |||
71d8d93abc | |||
64a6cf50f3 | |||
100c0f3314 | |||
e51aa91e4e | |||
c4b98c7be2 | |||
70d2c91f03 | |||
acac5a8df4 | |||
883f926e57 | |||
0c63430dd9 | |||
3ab5006aa3 | |||
f903272335 | |||
8ad996981a | |||
15fc5367f9 | |||
2118890977 | |||
0fc5ba914f | |||
2b0167226e | |||
504457595a | |||
0fcf4d6676 | |||
d83b811444 | |||
f32cc5fa7c | |||
052a19b6aa |
7 changed files with 668 additions and 488 deletions
4
Makefile
4
Makefile
|
@ -2,8 +2,8 @@
|
|||
|
||||
all : mandel.xex
|
||||
|
||||
mandel.xex : mandel.o tables.o
|
||||
ld65 -C ./atari-asm-xex.cfg -o $@ $+
|
||||
mandel.xex : mandel.o tables.o atari-asm-xex.cfg
|
||||
ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
|
||||
|
||||
%.o : %.s
|
||||
ca65 -o $@ $<
|
||||
|
|
28
atari-asm-xex.cfg
Normal file
28
atari-asm-xex.cfg
Normal file
|
@ -0,0 +1,28 @@
|
|||
FEATURES {
|
||||
STARTADDRESS: default = $2E00;
|
||||
}
|
||||
SYMBOLS {
|
||||
__STARTADDRESS__: type = export, value = %S;
|
||||
}
|
||||
MEMORY {
|
||||
ZP: file = "", define = yes, start = $0082, size = $007E;
|
||||
MAIN: file = %O, define = yes, start = %S, size = $4000 - %S;
|
||||
# Keep $4000-7fff clear for expanded RAM access window
|
||||
TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000;
|
||||
# Keep $a000-$bfff clear for BASIC cartridge
|
||||
}
|
||||
FILES {
|
||||
%O: format = atari;
|
||||
}
|
||||
FORMATS {
|
||||
atari: runad = start;
|
||||
}
|
||||
SEGMENTS {
|
||||
ZEROPAGE: load = ZP, type = zp, optional = yes;
|
||||
EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs
|
||||
CODE: load = MAIN, type = rw, define = yes;
|
||||
RODATA: load = MAIN, type = ro optional = yes;
|
||||
DATA: load = MAIN, type = rw optional = yes;
|
||||
BSS: load = MAIN, type = bss, optional = yes, define = yes;
|
||||
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
|
||||
}
|
175
imul8xe.s
175
imul8xe.s
|
@ -1,175 +0,0 @@
|
|||
FR0 = $d4 ; float48
|
||||
PORTB = $d301
|
||||
|
||||
|
||||
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
|
||||
|
||||
; lookup table for top byte -> PORTB value for bank-switch
|
||||
.align 256
|
||||
bankswitch:
|
||||
.repeat 256, i
|
||||
.byte ((i & $c0) >> 5) | $c1
|
||||
.endrepeat
|
||||
|
||||
; 58-77 cycles
|
||||
; clobbers x, y, dest to dest + 3
|
||||
.macro imul8xe dest, arg1, arg2
|
||||
.local done
|
||||
.local output
|
||||
.local ptr
|
||||
|
||||
output = dest
|
||||
ptr = dest + 2 ; scratch space assumed
|
||||
|
||||
; bottom 14 bits except the LSB are the per-bank table index
|
||||
; add $4000 for the bank pointer
|
||||
lda arg1 ; 3 cyc
|
||||
and #$fe ; 2 cyc
|
||||
sta ptr ; 3 cyc
|
||||
lda arg2 ; 3 cyc
|
||||
and #$3f ; 2 cyc
|
||||
clc ; 2 cyc
|
||||
adc #$40 ; 2 cyc
|
||||
sta ptr + 1 ; 3 cyc
|
||||
|
||||
; top 2 bits are the table bank selector
|
||||
ldx arg2 ; 3 cyc
|
||||
lda bank_switch,x ; 4 cyc
|
||||
sta PORTB ; 4 cyc
|
||||
|
||||
|
||||
; copy the entry into output
|
||||
ldy #0 ; 2 cyc
|
||||
lda (ptr),y ; 5 cyc
|
||||
sta output ; 3 cyc
|
||||
iny ; 2 cyc
|
||||
lda (ptr),y ; 5 cyc
|
||||
sta output+1 ; 3 cyc
|
||||
|
||||
; note: we are not restoring memory to save 6 cycles!
|
||||
; this means those 16kb have to be switched back to base RAM
|
||||
; if we need to use them anywhere else
|
||||
;;; restore memory
|
||||
;;lda #$81 ; 2 cyc - disabled
|
||||
;;sta PORTB ; 4 cyc - disabled
|
||||
|
||||
; check that 1 bit we skipped to fit into space
|
||||
lda arg1 ; 3 cyc
|
||||
and #1 ; 2 cyc
|
||||
beq done ; 2 cyc
|
||||
|
||||
; add the second param one last time for the skipped bit
|
||||
clc ; 2 cyc
|
||||
lda arg2 ; 3 cyc
|
||||
adc output ; 3 cyc
|
||||
sta output ; 3 cyc
|
||||
lda #0 ; 2 cyc
|
||||
adc output+1 ; 3 cyc
|
||||
sta output+1 ; 3 cyc
|
||||
|
||||
done:
|
||||
.endmacro
|
||||
|
||||
.macro bank_switch bank
|
||||
lda #((bank << 1) | $c1)
|
||||
sta PORTB
|
||||
.endmacro
|
||||
|
||||
proc imul8xe_init
|
||||
|
||||
; go through the input set, in four 16KB chunks
|
||||
|
||||
arg1 = FR1
|
||||
arg2 = FR2
|
||||
result = FR0
|
||||
|
||||
lda #$00
|
||||
sta arg1
|
||||
sta arg2
|
||||
|
||||
; $00 * $00 -> $3f * $ff
|
||||
bank_switch 0
|
||||
jsr imul8xe_init_section
|
||||
|
||||
; $40 * $00 -> $7f * $ff
|
||||
bank_switch 1
|
||||
jsr imul8xe_init_section
|
||||
|
||||
; $80 * $00 -> $bf * $ff
|
||||
bank_switch 2
|
||||
jsr imul8xe_init_section
|
||||
|
||||
; $c0 * $00 -> $ff * $ff
|
||||
bank_switch 3
|
||||
jsr imul8xe_init_section
|
||||
|
||||
rts
|
||||
endproc
|
||||
|
||||
; Initialize a 16 KB chunk of the table
|
||||
; input: multipliers in temp
|
||||
; output: new multipliers in temp
|
||||
; clobbers: temp, temp2
|
||||
proc imul8xe_init_section
|
||||
arg1 = FR1
|
||||
arg2 = FR2
|
||||
result = FR0
|
||||
ptr = temp2
|
||||
|
||||
lda #$00
|
||||
sta ptr
|
||||
lda #$40
|
||||
sta ptr + 1
|
||||
|
||||
ldx #0
|
||||
ldy #0
|
||||
|
||||
; outer loop: $00 -> $3f
|
||||
outer_loop:
|
||||
|
||||
; reset result to 0
|
||||
lda #0
|
||||
sta result
|
||||
sta result + 1
|
||||
|
||||
; inner loop: $00 -> $ff
|
||||
inner_loop:
|
||||
|
||||
; copy result to data set
|
||||
lda result
|
||||
sta (ptr),y
|
||||
lda result + 1
|
||||
sta (ptr),y
|
||||
|
||||
; result += 2 * arg2
|
||||
clc
|
||||
lda arg2
|
||||
adc result
|
||||
sta result
|
||||
lda #0
|
||||
adc result + 1
|
||||
sta result
|
||||
lda arg2
|
||||
adc result
|
||||
sta result
|
||||
lda #0
|
||||
adc result + 1
|
||||
sta result
|
||||
|
||||
; inner loop check
|
||||
inc arg1
|
||||
inc arg1
|
||||
inc ptr
|
||||
inc ptr
|
||||
bne inner_loop
|
||||
|
||||
; outer loop check
|
||||
inc arg2
|
||||
inc ptr + 1
|
||||
lda ptr + 1
|
||||
cmp #$40
|
||||
bne outer_loop
|
||||
|
||||
rts
|
||||
|
||||
endproc
|
26
readme.md
26
readme.md
|
@ -14,33 +14,37 @@ Non-goals:
|
|||
|
||||
Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
|
||||
|
||||
-- brooke, january 2023 - february 2024
|
||||
-- brooke, january 2023 - december 2024
|
||||
|
||||
## Current state
|
||||
|
||||
Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
|
||||
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
|
||||
|
||||
The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
|
||||
The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
|
||||
|
||||
The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
|
||||
* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
|
||||
* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
|
||||
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
|
||||
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
|
||||
|
||||
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
|
||||
The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
|
||||
|
||||
Iterations are capped at 255.
|
||||
|
||||
The pixels are run in a progressive layout to get the basic shape on screen faster.
|
||||
|
||||
## Next steps
|
||||
There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
|
||||
|
||||
Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
|
||||
There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
|
||||
|
||||
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
|
||||
|
||||
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
|
||||
(done)
|
||||
There's some cute color cycling.
|
||||
|
||||
## Deps and build instructions
|
||||
|
||||
I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
|
||||
|
||||
Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
|
||||
|
||||
## Todo
|
||||
|
||||
See ideas in `todo.md`.
|
||||
|
|
12
tables.js
12
tables.js
|
@ -22,7 +22,10 @@ console.log(
|
|||
.export mul_lobyte256
|
||||
.export mul_hibyte256
|
||||
.export mul_hibyte512
|
||||
.export sqr_lobyte
|
||||
.export sqr_hibyte
|
||||
|
||||
; (i * i + 1) / 2 for the multiplier
|
||||
.align 256
|
||||
mul_lobyte256:
|
||||
${db((i) => squares[i] & 0xff)}
|
||||
|
@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
|
|||
mul_hibyte512:
|
||||
${db((i) => (squares[i + 256] >> 8) & 0xff)}
|
||||
|
||||
; (i * i) for the plain squares
|
||||
.align 256
|
||||
sqr_lobyte:
|
||||
${db((i) => (i * i) & 0xff)}
|
||||
|
||||
.align 256
|
||||
sqr_hibyte:
|
||||
${db((i) => ((i * i) >> 8) & 0xff)}
|
||||
|
||||
`);
|
||||
|
|
19
todo.md
Normal file
19
todo.md
Normal file
|
@ -0,0 +1,19 @@
|
|||
things to try:
|
||||
|
||||
* skip add on the top-byte multiply in sqr8/mul8
|
||||
* should save a few cycles, suggestion by jamey
|
||||
|
||||
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
|
||||
|
||||
* try 3.13 fixed point instead of 4.12 for more precision
|
||||
* can we get away without the extra bit?
|
||||
* since exit compare space would be 6.26 i think so
|
||||
|
||||
* y-axis mirror optimization
|
||||
|
||||
* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
|
||||
* maybe redo tiering to just 4x4, 2x2, 1x1?
|
||||
|
||||
* extract viewport for display & re-input via keyboard
|
||||
|
||||
* fujinet screenshot/viewport uploader
|
Loading…
Reference in a new issue