Compare commits

..

No commits in common. "main" and "xe" have entirely different histories.
main ... xe

7 changed files with 489 additions and 669 deletions

View file

@ -2,8 +2,8 @@
all : mandel.xex all : mandel.xex
mandel.xex : mandel.o tables.o atari-asm-xex.cfg mandel.xex : mandel.o tables.o
ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o ld65 -C ./atari-asm-xex.cfg -o $@ $+
%.o : %.s %.o : %.s
ca65 -o $@ $< ca65 -o $@ $<

View file

@ -1,28 +0,0 @@
FEATURES {
STARTADDRESS: default = $2E00;
}
SYMBOLS {
__STARTADDRESS__: type = export, value = %S;
}
MEMORY {
ZP: file = "", define = yes, start = $0082, size = $007E;
MAIN: file = %O, define = yes, start = %S, size = $4000 - %S;
# Keep $4000-7fff clear for expanded RAM access window
TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000;
# Keep $a000-$bfff clear for BASIC cartridge
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp, optional = yes;
EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs
CODE: load = MAIN, type = rw, define = yes;
RODATA: load = MAIN, type = ro optional = yes;
DATA: load = MAIN, type = rw optional = yes;
BSS: load = MAIN, type = bss, optional = yes, define = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
}

175
imul8xe.s Normal file
View file

@ -0,0 +1,175 @@
FR0 = $d4 ; float48
PORTB = $d301
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
; lookup table for top byte -> PORTB value for bank-switch
.align 256
bankswitch:
.repeat 256, i
.byte ((i & $c0) >> 5) | $c1
.endrepeat
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
.macro bank_switch bank
lda #((bank << 1) | $c1)
sta PORTB
.endmacro
proc imul8xe_init
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldx #0
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
sta (ptr),y
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$40
bne outer_loop
rts
endproc

894
mandel.s

File diff suppressed because it is too large Load diff

View file

@ -14,37 +14,33 @@ Non-goals:
Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals. Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brooke, january 2023 - december 2024 -- brooke, january 2023 - february 2024
## Current state ## Current state
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys. Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
Iterations are capped at 255. Iterations are capped at 255.
The pixels are run in a progressive layout to get the basic shape on screen faster. The pixels are run in a progressive layout to get the basic shape on screen faster.
There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D ## Next steps
There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
There's some cute color cycling. Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions ## Deps and build instructions
I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that. I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices. Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
## Todo
See ideas in `todo.md`.

View file

@ -22,10 +22,7 @@ console.log(
.export mul_lobyte256 .export mul_lobyte256
.export mul_hibyte256 .export mul_hibyte256
.export mul_hibyte512 .export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier
.align 256 .align 256
mul_lobyte256: mul_lobyte256:
${db((i) => squares[i] & 0xff)} ${db((i) => squares[i] & 0xff)}
@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
mul_hibyte512: mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)} ${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`); `);

19
todo.md
View file

@ -1,19 +0,0 @@
things to try:
* skip add on the top-byte multiply in sqr8/mul8
* should save a few cycles, suggestion by jamey
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit?
* since exit compare space would be 6.26 i think so
* y-axis mirror optimization
* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
* maybe redo tiering to just 4x4, 2x2, 1x1?
* extract viewport for display & re-input via keyboard
* fujinet screenshot/viewport uploader