Compare commits

..

No commits in common. "main" and "xe" have entirely different histories.

11 changed files with 1651 additions and 2348 deletions

View file

@ -2,11 +2,8 @@
all : mandel.xex
mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
mandel.s : mandel.c mandel.h
cc65 -o $@ mandel.c
mandel.xex : mandel.o tables.o
ld65 -C ./atari-asm-xex.cfg -o $@ $+
%.o : %.s
ca65 -o $@ $<
@ -16,7 +13,6 @@ tables.s : tables.js
clean :
rm -f tables.s
rm -f mandel.s
rm -f *.o
rm -f *.xex
rm -f mandel.map

View file

@ -1,28 +0,0 @@
FEATURES {
STARTADDRESS: default = $2E00;
}
SYMBOLS {
__STARTADDRESS__: type = export, value = %S;
}
MEMORY {
ZP: file = "", define = yes, start = $0082, size = $007E;
MAIN: file = %O, define = yes, start = %S, size = $4000 - %S;
# Keep $4000-7fff clear for expanded RAM access window
TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000;
# Keep $a000-$bfff clear for BASIC cartridge
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp, optional = yes;
EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs
CODE: load = MAIN, type = rw, define = yes;
RODATA: load = MAIN, type = ro optional = yes;
DATA: load = MAIN, type = rw optional = yes;
BSS: load = MAIN, type = bss, optional = yes, define = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
}

View file

@ -1,69 +0,0 @@
# Sample linker configuration for C programs using the Atari binary file support.
# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
FEATURES {
STARTADDRESS: default = $8000;
}
SYMBOLS {
__SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk
__STACKSIZE__: type = weak, value = $0800; # 2k stack
__STARTADDRESS__: type = export, value = %S;
__RESERVED_MEMORY__: type = weak, value = $0000;
__SYSCHKHDR__: type = export, value = 0; # Disable system check header
__SYSCHKTRL__: type = export, value = 0; # Disable system check trailer
__TABLESEG_START__: type = weak, value = $2E00 + $0300;
__TABLESEG_SIZE__: type = weak, value = 6 * $100;
__BANKSY_START__: type = weak, value = $4000;
__BANKSY_SIZE__: type = weak, value = $4000;
__FRAMEBUFFER_START__: type = weak, value = $A000;
}
MEMORY {
# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
ZP: file = "", define = yes, start = $0082, size = $007E;
# "system check" load chunk
SYSCHKCHNK: file = %O, start = $2E00, size = $0300;
# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
TABLES: file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
# We reserve $4000-7fff for the bank-switch window.
# In theory we could keep data and code here that we only use on 48k/64k systems.
BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
# "main program" load chunk
MAIN: file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start,
initad = SYSCHKCHNK: __SYSTEM_CHECK__;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp;
EXTZP: load = ZP, type = zp, optional = yes;
SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
BANKSWICH: load = BANKSWITCH, type = ro, optional = yes;
STARTUP: load = MAIN, type = ro, define = yes;
LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized
LOWCODE: load = MAIN, type = ro, define = yes, optional = yes;
ONCE: load = MAIN, type = ro, optional = yes;
CODE: load = MAIN, type = ro, define = yes;
RODATA: load = MAIN, type = ro;
DATA: load = MAIN, type = rw;
INIT: load = MAIN, type = rw, optional = yes;
BSS: load = MAIN, type = bss, define = yes;
}
FEATURES {
CONDES: type = constructor,
label = __CONSTRUCTOR_TABLE__,
count = __CONSTRUCTOR_COUNT__,
segment = ONCE;
CONDES: type = destructor,
label = __DESTRUCTOR_TABLE__,
count = __DESTRUCTOR_COUNT__,
segment = RODATA;
CONDES: type = interruptor,
label = __INTERRUPTOR_TABLE__,
count = __INTERRUPTOR_COUNT__,
segment = RODATA,
import = __CALLIRQ__;
}

175
imul8xe.s Normal file
View file

@ -0,0 +1,175 @@
FR0 = $d4 ; float48
PORTB = $d301
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
; lookup table for top byte -> PORTB value for bank-switch
.align 256
bankswitch:
.repeat 256, i
.byte ((i & $c0) >> 5) | $c1
.endrepeat
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
.macro bank_switch bank
lda #((bank << 1) | $c1)
sta PORTB
.endmacro
proc imul8xe_init
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldx #0
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
sta (ptr),y
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$40
bne outer_loop
rts
endproc

File diff suppressed because it is too large Load diff

View file

@ -1,15 +0,0 @@
/**
* The UI and I/O wrapper for the Mandelbrot runner, in C.
*
* For the moment *all* logic is in mandel-core.s, I'm just
* trying to get this to run within a cc65 environment.
* Eventually just the inner loop fun will live in there.
*/
#include <stdlib.h>
#include <stdio.h>
#include "mandel.h"
void main(void) {
mandel_start();
}

View file

@ -1,4 +0,0 @@
#include <inttypes.h>
// From mandel-core.s:
extern void mandel_start(void);

1462
mandel.s Normal file

File diff suppressed because it is too large Load diff

View file

@ -14,37 +14,33 @@ Non-goals:
Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brooke, january 2023 - december 2024
-- brooke, january 2023 - february 2024
## Current state
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
Iterations are capped at 255.
The pixels are run in a progressive layout to get the basic shape on screen faster.
There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
## Next steps
There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
There's some cute color cycling.
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions
I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
## Todo
See ideas in `todo.md`.

View file

@ -22,10 +22,7 @@ console.log(
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`);

17
todo.md
View file

@ -1,17 +0,0 @@
things to try:
* fix status bar to show elapsed time, per-iter time, per-pixel iter count
* 'turbo' mode disabling graphics in full or part
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* maybe clean up the load/layout of the big mul table
* consider alternate lookup tables in the top 16KB under ROM
* y-axis mirror optimization
* extract viewport for display & re-input via keyboard
* fujinet screenshot/viewport uploader