Compare commits

..

4 commits

Author SHA1 Message Date
3553ce986f shave some cycles off 16-bit squaring with shift instead of add
also fix the comments about how many cycles shift takes
2024-12-31 15:29:40 -08:00
0f49760aa5 unify tables for squaring and multiplication 2024-12-31 02:26:24 -08:00
f06aed0c00 set results from both 8-bit squares first
Since the results from the lo and hi squares don't overlap or overflow,
they can be written directly to the final output location without doing
any addition. Then only the multiplication that goes in the middle needs
any adds.
2024-12-31 02:22:31 -08:00
aee587388d eliminate mul_hibyte512 table
This costs an extra half cycle on average, assuming uniform distribution
of multiplication inputs. I don't think a half cycle is worth an extra
256-byte table.
2024-12-31 02:01:45 -08:00
8 changed files with 331 additions and 964 deletions

View file

@ -2,11 +2,8 @@
all : mandel.xex
mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
mandel.s : mandel.c mandel.h
cc65 -o $@ mandel.c
mandel.xex : mandel.o tables.o atari-asm-xex.cfg
ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
%.o : %.s
ca65 -o $@ $<
@ -16,7 +13,6 @@ tables.s : tables.js
clean :
rm -f tables.s
rm -f mandel.s
rm -f *.o
rm -f *.xex
rm -f mandel.map

View file

@ -1,69 +0,0 @@
# Sample linker configuration for C programs using the Atari binary file support.
# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
FEATURES {
STARTADDRESS: default = $8000;
}
SYMBOLS {
__SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk
__STACKSIZE__: type = weak, value = $0800; # 2k stack
__STARTADDRESS__: type = export, value = %S;
__RESERVED_MEMORY__: type = weak, value = $0000;
__SYSCHKHDR__: type = export, value = 0; # Disable system check header
__SYSCHKTRL__: type = export, value = 0; # Disable system check trailer
__TABLESEG_START__: type = weak, value = $2E00 + $0300;
__TABLESEG_SIZE__: type = weak, value = 6 * $100;
__BANKSY_START__: type = weak, value = $4000;
__BANKSY_SIZE__: type = weak, value = $4000;
__FRAMEBUFFER_START__: type = weak, value = $A000;
}
MEMORY {
# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
ZP: file = "", define = yes, start = $0082, size = $007E;
# "system check" load chunk
SYSCHKCHNK: file = %O, start = $2E00, size = $0300;
# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
TABLES: file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
# We reserve $4000-7fff for the bank-switch window.
# In theory we could keep data and code here that we only use on 48k/64k systems.
BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
# "main program" load chunk
MAIN: file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start,
initad = SYSCHKCHNK: __SYSTEM_CHECK__;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp;
EXTZP: load = ZP, type = zp, optional = yes;
SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
BANKSWICH: load = BANKSWITCH, type = ro, optional = yes;
STARTUP: load = MAIN, type = ro, define = yes;
LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized
LOWCODE: load = MAIN, type = ro, define = yes, optional = yes;
ONCE: load = MAIN, type = ro, optional = yes;
CODE: load = MAIN, type = ro, define = yes;
RODATA: load = MAIN, type = ro;
DATA: load = MAIN, type = rw;
INIT: load = MAIN, type = rw, optional = yes;
BSS: load = MAIN, type = bss, define = yes;
}
FEATURES {
CONDES: type = constructor,
label = __CONSTRUCTOR_TABLE__,
count = __CONSTRUCTOR_COUNT__,
segment = ONCE;
CONDES: type = destructor,
label = __DESTRUCTOR_TABLE__,
count = __DESTRUCTOR_COUNT__,
segment = RODATA;
CONDES: type = interruptor,
label = __INTERRUPTOR_TABLE__,
count = __INTERRUPTOR_COUNT__,
segment = RODATA,
import = __CALLIRQ__;
}

View file

@ -1,15 +0,0 @@
/**
* The UI and I/O wrapper for the Mandelbrot runner, in C.
*
* For the moment *all* logic is in mandel-core.s, I'm just
* trying to get this to run within a cc65 environment.
* Eventually just the inner loop fun will live in there.
*/
#include <stdlib.h>
#include <stdio.h>
#include "mandel.h"
void main(void) {
mandel_start();
}

View file

@ -1,4 +0,0 @@
#include <inttypes.h>
// From mandel-core.s:
extern void mandel_start(void);

File diff suppressed because it is too large Load diff

View file

@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g
## Current state
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
Iterations are capped at 255.
@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e
## Todo
See ideas in `todo.md`.
See ideas in `todo.md`.

View file

@ -11,40 +11,19 @@ function db(func) {
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
.export mul_lobyte
.export mul_hibyte
; (i * i + 1) / 2 for the multiplier
; (i * i) / 2 for the multiplier
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
mul_lobyte:
${db((i) => ((i * i) >> 1) & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
mul_hibyte:
${db((i) => ((i * i) >> 9) & 0xff)}
`);

12
todo.md
View file

@ -1,17 +1,15 @@
things to try:
* fix status bar to show elapsed time, per-iter time, per-pixel iter count
* 'turbo' mode disabling graphics in full or part
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* maybe clean up the load/layout of the big mul table
* consider alternate lookup tables in the top 16KB under ROM
* try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit?
* y-axis mirror optimization
* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
* maybe redo tiering to just 4x4, 2x2, 1x1?
* extract viewport for display & re-input via keyboard
* fujinet screenshot/viewport uploader