9 changed files with 299 additions and 1357 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 *.o
 *.xex
-tables.s
 .DS_Store
--- a/.mailmap
+++ b/.mailmap
@ -1,2 +0,0 @@
-Brooke Vibber <bvibber@pobox.com>
-Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>
--- a/8
+++ b/8
@ -2,17 +2,13 @@

 all : mandel.xex

-mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+%.xex : %.o
+	ld65 -C atari-asm-xex.cfg -o $@ $<

 %.o : %.s
 	ca65 -o $@ $<

-tables.s : tables.js
-	node tables.js > tables.s
-
 clean :
-	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex

--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -1,28 +0,0 @@
-FEATURES {
-    STARTADDRESS: default = $2E00;
-}
-SYMBOLS {
-    __STARTADDRESS__: type = export, value = %S;
-}
-MEMORY {
-    ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
-    # Keep $4000-7fff clear for expanded RAM access window
-    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
-    # Keep $a000-$bfff clear for BASIC cartridge
-}
-FILES {
-    %O: format = atari;
-}
-FORMATS {
-    atari: runad = start;
-}
-SEGMENTS {
-    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
-    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
-    CODE:     load = MAIN,    type = rw,                  define = yes;
-    RODATA:   load = MAIN,    type = ro   optional = yes;
-    DATA:     load = MAIN,    type = rw   optional = yes;
-    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
-}
--- a/mandel.s
+++ b/mandel.s
--- a/readme.md
+++ b/readme.md
@ -14,37 +14,30 @@ Non-goals:

 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.

-- brooke, january 2023 - december 2024
+-- brion, january 2023

 ## Current state

-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
+Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.

-The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.

-* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
-* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
-* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
-* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
+The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.

-The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.

 Iterations are capped at 255.

-The pixels are run in a progressive layout to get the basic shape on screen faster.
+## Next steps

-There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
+Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!

-There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.

-There's some cute color cycling.
+I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.

 ## Deps and build instructions

 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.

 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
-
-## Todo
-
-See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -1,50 +0,0 @@
-function db(func) {
-    let lines = [];
-    for (let i = 0; i < 256; i += 16) {
-        let items = [];
-        for (let j = 0; j < 16; j++) {
-            let x = i + j;
-            items.push(func(x));
-        }
-        lines.push('    .byte ' + items.join(', '));
-    }
-    return lines.join('\n');
-}
-
-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
-console.log(
-`.segment "TABLES"
-
-.export mul_lobyte256
-.export mul_hibyte256
-.export mul_hibyte512
-.export sqr_lobyte
-.export sqr_hibyte
-
-; (i * i + 1) / 2 for the multiplier
-.align 256
-mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
-
-.align 256
-mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
-
-.align 256
-mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
-
-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
-
-`);
--- a/testme.js
+++ b/testme.js
@ -1,41 +0,0 @@
-// ax = (a + x)2/2 - a2/2 - x2/2 
-
-function half_square(x) {
-    return Math.round(x * x / 2) & 0xffff >>> 0;
-}
-
-function mul8(a, b) {
-    let result = half_square(a + b) & 0xffff;
-    result = (result - half_square(a)) & 0xffff;
-    result = (result - half_square(b)) & 0xffff;
-    result = (result + (b & a & 1)) & 0xffff;
-    return result >>> 0;
-}
-
-function mul16(a, b) {
-    let ah = (a & 0xff00) >>> 8;
-    let al = (a & 0x00ff) >>> 0;
-    let bh = (b & 0xff00) >>> 8;
-    let bl = (b & 0x00ff) >>> 0;
-    let result = (mul8(al, bl) & 0xffff) >>> 0;
-    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
-    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
-    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
-    return result;
-}
-
-let max = 65536;
-//let max = 256;
-//let max = 128;
-//let max = 8;
-
-for (let a = 0; a < max; a++) {
-    for (let b = 0; b < max; b++) {
-        let expected = Math.imul(a, b) >>> 0;
-        //let actual = mul8(a, b);
-        let actual = mul16(a, b);
-        if (expected !== actual) {
-            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
-        }
-    }
-}
--- a/todo.md
+++ b/todo.md
@ -1,17 +0,0 @@
-things to try:
-
-* fix status bar to show elapsed time, per-iter time, per-pixel iter count
-
-* 'turbo' mode disabling graphics in full or part
-
-* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
-
-* maybe clean up the load/layout of the big mul table
-
-* consider alternate lookup tables in the top 16KB under ROM
-
-* y-axis mirror optimization
-
-* extract viewport for display & re-input via keyboard
-
-* fujinet screenshot/viewport uploader