shave some cycles off 16-bit squaring with shift instead of add

also fix the comments about how many cycles shift takes
unify tables for squaring and multiplication
2024-12-31 15:29:40 -08:00 · 2024-12-31 02:26:24 -08:00 · 2024-12-31 02:22:31 -08:00 · 2024-12-31 02:01:45 -08:00
8 changed files with 349 additions and 983 deletions
--- a/10
+++ b/10
@ -2,11 +2,8 @@

 all : mandel.xex

-mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
-	ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
-
-mandel.s : mandel.c mandel.h
-	cc65 -o $@ mandel.c
+mandel.xex : mandel.o tables.o atari-asm-xex.cfg
+	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o

 %.o : %.s
 	ca65 -o $@ $<
@ -16,7 +13,6 @@ tables.s : tables.js

 clean :
 	rm -f tables.s
-	rm -f mandel.s
 	rm -f *.o
 	rm -f *.xex
-	rm -f mandel.map
+
--- a/atari-xex.cfg
+++ b/atari-xex.cfg
@ -1,69 +0,0 @@
-# Sample linker configuration for C programs using the Atari binary file support.
-# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
-FEATURES {
-    STARTADDRESS: default = $8000;
-}
-SYMBOLS {
-    __SYSTEM_CHECK__:    type = import;  # force inclusion of "system check" load chunk
-    __STACKSIZE__:       type = weak, value = $0800; # 2k stack
-    __STARTADDRESS__:    type = export, value = %S;
-    __RESERVED_MEMORY__: type = weak, value = $0000;
-    __SYSCHKHDR__:       type = export, value = 0; # Disable system check header
-    __SYSCHKTRL__:       type = export, value = 0; # Disable system check trailer
-    __TABLESEG_START__:    type = weak, value = $2E00 + $0300;
-    __TABLESEG_SIZE__:     type = weak, value = 6 * $100;
-    __BANKSY_START__:  type = weak, value = $4000;
-    __BANKSY_SIZE__:   type = weak, value = $4000;
-    __FRAMEBUFFER_START__: type = weak, value = $A000;
-}
-MEMORY {
-# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
-    ZP:         file = "", define = yes, start = $0082, size = $007E;
-# "system check" load chunk
-    SYSCHKCHNK: file = %O,               start = $2E00, size = $0300;
-# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
-    TABLES:     file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
-# We reserve $4000-7fff for the bank-switch window.
-# In theory we could keep data and code here that we only use on 48k/64k systems.
-    BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
-# "main program" load chunk
-    MAIN:       file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
-}
-FILES {
-    %O: format = atari;
-}
-FORMATS {
-    atari: runad = start,
-           initad = SYSCHKCHNK: __SYSTEM_CHECK__;
-}
-SEGMENTS {
-    ZEROPAGE:  load = ZP,         type = zp;
-    EXTZP:     load = ZP,         type = zp,                optional = yes;
-    SYSCHK:    load = SYSCHKCHNK, type = rw,  define = yes, optional = yes;
-    TABLES:    load = TABLES,     type = ro,  optional = yes, align = 256;
-    BANKSWICH: load = BANKSWITCH, type = ro,  optional = yes;
-    STARTUP:   load = MAIN,       type = ro,  define = yes;
-    LOWBSS:    load = MAIN,       type = rw,                optional = yes;  # not zero initialized
-    LOWCODE:   load = MAIN,       type = ro,  define = yes, optional = yes;
-    ONCE:      load = MAIN,       type = ro,                optional = yes;
-    CODE:      load = MAIN,       type = ro,  define = yes;
-    RODATA:    load = MAIN,       type = ro;
-    DATA:      load = MAIN,       type = rw;
-    INIT:      load = MAIN,       type = rw,                optional = yes;
-    BSS:       load = MAIN,       type = bss, define = yes;
-}
-FEATURES {
-    CONDES: type    = constructor,
-            label   = __CONSTRUCTOR_TABLE__,
-            count   = __CONSTRUCTOR_COUNT__,
-            segment = ONCE;
-    CONDES: type    = destructor,
-            label   = __DESTRUCTOR_TABLE__,
-            count   = __DESTRUCTOR_COUNT__,
-            segment = RODATA;
-    CONDES: type    = interruptor,
-            label   = __INTERRUPTOR_TABLE__,
-            count   = __INTERRUPTOR_COUNT__,
-            segment = RODATA,
-            import  = __CALLIRQ__;
-}
--- a/mandel.c
+++ b/mandel.c
@ -1,15 +0,0 @@
-/**
- * The UI and I/O wrapper for the Mandelbrot runner, in C.
- *
- * For the moment *all* logic is in mandel-core.s, I'm just
- * trying to get this to run within a cc65 environment.
- * Eventually just the inner loop fun will live in there.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "mandel.h"
-
-void main(void) {
-    mandel_start();
-}
--- a/mandel.h
+++ b/mandel.h
@ -1,4 +0,0 @@
-#include <inttypes.h>
-
-// From mandel-core.s:
-extern void mandel_start(void);
--- a/mandel-core.s
+++ b/mandel-core.s
--- a/readme.md
+++ b/readme.md
@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g

 ## Current state

-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.

 The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.

@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication

-The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.

 Iterations are capped at 255.

@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e

 ## Todo

-See ideas in `todo.md`.
+See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -11,40 +11,19 @@ function db(func) {
    return lines.join('\n');
 }

-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
 console.log(
 `.segment "TABLES"

-.export mul_lobyte256
-.export mul_hibyte256
-.export mul_hibyte512
-.export sqr_lobyte
-.export sqr_hibyte
+.export mul_lobyte
+.export mul_hibyte

-; (i * i + 1) / 2 for the multiplier
+; (i * i) / 2 for the multiplier
 .align 256
-mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
+mul_lobyte:
+${db((i) => ((i * i) >> 1) & 0xff)}

 .align 256
-mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
-
-.align 256
-mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
-
-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
+mul_hibyte:
+${db((i) => ((i * i) >> 9) & 0xff)}

 `);
--- a/todo.md
+++ b/todo.md
@ -1,17 +1,15 @@
 things to try:

-* fix status bar to show elapsed time, per-iter time, per-pixel iter count
-
-* 'turbo' mode disabling graphics in full or part
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D

-* maybe clean up the load/layout of the big mul table
-
-* consider alternate lookup tables in the top 16KB under ROM
+* try 3.13 fixed point instead of 4.12 for more precision
+  * can we get away without the extra bit?

 * y-axis mirror optimization

+* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
+  * maybe redo tiering to just 4x4, 2x2, 1x1?
+
 * extract viewport for display & re-input via keyboard

 * fujinet screenshot/viewport uploader
Author	SHA1	Message	Date
Jamey Sharp	3553ce986f	shave some cycles off 16-bit squaring with shift instead of add also fix the comments about how many cycles shift takes	2024-12-31 15:29:40 -08:00
Jamey Sharp	0f49760aa5	unify tables for squaring and multiplication	2024-12-31 02:26:24 -08:00
Jamey Sharp	f06aed0c00	set results from both 8-bit squares first Since the results from the lo and hi squares don't overlap or overflow, they can be written directly to the final output location without doing any addition. Then only the multiplication that goes in the middle needs any adds.	2024-12-31 02:22:31 -08:00
Jamey Sharp	aee587388d	eliminate mul_hibyte512 table This costs an extra half cycle on average, assuming uniform distribution of multiplication inputs. I don't think a half cycle is worth an extra 256-byte table.	2024-12-31 02:01:45 -08:00