7 changed files with 489 additions and 669 deletions
--- a/4
+++ b/4
@ -2,8 +2,8 @@

 all : mandel.xex

-mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+

 %.o : %.s
 	ca65 -o $@ $<
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -1,28 +0,0 @@
-FEATURES {
-    STARTADDRESS: default = $2E00;
-}
-SYMBOLS {
-    __STARTADDRESS__: type = export, value = %S;
-}
-MEMORY {
-    ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
-    # Keep $4000-7fff clear for expanded RAM access window
-    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
-    # Keep $a000-$bfff clear for BASIC cartridge
-}
-FILES {
-    %O: format = atari;
-}
-FORMATS {
-    atari: runad = start;
-}
-SEGMENTS {
-    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
-    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
-    CODE:     load = MAIN,    type = rw,                  define = yes;
-    RODATA:   load = MAIN,    type = ro   optional = yes;
-    DATA:     load = MAIN,    type = rw   optional = yes;
-    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
-}
--- a/imul8xe.s
+++ b/imul8xe.s
@ -0,0 +1,175 @@
+FR0    = $d4 ; float48
+PORTB = $d301
+
+
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+
+; lookup table for top byte -> PORTB value for bank-switch
+.align 256
+bankswitch:
+    .repeat 256, i
+        .byte ((i & $c0) >> 5) | $c1
+    .endrepeat
+
+; 58-77 cycles
+; clobbers x, y, dest to dest + 3
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    lda arg1     ; 3 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    lda arg2     ; 3 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    ldx arg2          ; 3 cyc
+    lda bank_switch,x ; 4 cyc
+    sta PORTB         ; 4 cyc
+
+
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
+
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
+
+    ; check that 1 bit we skipped to fit into space
+    lda arg1     ; 3 cyc
+    and #1       ; 2 cyc
+    beq done     ; 2 cyc
+
+    ; add the second param one last time for the skipped bit
+    clc          ; 2 cyc
+    lda arg2     ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
+
+done:
+.endmacro
+
+.macro bank_switch bank
+    lda #((bank << 1) | $c1)
+    sta PORTB
+.endmacro
+
+proc imul8xe_init
+
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+endproc
+
+; Initialize a 16 KB chunk of the table
+; input: multipliers in temp
+; output: new multipliers in temp
+; clobbers: temp, temp2
+proc imul8xe_init_section
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+    ptr = temp2
+
+    lda #$00
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ldx #0
+    ldy #0
+
+    ; outer loop: $00 -> $3f
+outer_loop:
+
+    ; reset result to 0
+    lda #0
+    sta result
+    sta result + 1
+
+    ; inner loop: $00 -> $ff
+inner_loop:
+
+    ; copy result to data set
+    lda result
+    sta (ptr),y
+    lda result + 1
+    sta (ptr),y
+
+    ; result += 2 * arg2
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+
+    ; inner loop check
+    inc arg1
+    inc arg1
+    inc ptr
+    inc ptr
+    bne inner_loop
+
+    ; outer loop check
+    inc arg2
+    inc ptr + 1
+    lda ptr + 1
+    cmp #$40
+    bne outer_loop
+
+    rts
+
+endproc
--- a/mandel.s
+++ b/mandel.s
--- a/readme.md
+++ b/readme.md
@ -14,37 +14,33 @@ Non-goals:

 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.

-- brooke, january 2023 - december 2024
+-- brooke, january 2023 - february 2024

 ## Current state

-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
+Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.

-The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.

-* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
-* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
-* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
-* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
+The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.

-The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.

 Iterations are capped at 255.

 The pixels are run in a progressive layout to get the basic shape on screen faster.

-There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
+## Next steps

-There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!

-There's some cute color cycling.
+Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+
+I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
+(done)

 ## Deps and build instructions

 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.

 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
-
-## Todo
-
-See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -22,10 +22,7 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
-.export sqr_lobyte
-.export sqr_hibyte

-; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}

-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
-
 `);
--- a/todo.md
+++ b/todo.md
@ -1,19 +0,0 @@
-things to try:
-
-* skip add on the top-byte multiply in sqr8/mul8
-  * should save a few cycles, suggestion by jamey
-
-* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
-
-* try 3.13 fixed point instead of 4.12 for more precision
-  * can we get away without the extra bit?
-  * since exit compare space would be 6.26 i think so
-
-* y-axis mirror optimization
-
-* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
-  * maybe redo tiering to just 4x4, 2x2, 1x1?
-
-* extract viewport for display & re-input via keyboard
-
-* fujinet screenshot/viewport uploader