7 changed files with 489 additions and 669 deletions
--- a/4
+++ b/4
@ -2,8 +2,8 @@
 all : mandel.xex
-mandel.xex : mandel.o tables.o atari-asm-xex.cfg
+mandel.xex : mandel.o tables.o
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 %.o : %.s
 	ca65 -o $@ $<
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -1,28 +0,0 @@
 FEATURES {
    STARTADDRESS: default = $2E00;
 }
 SYMBOLS {
    __STARTADDRESS__: type = export, value = %S;
 }
 MEMORY {
    ZP:      file = "", define = yes, start = $0082, size = $007E;
    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
    # Keep $4000-7fff clear for expanded RAM access window
    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
    # Keep $a000-$bfff clear for BASIC cartridge
 }
 FILES {
    %O: format = atari;
 }
 FORMATS {
    atari: runad = start;
 }
 SEGMENTS {
    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
    CODE:     load = MAIN,    type = rw,                  define = yes;
    RODATA:   load = MAIN,    type = ro   optional = yes;
    DATA:     load = MAIN,    type = rw   optional = yes;
    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
 }
--- a/imul8xe.s
+++ b/imul8xe.s
@ -0,0 +1,175 @@
 FR0    = $d4 ; float48
 PORTB = $d301
 EXTENDED_RAM = $4000 ; 16KiB bank on the XE
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
 bankswitch:
    .repeat 256, i
        .byte ((i & $c0) >> 5) | $c1
    .endrepeat
 ; 58-77 cycles
 ; clobbers x, y, dest to dest + 3
 .macro imul8xe dest, arg1, arg2
 .local done
 .local output
 .local ptr
    output = dest
    ptr = dest + 2 ; scratch space assumed
    ; bottom 14 bits except the LSB are the per-bank table index
    ; add $4000 for the bank pointer
    lda arg1     ; 3 cyc
    and #$fe     ; 2 cyc
    sta ptr      ; 3 cyc
    lda arg2     ; 3 cyc
    and #$3f     ; 2 cyc
    clc          ; 2 cyc
    adc #$40     ; 2 cyc
    sta ptr + 1  ; 3 cyc
    ; top 2 bits are the table bank selector
    ldx arg2          ; 3 cyc
    lda bank_switch,x ; 4 cyc
    sta PORTB         ; 4 cyc
    ; copy the entry into output
    ldy #0       ; 2 cyc
    lda (ptr),y  ; 5 cyc
    sta output   ; 3 cyc
    iny          ; 2 cyc
    lda (ptr),y  ; 5 cyc
    sta output+1 ; 3 cyc
    ; note: we are not restoring memory to save 6 cycles!
    ; this means those 16kb have to be switched back to base RAM
    ; if we need to use them anywhere else
    ;;; restore memory
    ;;lda #$81     ; 2 cyc - disabled
    ;;sta PORTB    ; 4 cyc - disabled
    ; check that 1 bit we skipped to fit into space
    lda arg1     ; 3 cyc
    and #1       ; 2 cyc
    beq done     ; 2 cyc
    ; add the second param one last time for the skipped bit
    clc          ; 2 cyc
    lda arg2     ; 3 cyc
    adc output   ; 3 cyc
    sta output   ; 3 cyc
    lda #0       ; 2 cyc
    adc output+1 ; 3 cyc
    sta output+1 ; 3 cyc
 done:
 .endmacro
 .macro bank_switch bank
    lda #((bank << 1) | $c1)
    sta PORTB
 .endmacro
 proc imul8xe_init
    ; go through the input set, in four 16KB chunks
    arg1 = FR1
    arg2 = FR2
    result = FR0
    lda #$00
    sta arg1
    sta arg2
    ; $00 * $00 -> $3f * $ff
    bank_switch 0
    jsr imul8xe_init_section
    ; $40 * $00 -> $7f * $ff
    bank_switch 1
    jsr imul8xe_init_section
    ; $80 * $00 -> $bf * $ff
    bank_switch 2
    jsr imul8xe_init_section
    ; $c0 * $00 -> $ff * $ff
    bank_switch 3
    jsr imul8xe_init_section
    rts
 endproc
 ; Initialize a 16 KB chunk of the table
 ; input: multipliers in temp
 ; output: new multipliers in temp
 ; clobbers: temp, temp2
 proc imul8xe_init_section
    arg1 = FR1
    arg2 = FR2
    result = FR0
    ptr = temp2
    lda #$00
    sta ptr
    lda #$40
    sta ptr + 1
    ldx #0
    ldy #0
    ; outer loop: $00 -> $3f
 outer_loop:
    ; reset result to 0
    lda #0
    sta result
    sta result + 1
    ; inner loop: $00 -> $ff
 inner_loop:
    ; copy result to data set
    lda result
    sta (ptr),y
    lda result + 1
    sta (ptr),y
    ; result += 2 * arg2
    clc
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result
    ; inner loop check
    inc arg1
    inc arg1
    inc ptr
    inc ptr
    bne inner_loop
    ; outer loop check
    inc arg2
    inc ptr + 1
    lda ptr + 1
    cmp #$40
    bne outer_loop
    rts
 endproc
--- a/mandel.s
+++ b/mandel.s
--- a/readme.md
+++ b/readme.md
@ -14,37 +14,33 @@ Non-goals:
 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brooke, january 2023 - december 2024
+-- brooke, january 2023 - february 2024
 ## Current state
-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
+Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
-The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
-* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
+The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
 * an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
-The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
 Iterations are capped at 255.
 The pixels are run in a progressive layout to get the basic shape on screen faster.
-There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
+## Next steps
-There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
-There's some cute color cycling.
+Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
 (done)
 ## Deps and build instructions
 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
 ## Todo
 See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -22,10 +22,7 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
 .export sqr_lobyte
 .export sqr_hibyte
 ; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 ; (i * i) for the plain squares
 .align 256
 sqr_lobyte:
 ${db((i) => (i * i) & 0xff)}
 .align 256
 sqr_hibyte:
 ${db((i) => ((i * i) >> 8) & 0xff)}
 `);
--- a/todo.md
+++ b/todo.md
@ -1,19 +0,0 @@
 things to try:
 * skip add on the top-byte multiply in sqr8/mul8
  * should save a few cycles, suggestion by jamey
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 * try 3.13 fixed point instead of 4.12 for more precision
  * can we get away without the extra bit?
  * since exit compare space would be 6.26 i think so
 * y-axis mirror optimization
 * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
  * maybe redo tiering to just 4x4, 2x2, 1x1?
 * extract viewport for display & re-input via keyboard
 * fujinet screenshot/viewport uploader