sixth viewport

tweak viewports
skip experimental 6th viewport that got forgotten and limit max zoom to 7 (range 0-7) which is what looks good
2025-01-01 21:15:38 -08:00 · 2025-01-01 15:45:26 -08:00 · 2025-01-01 15:37:12 -08:00 · 2024-12-31 22:10:27 -08:00 · 2024-12-31 20:13:11 -08:00 · 2024-12-31 17:49:13 -08:00
7 changed files with 707 additions and 504 deletions
--- a/4
+++ b/4
@ -2,8 +2,8 @@
 all : mandel.xex
-mandel.xex : mandel.o tables.o
+mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ $+
+	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
 %.o : %.s
 	ca65 -o $@ $<
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -0,0 +1,28 @@
 FEATURES {
    STARTADDRESS: default = $2E00;
 }
 SYMBOLS {
    __STARTADDRESS__: type = export, value = %S;
 }
 MEMORY {
    ZP:      file = "", define = yes, start = $0082, size = $007E;
    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
    # Keep $4000-7fff clear for expanded RAM access window
    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
    # Keep $a000-$bfff clear for BASIC cartridge
 }
 FILES {
    %O: format = atari;
 }
 FORMATS {
    atari: runad = start;
 }
 SEGMENTS {
    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
    CODE:     load = MAIN,    type = rw,                  define = yes;
    RODATA:   load = MAIN,    type = ro   optional = yes;
    DATA:     load = MAIN,    type = rw   optional = yes;
    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
 }
--- a/imul8xe.s
+++ b/imul8xe.s
@ -1,175 +0,0 @@
 FR0    = $d4 ; float48
 PORTB = $d301
 EXTENDED_RAM = $4000 ; 16KiB bank on the XE
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
 bankswitch:
    .repeat 256, i
        .byte ((i & $c0) >> 5) | $c1
    .endrepeat
 ; 58-77 cycles
 ; clobbers x, y, dest to dest + 3
 .macro imul8xe dest, arg1, arg2
 .local done
 .local output
 .local ptr
    output = dest
    ptr = dest + 2 ; scratch space assumed
    ; bottom 14 bits except the LSB are the per-bank table index
    ; add $4000 for the bank pointer
    lda arg1     ; 3 cyc
    and #$fe     ; 2 cyc
    sta ptr      ; 3 cyc
    lda arg2     ; 3 cyc
    and #$3f     ; 2 cyc
    clc          ; 2 cyc
    adc #$40     ; 2 cyc
    sta ptr + 1  ; 3 cyc
    ; top 2 bits are the table bank selector
    ldx arg2          ; 3 cyc
    lda bank_switch,x ; 4 cyc
    sta PORTB         ; 4 cyc
    ; copy the entry into output
    ldy #0       ; 2 cyc
    lda (ptr),y  ; 5 cyc
    sta output   ; 3 cyc
    iny          ; 2 cyc
    lda (ptr),y  ; 5 cyc
    sta output+1 ; 3 cyc
    ; note: we are not restoring memory to save 6 cycles!
    ; this means those 16kb have to be switched back to base RAM
    ; if we need to use them anywhere else
    ;;; restore memory
    ;;lda #$81     ; 2 cyc - disabled
    ;;sta PORTB    ; 4 cyc - disabled
    ; check that 1 bit we skipped to fit into space
    lda arg1     ; 3 cyc
    and #1       ; 2 cyc
    beq done     ; 2 cyc
    ; add the second param one last time for the skipped bit
    clc          ; 2 cyc
    lda arg2     ; 3 cyc
    adc output   ; 3 cyc
    sta output   ; 3 cyc
    lda #0       ; 2 cyc
    adc output+1 ; 3 cyc
    sta output+1 ; 3 cyc
 done:
 .endmacro
 .macro bank_switch bank
    lda #((bank << 1) | $c1)
    sta PORTB
 .endmacro
 proc imul8xe_init
    ; go through the input set, in four 16KB chunks
    arg1 = FR1
    arg2 = FR2
    result = FR0
    lda #$00
    sta arg1
    sta arg2
    ; $00 * $00 -> $3f * $ff
    bank_switch 0
    jsr imul8xe_init_section
    ; $40 * $00 -> $7f * $ff
    bank_switch 1
    jsr imul8xe_init_section
    ; $80 * $00 -> $bf * $ff
    bank_switch 2
    jsr imul8xe_init_section
    ; $c0 * $00 -> $ff * $ff
    bank_switch 3
    jsr imul8xe_init_section
    rts
 endproc
 ; Initialize a 16 KB chunk of the table
 ; input: multipliers in temp
 ; output: new multipliers in temp
 ; clobbers: temp, temp2
 proc imul8xe_init_section
    arg1 = FR1
    arg2 = FR2
    result = FR0
    ptr = temp2
    lda #$00
    sta ptr
    lda #$40
    sta ptr + 1
    ldx #0
    ldy #0
    ; outer loop: $00 -> $3f
 outer_loop:
    ; reset result to 0
    lda #0
    sta result
    sta result + 1
    ; inner loop: $00 -> $ff
 inner_loop:
    ; copy result to data set
    lda result
    sta (ptr),y
    lda result + 1
    sta (ptr),y
    ; result += 2 * arg2
    clc
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result
    ; inner loop check
    inc arg1
    inc arg1
    inc ptr
    inc ptr
    bne inner_loop
    ; outer loop check
    inc arg2
    inc ptr + 1
    lda ptr + 1
    cmp #$40
    bne outer_loop
    rts
 endproc
--- a/mandel.s
+++ b/mandel.s
--- a/readme.md
+++ b/readme.md
@ -14,33 +14,37 @@ Non-goals:
 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brooke, january 2023 - february 2024
+-- brooke, january 2023 - december 2024
 ## Current state
-Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
-The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
-The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
+* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
 * an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
-The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
+The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
 Iterations are capped at 255.
 The pixels are run in a progressive layout to get the basic shape on screen faster.
-## Next steps
+There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
-Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
+There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
-Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+There's some cute color cycling.
 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
 (done)
 ## Deps and build instructions
 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
 ## Todo
 See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -22,7 +22,10 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
 .export sqr_lobyte
 .export sqr_hibyte
 ; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 ; (i * i) for the plain squares
 .align 256
 sqr_lobyte:
 ${db((i) => (i * i) & 0xff)}
 .align 256
 sqr_hibyte:
 ${db((i) => ((i * i) >> 8) & 0xff)}
 `);
--- a/todo.md
+++ b/todo.md
@ -0,0 +1,12 @@
 things to try:
 * skip add on the top-byte multiply in sqr8/mul8
  * should save a few cycles, suggestion by jamey
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 * y-axis mirror optimization
 * extract viewport for display & re-input via keyboard
 * fujinet screenshot/viewport uploader
Author	SHA1	Message	Date
Brooke Vibber	dcf5a3f59e	sixth viewport	2025-01-01 21:15:38 -08:00
Brooke Vibber	837082cf56	tweak viewports skip experimental 6th viewport that got forgotten and limit max zoom to 7 (range 0-7) which is what looks good	2025-01-01 15:45:26 -08:00
Brooke Vibber	65fcb44934	3.13 / 6.26 gives nicer results!	2025-01-01 15:37:12 -08:00
Brooke Vibber	c424f1b8bc	fill in scanlines during tiering	2024-12-31 22:10:27 -08:00
Brooke Vibber	49fe315529	'wide pixels' should get better color on the composite video because the scanlines will be fuller of data	2024-12-31 20:13:11 -08:00
Brooke Vibber	f1ebb21bcb	wip not working wide pixels	2024-12-31 17:49:13 -08:00
Brooke Vibber	87caa52543	add viewport number 5 full zoom	2024-12-31 15:45:03 -08:00
Brooke Vibber	d8601bb856	fix fix	2024-12-31 15:03:43 -08:00
Brooke Vibber	7985ea9a39	fix panning for 32-bi	2024-12-31 14:45:38 -08:00
Brooke Vibber	cc83c76706	update docs for 32-bit intermediates	2024-12-31 14:16:43 -08:00
Brooke Vibber	2e8893fd78	haha fuck me	2024-12-31 13:54:53 -08:00
Brooke Vibber	81bf7f3c43	tweak	2024-12-31 09:53:22 -08:00
Brooke Vibber	1e0f577e09	wip	2024-12-31 09:09:11 -08:00
Brooke Vibber	d2f41f9644	wip	2024-12-31 09:02:42 -08:00
Brooke Vibber	2fcb30b76a	wip	2024-12-31 08:56:59 -08:00
Brooke Vibber	13257309dc	init fix	2024-12-31 08:34:02 -08:00
Brooke Vibber	7184b8e03f	wip	2024-12-31 08:24:47 -08:00
Brooke Vibber	4a1e35699a	wip	2024-12-31 08:24:44 -08:00
Brooke Vibber	0d086a179c	wip	2024-12-31 08:23:04 -08:00
Brooke Vibber	61eb1aaf21	notes	2024-12-31 05:11:26 -08:00
Brooke Vibber	b56dc1e98b	notes	2024-12-30 20:38:33 -08:00
Brooke Vibber	0a7293d8bc	do 4x4 2x2 1x1 only in prep for bigger pixels	2024-12-30 19:52:35 -08:00
Brooke Vibber	ec42f672d4	use an 8-item z buffer for slightly fasterness	2024-12-30 19:48:28 -08:00
Brooke Vibber	67649d4743	annotations, tweak	2024-12-30 19:17:02 -08:00
Brooke Vibber	ed79c80b16	update readme	2024-12-30 16:50:25 -08:00
Brooke Vibber	e6cbe0bc6b	notes	2024-12-30 16:43:18 -08:00
Brooke Vibber	6db8cef82d	51-70 cycles for xe :D	2024-12-30 15:17:50 -08:00
Brooke Vibber	9b7f6b8937	add a viewport in the front spike	2024-12-30 14:22:03 -08:00
Brooke Vibber	3bd9b1ac31	micro-optimizations in imul8xe 53-72 cycles overview in 10.896 ms/px	2024-12-30 14:09:02 -08:00
Brooke Vibber	63e74d5152	tweak	2024-12-30 13:44:31 -08:00
Brooke Vibber	14125a398a	cycle 'in' not 'out'	2024-12-30 11:35:45 -08:00
Brooke Vibber	71d8d93abc	even better palette cycling	2024-12-30 11:33:55 -08:00
Brooke Vibber	64a6cf50f3	awesome new palette cycler	2024-12-30 10:21:52 -08:00
Brooke Vibber	100c0f3314	1/2/3 selectable viewports	2024-12-30 09:19:41 -08:00
Brooke Vibber	e51aa91e4e	notes	2024-12-30 06:48:04 -08:00
Brooke Vibber	c4b98c7be2	optimize out a temporary down to 11.076 ms/px on xe	2024-12-30 05:35:22 -08:00
Brooke Vibber	70d2c91f03	fix bank switch on xl/xe was accidentally enabling basic rom :D 5m46s - 11.759 ms/px - 800xl 5m30s - 11.215 ms/px - 130xe	2024-12-30 03:56:35 -08:00
Brooke Vibber	acac5a8df4	moving the framebuffer into the basic space fails on 130xe and 800xl for some reason works on 800 as expected	2024-12-29 21:19:55 -08:00
Brooke Vibber	883f926e57	split memory, wip appears to work on 800 but xl/xe overlap basic lol	2024-12-29 21:06:48 -08:00
Brooke Vibber	0c63430dd9	wip tables segment to be	2024-12-29 20:37:58 -08:00
Brooke Vibber	3ab5006aa3	wip refacotring	2024-12-29 17:56:14 -08:00
Brooke Vibber	f903272335	refactoring and start on squares	2024-12-29 17:37:06 -08:00
Brooke Vibber	8ad996981a	whoops	2024-12-29 13:19:58 -08:00
Brooke Vibber	15fc5367f9	switck with the overview as default fo rnow	2024-12-29 13:18:54 -08:00
Brooke Vibber	2118890977	add an alternate viewport (compile-time currently) zoomed to max	2024-12-29 13:10:35 -08:00
Brooke Vibber	0fc5ba914f	fix pan/zoom bug was missing an rts on update_palette this happened to fall through to keycheck which if timing was wrong would dutifully process the viewport change and return to update_palette's caller which in turn was -not- expecting to reset the outer loop fixed	2024-12-29 12:29:36 -08:00
Brooke Vibber	2b0167226e	todos	2024-12-28 20:44:27 -08:00
Brooke Vibber	504457595a	correct zoom border checks	2024-12-28 18:11:35 -08:00
Brooke Vibber	0fcf4d6676	comment tweak	2024-12-28 17:40:21 -08:00
Brooke Vibber	d83b811444	remove stray copy of the expanded-ram imul it's not finished or working, just keep the core one :D	2024-12-28 15:13:06 -08:00
Brooke Vibber	f32cc5fa7c	whoops	2024-12-27 19:15:19 -08:00
Brooke Vibber	052a19b6aa	Merge pull request 'xe' (#1 ) from xe into main Reviewed-on: https://brooke.vibber.net/git/git/brooke/mandel-6502/pulls/1	2024-12-28 02:40:01 +00:00