From fa0de6dc776a875ed97f74c4261d91629ee58fb7 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 16 Sep 2025 21:29:40 -0700 Subject: [PATCH 1/6] WIP savings of half a cycle per imul8_xe Uses X to cache arg1, which is always used, instead of arg2, which is only used on odds. Should save half a cycle per imul8_xe, untested --- mandel.s | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mandel.s b/mandel.s index b0f9c28..ec9f17f 100644 --- a/mandel.s +++ b/mandel.s @@ -461,7 +461,7 @@ input_max: sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * bytes cycles +; 3 + 5 * (bytes - 1) cycles .macro shl bytes, arg asl arg ; 3 cyc .repeat bytes-1, i @@ -469,17 +469,17 @@ input_max: .endrepeat .endmacro -; 13 cycles +; 8 cycles .macro shl16 arg shl 2, arg .endmacro -; 18 cycles +; 13 cycles .macro shl24 arg shl 3, arg .endmacro -; 23 cycles +; 18 cycles .macro shl32 arg shl 4, arg .endmacro @@ -529,11 +529,11 @@ input_max: neg 4, arg .endmacro -; 11-27 + 23 * shift cycles -; 103-119 cycles for shift=4 +; 11-27 + 18 * shift cycles +; 65-81 cycles for shift=3 .macro shift_round_16 arg, shift .repeat shift - shl32 arg ; 23 cycles + shl32 arg ; 18 cycles .endrepeat round16 arg ; 11-27 cycles .endmacro @@ -588,7 +588,7 @@ bank_switch_table: .macro imul8 dest, arg1, arg2, xe .if xe ; using 64KB lookup table - ; 51-70 cycles + ; 50-70 cycles ; clobbers x, y, dest, ptr .scope output = dest @@ -600,13 +600,13 @@ bank_switch_table: ; bottom 14 bits except the LSB are the per-bank table index ; add $4000 for the bank pointer - txa ; 2 cyc and #$3f ; 2 cyc ora #$40 ; 2 cyc sta ptr + 1 ; 3 cyc ; copy the entry into output lda arg1 ; 3 cyc + tax ; 2 cyc and #$fe ; 2 cyc tay ; 2 cyc lda (ptr),y ; 5 cyc @@ -623,13 +623,13 @@ bank_switch_table: ;;sta PORTB ; 4 cyc - disabled ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc + txa ; 2 cyc and #1 ; 2 cyc beq done ; 2 cyc ; add arg2 one last time for the skipped bit clc ; 2 cyc - txa ; 2 cyc + lda arg1 ; 3 cyc adc output ; 3 cyc sta output ; 3 cyc lda #0 ; 2 cyc From 6479cf530c1c584f33b96f2b19885d02415863bb Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 16 Sep 2025 21:29:40 -0700 Subject: [PATCH 2/6] update some timings --- mandel.s | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mandel.s b/mandel.s index b0f9c28..b52f24a 100644 --- a/mandel.s +++ b/mandel.s @@ -461,7 +461,7 @@ input_max: sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * bytes cycles +; 3 + 5 * (bytes - 1) cycles .macro shl bytes, arg asl arg ; 3 cyc .repeat bytes-1, i @@ -469,17 +469,17 @@ input_max: .endrepeat .endmacro -; 13 cycles +; 8 cycles .macro shl16 arg shl 2, arg .endmacro -; 18 cycles +; 13 cycles .macro shl24 arg shl 3, arg .endmacro -; 23 cycles +; 18 cycles .macro shl32 arg shl 4, arg .endmacro @@ -529,11 +529,11 @@ input_max: neg 4, arg .endmacro -; 11-27 + 23 * shift cycles -; 103-119 cycles for shift=4 +; 11-27 + 18 * shift cycles +; 65-81 cycles for shift=3 .macro shift_round_16 arg, shift .repeat shift - shl32 arg ; 23 cycles + shl32 arg ; 18 cycles .endrepeat round16 arg ; 11-27 cycles .endmacro From b27be3c1592c26609a26b6d0f82dcaf88aad5763 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 28 Dec 2025 09:23:38 -0800 Subject: [PATCH 3/6] Add a C shell, which currently just passes through This is a first step toward moving the UI to C and adding file and network I/O in C. The fractal core will remain in assembler as well as the multiplier. --- Makefile | 10 ++++--- atari-xex.cfg | 62 +++++++++++++++++++++++++++++++++++++++ mandel.s => mandel-core.s | 4 +-- mandel.c | 15 ++++++++++ mandel.h | 4 +++ 5 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 atari-xex.cfg rename mandel.s => mandel-core.s (99%) create mode 100644 mandel.c create mode 100644 mandel.h diff --git a/Makefile b/Makefile index 711adcd..c94074b 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,11 @@ all : mandel.xex -mandel.xex : mandel.o tables.o atari-asm-xex.cfg - ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o +mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg + ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib + +mandel.s : mandel.c mandel.h + cc65 -o $@ mandel.c %.o : %.s ca65 -o $@ $< @@ -13,8 +16,7 @@ tables.s : tables.js clean : rm -f tables.s + rm -f mandel.s rm -f *.o rm -f *.xex rm -f mandel.map - - diff --git a/atari-xex.cfg b/atari-xex.cfg new file mode 100644 index 0000000..ee41c4c --- /dev/null +++ b/atari-xex.cfg @@ -0,0 +1,62 @@ +# Sample linker configuration for C programs using the Atari binary file support. +# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex +FEATURES { + STARTADDRESS: default = $2000; +} +SYMBOLS { + __SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk + __STACKSIZE__: type = weak, value = $0800; # 2k stack + __STARTADDRESS__: type = export, value = %S; + __RESERVED_MEMORY__: type = weak, value = $0000; + __SYSCHKHDR__: type = export, value = 0; # Disable system check header + __SYSCHKTRL__: type = export, value = 0; # Disable system check trailer +} +MEMORY { + ZP: file = "", define = yes, start = $0082, size = $007E; +# "system check" load chunk + SYSCHKCHNK: file = %O, start = $2E00, size = $0300; +# "main program" load chunk +# Note we reserve $4000-7fff for the bank-switch window. + #MAIN: file = %O, define = yes, start = %S, size = $BC20 - __STACKSIZE__ - __RESERVED_MEMORY__ - %S; + MAIN: file = %O, define = yes, start = %S, size = $4000 - __STACKSIZE__ - __RESERVED_MEMORY__ - %S; + +# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION. + TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; +} +FILES { + %O: format = atari; +} +FORMATS { + atari: runad = start, + initad = SYSCHKCHNK: __SYSTEM_CHECK__; +} +SEGMENTS { + ZEROPAGE: load = ZP, type = zp; + EXTZP: load = ZP, type = zp, optional = yes; + SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes; + STARTUP: load = MAIN, type = ro, define = yes; + LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized + LOWCODE: load = MAIN, type = ro, define = yes, optional = yes; + ONCE: load = MAIN, type = ro, optional = yes; + CODE: load = MAIN, type = ro, define = yes; + RODATA: load = MAIN, type = ro; + DATA: load = MAIN, type = rw; + INIT: load = MAIN, type = rw, optional = yes; + BSS: load = MAIN, type = bss, define = yes; + TABLES: load = TABLES, type = ro, optional = yes, align = 256; +} +FEATURES { + CONDES: type = constructor, + label = __CONSTRUCTOR_TABLE__, + count = __CONSTRUCTOR_COUNT__, + segment = ONCE; + CONDES: type = destructor, + label = __DESTRUCTOR_TABLE__, + count = __DESTRUCTOR_COUNT__, + segment = RODATA; + CONDES: type = interruptor, + label = __INTERRUPTOR_TABLE__, + count = __INTERRUPTOR_COUNT__, + segment = RODATA, + import = __CALLIRQ__; +} diff --git a/mandel.s b/mandel-core.s similarity index 99% rename from mandel.s rename to mandel-core.s index b52f24a..6ebb089 100644 --- a/mandel.s +++ b/mandel-core.s @@ -361,7 +361,7 @@ z_buffer: .word 0 .endrepeat -.export start +.export _mandel_start ;max_fill_level = 6 max_fill_level = 3 @@ -1745,7 +1745,7 @@ zero_byte_loop: rts .endproc -.proc start +.proc _mandel_start jsr imul8xe_init diff --git a/mandel.c b/mandel.c new file mode 100644 index 0000000..f287fa3 --- /dev/null +++ b/mandel.c @@ -0,0 +1,15 @@ +/** + * The UI and I/O wrapper for the Mandelbrot runner, in C. + * + * For the moment *all* logic is in mandel-core.s, I'm just + * trying to get this to run within a cc65 environment. + * Eventually just the inner loop fun will live in there. + */ + +#include +#include +#include "mandel.h" + +void main(void) { + mandel_start(); +} \ No newline at end of file diff --git a/mandel.h b/mandel.h new file mode 100644 index 0000000..e43fad7 --- /dev/null +++ b/mandel.h @@ -0,0 +1,4 @@ +#include + +// From mandel-core.s: +extern void mandel_start(void); From 97fdc12565c9f2e6b853b0e97688179195ee5281 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 28 Dec 2025 12:32:57 -0800 Subject: [PATCH 4/6] Put the tables before the main code, and shrink the segment Leaves more room for code and dynamic data/stack --- atari-xex.cfg | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/atari-xex.cfg b/atari-xex.cfg index ee41c4c..e9090b4 100644 --- a/atari-xex.cfg +++ b/atari-xex.cfg @@ -10,18 +10,19 @@ SYMBOLS { __RESERVED_MEMORY__: type = weak, value = $0000; __SYSCHKHDR__: type = export, value = 0; # Disable system check header __SYSCHKTRL__: type = export, value = 0; # Disable system check trailer + __TABLESEG_SIZE__: type = weak, value = 6 * $100; + __FRAMEBUFFER_START__: type = weak, value = $a000; } MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; # "system check" load chunk SYSCHKCHNK: file = %O, start = $2E00, size = $0300; -# "main program" load chunk # Note we reserve $4000-7fff for the bank-switch window. - #MAIN: file = %O, define = yes, start = %S, size = $BC20 - __STACKSIZE__ - __RESERVED_MEMORY__ - %S; - MAIN: file = %O, define = yes, start = %S, size = $4000 - __STACKSIZE__ - __RESERVED_MEMORY__ - %S; - # Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION. - TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; + TABLES: file = %O, define = yes, start = %S, size = __TABLESEG_SIZE__; +# "main program" load chunk + MAIN: file = %O, define = yes, start = %S + __TABLESEG_SIZE__, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - __TABLESEG_SIZE__ - %S; + } FILES { %O: format = atari; @@ -34,6 +35,7 @@ SEGMENTS { ZEROPAGE: load = ZP, type = zp; EXTZP: load = ZP, type = zp, optional = yes; SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes; + TABLES: load = TABLES, type = ro, optional = yes, align = 256; STARTUP: load = MAIN, type = ro, define = yes; LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized LOWCODE: load = MAIN, type = ro, define = yes, optional = yes; @@ -43,7 +45,6 @@ SEGMENTS { DATA: load = MAIN, type = rw; INIT: load = MAIN, type = rw, optional = yes; BSS: load = MAIN, type = bss, define = yes; - TABLES: load = TABLES, type = ro, optional = yes, align = 256; } FEATURES { CONDES: type = constructor, From a93dd00e3697f9af47bb09a118722678b597a4cf Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 28 Dec 2025 12:55:08 -0800 Subject: [PATCH 5/6] Rearrange the segments a bit * put TABLES in the low memory, before the bank switch window * reserve bank switch window * put rest of the code after that and before the framebuffer so TABLES lives just before $4000 and MAIN lives in $8000-$bfff could split some more code and/or data into low mem and/or move the tables not used in extended memory mode into the bank switch window so they take no address space on XE or expanded memory machines --- atari-xex.cfg | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/atari-xex.cfg b/atari-xex.cfg index e9090b4..467d9d4 100644 --- a/atari-xex.cfg +++ b/atari-xex.cfg @@ -1,7 +1,7 @@ # Sample linker configuration for C programs using the Atari binary file support. # Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex FEATURES { - STARTADDRESS: default = $2000; + STARTADDRESS: default = $8000; } SYMBOLS { __SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk @@ -10,19 +10,24 @@ SYMBOLS { __RESERVED_MEMORY__: type = weak, value = $0000; __SYSCHKHDR__: type = export, value = 0; # Disable system check header __SYSCHKTRL__: type = export, value = 0; # Disable system check trailer + __TABLESEG_START__: type = weak, value = $2E00 + $0300; __TABLESEG_SIZE__: type = weak, value = 6 * $100; - __FRAMEBUFFER_START__: type = weak, value = $a000; + __BANKSY_START__: type = weak, value = $4000; + __BANKSY_SIZE__: type = weak, value = $4000; + __FRAMEBUFFER_START__: type = weak, value = $A000; } MEMORY { +# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP. ZP: file = "", define = yes, start = $0082, size = $007E; # "system check" load chunk SYSCHKCHNK: file = %O, start = $2E00, size = $0300; -# Note we reserve $4000-7fff for the bank-switch window. # Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION. - TABLES: file = %O, define = yes, start = %S, size = __TABLESEG_SIZE__; + TABLES: file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__; +# We reserve $4000-7fff for the bank-switch window. +# In theory we could keep data and code here that we only use on 48k/64k systems. + BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__; # "main program" load chunk - MAIN: file = %O, define = yes, start = %S + __TABLESEG_SIZE__, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - __TABLESEG_SIZE__ - %S; - + MAIN: file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S; } FILES { %O: format = atari; @@ -36,6 +41,7 @@ SEGMENTS { EXTZP: load = ZP, type = zp, optional = yes; SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes; TABLES: load = TABLES, type = ro, optional = yes, align = 256; + BANKSWICH: load = BANKSWITCH, type = ro, optional = yes; STARTUP: load = MAIN, type = ro, define = yes; LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized LOWCODE: load = MAIN, type = ro, define = yes, optional = yes; From 25c37a1188b8726f95cee5615fc94cbb34765b0c Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 8 Apr 2026 19:58:27 -0700 Subject: [PATCH 6/6] zeropage tweaks * switched zero-page from hardcoded assignments to symbols * moved most non-hotpath stuff out to .data * merged ptr and pixel_ptr Slight slowdown in Atari800MacX from 5m13s to 5m15s --- mandel-core.s | 103 +++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/mandel-core.s b/mandel-core.s index 6ebb089..34dff8b 100644 --- a/mandel-core.s +++ b/mandel-core.s @@ -1,44 +1,44 @@ -; Our zero-page vars -ox = $80 ; fixed6.26: center point x -oy = $84 ; fixed6.26: center point y -cx = $88 ; fixed6.26: c_x -cy = $8c ; fixed6.26: c_y +.zeropage -zx = $90 ; fixed6.26: z_x -zy = $94 ; fixed6.26: z_y -zx_2 = $98 ; fixed6.26: z_x^2 -zy_2 = $9c ; fixed6.26: z_y^2 +ox: .res 4 ; fixed6.26: center point x +oy: .res 4 ; fixed6.26: center point y +cx: .res 4 ; fixed6.26: c_x +cy: .res 4 ; fixed6.26: c_y -zx_zy = $a0 ; fixed6.26: z_x * z_y -dist = $a4 ; fixed6.26: z_x^2 + z_y^2 -sx = $a8 ; i16: screen pixel x -sy = $aa ; i16: screen pixel y -z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $ad ; u8: index into z_buffer -z_buffer_end = $ae ; u8: index into z_buffer -iter = $af ; u8: iteration count +zx: .res 4 ; fixed6.26: z_x +zy: .res 4 ; fixed6.26: z_y +zx_2: .res 4 ; fixed6.26: z_x^2 +zy_2: .res 4 ; fixed6.26: z_y^2 -ptr = $b0 ; u16 -pixel_ptr = $b2 ; u16 -zoom = $b4 ; u8: zoom shift level -fill_level = $b5 ; u8 -pixel_color = $b6 ; u8 -pixel_mask = $b7 ; u8 -pixel_shift = $b8 ; u8 -pixel_offset = $b9 ; u8 -palette_offset = $ba ; u8 -chroma_offset = $bb ; u8 -palette_ticks = $bc ; u8 -chroma_ticks = $bd ; u8 -count_frames = $be ; u8 -; free space $bf +zx_zy: .res 4 ; fixed6.26: z_x * z_y +dist: .res 4 ; fixed6.26: z_x^2 + z_y^2 -count_iters = $c0 ; u16 -text_col = $c2 ; u8 -text_row = $c3 ; u8 -; free space c4-cb -temp = $cc ; u16 -temp2 = $ce ; u16 +z_buffer_active: .res 1 ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start: .res 1 ; u8: index into z_buffer +z_buffer_end: .res 1 ; u8: index into z_buffer +iter: .res 1 ; u8: iteration count +ptr: .res 2 ; u16 +temp: .res 2 ; u16 +temp2: .res 2 ; u16 + +.data +; can move to .data +sx: .res 2 ; i16: screen pixel x +sy: .res 2 ; i16: screen pixel y +zoom: .res 1 ; u8: zoom shift level +fill_level: .res 1 ; u8 +pixel_color: .res 1 ; u8 +pixel_mask: .res 1 ; u8 +pixel_shift: .res 1 ; u8 +pixel_offset: .res 1 ; u8 +palette_offset: .res 1 ; u8 +chroma_offset: .res 1 ; u8 +palette_ticks: .res 1 ; u8 +chroma_ticks: .res 1 ; u8 +count_frames: .res 1 ; u8 +count_iters: .res 2 ; u16 +text_col: .res 1 ; u8 +text_row: .res 1 ; u8 palette_delay = 23 chroma_delay = 137 @@ -131,6 +131,8 @@ KEY_E = 42 KEY_X = 22 KEY_Y = 43 +.data + .struct float48 exponent .byte mantissa .byte 5 @@ -142,7 +144,6 @@ KEY_Y = 43 .import sqr_lobyte .import sqr_hibyte -.data strings: str_self: @@ -1253,21 +1254,21 @@ enough: negative: ; temp1 = top half lda #.lobyte(framebuffer_top + stride * half_height) - sta pixel_ptr + sta ptr lda #.hibyte(framebuffer_top + stride * half_height) - sta pixel_ptr + 1 + sta ptr + 1 jmp point positive: lda #.lobyte(framebuffer_bottom) - sta pixel_ptr + sta ptr lda #.hibyte(framebuffer_bottom) - sta pixel_ptr + 1 + sta ptr + 1 point: - ; pixel_ptr += sy * stride + ; ptr += sy * stride ; temp * 40 ; = temp * 32 + temp * 8 ; = (temp << 5) + (temp << 3) @@ -1275,10 +1276,10 @@ point: shl16 temp shl16 temp shl16 temp - add16 pixel_ptr, pixel_ptr, temp + add16 ptr, ptr, temp shl16 temp shl16 temp - add16 pixel_ptr, pixel_ptr, temp + add16 ptr, ptr, temp ; Ok so temp1 points to the start of the line, which is 40 bytes. ; Get the byte and bit offsets @@ -1318,20 +1319,20 @@ shift_done: draw_pixel: ; read, mask, or, write - lda (pixel_ptr),y + lda (ptr),y and pixel_mask ora pixel_color - sta (pixel_ptr),y + sta (ptr),y dex beq done clc lda #40 - adc pixel_ptr - sta pixel_ptr + adc ptr + sta ptr lda #0 - adc pixel_ptr + 1 - sta pixel_ptr + 1 + adc ptr + 1 + sta ptr + 1 jmp draw_pixel done: