From 87caa52543f3aec6ff3c87dc79dd734182d6be87 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 15:45:03 -0800 Subject: [PATCH 01/32] add viewport number 5 full zoom --- mandel.s | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mandel.s b/mandel.s index b8985b3..9546b68 100644 --- a/mandel.s +++ b/mandel.s @@ -290,18 +290,21 @@ viewport_zoom: .byte 6 .byte 8 .byte 6 + .byte 8 viewport_ox: .dword $00000000 .dword $ff110000 .dword $ff110000 .dword $fe400000 + .dword $fe3b0000 viewport_oy: .dword $00000000 .dword $ffb60000 .dword $ffbe0000 .dword $00000000 + .dword $fffe0000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -1399,6 +1402,8 @@ number_keys: beq three cpy #KEY_4 beq four + cpy #KEY_5 + beq five jmp skip_char one: @@ -1412,6 +1417,9 @@ three: jmp load_key_viewport four: ldx #3 + jmp load_key_viewport +five: + ldx #4 ; fall through load_key_viewport: jsr load_viewport From f1ebb21bcbf9861d19c1dcb9e38f37503b1d22ee Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 17:49:13 -0800 Subject: [PATCH 02/32] wip not working wide pixels --- mandel.s | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 9546b68..f777cbc 100644 --- a/mandel.s +++ b/mandel.s @@ -234,9 +234,9 @@ display_list_len = display_list_end - display_list_start color_map: .byte 0 .repeat 85 - .byte 1 - .byte 2 - .byte 3 + .byte %01010101 + .byte %10101010 + .byte %11111111 .endrepeat @@ -285,6 +285,11 @@ fill_masks: .byte %00000001 .byte %00000000 +pixel_masks: + .byte $ff + .byte $0f + .byte $03 + viewport_zoom: .byte 1 .byte 6 @@ -1130,8 +1135,11 @@ enough: ; iter -> color ldx iter lda color_map,x + ldx fill_level + and pixel_masks,x sta pixel_color - lda #(255 - 3) + lda pixel_masks,x + eor #$ff sta pixel_mask ; sy -> line base address in temp From 49fe3155294c0e392904a85154bf7dbe9d2e7808 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 20:13:11 -0800 Subject: [PATCH 03/32] 'wide pixels' should get better color on the composite video because the scanlines will be fuller of data --- mandel.s | 17 +++++++---------- todo.md | 3 +-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/mandel.s b/mandel.s index f777cbc..472f613 100644 --- a/mandel.s +++ b/mandel.s @@ -286,9 +286,9 @@ fill_masks: .byte %00000000 pixel_masks: - .byte $ff - .byte $0f - .byte $03 + .byte %11111111 + .byte %11110000 + .byte %11000000 viewport_zoom: .byte 1 @@ -1188,18 +1188,15 @@ point: ; pixel_mask <<= pixel_shift (shifting in ones) and #3 sta pixel_shift - lda #3 - sec - sbc pixel_shift tax shift_loop: beq shift_done - asl pixel_color - asl pixel_color + lsr pixel_color + lsr pixel_color sec - rol pixel_mask + ror pixel_mask sec - rol pixel_mask + ror pixel_mask dex jmp shift_loop shift_done: diff --git a/todo.md b/todo.md index 284d653..e8cffe3 100644 --- a/todo.md +++ b/todo.md @@ -11,8 +11,7 @@ things to try: * y-axis mirror optimization -* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering - * maybe redo tiering to just 4x4, 2x2, 1x1? +* try filling in the extra scanlines on 4x4 and 2x2 tiering * extract viewport for display & re-input via keyboard From c424f1b8bc784c1b3bdbed15bb841a068b637039 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 22:10:27 -0800 Subject: [PATCH 04/32] fill in scanlines during tiering --- mandel.s | 17 +++++++++++++++++ todo.md | 2 -- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 472f613..1f5a06f 100644 --- a/mandel.s +++ b/mandel.s @@ -1201,6 +1201,10 @@ shift_loop: jmp shift_loop shift_done: + ldy fill_level + ldx fill_masks,y + inx + ; pixel_offset = temp >> 2 lda temp lsr a @@ -1208,12 +1212,25 @@ shift_done: sta pixel_offset tay +draw_pixel: ; read, mask, or, write lda (pixel_ptr),y and pixel_mask ora pixel_color sta (pixel_ptr),y + dex + beq done + clc + lda #40 + adc pixel_ptr + sta pixel_ptr + lda #0 + adc pixel_ptr + 1 + sta pixel_ptr + 1 + jmp draw_pixel + +done: rts .endproc diff --git a/todo.md b/todo.md index e8cffe3..7ab092b 100644 --- a/todo.md +++ b/todo.md @@ -11,8 +11,6 @@ things to try: * y-axis mirror optimization -* try filling in the extra scanlines on 4x4 and 2x2 tiering - * extract viewport for display & re-input via keyboard * fujinet screenshot/viewport uploader From 65fcb44934d1eedd4ec149082674ac491eef76f8 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 1 Jan 2025 15:37:12 -0800 Subject: [PATCH 05/32] 3.13 / 6.26 gives nicer results! --- mandel.s | 86 +++++++++++++++++++++++++++++-------------------------- readme.md | 4 +-- todo.md | 4 --- 3 files changed, 48 insertions(+), 46 deletions(-) diff --git a/mandel.s b/mandel.s index 1f5a06f..9996c53 100644 --- a/mandel.s +++ b/mandel.s @@ -1,16 +1,16 @@ ; Our zero-page vars -ox = $80 ; fixed8.24: center point x -oy = $84 ; fixed8.24: center point y -cx = $88 ; fixed8.24: c_x -cy = $8c ; fixed8.24: c_y +ox = $80 ; fixed6.26: center point x +oy = $84 ; fixed6.26: center point y +cx = $88 ; fixed6.26: c_x +cy = $8c ; fixed6.26: c_y -zx = $90 ; fixed8.24: z_x -zy = $94 ; fixed8.24: z_y -zx_2 = $98 ; fixed8.24: z_x^2 -zy_2 = $9c ; fixed8.24: z_y^2 +zx = $90 ; fixed6.26: z_x +zy = $94 ; fixed6.26: z_y +zx_2 = $98 ; fixed6.26: z_x^2 +zy_2 = $9c ; fixed6.26: z_y^2 -zx_zy = $a0 ; fixed8.24: z_x * z_y -dist = $a4 ; fixed8.24: z_x^2 + z_y^2 +zx_zy = $a0 ; fixed6.26: z_x * z_y +dist = $a4 ; fixed6.26: z_x^2 + z_y^2 sx = $a8 ; i16: screen pixel x sy = $aa ; i16: screen pixel y z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not @@ -189,11 +189,11 @@ aspect: ; ; 184h is the equiv of 220.8h at square pixels ; 320 / 220.8 = 1.45 display aspect ratio -aspect_x: ; fixed4.16 5/4 - .word 5 << (12 - 2) +aspect_x: ; fixed3.13 5/4 + .word 5 << (13 - 2) -aspect_y: ; fixed4.16 3/4 - .word 3 << (12 - 2) +aspect_y: ; fixed3.13 3/4 + .word 3 << (13 - 2) ms_per_frame: ; float48 16.66666667 .byte 64 ; exponent/sign @@ -291,25 +291,26 @@ pixel_masks: .byte %11000000 viewport_zoom: - .byte 1 - .byte 6 - .byte 8 - .byte 6 + .byte 0 + .byte 5 + .byte 7 + .byte 5 + .byte 7 .byte 8 viewport_ox: - .dword $00000000 - .dword $ff110000 - .dword $ff110000 - .dword $fe400000 - .dword $fe3b0000 + .dword ($00000000 & $3fffffff) << 2 + .dword ($ff110000 & $3fffffff) << 2 + .dword ($ff110000 & $3fffffff) << 2 + .dword ($fe400000 & $3fffffff) << 2 + .dword ($fe3b0000 & $3fffffff) << 2 viewport_oy: - .dword $00000000 - .dword $ffb60000 - .dword $ffbe0000 - .dword $00000000 - .dword $fffe0000 + .dword ($00000000 & $3fffffff) << 2 + .dword ($ffb60000 & $3fffffff) << 2 + .dword ($ffbe0000 & $3fffffff) << 2 + .dword ($00000000 & $3fffffff) << 2 + .dword ($fffe0000 & $3fffffff) << 2 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -883,8 +884,8 @@ next: .proc mandelbrot ; input: - ; cx: position scaled to 8.24 fixed point - -128..+127.9 - ; cy: position scaled to 8.24 + ; cx: position scaled to 6.26 fixed point - -32..+31.9 + ; cy: position scaled to 6.26 ; ; output: ; iter: iteration count at escape or 0 @@ -942,7 +943,7 @@ loop: keep_going: .macro quick_exit arg, max - ; arg: fixed8.24 + ; arg: fixed6.26 ; max: integer .local positive .local negative @@ -955,12 +956,12 @@ keep_going: bmi negative positive: - cmp #max + cmp #(max << 2) bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - max) + cmp #(256 - (max << 2)) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' @@ -980,7 +981,7 @@ keep_going: all_done: .endmacro - ; 8.24: (-128 .. 127.9) + ; 6.26: (-32 .. 31.9) ; zx = zx_2 - zy_2 + cx sub32 zx, zx_2, zy_2 add32 zx, zx, cx @@ -991,9 +992,9 @@ keep_going: add32 zy, zy, cy quick_exit zy, 2 - ; convert 8.24 -> 4.12: (-8 .. +7.9) - shift_round_16 zx, 4 - shift_round_16 zy, 4 + ; convert 6.26 -> 3.13: (-4 .. +3.9) + shift_round_16 zx, 3 + shift_round_16 zy, 3 ; zx_2 = zx * zx sqr16 zx_2, zx + 2 @@ -1115,9 +1116,9 @@ enough: .endmacro .macro zoom_factor dest, src, aspect - ; output: dest: fixed8.24 - ; input: src: fixed4.12 - ; aspect: fixed4.12 + ; output: dest: fixed6.26 + ; input: src: fixed3.13 + ; aspect: fixed3.13 ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest @@ -1426,6 +1427,8 @@ number_keys: beq four cpy #KEY_5 beq five + cpy #KEY_6 + beq six jmp skip_char one: @@ -1442,6 +1445,9 @@ four: jmp load_key_viewport five: ldx #4 + jmp load_key_viewport +six: + ldx #5 ; fall through load_key_viewport: jsr load_viewport diff --git a/readme.md b/readme.md index d60644c..2c9efc1 100644 --- a/readme.md +++ b/readme.md @@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. @@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. +The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates. Iterations are capped at 255. diff --git a/todo.md b/todo.md index 7ab092b..1d46281 100644 --- a/todo.md +++ b/todo.md @@ -5,10 +5,6 @@ things to try: * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D -* try 3.13 fixed point instead of 4.12 for more precision - * can we get away without the extra bit? - * since exit compare space would be 6.26 i think so - * y-axis mirror optimization * extract viewport for display & re-input via keyboard From 837082cf56d0b6325788da5d71e444c04f50fb69 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 1 Jan 2025 15:45:26 -0800 Subject: [PATCH 06/32] tweak viewports skip experimental 6th viewport that got forgotten and limit max zoom to 7 (range 0-7) which is what looks good --- mandel.s | 8 +------- readme.md | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 9996c53..8bd1a27 100644 --- a/mandel.s +++ b/mandel.s @@ -296,7 +296,6 @@ viewport_zoom: .byte 7 .byte 5 .byte 7 - .byte 8 viewport_ox: .dword ($00000000 & $3fffffff) << 2 @@ -1393,7 +1392,7 @@ skip_char: plus: lda zoom - cmp #8 + cmp #7 bpl skip_char inc zoom jmp done @@ -1427,8 +1426,6 @@ number_keys: beq four cpy #KEY_5 beq five - cpy #KEY_6 - beq six jmp skip_char one: @@ -1445,9 +1442,6 @@ four: jmp load_key_viewport five: ldx #4 - jmp load_key_viewport -six: - ldx #5 ; fall through load_key_viewport: jsr load_viewport diff --git a/readme.md b/readme.md index 2c9efc1..881890a 100644 --- a/readme.md +++ b/readme.md @@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 5 preset viewports via the number keys. The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. From dcf5a3f59e1c9c5081a556e93838304abb038c36 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 1 Jan 2025 21:15:38 -0800 Subject: [PATCH 07/32] sixth viewport --- mandel.s | 8 ++++++++ readme.md | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 8bd1a27..6be37d2 100644 --- a/mandel.s +++ b/mandel.s @@ -296,6 +296,7 @@ viewport_zoom: .byte 7 .byte 5 .byte 7 + .byte 7 viewport_ox: .dword ($00000000 & $3fffffff) << 2 @@ -303,6 +304,7 @@ viewport_ox: .dword ($ff110000 & $3fffffff) << 2 .dword ($fe400000 & $3fffffff) << 2 .dword ($fe3b0000 & $3fffffff) << 2 + .dword $fd220000 viewport_oy: .dword ($00000000 & $3fffffff) << 2 @@ -310,6 +312,7 @@ viewport_oy: .dword ($ffbe0000 & $3fffffff) << 2 .dword ($00000000 & $3fffffff) << 2 .dword ($fffe0000 & $3fffffff) << 2 + .dword $ff000000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -1426,6 +1429,8 @@ number_keys: beq four cpy #KEY_5 beq five + cpy #KEY_6 + beq six jmp skip_char one: @@ -1442,6 +1447,9 @@ four: jmp load_key_viewport five: ldx #4 + jmp load_key_viewport +six: + ldx #5 ; fall through load_key_viewport: jsr load_viewport diff --git a/readme.md b/readme.md index 881890a..2c9efc1 100644 --- a/readme.md +++ b/readme.md @@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 5 preset viewports via the number keys. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. From d157fe1306267caa489a70dd176593873445820b Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 4 Jan 2025 10:06:12 -0800 Subject: [PATCH 08/32] Faster pixel skipping on 4x4, 2x2 tiers Iterate at fill_masks[fill_level]+1 instead of every pixel and then skipping, saves a smidge of time view 1 with expanded memory: 10.514 ms/px before 10.430 ms/px after --- mandel.s | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 6be37d2..210799a 100644 --- a/mandel.s +++ b/mandel.s @@ -1737,12 +1737,15 @@ update_status: skipped: + ; sx += fill_level[fill_masks] + 1 + ldx fill_level + lda fill_masks,x clc - lda sx - adc #1 + adc #1 ; will never carry + adc sx sta sx - lda sx + 1 - adc #0 + lda #0 + adc sx + 1 sta sx + 1 lda sx @@ -1752,12 +1755,15 @@ skipped: loop_sx_done: + ; sy += fill_level[fill_masks] + 1 + ldx fill_level + lda fill_masks,x clc - lda sy - adc #1 + adc #1 ; will never carry + adc sy sta sy - lda sy + 1 - adc #0 + lda #0 + adc sy + 1 sta sy + 1 lda sy From 582ddf497f3c4f1aeae39201b2490dff14ff7f16 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 4 Jan 2025 10:53:51 -0800 Subject: [PATCH 09/32] apply jamey's suggestion of skipping add for high byte muls rather than saving 0 into the high bytes, then adding the high-byte multiplication later, write it directly in place. this saves a few cycles on every iteration, and it adds up nicely. View 1 overview render times: 130XE: 10.050 ms/px - 4m56s 800XL: 10.906 ms/px - 5m21s --- mandel.s | 29 ++++------------------------- todo.md | 3 --- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/mandel.s b/mandel.s index 210799a..526953a 100644 --- a/mandel.s +++ b/mandel.s @@ -464,20 +464,6 @@ viewport_oy: sta dest + 1 .endmacro -; input: arg as u8 -; input/output: dest as u16 -; clobbers a, x -.macro sqr8_add16 dest, arg - ldx arg - clc - lda sqr_lobyte,x - adc dest - sta dest - lda sqr_hibyte,x - adc dest + 1 - sta dest + 1 -.endmacro - .segment "TABLES" ; lookup table for top byte -> PORTB value for bank-switch .align 256 @@ -760,9 +746,8 @@ inner_loop: ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 imul8 result, arg1, arg2, xe - lda #0 - sta result + 2 - sta result + 3 + + imul8 result + 2, arg1 + 1, arg2 + 1, xe imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter @@ -772,9 +757,6 @@ inner_loop: add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1, xe - add16 result + 2, result + 2, inter - ; In case of negative inputs, adjust high word ; https://stackoverflow.com/a/28827013 lda arg1 + 1 @@ -807,9 +789,8 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + + sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe add16 result + 1, result + 1, inter @@ -817,8 +798,6 @@ arg2_pos: add16 result + 1, result + 1, inter add_carry result + 3 - sqr8_add16 result + 2, arg + 1 - rts ; 6 cyc .endscope .endmacro diff --git a/todo.md b/todo.md index 1d46281..a78a2d5 100644 --- a/todo.md +++ b/todo.md @@ -1,8 +1,5 @@ things to try: -* skip add on the top-byte multiply in sqr8/mul8 - * should save a few cycles, suggestion by jamey - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * y-axis mirror optimization From d2bf77dc26218ae1c2a342fd424a7d532d064904 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 4 Jan 2025 12:13:27 -0800 Subject: [PATCH 10/32] todo notes --- todo.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/todo.md b/todo.md index a78a2d5..6807ae2 100644 --- a/todo.md +++ b/todo.md @@ -1,7 +1,15 @@ things to try: +* fix status bar to show elapsed time, per-iter time, per-pixel iter count + +* 'turbo' mode disabling graphics in full or part + * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D +* maybe clean up the load/layout of the big mul table + +* consider alternate lookup tables in the top 16KB under ROM + * y-axis mirror optimization * extract viewport for display & re-input via keyboard From 7e5ca79d9a4bd419a3a004f7c96a612c9e41cee7 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 4 Jan 2025 14:25:25 -0800 Subject: [PATCH 11/32] move total_ms, total_pixels out of zero page this frees up 12 bytes of zero page space and costs no measurable time as these variables are not in the hot path and there was only a tiny bit different. --- mandel.s | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/mandel.s b/mandel.s index 526953a..317e3b1 100644 --- a/mandel.s +++ b/mandel.s @@ -33,8 +33,7 @@ chroma_ticks = $bd ; u8 count_frames = $be ; u8 count_pixels = $bf ; u8 -total_pixels = $c0 ; float48 -total_ms = $c6 ; float48 +; free space c0-cb temp = $cc ; u16 temp2 = $ce ; u16 @@ -63,6 +62,7 @@ FADD = $DA66 ; ADDITION (FR0 += FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) FDIV = $DB28 ; DIVISION (FR0 /= FR1) +ZFR0 = $DA44 ; clear FR0 ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) @@ -203,6 +203,16 @@ ms_per_frame: ; float48 16.66666667 .byte $66 .byte $67 +total_pixels: ; float48 + .repeat 6 + .byte 0 + .endrepeat + +total_ms: ; float48 + .repeat 6 + .byte 0 + .endrepeat + display_list_start: ; 24 lines overscan .repeat 3 @@ -1565,10 +1575,13 @@ main_loop: sta count_pixels ; total_ms = 0.0; total_pixels = 0.0 - ldx #total_ms - jsr ZF1 - ldx #total_pixels - jsr ZF1 + jsr ZFR0 + ldx #.lobyte(total_ms) + ldy #.hibyte(total_ms) + jsr FST0R + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) + jsr FST0R jsr clear_screen jsr status_bar @@ -1691,19 +1704,19 @@ update_status: jsr FMUL ; FR0 += total_ms - ldx #total_ms - ldy #0 + ldx #.lobyte(total_ms) + ldy #.hibyte(total_ms) jsr FLD1R jsr FADD ; total_ms = FR0 - ldx #total_ms - ldy #0 + ldx #.lobyte(total_ms) + ldy #.hibyte(total_ms) jsr FST0R ; FR0 /= total_pixels - ldx #total_pixels - ldy #0 + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) jsr FLD1R jsr FDIV From eaa00a055ac6ff39291a42b458b3e41806025035 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 4 Jan 2025 18:46:51 -0800 Subject: [PATCH 12/32] wip changing time units it does this weird thing where sometimes it's reading out wrong digits and then switches to expected unit of sec/px work in progress no clue what's going on --- mandel.s | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mandel.s b/mandel.s index 317e3b1..f053748 100644 --- a/mandel.s +++ b/mandel.s @@ -141,7 +141,7 @@ str_self: .byte "MANDEL-6502" str_self_end: str_speed: - .byte " ms/px" + .byte " sec/px" str_speed_end: str_run: .byte " RUN" @@ -195,9 +195,9 @@ aspect_x: ; fixed3.13 5/4 aspect_y: ; fixed3.13 3/4 .word 3 << (13 - 2) -ms_per_frame: ; float48 16.66666667 - .byte 64 ; exponent/sign - .byte $16 ; BCD digits +sec_per_frame: ; float48 0.016666667 + .byte 63 ; exponent/sign + .byte $01 ; BCD digits .byte $66 .byte $66 .byte $66 @@ -208,7 +208,7 @@ total_pixels: ; float48 .byte 0 .endrepeat -total_ms: ; float48 +total_sec: ; float48 .repeat 6 .byte 0 .endrepeat @@ -1574,10 +1574,10 @@ main_loop: sta count_frames sta count_pixels - ; total_ms = 0.0; total_pixels = 0.0 + ; total_sec = 0.0; total_pixels = 0.0 jsr ZFR0 - ldx #.lobyte(total_ms) - ldy #.hibyte(total_ms) + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) jsr FST0R ldx #.lobyte(total_pixels) ldy #.hibyte(total_pixels) @@ -1697,21 +1697,21 @@ update_status: sta count_frames jsr IFP - ; FR0 *= ms_per_frame - ldx #.lobyte(ms_per_frame) - ldy #.hibyte(ms_per_frame) + ; FR0 *= sec_per_frame + ldx #.lobyte(sec_per_frame) + ldy #.hibyte(sec_per_frame) jsr FLD1R jsr FMUL - ; FR0 += total_ms - ldx #.lobyte(total_ms) - ldy #.hibyte(total_ms) + ; FR0 += total_sec + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) jsr FLD1R jsr FADD - ; total_ms = FR0 - ldx #.lobyte(total_ms) - ldy #.hibyte(total_ms) + ; total_sec = FR0 + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) jsr FST0R ; FR0 /= total_pixels From 918d15e8139d21c15f05776bfdb6780000a687f9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 5 Jan 2025 14:05:24 -0800 Subject: [PATCH 13/32] wip us/iter counter seems wrong, gives 32 all the time and that seems too small --- mandel.s | 103 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 35 deletions(-) diff --git a/mandel.s b/mandel.s index f053748..09af6a5 100644 --- a/mandel.s +++ b/mandel.s @@ -31,9 +31,10 @@ chroma_offset = $bb ; u8 palette_ticks = $bc ; u8 chroma_ticks = $bd ; u8 count_frames = $be ; u8 -count_pixels = $bf ; u8 +; free space $bf -; free space c0-cb +count_iters = $c0 ; u16 +; free space c2-cb temp = $cc ; u16 temp2 = $ce ; u16 @@ -58,6 +59,7 @@ LBUFF = $0580 ; result buffer for FASC routine ; FP ROM routine vectors FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) +FPI = $D9D2 ; floating point to integer FADD = $DA66 ; ADDITION (FR0 += FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) @@ -141,7 +143,7 @@ str_self: .byte "MANDEL-6502" str_self_end: str_speed: - .byte " sec/px" + .byte "us/iter: " str_speed_end: str_run: .byte " RUN" @@ -154,7 +156,7 @@ str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done -speed_precision = 6 +speed_precision = 5 speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 speed_len = 14 + str_speed_len @@ -196,14 +198,22 @@ aspect_y: ; fixed3.13 3/4 .word 3 << (13 - 2) sec_per_frame: ; float48 0.016666667 - .byte 63 ; exponent/sign + .byte 63 ; exponent/sign - -2 .byte $01 ; BCD digits .byte $66 .byte $66 .byte $66 .byte $67 -total_pixels: ; float48 +us_per_sec: ; float48 1e9 + .byte 68 ; exponent/sign +8 + .byte $10 ; BCD digits + .byte $00 + .byte $00 + .byte $00 + .byte $00 + +total_iters: ; float48 .repeat 6 .byte 0 .endrepeat @@ -927,6 +937,11 @@ next: sta z_buffer_end loop: + inc count_iters + bne low_iters + inc count_iters + 1 +low_iters: + ; iter++ & max-iters break inc iter bne keep_going @@ -1230,6 +1245,7 @@ done: ; clobbers A, X .local loop .local done + .local padding ldx #0 loop: cpx #len @@ -1237,11 +1253,23 @@ loop: txa tay lda (strptr),y + bmi padding tay lda char_map,y sta textbuffer + col,x inx jmp loop + +padding: + ldy #32 ; space + lda char_map,y + + cpx #len + beq done + sta textbuffer + col,x + inx + jmp padding + done: .endmacro @@ -1569,18 +1597,19 @@ copy_byte_loop: jsr SETVBV main_loop: - ; count_frames = 0; count_pixels = 0 + ; count_frames = 0; count_iters = 0 lda #0 sta count_frames - sta count_pixels + sta count_iters + sta count_iters + 1 - ; total_sec = 0.0; total_pixels = 0.0 + ; total_sec = 0.0; total_iters = 0.0 jsr ZFR0 ldx #.lobyte(total_sec) ldy #.hibyte(total_sec) jsr FST0R - ldx #.lobyte(total_pixels) - ldy #.hibyte(total_pixels) + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FST0R jsr clear_screen @@ -1653,38 +1682,32 @@ not_skipped_mask: no_key: ; check if we should update the counters - ; - ; count_pixels >= width? update! - inc count_pixels - lda count_pixels - cmp #width - bmi update_status ; count_frames >= 120? update! lda count_frames cmp #120 ; >= 2 seconds - bmi skipped + bpl update_status + jmp skipped update_status: - ; FR0 = (float)count_pixels & clear count_pixels - lda count_pixels - sta FR0 - lda #0 - sta FR0 + 1 - sta count_pixels + ; FR0 = (float)count_iters & clear count_iters + copy16 FR0, count_iters jsr IFP + lda #0 + sta count_iters + sta count_iters + 1 - ; FR1 = total_pixels - ldx #.lobyte(total_pixels) - ldy #.hibyte(total_pixels) + ; FR1 = total_iters + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FLD1R ; FR0 += FR1 jsr FADD - ; total_pixels = FR0 - ldx #.lobyte(total_pixels) - ldy #.hibyte(total_pixels) + ; total_iters = FR0 + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FST0R @@ -1714,18 +1737,28 @@ update_status: ldy #.hibyte(total_sec) jsr FST0R - ; FR0 /= total_pixels - ldx #.lobyte(total_pixels) - ldy #.hibyte(total_pixels) + ; FR0 /= total_iters + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FLD1R jsr FDIV + ; FR0 *= us_per_sec + ldx #.lobyte(us_per_sec) + ldy #.hibyte(us_per_sec) + jsr FLD1R + jsr FMUL + + ; round to integer + jsr FPI + jsr IFP + ; convert to ASCII in INBUFF jsr FASC ; print the first 6 digits - draw_text_indirect speed_start, speed_precision, INBUFF - draw_text speed_start + speed_precision, str_speed_len, str_speed + draw_text speed_start, str_speed_len, str_speed + draw_text_indirect speed_start + str_speed_len, speed_precision, INBUFF skipped: From 7c04862d70b16a8e35392255371cdbaca0340396 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 5 Jan 2025 14:29:27 -0800 Subject: [PATCH 14/32] workaround for rounding us/iter for some reason rounding is giving me wrong results not sure what i'm doing wrong :D just show 6 digits :P ok this gets the us/iter working, and it is more stable but the elapsed time still needs to be added --- mandel.s | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 09af6a5..7466cd9 100644 --- a/mandel.s +++ b/mandel.s @@ -156,7 +156,7 @@ str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done -speed_precision = 5 +speed_precision = 6 speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 speed_len = 14 + str_speed_len @@ -197,17 +197,17 @@ aspect_x: ; fixed3.13 5/4 aspect_y: ; fixed3.13 3/4 .word 3 << (13 - 2) -sec_per_frame: ; float48 0.016666667 - .byte 63 ; exponent/sign - -2 +sec_per_frame: ; float48 00 . 01 66 66 66 67 + .byte 63 ; exponent/sign - -1 bytes .byte $01 ; BCD digits .byte $66 .byte $66 .byte $66 .byte $67 -us_per_sec: ; float48 1e9 - .byte 68 ; exponent/sign +8 - .byte $10 ; BCD digits +us_per_sec: ; float48 1e9 01 00 0,0 00 . 00 + .byte 67 ; exponent/sign +3 bytes + .byte $01 ; BCD digits .byte $00 .byte $00 .byte $00 @@ -1749,9 +1749,12 @@ update_status: jsr FLD1R jsr FMUL + ; @fixme ; round to integer - jsr FPI - jsr IFP + ; for some reason this gives bad results? + ;clc + ;jsr FPI + ;jsr IFP ; convert to ASCII in INBUFF jsr FASC From e0cc704d9906a3801b8812e3cb994f71995391bf Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 8 Jan 2025 18:34:46 -0800 Subject: [PATCH 15/32] Fix drawing terminator, round usec --- mandel.s | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mandel.s b/mandel.s index 7466cd9..db072be 100644 --- a/mandel.s +++ b/mandel.s @@ -1253,11 +1253,15 @@ loop: txa tay lda (strptr),y - bmi padding + pha ; save the char for terminator check + and #$7f ; strip the high bit (terminator) tay lda char_map,y sta textbuffer + col,x inx + + pla + bmi padding jmp loop padding: @@ -1749,12 +1753,10 @@ update_status: jsr FLD1R jsr FMUL - ; @fixme - ; round to integer - ; for some reason this gives bad results? - ;clc - ;jsr FPI - ;jsr IFP + ; round (down) to integer + jsr FPI + clc + jsr IFP ; convert to ASCII in INBUFF jsr FASC From d182d33b3579668be8db81034f4197f6ee381fa8 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 1 Feb 2025 10:02:01 -0800 Subject: [PATCH 16/32] draw_string --- mandel.s | 141 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 57 deletions(-) diff --git a/mandel.s b/mandel.s index db072be..b7c323b 100644 --- a/mandel.s +++ b/mandel.s @@ -34,7 +34,9 @@ count_frames = $be ; u8 ; free space $bf count_iters = $c0 ; u16 -; free space c2-cb +text_col = $c2 ; u8 +text_row = $c3 ; u8 +; free space c4-cb temp = $cc ; u16 temp2 = $ce ; u16 @@ -140,16 +142,16 @@ KEY_0 = 50 strings: str_self: - .byte "MANDEL-6502" + .byte "MANDEL-6502", 0 str_self_end: str_speed: - .byte "us/iter: " + .byte "us/iter: ", 0 str_speed_end: str_run: - .byte " RUN" + .byte " RUN", 0 str_run_end: str_done: - .byte "DONE" + .byte "DONE", 0 str_done_end: str_self_len = str_self_end - str_self @@ -1241,57 +1243,50 @@ done: rts .endproc -.macro draw_text_indirect col, len, strptr - ; clobbers A, X - .local loop - .local done - .local padding - ldx #0 +; in/out: column in text_col +; in: row in text_row @fixme implement +; in: pointer to string in INBUFF +; clobbers x/y/a/temp +.proc draw_string + drawptr = temp + strptr = INBUFF + + clc + lda #.lobyte(textbuffer) + adc text_col + sta temp + lda #.hibyte(textbuffer) + adc #0 + sta temp + 1 + + ldy #0 loop: - cpx #len - beq done - txa - tay lda (strptr),y - pha ; save the char for terminator check - and #$7f ; strip the high bit (terminator) - tay - lda char_map,y - sta textbuffer + col,x - inx + ; if char's null, terminate c-style + beq done + ; save the char for terminator check + pha + ; strip the high bit (terminator) + and #$7f + tax + lda char_map,x + sta (drawptr),y + iny pla - bmi padding + ; _last_ char has high bit set in atari rom routines + bmi done jmp loop -padding: - ldy #32 ; space - lda char_map,y - - cpx #len - beq done - sta textbuffer + col,x - inx - jmp padding - done: -.endmacro + ; move the text column pointer + tya + clc + adc text_col + sta text_col -.macro draw_text col, len, cstr - ; clobbers A, X - .local loop - .local done - ldx #0 -loop: - cpx #len - beq done - ldy cstr,x - lda char_map,y - sta textbuffer + col,x - inx - jmp loop -done: -.endmacro + rts +.endproc .proc vblank_handler inc count_frames @@ -1506,8 +1501,24 @@ zero_byte_loop: .proc status_bar ; Status bar - draw_text 0, str_self_len, str_self - draw_text 40 - str_run_len, str_run_len, str_run + + lda #0 + sta text_col + lda #0 + sta text_row + lda #.lobyte(str_self) + sta INBUFF + lda #.hibyte(str_self) + sta INBUFF + 1 + jsr draw_string + + lda #(40 - str_run_len) + sta text_col + lda #.lobyte(str_run) + sta INBUFF + lda #.hibyte(str_run) + sta INBUFF + 1 + jsr draw_string rts .endproc @@ -1758,13 +1769,19 @@ update_status: clc jsr IFP - ; convert to ASCII in INBUFF + lda #speed_start + sta text_col + lda #0 + sta text_row + lda #.lobyte(str_speed) + sta INBUFF + lda #.hibyte(str_speed) + sta INBUFF + 1 + jsr draw_string + + ; convert to ASCII in INBUFF and print jsr FASC - - ; print the first 6 digits - draw_text speed_start, str_speed_len, str_speed - draw_text_indirect speed_start + str_speed_len, speed_precision, INBUFF - + jsr draw_string skipped: ; sx += fill_level[fill_masks] + 1 @@ -1812,7 +1829,17 @@ fill_loop_done: loop: ; finished - draw_text 40 - str_done_len, str_done_len, str_done + + lda #(40 - str_done_len) + sta text_col + lda #0 + sta text_row + lda #.lobyte(str_done) + sta INBUFF + lda #.hibyte(str_done) + sta INBUFF + 1 + jsr draw_string + jsr keycheck beq loop jmp main_loop From 25da81c64bfddf0a39e09288386ad76e19290be2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 2 Feb 2025 16:40:58 -0800 Subject: [PATCH 17/32] clean up text draw, fix offset by one --- mandel.s | 60 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/mandel.s b/mandel.s index b7c323b..ed7ae40 100644 --- a/mandel.s +++ b/mandel.s @@ -142,26 +142,32 @@ KEY_0 = 50 strings: str_self: - .byte "MANDEL-6502", 0 + .byte "MANDEL-6502" str_self_end: + .byte 0 str_speed: - .byte "us/iter: ", 0 + .byte "us/iter: " str_speed_end: + .byte 0 str_run: - .byte " RUN", 0 + .byte " RUN" str_run_end: + .byte 0 str_done: - .byte "DONE", 0 + .byte "DONE" str_done_end: + .byte 0 +str_padding: + .byte " " +str_padding_end: + .byte 0 -str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done -speed_precision = 6 +str_padding_len = str_padding_end - str_padding -speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 -speed_len = 14 + str_speed_len +speed_start = 40 - str_done_len - str_speed_len - str_padding_len - 1 char_map: @@ -1288,6 +1294,14 @@ done: rts .endproc +.macro draw_string_const str + lda #.lobyte(str) + sta INBUFF + lda #.hibyte(str) + sta INBUFF + 1 + jsr draw_string +.endmacro + .proc vblank_handler inc count_frames @@ -1506,19 +1520,11 @@ zero_byte_loop: sta text_col lda #0 sta text_row - lda #.lobyte(str_self) - sta INBUFF - lda #.hibyte(str_self) - sta INBUFF + 1 - jsr draw_string + draw_string_const str_self lda #(40 - str_run_len) sta text_col - lda #.lobyte(str_run) - sta INBUFF - lda #.hibyte(str_run) - sta INBUFF + 1 - jsr draw_string + draw_string_const str_run rts .endproc @@ -1773,11 +1779,13 @@ update_status: sta text_col lda #0 sta text_row - lda #.lobyte(str_speed) - sta INBUFF - lda #.hibyte(str_speed) - sta INBUFF + 1 - jsr draw_string + draw_string_const str_speed + + lda text_col + pha + draw_string_const str_padding + pla + sta text_col ; convert to ASCII in INBUFF and print jsr FASC @@ -1834,11 +1842,7 @@ loop: sta text_col lda #0 sta text_row - lda #.lobyte(str_done) - sta INBUFF - lda #.hibyte(str_done) - sta INBUFF + 1 - jsr draw_string + draw_string_const str_done jsr keycheck beq loop From 26d612b6f3640e4ee6e15eb9817f0037ee91ed80 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Fri, 21 Feb 2025 19:42:10 -0800 Subject: [PATCH 18/32] move 8 scan lines on the bottom to status bar --- mandel.s | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index ed7ae40..81ee0eb 100644 --- a/mandel.s +++ b/mandel.s @@ -80,7 +80,7 @@ framebuffer_bottom = $b000 display_list = $bf00 framebuffer_end = $c000 -height = 184 +height = 176 half_height = height >> 1 width = 160 half_width = width >> 1 @@ -254,6 +254,10 @@ display_list_start: .byte $0e .endrep + ; 8 scan lines, 1 row of 40-column text + .byte $42 + .addr textbuffer + 40 + .byte $41 ; jump and blank .addr display_list display_list_end: From 07db3d00d7ff36afe331c6adbeae2398a53c8173 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 22 Feb 2025 11:23:13 -0800 Subject: [PATCH 19/32] second status bar display with coords/zoom currently using 3.13 precision to output to floats for formatting --- mandel.s | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 81ee0eb..a89be85 100644 --- a/mandel.s +++ b/mandel.s @@ -169,6 +169,27 @@ str_padding_len = str_padding_end - str_padding speed_start = 40 - str_done_len - str_speed_len - str_padding_len - 1 +col_x = 1 +str_x: + .byte "X:" + .byte 0 +str_x_len = 2 +str_x_space = 12 +str_x_padding = 2 + +col_y = str_x_len + str_x_space + str_x_padding +str_y: + .byte "Y:" + .byte 0 +str_y_len = 2 +str_y_space = 12 +str_y_padding = 2 + +col_zoom = col_y + str_y_len + str_y_space + str_y_padding +str_zoom: + .byte "ZOOM:" + .byte 0 +str_zoom_len = 5 char_map: ; Map ATASCII string values to framebuffer font entries @@ -205,6 +226,17 @@ aspect_x: ; fixed3.13 5/4 aspect_y: ; fixed3.13 3/4 .word 3 << (13 - 2) +fixed3_13_as_float: ; float48 + ; 1 << 13 + ; 8192 + ; 81 92 . 00 00 00 + .byte 65 ; exponent/sign - +1 byte + .byte $81 + .byte $92 + .byte $00 + .byte $00 + .byte $00 + sec_per_frame: ; float48 00 . 01 66 66 66 67 .byte 63 ; exponent/sign - -1 bytes .byte $01 ; BCD digits @@ -895,6 +927,68 @@ next: .endmacro +; input in FR0, 16 bits signed 3.13 fixed +; output in FR0, Atari float +; clobbers a, x, y, FR0, FR1 +.proc fixed3_13_to_float + ldx #.lobyte(fixed3_13_as_float) + ldy #.hibyte(fixed3_13_as_float) + jsr FLD1R + + ; check sign bit! conversion routine is for unsigned + lda FR0 + 1 + bpl positive + +negative: + neg16 FR0 + jsr IFP + + ; set float sign bit + lda FR0 + ora #$80 + sta FR0 + jmp common + +positive: + jsr IFP + +common: + jsr FDIV + rts + +.endproc + +; input in FR0, Atari float +; output in FR0, 16 bits signed 3.13 fixed +; clobbers a, x, y, FR0, FR1 +.proc float_to_fixed3_13 + ldx #.lobyte(fixed3_13_as_float) + ldy #.hibyte(fixed3_13_as_float) + jsr FLD1R + jsr FMUL + + ; check sign bit! conversion routine is for unsigned + lda FR0 + bcc positive + +negative: + ; clearfloat sign bit + lda FR0 + eor #$80 + sta FR0 + + jsr FPI + neg16 FR0 + jmp common + +positive: + jsr FPI + +common: + rts + +.endproc + .proc mandelbrot ; input: ; cx: position scaled to 6.26 fixed point - -32..+31.9 @@ -1254,7 +1348,7 @@ done: .endproc ; in/out: column in text_col -; in: row in text_row @fixme implement +; in: row in text_row ; in: pointer to string in INBUFF ; clobbers x/y/a/temp .proc draw_string @@ -1269,6 +1363,21 @@ done: adc #0 sta temp + 1 + ldx text_row + beq done_rows +continue_rows: + clc + lda temp + adc #40 + sta temp + lda temp + 1 + adc #0 + sta temp + 1 + dex + bne continue_rows + +done_rows: + ldy #0 loop: lda (strptr),y @@ -1429,7 +1538,7 @@ skip_char: plus: lda zoom - cmp #7 + cmp #8 bpl skip_char inc zoom jmp done @@ -1441,15 +1550,19 @@ minus: jmp done up: sub32 oy, oy, temp + jsr display_coords jmp done down: add32 oy, oy, temp + jsr display_coords jmp done left: sub32 ox, ox, temp + jsr display_coords jmp done right: add32 ox, ox, temp + jsr display_coords jmp done number_keys: @@ -1533,6 +1646,49 @@ zero_byte_loop: rts .endproc +.proc display_coords + lda #1 + sta text_row + lda #col_x + sta text_col + draw_string_const str_x + + copy32 FR0, ox + shift_round_16 FR0, 3 + copy16 FR0, FR0 + 2 + jsr fixed3_13_to_float + jsr FASC + jsr draw_string + + lda #col_y + sta text_col + draw_string_const str_y + + copy32 FR0, oy + shift_round_16 FR0, 3 + copy16 FR0, FR0 + 2 + jsr fixed3_13_to_float + jsr FASC + jsr draw_string + + lda #col_zoom + sta text_col + draw_string_const str_zoom + + lda zoom + clc + adc #0 + sta FR0 + lda #0 + sta FR0 + 1 + jsr IFP + jsr FASC + jsr draw_string + + rts + +.endproc + ; input: viewport selector in x ; clobbers: a, x .proc load_viewport @@ -1584,6 +1740,7 @@ zero_byte_loop: sta DMACTL jsr clear_screen + jsr display_coords ; Copy the display list into properly aligned memory ; Can't cross 1024-byte boundaries :D @@ -1639,6 +1796,7 @@ main_loop: jsr clear_screen jsr status_bar + jsr display_coords lda #0 sta fill_level From 6e66145ec6ea2be22f81819c2c797bfa9b1425aa Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 22 Feb 2025 15:37:11 -0800 Subject: [PATCH 20/32] whoops fixes --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index a89be85..9840325 100644 --- a/mandel.s +++ b/mandel.s @@ -177,7 +177,7 @@ str_x_len = 2 str_x_space = 12 str_x_padding = 2 -col_y = str_x_len + str_x_space + str_x_padding +col_y = col_x + str_x_len + str_x_space + str_x_padding str_y: .byte "Y:" .byte 0 @@ -1538,7 +1538,7 @@ skip_char: plus: lda zoom - cmp #8 + cmp #7 bpl skip_char inc zoom jmp done From 89b4e459013a74b070f730a4389e2ce90a177b57 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 22 Feb 2025 20:24:04 -0800 Subject: [PATCH 21/32] flip the y coordinate sign --- mandel.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 9840325..3419466 100644 --- a/mandel.s +++ b/mandel.s @@ -1549,11 +1549,11 @@ minus: dec zoom jmp done up: - sub32 oy, oy, temp + add32 oy, oy, temp jsr display_coords jmp done down: - add32 oy, oy, temp + sub32 oy, oy, temp jsr display_coords jmp done left: @@ -1854,6 +1854,7 @@ not_skipped_mask: zoom_factor cx, sx, aspect_x add32 cx, cx, ox zoom_factor cy, sy, aspect_y + neg32 cy add32 cy, cy, oy jsr mandelbrot jsr pset From 689363d0834c1cc3a04a19999a3ca2485a3a0f30 Mon Sep 17 00:00:00 2001 From: Brooke Date: Sun, 22 Jun 2025 20:00:35 -0700 Subject: [PATCH 22/32] WIP code for elapsed time not finished, doesn't work right --- mandel.s | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 3419466..dc7e323 100644 --- a/mandel.s +++ b/mandel.s @@ -162,12 +162,28 @@ str_padding: str_padding_end: .byte 0 +str_space: + .byte " " + .byte 0 + +str_h: + .byte "h" + .byte 0 +str_m: + .byte "m" + .byte 0 +str_s: + .byte "s" + .byte 0 + str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done str_padding_len = str_padding_end - str_padding -speed_start = 40 - str_done_len - str_speed_len - str_padding_len - 1 +; "3h59m59s" +str_elapsed_spacer = 8 +speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1 col_x = 1 str_x: @@ -1953,6 +1969,55 @@ update_status: ; convert to ASCII in INBUFF and print jsr FASC jsr draw_string + + ; elapsed time + ; FR0 = total_sec + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) + jsr FLD0R + ; FR0 -> integer -> temp + jsr FPI + lda FR0 + sta temp + lda FR0 + 1 + sta temp + 1 + + .macro countdown divisor, string + .scope + ; count the hours + ldx #0 + countdown_loop: + lda temp + 1 + cmp #.hibyte(divisor) + bmi countdown_done + lda temp + cmp #.lobyte(divisor) + bmi countdown_done + sec + lda temp + sbc #.lobyte(divisor) + sta temp + lda temp + 1 + sbc #.hibyte(divisor) + sta temp + 1 + inx + jmp countdown_loop + countdown_done: + stx FR0 + ldx #0 + stx FR0 + 1 + jsr IFP + jsr FASC + jsr draw_string + draw_string_const string + .endscope + .endmacro + + draw_string_const str_space + countdown 3600, str_h + countdown 60, str_m + countdown 1, str_s + skipped: ; sx += fill_level[fill_masks] + 1 From f7082ab371b0454c32ed2b120cdac5f28e02a587 Mon Sep 17 00:00:00 2001 From: Brooke Date: Sun, 22 Jun 2025 22:21:26 -0700 Subject: [PATCH 23/32] wip subtraction method, still not working --- mandel.s | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/mandel.s b/mandel.s index dc7e323..73a8ebb 100644 --- a/mandel.s +++ b/mandel.s @@ -221,8 +221,12 @@ char_map: .endrepeat hex_chars: +digits_zero: .byte "0123456789abcdef" +digits_space: + .byte " 123456789abcdef" + aspect: ; aspect ratio! ; pixels at 320w are 5:6 (narrow) @@ -1982,7 +1986,11 @@ update_status: lda FR0 + 1 sta temp + 1 - .macro countdown divisor, string + ;jsr IFP + ;jsr FASC + ;jsr draw_string + + .macro countdown divisor, digits .scope ; count the hours ldx #0 @@ -2003,20 +2011,27 @@ update_status: inx jmp countdown_loop countdown_done: - stx FR0 - ldx #0 - stx FR0 + 1 - jsr IFP - jsr FASC + lda digits,x + eor #$80 + sta temp2 + lda #.lobyte(temp2) + sta INBUFF + lda #.hibyte(temp2) + sta INBUFF + 1 jsr draw_string - draw_string_const string .endscope .endmacro draw_string_const str_space - countdown 3600, str_h - countdown 60, str_m - countdown 1, str_s + countdown 36000, digits_space + countdown 3600, digits_zero + draw_string_const str_h + countdown 600, digits_zero + countdown 60, digits_zero + draw_string_const str_m + countdown 10, digits_zero + countdown 1, digits_zero + draw_string_const str_s skipped: From 5cf64970c8a9ca8efe7821a9f425cb9282c92e29 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 22 Jun 2025 23:10:43 -0700 Subject: [PATCH 24/32] Ah that's better used the appropriate instruction for comparison --- mandel.s | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mandel.s b/mandel.s index 73a8ebb..6837e00 100644 --- a/mandel.s +++ b/mandel.s @@ -398,6 +398,11 @@ viewport_oy: .dword ($fffe0000 & $3fffffff) << 2 .dword $ff000000 +elapsed_work: + .dword 0 +elapsed_digit: + .byte 0 + ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 clc ; 2 cyc @@ -1979,12 +1984,12 @@ update_status: ldx #.lobyte(total_sec) ldy #.hibyte(total_sec) jsr FLD0R - ; FR0 -> integer -> temp + ; FR0 -> integer -> elapsed_work jsr FPI lda FR0 - sta temp + sta elapsed_work lda FR0 + 1 - sta temp + 1 + sta elapsed_work + 1 ;jsr IFP ;jsr FASC @@ -1995,28 +2000,28 @@ update_status: ; count the hours ldx #0 countdown_loop: - lda temp + 1 + lda elapsed_work + 1 cmp #.hibyte(divisor) - bmi countdown_done - lda temp + bcc countdown_done + lda elapsed_work cmp #.lobyte(divisor) - bmi countdown_done + bcc countdown_done sec - lda temp + lda elapsed_work sbc #.lobyte(divisor) - sta temp - lda temp + 1 + sta elapsed_work + lda elapsed_work + 1 sbc #.hibyte(divisor) - sta temp + 1 + sta elapsed_work + 1 inx jmp countdown_loop countdown_done: lda digits,x eor #$80 - sta temp2 - lda #.lobyte(temp2) + sta elapsed_digit + lda #.lobyte(elapsed_digit) sta INBUFF - lda #.hibyte(temp2) + lda #.hibyte(elapsed_digit) sta INBUFF + 1 jsr draw_string .endscope From 4bac47a4fdad2fd56f9023eaae9a4be83fe9fced Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 23 Jun 2025 00:31:53 -0700 Subject: [PATCH 25/32] fix at 256 seconds --- mandel.s | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mandel.s b/mandel.s index 6837e00..714f6b6 100644 --- a/mandel.s +++ b/mandel.s @@ -2002,10 +2002,14 @@ update_status: countdown_loop: lda elapsed_work + 1 cmp #.hibyte(divisor) + beq countdown_lobyte bcc countdown_done + bcs countdown_inc + countdown_lobyte: lda elapsed_work cmp #.lobyte(divisor) bcc countdown_done + countdown_inc: sec lda elapsed_work sbc #.lobyte(divisor) From fd954da47e75a99b0033ec11f0dd145e1d8ab544 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 23 Jun 2025 08:17:39 -0700 Subject: [PATCH 26/32] Create map file for convenience export a symbol and it'll appear in mandel.map --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bd14c7d..711adcd 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ all : mandel.xex mandel.xex : mandel.o tables.o atari-asm-xex.cfg - ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o %.o : %.s ca65 -o $@ $< @@ -15,4 +15,6 @@ clean : rm -f tables.s rm -f *.o rm -f *.xex + rm -f mandel.map + From fab2760394b9b74f07488bc345997fbd9e566389 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 28 Jun 2025 13:43:43 -0700 Subject: [PATCH 27/32] refactor countdown as a procedure call --- mandel.s | 104 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/mandel.s b/mandel.s index 714f6b6..cad4e5e 100644 --- a/mandel.s +++ b/mandel.s @@ -1991,55 +1991,25 @@ update_status: lda FR0 + 1 sta elapsed_work + 1 - ;jsr IFP - ;jsr FASC - ;jsr draw_string - - .macro countdown divisor, digits - .scope - ; count the hours - ldx #0 - countdown_loop: - lda elapsed_work + 1 - cmp #.hibyte(divisor) - beq countdown_lobyte - bcc countdown_done - bcs countdown_inc - countdown_lobyte: - lda elapsed_work - cmp #.lobyte(divisor) - bcc countdown_done - countdown_inc: - sec - lda elapsed_work - sbc #.lobyte(divisor) - sta elapsed_work - lda elapsed_work + 1 - sbc #.hibyte(divisor) - sta elapsed_work + 1 - inx - jmp countdown_loop - countdown_done: - lda digits,x - eor #$80 - sta elapsed_digit - lda #.lobyte(elapsed_digit) - sta INBUFF - lda #.hibyte(elapsed_digit) - sta INBUFF + 1 - jsr draw_string - .endscope - .endmacro - draw_string_const str_space - countdown 36000, digits_space - countdown 3600, digits_zero + + .macro do_countdown divisor, digits + ldx #.lobyte(divisor) + ldy #.hibyte(divisor) + lda #.lobyte(digits) + sta INBUFF + lda #.hibyte(digits) + sta INBUFF + 1 + jsr countdown + .endmacro + do_countdown 36000, digits_space + do_countdown 3600, digits_zero draw_string_const str_h - countdown 600, digits_zero - countdown 60, digits_zero + do_countdown 600, digits_zero + do_countdown 60, digits_zero draw_string_const str_m - countdown 10, digits_zero - countdown 1, digits_zero + do_countdown 10, digits_zero + do_countdown 1, digits_zero draw_string_const str_s skipped: @@ -2101,3 +2071,45 @@ loop: jmp main_loop .endproc + +; digit string in INBUFF +; divisor X/Y +; clobbers temp, calls draw_string +.proc countdown + divisor = temp + stx divisor + sty divisor + 1 + + ; count the hours + ldy #0 +countdown_loop: + lda elapsed_work + 1 + cmp divisor + 1 + beq countdown_lobyte + bcc countdown_done + bcs countdown_inc +countdown_lobyte: + lda elapsed_work + cmp divisor + bcc countdown_done +countdown_inc: + sec + lda elapsed_work + sbc divisor + sta elapsed_work + lda elapsed_work + 1 + sbc divisor + 1 + sta elapsed_work + 1 + iny + jmp countdown_loop +countdown_done: + lda (INBUFF),y + eor #$80 + sta elapsed_digit + lda #.lobyte(elapsed_digit) + sta INBUFF + lda #.hibyte(elapsed_digit) + sta INBUFF + 1 + jsr draw_string + rts +.endproc From 96e0356e578f7c94dd2c5764ecbf59f70dbe29d9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 3 Jul 2025 18:41:24 -0700 Subject: [PATCH 28/32] WIP input handling for coords experimental output via 32-bits mult, looses precision in conversion --- mandel.s | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 128 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index cad4e5e..4cc0f39 100644 --- a/mandel.s +++ b/mandel.s @@ -126,6 +126,10 @@ KEY_7 = 51 KEY_8 = 53 KEY_9 = 48 KEY_0 = 50 +KEY_PERIOD = 34 +KEY_E = 42 +KEY_X = 22 +KEY_Y = 43 .struct float48 exponent .byte @@ -257,6 +261,28 @@ fixed3_13_as_float: ; float48 .byte $00 .byte $00 +u65536_as_float: ; float48 + ; 1 << 16 + ; 65536 + ; 06 55 36 . 00 00 + .byte 66 ; exponent/sign - +2 bytes + .byte $06 + .byte $55 + .byte $36 + .byte $00 + .byte $00 + +fixed6_26_as_float: ; float48 + ; 1 << 26 + ; 67108864 + ; 67 10 88 64 . 00 + .byte 67 ; exponent/sign - +3 bytes + .byte $67 + .byte $10 + .byte $88 + .byte $64 + .byte $00 + sec_per_frame: ; float48 00 . 01 66 66 66 67 .byte 63 ; exponent/sign - -1 bytes .byte $01 ; BCD digits @@ -403,6 +429,13 @@ elapsed_work: elapsed_digit: .byte 0 +input_col: + .byte 0 +input_row: + .byte 0 +input_max: + .byte 0 + ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 clc ; 2 cyc @@ -983,6 +1016,66 @@ common: .endproc +; input in FR0, 32 bits signed 6.26 fixed +; output in FR0, Atari float +; clobbers a, x, y, FR0, FR1 +.proc fixed6_26_to_float + ; check sign bit! conversion routine is for unsigned + lda FR0 + 3 + and #$80 + sta temp + + beq positive + neg32 FR0 +positive: + + ; save low word + lda FR0 + pha + lda FR0 + 1 + pha + + ; convert high word + sta FR0 + 2 + sta FR1 + lda FR0 + 3 + sta FR0 + 1 + jsr IFP + + lda temp + beq positive2 + ; set float sign bit + lda FR0 + ora #$80 + sta FR0 +positive2: + + ; high word to FR1 + ldx #.lobyte(u65536_as_float) + ldy #.hibyte(u65536_as_float) + jsr FLD1R + jsr FMUL + jsr FMOVE + + ; convert low word + pla + lda temp + 1 + pla + lda temp + jsr IFP + + ; combine + jsr FADD + + ; scale + ldx #.lobyte(fixed6_26_as_float) + ldy #.hibyte(fixed6_26_as_float) + jsr FLD1R + jsr FDIV + + rts +.endproc + ; input in FR0, Atari float ; output in FR0, 16 bits signed 3.13 fixed ; clobbers a, x, y, FR0, FR1 @@ -1603,7 +1696,7 @@ number_keys: beq five cpy #KEY_6 beq six - jmp skip_char + jmp letter_keys one: ldx #0 @@ -1622,7 +1715,21 @@ five: jmp load_key_viewport six: ldx #5 - ; fall through + jmp load_key_viewport + +letter_keys: + cpy #KEY_X + bne not_x + jsr input_x + jmp load_key_viewport +not_x: + cpy #KEY_Y + bne not_y + jsr input_y + jmp load_key_viewport +not_y: + jmp skip_char + load_key_viewport: jsr load_viewport ; fall through @@ -1632,6 +1739,23 @@ done: .endproc +.proc input_x + ldx #col_x + ldy #1 + jsr input_number + + + rts +.endproc + +.proc input_y + rts +.endproc + +.proc input_number + rts +.endproc + .proc clear_screen ; zero the range from framebuffer_top to display_list lda #.lobyte(framebuffer_top) @@ -1679,9 +1803,7 @@ zero_byte_loop: draw_string_const str_x copy32 FR0, ox - shift_round_16 FR0, 3 - copy16 FR0, FR0 + 2 - jsr fixed3_13_to_float + jsr fixed6_26_to_float jsr FASC jsr draw_string @@ -1690,9 +1812,7 @@ zero_byte_loop: draw_string_const str_y copy32 FR0, oy - shift_round_16 FR0, 3 - copy16 FR0, FR0 + 2 - jsr fixed3_13_to_float + jsr fixed6_26_to_float jsr FASC jsr draw_string From f2a6af0995512835bae7e6ecd094e15ef5115fc3 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 3 Jul 2025 18:43:10 -0700 Subject: [PATCH 29/32] Replace the not-enough-precision 32 bit to float impl keep the proc though to encapsulate it but uses the older logic of rounding down to 3.13 first --- mandel.s | 79 +++----------------------------------------------------- 1 file changed, 4 insertions(+), 75 deletions(-) diff --git a/mandel.s b/mandel.s index 4cc0f39..a97d6e3 100644 --- a/mandel.s +++ b/mandel.s @@ -261,28 +261,6 @@ fixed3_13_as_float: ; float48 .byte $00 .byte $00 -u65536_as_float: ; float48 - ; 1 << 16 - ; 65536 - ; 06 55 36 . 00 00 - .byte 66 ; exponent/sign - +2 bytes - .byte $06 - .byte $55 - .byte $36 - .byte $00 - .byte $00 - -fixed6_26_as_float: ; float48 - ; 1 << 26 - ; 67108864 - ; 67 10 88 64 . 00 - .byte 67 ; exponent/sign - +3 bytes - .byte $67 - .byte $10 - .byte $88 - .byte $64 - .byte $00 - sec_per_frame: ; float48 00 . 01 66 66 66 67 .byte 63 ; exponent/sign - -1 bytes .byte $01 ; BCD digits @@ -1016,63 +994,14 @@ common: .endproc +; rounds to 16-bit first! ; input in FR0, 32 bits signed 6.26 fixed ; output in FR0, Atari float ; clobbers a, x, y, FR0, FR1 .proc fixed6_26_to_float - ; check sign bit! conversion routine is for unsigned - lda FR0 + 3 - and #$80 - sta temp - - beq positive - neg32 FR0 -positive: - - ; save low word - lda FR0 - pha - lda FR0 + 1 - pha - - ; convert high word - sta FR0 + 2 - sta FR1 - lda FR0 + 3 - sta FR0 + 1 - jsr IFP - - lda temp - beq positive2 - ; set float sign bit - lda FR0 - ora #$80 - sta FR0 -positive2: - - ; high word to FR1 - ldx #.lobyte(u65536_as_float) - ldy #.hibyte(u65536_as_float) - jsr FLD1R - jsr FMUL - jsr FMOVE - - ; convert low word - pla - lda temp + 1 - pla - lda temp - jsr IFP - - ; combine - jsr FADD - - ; scale - ldx #.lobyte(fixed6_26_as_float) - ldy #.hibyte(fixed6_26_as_float) - jsr FLD1R - jsr FDIV - + shift_round_16 FR0, 3 + copy16 FR0, FR0 + 2 + jsr fixed3_13_to_float rts .endproc From b46e6fb343e425985f53f868f1dbe4e3f6c0e384 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 1 Sep 2025 12:28:33 -0700 Subject: [PATCH 30/32] fix typo on stub x/y inputs was accidentally falling through to the load a viewport from a keypress thingy which was not needed here --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index a97d6e3..5edcbce 100644 --- a/mandel.s +++ b/mandel.s @@ -1650,12 +1650,12 @@ letter_keys: cpy #KEY_X bne not_x jsr input_x - jmp load_key_viewport + jmp done not_x: cpy #KEY_Y bne not_y jsr input_y - jmp load_key_viewport + jmp done not_y: jmp skip_char From 29cd3d968fbff768c23959fab2f6addcaff8ccfe Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 6 Sep 2025 19:53:25 -0700 Subject: [PATCH 31/32] Shaves 3 seconds off initial view runtime on XE :D Instead of relying solely on the JMP thunks added to imul16_func and sqr16_func, three call sites within the mandelbrot iteration function are patched directly to jsr to the XE versions, saving like 15 cycles per iter Ok so it's not a lot, but every seconds counts. ;) with XE code disabled: 1539 us/iter 5m13s with old XE code: 1417 us/iter 4m48s with new XE code: 1406 us/iter 4m45s --- mandel.s | 147 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 82 insertions(+), 65 deletions(-) diff --git a/mandel.s b/mandel.s index 5edcbce..b0f9c28 100644 --- a/mandel.s +++ b/mandel.s @@ -485,6 +485,7 @@ input_max: .endmacro ; 6 * bytes cycles +; 4 * bytes bytes .macro copy bytes, dest, arg .repeat bytes, byte ; 6 * bytes cycles lda arg + byte ; 3 cyc @@ -493,6 +494,7 @@ input_max: .endmacro ; 12 cycles +; 8 bytes .macro copy16 dest, arg copy 2, dest, arg .endmacro @@ -538,6 +540,8 @@ input_max: ; input: arg1, arg2 as fixed4.12 ; output: dest as fixed8.24 +; patch point jsr at 16 bytes in +imul16_patch_offset = 16 .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc @@ -547,6 +551,8 @@ input_max: ; input: arg as fixed4.12 ; output: dest as fixed8.24 +; patch point jsr at 8 bytes in +sqr16_patch_offset = 8 .macro sqr16 dest, arg copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc @@ -692,71 +698,6 @@ bank_switch_table: .endif .endmacro -.proc imul8xe_init - - bank_switch 0 - lda #0 - sta EXTENDED_RAM - bank_switch 1 - lda #1 - sta EXTENDED_RAM - bank_switch 0 - lda EXTENDED_RAM - beq init - - ; no bank switching available, we just overwrite the value in base ram - rts - -init: - - ; patch imul16_func into a forwarding thunk to imul16xe_func - lda #$4c ; 'jmp' opcode - sta imul16_func - lda #.lobyte(imul16xe_func) - sta imul16_func + 1 - lda #.hibyte(imul16xe_func) - sta imul16_func + 2 - - ; ditto for sqr16_func -> sqr16xe_func - lda #$4c ; 'jmp' opcode - sta sqr16_func - lda #.lobyte(sqr16xe_func) - sta sqr16_func + 1 - lda #.hibyte(sqr16xe_func) - sta sqr16_func + 2 - - ; create the lookup table - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - sta ptr - lda #$40 - sta ptr + 1 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -.endproc ; Initialize a 16 KB chunk of the table ; input: multipliers in temp @@ -1156,12 +1097,15 @@ keep_going: shift_round_16 zy, 3 ; zx_2 = zx * zx +fixup_sqr16_1: sqr16 zx_2, zx + 2 ; zy_2 = zy * zy +fixup_sqr16_2: sqr16 zy_2, zy + 2 ; zx_zy = zx * zy +fixup_imul16_1: imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 @@ -2162,3 +2106,76 @@ countdown_done: jsr draw_string rts .endproc + +.proc imul8xe_init + + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2 + + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1 + sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2 + sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2 + + + ; create the lookup table + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + sta ptr + lda #$40 + sta ptr + 1 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts +.endproc From 6479cf530c1c584f33b96f2b19885d02415863bb Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 16 Sep 2025 21:29:40 -0700 Subject: [PATCH 32/32] update some timings --- mandel.s | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mandel.s b/mandel.s index b0f9c28..b52f24a 100644 --- a/mandel.s +++ b/mandel.s @@ -461,7 +461,7 @@ input_max: sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * bytes cycles +; 3 + 5 * (bytes - 1) cycles .macro shl bytes, arg asl arg ; 3 cyc .repeat bytes-1, i @@ -469,17 +469,17 @@ input_max: .endrepeat .endmacro -; 13 cycles +; 8 cycles .macro shl16 arg shl 2, arg .endmacro -; 18 cycles +; 13 cycles .macro shl24 arg shl 3, arg .endmacro -; 23 cycles +; 18 cycles .macro shl32 arg shl 4, arg .endmacro @@ -529,11 +529,11 @@ input_max: neg 4, arg .endmacro -; 11-27 + 23 * shift cycles -; 103-119 cycles for shift=4 +; 11-27 + 18 * shift cycles +; 65-81 cycles for shift=3 .macro shift_round_16 arg, shift .repeat shift - shl32 arg ; 23 cycles + shl32 arg ; 18 cycles .endrepeat round16 arg ; 11-27 cycles .endmacro