From 0d086a179cf8e91b839f306bb597ef9e6125f6b2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:20:53 -0800 Subject: [PATCH 01/10] wip --- mandel.s | 108 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/mandel.s b/mandel.s index fc30532..50213ad 100644 --- a/mandel.s +++ b/mandel.s @@ -1,43 +1,42 @@ ; Our zero-page vars -sx = $80 ; i16: screen pixel x -sy = $82 ; i16: screen pixel y -ox = $84 ; fixed4.12: center point x -oy = $86 ; fixed4.12: center point y -cx = $88 ; fixed4.12: c_x -cy = $8a ; fixed4.12: c_y -zx = $8c ; fixed4.12: z_x -zy = $8e ; fixed4.12: z_y +ox = $80 ; fixed8.24: center point x +oy = $84 ; fixed8.24: center point y +cx = $88 ; fixed8.24: c_x +cy = $8c ; fixed8.24: c_y -zx_2 = $90 ; fixed4.12: z_x^2 -zy_2 = $92 ; fixed4.12: z_y^2 -zx_zy = $94 ; fixed4.12: z_x * z_y -dist = $96 ; fixed4.12: z_x^2 + z_y^2 +zx = $90 ; fixed8.24: z_x +zy = $94 ; fixed8.24: z_y +zx_2 = $98 ; fixed8.24: z_x^2 +zy_2 = $9c ; fixed8.24: z_y^2 -iter = $a0 ; u8: iteration count +zx_zy = $a0 ; fixed8.24: z_x * z_y +dist = $a4 ; fixed8.24: z_x^2 + z_y^2 +sx = $a8 ; i16: screen pixel x +sy = $aa ; i16: screen pixel y +z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start = $ad ; u8: index into z_buffer +z_buffer_end = $ae ; u8: index into z_buffer +iter = $af ; u8: iteration count -zoom = $a1 ; u8: zoom shift level -count_frames = $a2 ; u8 -count_pixels = $a3 ; u8 -total_ms = $a4 ; float48 -total_pixels = $aa ; float48 +ptr = $b0 ; u16 +pixel_ptr = $b2 ; u16 +zoom = $b4 ; u8: zoom shift level +fill_level = $b5 ; u8 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 +palette_offset = $ba ; u8 +chroma_offset = $bb ; u8 +palette_ticks = $bc ; u8 +chroma_ticks = $bd ; u8 +count_frames = $be ; u8 +count_pixels = $bf ; u8 -z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $b1 ; u8: index into z_buffer -z_buffer_end = $b2 ; u8: index into z_buffer -temp = $b4 ; u16 -temp2 = $b6 ; u16 -pixel_ptr = $b8 ; u16 -pixel_color = $ba ; u8 -pixel_mask = $bb ; u8 -pixel_shift = $bc ; u8 -pixel_offset = $bd ; u8 -fill_level = $be ; u8 -palette_offset = $bf ; u8 - -palette_ticks = $c0 ; u8 -chroma_ticks = $c1 ; u8 -chroma_offset = $c2 ; u8 -ptr = $c4 ; u16 +total_pixels = $c0 ; float48 +total_ms = $c6 ; float48 +temp = $cc ; u16 +temp2 = $ce ; u16 palette_delay = 23 chroma_delay = 137 @@ -884,12 +883,41 @@ next: ; zx_zy = 0 ; dist = 0 ; iter = 0 +; lda #00 +; ldx #(iter - zx + 1) +;initloop: +; sta zx - 1,x +; dex +; bne initloop +; sta z_buffer_start +; sta z_buffer_end + lda #00 - ldx #(iter - zx + 1) -initloop: - sta zx - 1,x - dex - bne initloop + sta zx + sta zx + 1 + sta zx + 2 + sta zx + 3 + sta zy + sta zy + 1 + sta zy + 2 + sta zy + 3 + sta zx_2 + sta zx_2 + 1 + sta zx_2 + 2 + sta zx_2 + 3 + sta zy_2 + sta zy_2 + 1 + sta zy_2 + 2 + sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 + sta dist + sta dist + 1 + sta dist + 2 + sta dist + 3 + sta iter sta z_buffer_start sta z_buffer_end From 4a1e35699adcce1af0f60ea51573e8a215975c66 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:19:45 -0800 Subject: [PATCH 02/10] wip --- mandel.s | 71 ++++++++++++++++++++++++++++++++++++++------------------ todo.md | 2 +- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/mandel.s b/mandel.s index 50213ad..622ff62 100644 --- a/mandel.s +++ b/mandel.s @@ -433,6 +433,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + .macro sqr16_round dest, arg, shift ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc @@ -441,6 +448,12 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16 dest, arg + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -870,8 +883,8 @@ next: .proc mandelbrot ; input: - ; cx: position scaled to 4.12 fixed point - -8..+7.9 - ; cy: position scaled to 4.12 + ; cx: position scaled to 8.24 fixed point - -128..+127.9 + ; cy: position scaled to 8.24 ; ; output: ; iter: iteration count at escape or 0 @@ -909,10 +922,6 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 @@ -929,6 +938,8 @@ loop: keep_going: .macro quick_exit arg, max + ; arg: fixed8.24 + ; max: integer .local positive .local negative .local nope_out @@ -936,51 +947,61 @@ keep_going: .local all_done ; check sign bit - lda arg + 1 + lda arg + 3 bmi negative positive: - cmp #((max) << 4) + cmp #max bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - ((max) << 4)) + cmp #(256 - max) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: jmp exit_path - + first_equal: + ; following bytes all 0 shows it's really 'equal' + lda arg + 2 + bne all_done + lda arg + 1 + bne all_done lda arg - beq nope_out ; 2nd byte 0 shows it's really 'equal' + bne all_done + jmp exit_path all_done: .endmacro - ; 4.12: (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx - sub16 zx, zx_2, zy_2 - add16 zx, zx, cx + sub32 zx, zx_2, zy_2 + add32 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add16 zy, zx_zy, zx_zy - add16 zy, zy, cy + add32 zy, zx_zy, zx_zy + add32 zy, zy, cy quick_exit zy, 2 + ; convert 8.24 -> 4.12 + shift_round_16 zx, 4 + shift_round_16 zy, 4 + ; zx_2 = zx * zx - sqr16_round zx_2, zx, 4 + sqr16 zx_2, zx + 2 ; zy_2 = zy * zy - sqr16_round zy_2, zy, 4 + sqr16 zy_2, zy + 2 ; zx_zy = zx * zy - imul16_round zx_zy, zx, zy, 4 + imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 - add16 dist, zx_2, zy_2 + add32 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer @@ -1090,13 +1111,17 @@ enough: .endmacro .macro zoom_factor dest, src, zoom, aspect + ; output: dest: fixed8.24 + ; input: src: fixed4.12 + ; input: zoom: u8 ??? + ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16_round dest, dest, aspect, 4 + imul16 dest, dest, aspect .endmacro .proc pset @@ -1567,9 +1592,9 @@ not_skipped_mask: ; run the fractal! zoom_factor cx, sx, zoom, aspect_x - add16 cx, cx, ox + add32 cx, cx, ox zoom_factor cy, sy, zoom, aspect_y - add16 cy, cy, oy + add32 cy, cy, oy jsr mandelbrot jsr pset diff --git a/todo.md b/todo.md index 6fb0282..29217cd 100644 --- a/todo.md +++ b/todo.md @@ -3,7 +3,7 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx += zx^s + cx in 32-bit space, before rounding +* perform the zx_next = zx^s + cx in 32-bit space, before rounding * should improve precision on max zoom, might cost a few cycles * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D From 7184b8e03f2748efd532277995afe5fa7d4a3cf6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:33:20 -0800 Subject: [PATCH 03/10] wip --- mandel.s | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 622ff62..86a6b48 100644 --- a/mandel.s +++ b/mandel.s @@ -292,16 +292,16 @@ viewport_zoom: .byte 6 viewport_ox: - .word $0000 - .word $f110 - .word $f110 - .word $e400 + .dword $00000000 + .dword $ff110000 + .dword $ff110000 + .dword $fe400000 viewport_oy: - .word $0000 - .word $fb60 - .word $fbe0 - .word $0000 + .dword $00000000 + .dword $ffb60000 + .dword $ffbe0000 + .dword $00000000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -1459,17 +1459,32 @@ zero_byte_loop: txa asl a + asl a + tax lda viewport_ox,x sta ox lda viewport_oy,x sta oy + inx lda viewport_ox,x sta ox + 1 lda viewport_oy,x sta oy + 1 + inx + lda viewport_ox,x + sta ox + 2 + lda viewport_oy,x + sta oy + 2 + + inx + lda viewport_ox,x + sta ox + 3 + lda viewport_oy,x + sta oy + 3 + rts .endproc From 13257309dc3a6493e05575404f5deddd09e9192d Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 08:34:02 -0800 Subject: [PATCH 04/10] init fix --- mandel.s | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mandel.s b/mandel.s index 86a6b48..76816c2 100644 --- a/mandel.s +++ b/mandel.s @@ -922,6 +922,10 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 From 2fcb30b76a66819ab96ec3353b8ce4978f723675 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 08:56:59 -0800 Subject: [PATCH 05/10] wip --- mandel.s | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mandel.s b/mandel.s index 76816c2..0400003 100644 --- a/mandel.s +++ b/mandel.s @@ -980,7 +980,7 @@ keep_going: all_done: .endmacro - ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) ; zx = zx_2 - zy_2 + cx sub32 zx, zx_2, zy_2 add32 zx, zx, cx @@ -991,7 +991,7 @@ keep_going: add32 zy, zy, cy quick_exit zy, 2 - ; convert 8.24 -> 4.12 + ; convert 8.24 -> 4.12: (-8 .. +7.9) shift_round_16 zx, 4 shift_round_16 zy, 4 @@ -1042,10 +1042,10 @@ z_buffer_loop: ; Compare the previously stored z values ldy #0 - z_compare zx - z_compare zx + 1 - z_compare zy - z_compare zy + 1 + z_compare zx + 2 + z_compare zx + 3 + z_compare zy + 2 + z_compare zy + 3 cpy #4 bne z_no_matches @@ -1060,10 +1060,10 @@ z_no_matches: z_nothing_to_read: ; Store and expand - z_store zx - z_store zx + 1 - z_store zy - z_store zy + 1 + z_store zx + 2 + z_store zx + 3 + z_store zy + 2 + z_store zy + 3 z_advance stx z_buffer_end From d2f41f964435b3803ce694a70bf38687fd467caa Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:02:42 -0800 Subject: [PATCH 06/10] wip --- mandel.s | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/mandel.s b/mandel.s index 0400003..8b63941 100644 --- a/mandel.s +++ b/mandel.s @@ -425,14 +425,8 @@ viewport_oy: round16 arg ; 11-27 cycles .endmacro -.macro imul16_round dest, arg1, arg2, shift - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; ? cyc - shift_round_16 FR2, shift ; 103-119 cycles for shift=4 - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro - +; input: arg1, arg2 as fixed4.12 +; output: dest as fixed8.24 .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc @@ -440,20 +434,16 @@ viewport_oy: copy32 dest, FR2 ; 24 cyc .endmacro -.macro sqr16_round dest, arg, shift - ;imul16_round dest, arg, arg, shift - copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - shift_round_16 FR2, shift ; 103-119 cycles for shift=4 - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro - +; input: arg as fixed4.12 +; output: dest as fixed8.24 .macro sqr16 dest, arg copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc copy32 dest, FR2 ; 24 cyc .endmacro +; input: arg as u8 +; output: dest as u16 ; clobbers a, x .macro sqr8 dest, arg ldx arg From 1e0f577e099b3d7787d6e6d4fce1813ccd6b489c Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:09:11 -0800 Subject: [PATCH 07/10] wip --- mandel.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mandel.s b/mandel.s index 8b63941..6977582 100644 --- a/mandel.s +++ b/mandel.s @@ -453,6 +453,8 @@ viewport_oy: sta dest + 1 .endmacro +; input: arg as u8 +; input/output: dest as u16 ; clobbers a, x .macro sqr8_add16 dest, arg ldx arg From 81bf7f3c434646f0374c35f20131050bd314d1b2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:53:22 -0800 Subject: [PATCH 08/10] tweak --- mandel.s | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 6977582..4ab6c19 100644 --- a/mandel.s +++ b/mandel.s @@ -1106,10 +1106,9 @@ cont: enough: .endmacro -.macro zoom_factor dest, src, zoom, aspect +.macro zoom_factor dest, src, aspect ; output: dest: fixed8.24 ; input: src: fixed4.12 - ; input: zoom: u8 ??? ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src @@ -1602,9 +1601,9 @@ skipped_mask: not_skipped_mask: ; run the fractal! - zoom_factor cx, sx, zoom, aspect_x + zoom_factor cx, sx, aspect_x add32 cx, cx, ox - zoom_factor cy, sy, zoom, aspect_y + zoom_factor cy, sy, aspect_y add32 cy, cy, oy jsr mandelbrot jsr pset From 2e8893fd7892429bc07bd1d653ef1319be7d2d7b Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 13:54:53 -0800 Subject: [PATCH 09/10] haha fuck me --- mandel.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 4ab6c19..04edec5 100644 --- a/mandel.s +++ b/mandel.s @@ -320,7 +320,7 @@ viewport_oy: ; 38 cycles .macro add32 dest, arg1, arg2 - add 4, dest, arg2, dest + add 4, dest, arg1, arg2 .endmacro ; 8 cycles From cc83c76706519cce3fff61ce46df9589d31025d6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 14:16:43 -0800 Subject: [PATCH 10/10] update docs for 32-bit intermediates --- readme.md | 4 ++-- todo.md | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/readme.md b/readme.md index f297d60..d60644c 100644 --- a/readme.md +++ b/readme.md @@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. +The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. Iterations are capped at 255. @@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e ## Todo -See ideas in `todo.md`. \ No newline at end of file +See ideas in `todo.md`. diff --git a/todo.md b/todo.md index 29217cd..284d653 100644 --- a/todo.md +++ b/todo.md @@ -3,13 +3,11 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx_next = zx^s + cx in 32-bit space, before rounding - * should improve precision on max zoom, might cost a few cycles - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * try 3.13 fixed point instead of 4.12 for more precision * can we get away without the extra bit? + * since exit compare space would be 6.26 i think so * y-axis mirror optimization