From aee587388de88e35e8f3b345898bd4abc9acf3ed Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:01:45 -0800 Subject: [PATCH 01/17] eliminate mul_hibyte512 table This costs an extra half cycle on average, assuming uniform distribution of multiplication inputs. I don't think a half cycle is worth an extra 256-byte table. --- mandel.s | 30 ++++++++++++++++-------------- tables.js | 24 +++++++----------------- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/mandel.s b/mandel.s index fc30532..ec1b086 100644 --- a/mandel.s +++ b/mandel.s @@ -129,9 +129,8 @@ KEY_0 = 50 mantissa .byte 5 .endstruct -.import mul_lobyte256 -.import mul_hibyte256 -.import mul_hibyte512 +.import mul_lobyte +.import mul_hibyte .import sqr_lobyte .import sqr_hibyte @@ -548,22 +547,25 @@ bank_switch_table: clc ; 2 cyc adc mul_factor_x ; 3 cyc tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc + lda mul_hibyte,x ; 4 cyc + bcc next ; 2 cyc + ; carry is set so we get to add 1 for free, but need to add 0x80 + adc #$7f ; 2 cyc + clc ; 2 cyc + ; stash the sum temporarily so we can use it as an operand to add + stx mul_product_lo ; 3 cyc + adc mul_product_lo ; 3 cyc next: + sec ; 2 cyc sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc + lda mul_lobyte,x ; 4 cyc ; - a^2/2 ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc ; + x & a & 1: @@ -582,10 +584,10 @@ bank_switch_table: ; - x^2/2 small_product: sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc .endscope .endif diff --git a/tables.js b/tables.js index 50cbef9..f4802ce 100644 --- a/tables.js +++ b/tables.js @@ -11,32 +11,22 @@ function db(func) { return lines.join('\n'); } -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - console.log( `.segment "TABLES" -.export mul_lobyte256 -.export mul_hibyte256 -.export mul_hibyte512 +.export mul_lobyte +.export mul_hibyte .export sqr_lobyte .export sqr_hibyte -; (i * i + 1) / 2 for the multiplier +; (i * i) / 2 for the multiplier .align 256 -mul_lobyte256: -${db((i) => squares[i] & 0xff)} +mul_lobyte: +${db((i) => ((i * i) >> 1) & 0xff)} .align 256 -mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} - -.align 256 -mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} +mul_hibyte: +${db((i) => ((i * i) >> 9) & 0xff)} ; (i * i) for the plain squares .align 256 From f06aed0c0080b45fdd92544afddcbebea6d74efa Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:22:31 -0800 Subject: [PATCH 02/17] set results from both 8-bit squares first Since the results from the lo and hi squares don't overlap or overflow, they can be written directly to the final output location without doing any addition. Then only the multiplication that goes in the middle needs any adds. --- mandel.s | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/mandel.s b/mandel.s index ec1b086..a63d96f 100644 --- a/mandel.s +++ b/mandel.s @@ -450,18 +450,6 @@ viewport_oy: sta dest + 1 .endmacro -; clobbers a, x -.macro sqr8_add16 dest, arg - ldx arg - clc - lda sqr_lobyte,x - adc dest - sta dest - lda sqr_hibyte,x - adc dest + 1 - sta dest + 1 -.endmacro - .segment "TABLES" ; lookup table for top byte -> PORTB value for bank-switch .align 256 @@ -794,9 +782,7 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe add16 result + 1, result + 1, inter @@ -804,8 +790,6 @@ arg2_pos: add16 result + 1, result + 1, inter add_carry result + 3 - sqr8_add16 result + 2, arg + 1 - rts ; 6 cyc .endscope .endmacro From 0f49760aa53b76f16fadf66b236b00df3d4fdd4c Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:26:24 -0800 Subject: [PATCH 03/17] unify tables for squaring and multiplication --- mandel.s | 10 ++++++---- tables.js | 11 ----------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/mandel.s b/mandel.s index a63d96f..299db98 100644 --- a/mandel.s +++ b/mandel.s @@ -131,8 +131,6 @@ KEY_0 = 50 .import mul_lobyte .import mul_hibyte -.import sqr_lobyte -.import sqr_hibyte .data @@ -444,9 +442,13 @@ viewport_oy: ; clobbers a, x .macro sqr8 dest, arg ldx arg - lda sqr_lobyte,x + txa + lsr + lda mul_lobyte,x + rol sta dest - lda sqr_hibyte,x + lda mul_hibyte,x + rol sta dest + 1 .endmacro diff --git a/tables.js b/tables.js index f4802ce..176e4df 100644 --- a/tables.js +++ b/tables.js @@ -16,8 +16,6 @@ console.log( .export mul_lobyte .export mul_hibyte -.export sqr_lobyte -.export sqr_hibyte ; (i * i) / 2 for the multiplier .align 256 @@ -28,13 +26,4 @@ ${db((i) => ((i * i) >> 1) & 0xff)} mul_hibyte: ${db((i) => ((i * i) >> 9) & 0xff)} -; (i * i) for the plain squares -.align 256 -sqr_lobyte: -${db((i) => (i * i) & 0xff)} - -.align 256 -sqr_hibyte: -${db((i) => ((i * i) >> 8) & 0xff)} - `); From 61eb1aaf21fdac377e6f04db117aa855ad73b940 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 05:11:26 -0800 Subject: [PATCH 04/17] notes --- todo.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/todo.md b/todo.md index 1281de7..6fb0282 100644 --- a/todo.md +++ b/todo.md @@ -1,5 +1,11 @@ things to try: +* skip add on the top-byte multiply in sqr8/mul8 + * should save a few cycles, suggestion by jamey + +* perform the zx += zx^s + cx in 32-bit space, before rounding + * should improve precision on max zoom, might cost a few cycles + * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * try 3.13 fixed point instead of 4.12 for more precision From 0d086a179cf8e91b839f306bb597ef9e6125f6b2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:20:53 -0800 Subject: [PATCH 05/17] wip --- mandel.s | 108 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/mandel.s b/mandel.s index fc30532..50213ad 100644 --- a/mandel.s +++ b/mandel.s @@ -1,43 +1,42 @@ ; Our zero-page vars -sx = $80 ; i16: screen pixel x -sy = $82 ; i16: screen pixel y -ox = $84 ; fixed4.12: center point x -oy = $86 ; fixed4.12: center point y -cx = $88 ; fixed4.12: c_x -cy = $8a ; fixed4.12: c_y -zx = $8c ; fixed4.12: z_x -zy = $8e ; fixed4.12: z_y +ox = $80 ; fixed8.24: center point x +oy = $84 ; fixed8.24: center point y +cx = $88 ; fixed8.24: c_x +cy = $8c ; fixed8.24: c_y -zx_2 = $90 ; fixed4.12: z_x^2 -zy_2 = $92 ; fixed4.12: z_y^2 -zx_zy = $94 ; fixed4.12: z_x * z_y -dist = $96 ; fixed4.12: z_x^2 + z_y^2 +zx = $90 ; fixed8.24: z_x +zy = $94 ; fixed8.24: z_y +zx_2 = $98 ; fixed8.24: z_x^2 +zy_2 = $9c ; fixed8.24: z_y^2 -iter = $a0 ; u8: iteration count +zx_zy = $a0 ; fixed8.24: z_x * z_y +dist = $a4 ; fixed8.24: z_x^2 + z_y^2 +sx = $a8 ; i16: screen pixel x +sy = $aa ; i16: screen pixel y +z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start = $ad ; u8: index into z_buffer +z_buffer_end = $ae ; u8: index into z_buffer +iter = $af ; u8: iteration count -zoom = $a1 ; u8: zoom shift level -count_frames = $a2 ; u8 -count_pixels = $a3 ; u8 -total_ms = $a4 ; float48 -total_pixels = $aa ; float48 +ptr = $b0 ; u16 +pixel_ptr = $b2 ; u16 +zoom = $b4 ; u8: zoom shift level +fill_level = $b5 ; u8 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 +palette_offset = $ba ; u8 +chroma_offset = $bb ; u8 +palette_ticks = $bc ; u8 +chroma_ticks = $bd ; u8 +count_frames = $be ; u8 +count_pixels = $bf ; u8 -z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $b1 ; u8: index into z_buffer -z_buffer_end = $b2 ; u8: index into z_buffer -temp = $b4 ; u16 -temp2 = $b6 ; u16 -pixel_ptr = $b8 ; u16 -pixel_color = $ba ; u8 -pixel_mask = $bb ; u8 -pixel_shift = $bc ; u8 -pixel_offset = $bd ; u8 -fill_level = $be ; u8 -palette_offset = $bf ; u8 - -palette_ticks = $c0 ; u8 -chroma_ticks = $c1 ; u8 -chroma_offset = $c2 ; u8 -ptr = $c4 ; u16 +total_pixels = $c0 ; float48 +total_ms = $c6 ; float48 +temp = $cc ; u16 +temp2 = $ce ; u16 palette_delay = 23 chroma_delay = 137 @@ -884,12 +883,41 @@ next: ; zx_zy = 0 ; dist = 0 ; iter = 0 +; lda #00 +; ldx #(iter - zx + 1) +;initloop: +; sta zx - 1,x +; dex +; bne initloop +; sta z_buffer_start +; sta z_buffer_end + lda #00 - ldx #(iter - zx + 1) -initloop: - sta zx - 1,x - dex - bne initloop + sta zx + sta zx + 1 + sta zx + 2 + sta zx + 3 + sta zy + sta zy + 1 + sta zy + 2 + sta zy + 3 + sta zx_2 + sta zx_2 + 1 + sta zx_2 + 2 + sta zx_2 + 3 + sta zy_2 + sta zy_2 + 1 + sta zy_2 + 2 + sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 + sta dist + sta dist + 1 + sta dist + 2 + sta dist + 3 + sta iter sta z_buffer_start sta z_buffer_end From 4a1e35699adcce1af0f60ea51573e8a215975c66 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:19:45 -0800 Subject: [PATCH 06/17] wip --- mandel.s | 71 ++++++++++++++++++++++++++++++++++++++------------------ todo.md | 2 +- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/mandel.s b/mandel.s index 50213ad..622ff62 100644 --- a/mandel.s +++ b/mandel.s @@ -433,6 +433,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + .macro sqr16_round dest, arg, shift ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc @@ -441,6 +448,12 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16 dest, arg + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -870,8 +883,8 @@ next: .proc mandelbrot ; input: - ; cx: position scaled to 4.12 fixed point - -8..+7.9 - ; cy: position scaled to 4.12 + ; cx: position scaled to 8.24 fixed point - -128..+127.9 + ; cy: position scaled to 8.24 ; ; output: ; iter: iteration count at escape or 0 @@ -909,10 +922,6 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 @@ -929,6 +938,8 @@ loop: keep_going: .macro quick_exit arg, max + ; arg: fixed8.24 + ; max: integer .local positive .local negative .local nope_out @@ -936,51 +947,61 @@ keep_going: .local all_done ; check sign bit - lda arg + 1 + lda arg + 3 bmi negative positive: - cmp #((max) << 4) + cmp #max bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - ((max) << 4)) + cmp #(256 - max) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: jmp exit_path - + first_equal: + ; following bytes all 0 shows it's really 'equal' + lda arg + 2 + bne all_done + lda arg + 1 + bne all_done lda arg - beq nope_out ; 2nd byte 0 shows it's really 'equal' + bne all_done + jmp exit_path all_done: .endmacro - ; 4.12: (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx - sub16 zx, zx_2, zy_2 - add16 zx, zx, cx + sub32 zx, zx_2, zy_2 + add32 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add16 zy, zx_zy, zx_zy - add16 zy, zy, cy + add32 zy, zx_zy, zx_zy + add32 zy, zy, cy quick_exit zy, 2 + ; convert 8.24 -> 4.12 + shift_round_16 zx, 4 + shift_round_16 zy, 4 + ; zx_2 = zx * zx - sqr16_round zx_2, zx, 4 + sqr16 zx_2, zx + 2 ; zy_2 = zy * zy - sqr16_round zy_2, zy, 4 + sqr16 zy_2, zy + 2 ; zx_zy = zx * zy - imul16_round zx_zy, zx, zy, 4 + imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 - add16 dist, zx_2, zy_2 + add32 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer @@ -1090,13 +1111,17 @@ enough: .endmacro .macro zoom_factor dest, src, zoom, aspect + ; output: dest: fixed8.24 + ; input: src: fixed4.12 + ; input: zoom: u8 ??? + ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16_round dest, dest, aspect, 4 + imul16 dest, dest, aspect .endmacro .proc pset @@ -1567,9 +1592,9 @@ not_skipped_mask: ; run the fractal! zoom_factor cx, sx, zoom, aspect_x - add16 cx, cx, ox + add32 cx, cx, ox zoom_factor cy, sy, zoom, aspect_y - add16 cy, cy, oy + add32 cy, cy, oy jsr mandelbrot jsr pset diff --git a/todo.md b/todo.md index 6fb0282..29217cd 100644 --- a/todo.md +++ b/todo.md @@ -3,7 +3,7 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx += zx^s + cx in 32-bit space, before rounding +* perform the zx_next = zx^s + cx in 32-bit space, before rounding * should improve precision on max zoom, might cost a few cycles * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D From 7184b8e03f2748efd532277995afe5fa7d4a3cf6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:33:20 -0800 Subject: [PATCH 07/17] wip --- mandel.s | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 622ff62..86a6b48 100644 --- a/mandel.s +++ b/mandel.s @@ -292,16 +292,16 @@ viewport_zoom: .byte 6 viewport_ox: - .word $0000 - .word $f110 - .word $f110 - .word $e400 + .dword $00000000 + .dword $ff110000 + .dword $ff110000 + .dword $fe400000 viewport_oy: - .word $0000 - .word $fb60 - .word $fbe0 - .word $0000 + .dword $00000000 + .dword $ffb60000 + .dword $ffbe0000 + .dword $00000000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -1459,17 +1459,32 @@ zero_byte_loop: txa asl a + asl a + tax lda viewport_ox,x sta ox lda viewport_oy,x sta oy + inx lda viewport_ox,x sta ox + 1 lda viewport_oy,x sta oy + 1 + inx + lda viewport_ox,x + sta ox + 2 + lda viewport_oy,x + sta oy + 2 + + inx + lda viewport_ox,x + sta ox + 3 + lda viewport_oy,x + sta oy + 3 + rts .endproc From 13257309dc3a6493e05575404f5deddd09e9192d Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 08:34:02 -0800 Subject: [PATCH 08/17] init fix --- mandel.s | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mandel.s b/mandel.s index 86a6b48..76816c2 100644 --- a/mandel.s +++ b/mandel.s @@ -922,6 +922,10 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 From 2fcb30b76a66819ab96ec3353b8ce4978f723675 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 08:56:59 -0800 Subject: [PATCH 09/17] wip --- mandel.s | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mandel.s b/mandel.s index 76816c2..0400003 100644 --- a/mandel.s +++ b/mandel.s @@ -980,7 +980,7 @@ keep_going: all_done: .endmacro - ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) ; zx = zx_2 - zy_2 + cx sub32 zx, zx_2, zy_2 add32 zx, zx, cx @@ -991,7 +991,7 @@ keep_going: add32 zy, zy, cy quick_exit zy, 2 - ; convert 8.24 -> 4.12 + ; convert 8.24 -> 4.12: (-8 .. +7.9) shift_round_16 zx, 4 shift_round_16 zy, 4 @@ -1042,10 +1042,10 @@ z_buffer_loop: ; Compare the previously stored z values ldy #0 - z_compare zx - z_compare zx + 1 - z_compare zy - z_compare zy + 1 + z_compare zx + 2 + z_compare zx + 3 + z_compare zy + 2 + z_compare zy + 3 cpy #4 bne z_no_matches @@ -1060,10 +1060,10 @@ z_no_matches: z_nothing_to_read: ; Store and expand - z_store zx - z_store zx + 1 - z_store zy - z_store zy + 1 + z_store zx + 2 + z_store zx + 3 + z_store zy + 2 + z_store zy + 3 z_advance stx z_buffer_end From d2f41f964435b3803ce694a70bf38687fd467caa Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:02:42 -0800 Subject: [PATCH 10/17] wip --- mandel.s | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/mandel.s b/mandel.s index 0400003..8b63941 100644 --- a/mandel.s +++ b/mandel.s @@ -425,14 +425,8 @@ viewport_oy: round16 arg ; 11-27 cycles .endmacro -.macro imul16_round dest, arg1, arg2, shift - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; ? cyc - shift_round_16 FR2, shift ; 103-119 cycles for shift=4 - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro - +; input: arg1, arg2 as fixed4.12 +; output: dest as fixed8.24 .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc @@ -440,20 +434,16 @@ viewport_oy: copy32 dest, FR2 ; 24 cyc .endmacro -.macro sqr16_round dest, arg, shift - ;imul16_round dest, arg, arg, shift - copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - shift_round_16 FR2, shift ; 103-119 cycles for shift=4 - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro - +; input: arg as fixed4.12 +; output: dest as fixed8.24 .macro sqr16 dest, arg copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc copy32 dest, FR2 ; 24 cyc .endmacro +; input: arg as u8 +; output: dest as u16 ; clobbers a, x .macro sqr8 dest, arg ldx arg From 1e0f577e099b3d7787d6e6d4fce1813ccd6b489c Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:09:11 -0800 Subject: [PATCH 11/17] wip --- mandel.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mandel.s b/mandel.s index 8b63941..6977582 100644 --- a/mandel.s +++ b/mandel.s @@ -453,6 +453,8 @@ viewport_oy: sta dest + 1 .endmacro +; input: arg as u8 +; input/output: dest as u16 ; clobbers a, x .macro sqr8_add16 dest, arg ldx arg From 81bf7f3c434646f0374c35f20131050bd314d1b2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:53:22 -0800 Subject: [PATCH 12/17] tweak --- mandel.s | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 6977582..4ab6c19 100644 --- a/mandel.s +++ b/mandel.s @@ -1106,10 +1106,9 @@ cont: enough: .endmacro -.macro zoom_factor dest, src, zoom, aspect +.macro zoom_factor dest, src, aspect ; output: dest: fixed8.24 ; input: src: fixed4.12 - ; input: zoom: u8 ??? ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src @@ -1602,9 +1601,9 @@ skipped_mask: not_skipped_mask: ; run the fractal! - zoom_factor cx, sx, zoom, aspect_x + zoom_factor cx, sx, aspect_x add32 cx, cx, ox - zoom_factor cy, sy, zoom, aspect_y + zoom_factor cy, sy, aspect_y add32 cy, cy, oy jsr mandelbrot jsr pset From 2e8893fd7892429bc07bd1d653ef1319be7d2d7b Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 13:54:53 -0800 Subject: [PATCH 13/17] haha fuck me --- mandel.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 4ab6c19..04edec5 100644 --- a/mandel.s +++ b/mandel.s @@ -320,7 +320,7 @@ viewport_oy: ; 38 cycles .macro add32 dest, arg1, arg2 - add 4, dest, arg2, dest + add 4, dest, arg1, arg2 .endmacro ; 8 cycles From cc83c76706519cce3fff61ce46df9589d31025d6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 14:16:43 -0800 Subject: [PATCH 14/17] update docs for 32-bit intermediates --- readme.md | 4 ++-- todo.md | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/readme.md b/readme.md index f297d60..d60644c 100644 --- a/readme.md +++ b/readme.md @@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. +The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. Iterations are capped at 255. @@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e ## Todo -See ideas in `todo.md`. \ No newline at end of file +See ideas in `todo.md`. diff --git a/todo.md b/todo.md index 29217cd..284d653 100644 --- a/todo.md +++ b/todo.md @@ -3,13 +3,11 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx_next = zx^s + cx in 32-bit space, before rounding - * should improve precision on max zoom, might cost a few cycles - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * try 3.13 fixed point instead of 4.12 for more precision * can we get away without the extra bit? + * since exit compare space would be 6.26 i think so * y-axis mirror optimization From 7985ea9a399554340a76f8cfc340bb566d86a952 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 14:45:38 -0800 Subject: [PATCH 15/17] fix panning for 32-bi --- mandel.s | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/mandel.s b/mandel.s index 04edec5..fe86366 100644 --- a/mandel.s +++ b/mandel.s @@ -1341,12 +1341,15 @@ skip_luma: cpy #KEY_MINUS beq minus - ; temp = $0010 << (8 - zoom) - lda #$10 - sta temp + ; temp+temp2 = $00010000 << (8 - zoom) lda #$00 + sta temp sta temp + 1 - scale_zoom temp + lda #$01 + sta temp + 2 + lda #$00 + sta temp + 3 + scale_zoom temp + 2 cpy #KEY_UP beq up @@ -1356,14 +1359,7 @@ skip_luma: beq left cpy #KEY_RIGHT beq right - cpy #KEY_1 - beq one - cpy #KEY_2 - beq two - cpy #KEY_3 - beq three - cpy #KEY_4 - beq four + jmp number_keys skip_char: lda #0 @@ -1382,17 +1378,28 @@ minus: dec zoom jmp done up: - sub16 oy, oy, temp + sub32 oy, oy, temp jmp done down: - add16 oy, oy, temp + add32 oy, oy, temp jmp done left: - sub16 ox, ox, temp + sub32 ox, ox, temp jmp done right: - add16 ox, ox, temp + add32 ox, ox, temp jmp done + +number_keys: + cpy #KEY_1 + beq one + cpy #KEY_2 + beq two + cpy #KEY_3 + beq three + cpy #KEY_4 + beq four + one: ldx #0 jmp load_key_viewport From d8601bb856ac0858ea7a06f4c60f162f1664c52a Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 15:03:43 -0800 Subject: [PATCH 16/17] fix fix --- mandel.s | 1 + 1 file changed, 1 insertion(+) diff --git a/mandel.s b/mandel.s index fe86366..b8985b3 100644 --- a/mandel.s +++ b/mandel.s @@ -1399,6 +1399,7 @@ number_keys: beq three cpy #KEY_4 beq four + jmp skip_char one: ldx #0 From 3553ce986f6721f8c6d446368cb6c6f55186713b Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:55:22 -0800 Subject: [PATCH 17/17] shave some cycles off 16-bit squaring with shift instead of add also fix the comments about how many cycles shift takes --- mandel.s | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mandel.s b/mandel.s index 299db98..b0c2b42 100644 --- a/mandel.s +++ b/mandel.s @@ -348,7 +348,7 @@ viewport_oy: sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * bytes cycles +; 3 + 5 * (bytes - 1) cycles .macro shl bytes, arg asl arg ; 3 cyc .repeat bytes-1, i @@ -356,17 +356,17 @@ viewport_oy: .endrepeat .endmacro -; 13 cycles +; 8 cycles .macro shl16 arg shl 2, arg .endmacro -; 18 cycles +; 13 cycles .macro shl24 arg shl 3, arg .endmacro -; 23 cycles +; 18 cycles .macro shl32 arg shl 4, arg .endmacro @@ -787,7 +787,7 @@ arg2_pos: sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter + shl16 inter add_carry result + 3 add16 result + 1, result + 1, inter add_carry result + 3