From 582ddf497f3c4f1aeae39201b2490dff14ff7f16 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 4 Jan 2025 10:53:51 -0800 Subject: [PATCH] apply jamey's suggestion of skipping add for high byte muls rather than saving 0 into the high bytes, then adding the high-byte multiplication later, write it directly in place. this saves a few cycles on every iteration, and it adds up nicely. View 1 overview render times: 130XE: 10.050 ms/px - 4m56s 800XL: 10.906 ms/px - 5m21s --- mandel.s | 29 ++++------------------------- todo.md | 3 --- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/mandel.s b/mandel.s index 210799a..526953a 100644 --- a/mandel.s +++ b/mandel.s @@ -464,20 +464,6 @@ viewport_oy: sta dest + 1 .endmacro -; input: arg as u8 -; input/output: dest as u16 -; clobbers a, x -.macro sqr8_add16 dest, arg - ldx arg - clc - lda sqr_lobyte,x - adc dest - sta dest - lda sqr_hibyte,x - adc dest + 1 - sta dest + 1 -.endmacro - .segment "TABLES" ; lookup table for top byte -> PORTB value for bank-switch .align 256 @@ -760,9 +746,8 @@ inner_loop: ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 imul8 result, arg1, arg2, xe - lda #0 - sta result + 2 - sta result + 3 + + imul8 result + 2, arg1 + 1, arg2 + 1, xe imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter @@ -772,9 +757,6 @@ inner_loop: add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1, xe - add16 result + 2, result + 2, inter - ; In case of negative inputs, adjust high word ; https://stackoverflow.com/a/28827013 lda arg1 + 1 @@ -807,9 +789,8 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + + sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe add16 result + 1, result + 1, inter @@ -817,8 +798,6 @@ arg2_pos: add16 result + 1, result + 1, inter add_carry result + 3 - sqr8_add16 result + 2, arg + 1 - rts ; 6 cyc .endscope .endmacro diff --git a/todo.md b/todo.md index 1d46281..a78a2d5 100644 --- a/todo.md +++ b/todo.md @@ -1,8 +1,5 @@ things to try: -* skip add on the top-byte multiply in sqr8/mul8 - * should save a few cycles, suggestion by jamey - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * y-axis mirror optimization