From aee587388de88e35e8f3b345898bd4abc9acf3ed Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:01:45 -0800 Subject: [PATCH 1/4] eliminate mul_hibyte512 table This costs an extra half cycle on average, assuming uniform distribution of multiplication inputs. I don't think a half cycle is worth an extra 256-byte table. --- mandel.s | 30 ++++++++++++++++-------------- tables.js | 24 +++++++----------------- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/mandel.s b/mandel.s index fc30532..ec1b086 100644 --- a/mandel.s +++ b/mandel.s @@ -129,9 +129,8 @@ KEY_0 = 50 mantissa .byte 5 .endstruct -.import mul_lobyte256 -.import mul_hibyte256 -.import mul_hibyte512 +.import mul_lobyte +.import mul_hibyte .import sqr_lobyte .import sqr_hibyte @@ -548,22 +547,25 @@ bank_switch_table: clc ; 2 cyc adc mul_factor_x ; 3 cyc tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc + lda mul_hibyte,x ; 4 cyc + bcc next ; 2 cyc + ; carry is set so we get to add 1 for free, but need to add 0x80 + adc #$7f ; 2 cyc + clc ; 2 cyc + ; stash the sum temporarily so we can use it as an operand to add + stx mul_product_lo ; 3 cyc + adc mul_product_lo ; 3 cyc next: + sec ; 2 cyc sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc + lda mul_lobyte,x ; 4 cyc ; - a^2/2 ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc ; + x & a & 1: @@ -582,10 +584,10 @@ bank_switch_table: ; - x^2/2 small_product: sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc .endscope .endif diff --git a/tables.js b/tables.js index 50cbef9..f4802ce 100644 --- a/tables.js +++ b/tables.js @@ -11,32 +11,22 @@ function db(func) { return lines.join('\n'); } -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - console.log( `.segment "TABLES" -.export mul_lobyte256 -.export mul_hibyte256 -.export mul_hibyte512 +.export mul_lobyte +.export mul_hibyte .export sqr_lobyte .export sqr_hibyte -; (i * i + 1) / 2 for the multiplier +; (i * i) / 2 for the multiplier .align 256 -mul_lobyte256: -${db((i) => squares[i] & 0xff)} +mul_lobyte: +${db((i) => ((i * i) >> 1) & 0xff)} .align 256 -mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} - -.align 256 -mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} +mul_hibyte: +${db((i) => ((i * i) >> 9) & 0xff)} ; (i * i) for the plain squares .align 256 From f06aed0c0080b45fdd92544afddcbebea6d74efa Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:22:31 -0800 Subject: [PATCH 2/4] set results from both 8-bit squares first Since the results from the lo and hi squares don't overlap or overflow, they can be written directly to the final output location without doing any addition. Then only the multiplication that goes in the middle needs any adds. --- mandel.s | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/mandel.s b/mandel.s index ec1b086..a63d96f 100644 --- a/mandel.s +++ b/mandel.s @@ -450,18 +450,6 @@ viewport_oy: sta dest + 1 .endmacro -; clobbers a, x -.macro sqr8_add16 dest, arg - ldx arg - clc - lda sqr_lobyte,x - adc dest - sta dest - lda sqr_hibyte,x - adc dest + 1 - sta dest + 1 -.endmacro - .segment "TABLES" ; lookup table for top byte -> PORTB value for bank-switch .align 256 @@ -794,9 +782,7 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe add16 result + 1, result + 1, inter @@ -804,8 +790,6 @@ arg2_pos: add16 result + 1, result + 1, inter add_carry result + 3 - sqr8_add16 result + 2, arg + 1 - rts ; 6 cyc .endscope .endmacro From 0f49760aa53b76f16fadf66b236b00df3d4fdd4c Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:26:24 -0800 Subject: [PATCH 3/4] unify tables for squaring and multiplication --- mandel.s | 10 ++++++---- tables.js | 11 ----------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/mandel.s b/mandel.s index a63d96f..299db98 100644 --- a/mandel.s +++ b/mandel.s @@ -131,8 +131,6 @@ KEY_0 = 50 .import mul_lobyte .import mul_hibyte -.import sqr_lobyte -.import sqr_hibyte .data @@ -444,9 +442,13 @@ viewport_oy: ; clobbers a, x .macro sqr8 dest, arg ldx arg - lda sqr_lobyte,x + txa + lsr + lda mul_lobyte,x + rol sta dest - lda sqr_hibyte,x + lda mul_hibyte,x + rol sta dest + 1 .endmacro diff --git a/tables.js b/tables.js index f4802ce..176e4df 100644 --- a/tables.js +++ b/tables.js @@ -16,8 +16,6 @@ console.log( .export mul_lobyte .export mul_hibyte -.export sqr_lobyte -.export sqr_hibyte ; (i * i) / 2 for the multiplier .align 256 @@ -28,13 +26,4 @@ ${db((i) => ((i * i) >> 1) & 0xff)} mul_hibyte: ${db((i) => ((i * i) >> 9) & 0xff)} -; (i * i) for the plain squares -.align 256 -sqr_lobyte: -${db((i) => (i * i) & 0xff)} - -.align 256 -sqr_hibyte: -${db((i) => ((i * i) >> 8) & 0xff)} - `); From 3553ce986f6721f8c6d446368cb6c6f55186713b Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:55:22 -0800 Subject: [PATCH 4/4] shave some cycles off 16-bit squaring with shift instead of add also fix the comments about how many cycles shift takes --- mandel.s | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mandel.s b/mandel.s index 299db98..b0c2b42 100644 --- a/mandel.s +++ b/mandel.s @@ -348,7 +348,7 @@ viewport_oy: sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * bytes cycles +; 3 + 5 * (bytes - 1) cycles .macro shl bytes, arg asl arg ; 3 cyc .repeat bytes-1, i @@ -356,17 +356,17 @@ viewport_oy: .endrepeat .endmacro -; 13 cycles +; 8 cycles .macro shl16 arg shl 2, arg .endmacro -; 18 cycles +; 13 cycles .macro shl24 arg shl 3, arg .endmacro -; 23 cycles +; 18 cycles .macro shl32 arg shl 4, arg .endmacro @@ -787,7 +787,7 @@ arg2_pos: sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter + shl16 inter add_carry result + 3 add16 result + 1, result + 1, inter add_carry result + 3