From aee587388de88e35e8f3b345898bd4abc9acf3ed Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Tue, 31 Dec 2024 02:01:45 -0800 Subject: [PATCH] eliminate mul_hibyte512 table This costs an extra half cycle on average, assuming uniform distribution of multiplication inputs. I don't think a half cycle is worth an extra 256-byte table. --- mandel.s | 30 ++++++++++++++++-------------- tables.js | 24 +++++++----------------- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/mandel.s b/mandel.s index fc30532..ec1b086 100644 --- a/mandel.s +++ b/mandel.s @@ -129,9 +129,8 @@ KEY_0 = 50 mantissa .byte 5 .endstruct -.import mul_lobyte256 -.import mul_hibyte256 -.import mul_hibyte512 +.import mul_lobyte +.import mul_hibyte .import sqr_lobyte .import sqr_hibyte @@ -548,22 +547,25 @@ bank_switch_table: clc ; 2 cyc adc mul_factor_x ; 3 cyc tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc + lda mul_hibyte,x ; 4 cyc + bcc next ; 2 cyc + ; carry is set so we get to add 1 for free, but need to add 0x80 + adc #$7f ; 2 cyc + clc ; 2 cyc + ; stash the sum temporarily so we can use it as an operand to add + stx mul_product_lo ; 3 cyc + adc mul_product_lo ; 3 cyc next: + sec ; 2 cyc sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc + lda mul_lobyte,x ; 4 cyc ; - a^2/2 ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc ; + x & a & 1: @@ -582,10 +584,10 @@ bank_switch_table: ; - x^2/2 small_product: sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc .endscope .endif diff --git a/tables.js b/tables.js index 50cbef9..f4802ce 100644 --- a/tables.js +++ b/tables.js @@ -11,32 +11,22 @@ function db(func) { return lines.join('\n'); } -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - console.log( `.segment "TABLES" -.export mul_lobyte256 -.export mul_hibyte256 -.export mul_hibyte512 +.export mul_lobyte +.export mul_hibyte .export sqr_lobyte .export sqr_hibyte -; (i * i + 1) / 2 for the multiplier +; (i * i) / 2 for the multiplier .align 256 -mul_lobyte256: -${db((i) => squares[i] & 0xff)} +mul_lobyte: +${db((i) => ((i * i) >> 1) & 0xff)} .align 256 -mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} - -.align 256 -mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} +mul_hibyte: +${db((i) => ((i * i) >> 9) & 0xff)} ; (i * i) for the plain squares .align 256