eliminate mul_hibyte512 table

This costs an extra half cycle on average, assuming uniform distribution
of multiplication inputs. I don't think a half cycle is worth an extra
256-byte table.
This commit is contained in:
Jamey Sharp 2024-12-31 02:01:45 -08:00
parent b56dc1e98b
commit aee587388d
2 changed files with 23 additions and 31 deletions

View file

@ -129,9 +129,8 @@ KEY_0 = 50
mantissa .byte 5 mantissa .byte 5
.endstruct .endstruct
.import mul_lobyte256 .import mul_lobyte
.import mul_hibyte256 .import mul_hibyte
.import mul_hibyte512
.import sqr_lobyte .import sqr_lobyte
.import sqr_hibyte .import sqr_hibyte
@ -548,22 +547,25 @@ bank_switch_table:
clc ; 2 cyc clc ; 2 cyc
adc mul_factor_x ; 3 cyc adc mul_factor_x ; 3 cyc
tax ; 2 cyc tax ; 2 cyc
bcc under256 ; 2 cyc lda mul_hibyte,x ; 4 cyc
lda mul_hibyte512,x ; 4 cyc bcc next ; 2 cyc
bcs next ; 2 cyc ; carry is set so we get to add 1 for free, but need to add 0x80
under256: adc #$7f ; 2 cyc
lda mul_hibyte256,x ; 4 cyc clc ; 2 cyc
sec ; 2 cyc ; stash the sum temporarily so we can use it as an operand to add
stx mul_product_lo ; 3 cyc
adc mul_product_lo ; 3 cyc
next: next:
sec ; 2 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc lda mul_lobyte,x ; 4 cyc
; - a^2/2 ; - a^2/2
ldx mul_factor_a ; 3 cyc ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
; + x & a & 1: ; + x & a & 1:
@ -582,10 +584,10 @@ bank_switch_table:
; - x^2/2 ; - x^2/2
small_product: small_product:
sec ; 2 cyc sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
.endscope .endscope
.endif .endif

View file

@ -11,32 +11,22 @@ function db(func) {
return lines.join('\n'); return lines.join('\n');
} }
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log( console.log(
`.segment "TABLES" `.segment "TABLES"
.export mul_lobyte256 .export mul_lobyte
.export mul_hibyte256 .export mul_hibyte
.export mul_hibyte512
.export sqr_lobyte .export sqr_lobyte
.export sqr_hibyte .export sqr_hibyte
; (i * i + 1) / 2 for the multiplier ; (i * i) / 2 for the multiplier
.align 256 .align 256
mul_lobyte256: mul_lobyte:
${db((i) => squares[i] & 0xff)} ${db((i) => ((i * i) >> 1) & 0xff)}
.align 256 .align 256
mul_hibyte256: mul_hibyte:
${db((i) => (squares[i] >> 8) & 0xff)} ${db((i) => ((i * i) >> 9) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares ; (i * i) for the plain squares
.align 256 .align 256