Compare commits

...

4 commits

Author SHA1 Message Date
3553ce986f shave some cycles off 16-bit squaring with shift instead of add
also fix the comments about how many cycles shift takes
2024-12-31 15:29:40 -08:00
0f49760aa5 unify tables for squaring and multiplication 2024-12-31 02:26:24 -08:00
f06aed0c00 set results from both 8-bit squares first
Since the results from the lo and hi squares don't overlap or overflow,
they can be written directly to the final output location without doing
any addition. Then only the multiplication that goes in the middle needs
any adds.
2024-12-31 02:22:31 -08:00
aee587388d eliminate mul_hibyte512 table
This costs an extra half cycle on average, assuming uniform distribution
of multiplication inputs. I don't think a half cycle is worth an extra
256-byte table.
2024-12-31 02:01:45 -08:00
2 changed files with 35 additions and 68 deletions

View file

@ -129,11 +129,8 @@ KEY_0 = 50
mantissa .byte 5
.endstruct
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
.import mul_lobyte
.import mul_hibyte
.data
@ -351,7 +348,7 @@ viewport_oy:
sub 4, dest, arg1, arg2
.endmacro
; 3 + 5 * bytes cycles
; 3 + 5 * (bytes - 1) cycles
.macro shl bytes, arg
asl arg ; 3 cyc
.repeat bytes-1, i
@ -359,17 +356,17 @@ viewport_oy:
.endrepeat
.endmacro
; 13 cycles
; 8 cycles
.macro shl16 arg
shl 2, arg
.endmacro
; 18 cycles
; 13 cycles
.macro shl24 arg
shl 3, arg
.endmacro
; 23 cycles
; 18 cycles
.macro shl32 arg
shl 4, arg
.endmacro
@ -445,21 +442,13 @@ viewport_oy:
; clobbers a, x
.macro sqr8 dest, arg
ldx arg
lda sqr_lobyte,x
txa
lsr
lda mul_lobyte,x
rol
sta dest
lda sqr_hibyte,x
sta dest + 1
.endmacro
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
lda mul_hibyte,x
rol
sta dest + 1
.endmacro
@ -548,22 +537,25 @@ bank_switch_table:
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
lda mul_hibyte,x ; 4 cyc
bcc next ; 2 cyc
; carry is set so we get to add 1 for free, but need to add 0x80
adc #$7f ; 2 cyc
clc ; 2 cyc
; stash the sum temporarily so we can use it as an operand to add
stx mul_product_lo ; 3 cyc
adc mul_product_lo ; 3 cyc
next:
sec ; 2 cyc
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
lda mul_lobyte,x ; 4 cyc
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc
; + x & a & 1:
@ -582,10 +574,10 @@ bank_switch_table:
; - x^2/2
small_product:
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endif
@ -792,18 +784,14 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg
lda #0
sta result + 2
sta result + 3
sqr8 result + 2, arg + 1
imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter
shl16 inter
add_carry result + 3
add16 result + 1, result + 1, inter
add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc
.endscope
.endmacro

View file

@ -11,40 +11,19 @@ function db(func) {
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
.export mul_lobyte
.export mul_hibyte
; (i * i + 1) / 2 for the multiplier
; (i * i) / 2 for the multiplier
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
mul_lobyte:
${db((i) => ((i * i) >> 1) & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
mul_hibyte:
${db((i) => ((i * i) >> 9) & 0xff)}
`);