Compare commits

...

4 commits

Author SHA1 Message Date
3553ce986f shave some cycles off 16-bit squaring with shift instead of add
also fix the comments about how many cycles shift takes
2024-12-31 15:29:40 -08:00
0f49760aa5 unify tables for squaring and multiplication 2024-12-31 02:26:24 -08:00
f06aed0c00 set results from both 8-bit squares first
Since the results from the lo and hi squares don't overlap or overflow,
they can be written directly to the final output location without doing
any addition. Then only the multiplication that goes in the middle needs
any adds.
2024-12-31 02:22:31 -08:00
aee587388d eliminate mul_hibyte512 table
This costs an extra half cycle on average, assuming uniform distribution
of multiplication inputs. I don't think a half cycle is worth an extra
256-byte table.
2024-12-31 02:01:45 -08:00
2 changed files with 35 additions and 68 deletions

View file

@ -129,11 +129,8 @@ KEY_0 = 50
mantissa .byte 5 mantissa .byte 5
.endstruct .endstruct
.import mul_lobyte256 .import mul_lobyte
.import mul_hibyte256 .import mul_hibyte
.import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
.data .data
@ -351,7 +348,7 @@ viewport_oy:
sub 4, dest, arg1, arg2 sub 4, dest, arg1, arg2
.endmacro .endmacro
; 3 + 5 * bytes cycles ; 3 + 5 * (bytes - 1) cycles
.macro shl bytes, arg .macro shl bytes, arg
asl arg ; 3 cyc asl arg ; 3 cyc
.repeat bytes-1, i .repeat bytes-1, i
@ -359,17 +356,17 @@ viewport_oy:
.endrepeat .endrepeat
.endmacro .endmacro
; 13 cycles ; 8 cycles
.macro shl16 arg .macro shl16 arg
shl 2, arg shl 2, arg
.endmacro .endmacro
; 18 cycles ; 13 cycles
.macro shl24 arg .macro shl24 arg
shl 3, arg shl 3, arg
.endmacro .endmacro
; 23 cycles ; 18 cycles
.macro shl32 arg .macro shl32 arg
shl 4, arg shl 4, arg
.endmacro .endmacro
@ -445,21 +442,13 @@ viewport_oy:
; clobbers a, x ; clobbers a, x
.macro sqr8 dest, arg .macro sqr8 dest, arg
ldx arg ldx arg
lda sqr_lobyte,x txa
lsr
lda mul_lobyte,x
rol
sta dest sta dest
lda sqr_hibyte,x lda mul_hibyte,x
sta dest + 1 rol
.endmacro
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1 sta dest + 1
.endmacro .endmacro
@ -548,22 +537,25 @@ bank_switch_table:
clc ; 2 cyc clc ; 2 cyc
adc mul_factor_x ; 3 cyc adc mul_factor_x ; 3 cyc
tax ; 2 cyc tax ; 2 cyc
bcc under256 ; 2 cyc lda mul_hibyte,x ; 4 cyc
lda mul_hibyte512,x ; 4 cyc bcc next ; 2 cyc
bcs next ; 2 cyc ; carry is set so we get to add 1 for free, but need to add 0x80
under256: adc #$7f ; 2 cyc
lda mul_hibyte256,x ; 4 cyc clc ; 2 cyc
sec ; 2 cyc ; stash the sum temporarily so we can use it as an operand to add
stx mul_product_lo ; 3 cyc
adc mul_product_lo ; 3 cyc
next: next:
sec ; 2 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc lda mul_lobyte,x ; 4 cyc
; - a^2/2 ; - a^2/2
ldx mul_factor_a ; 3 cyc ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
; + x & a & 1: ; + x & a & 1:
@ -582,10 +574,10 @@ bank_switch_table:
; - x^2/2 ; - x^2/2
small_product: small_product:
sec ; 2 cyc sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
.endscope .endscope
.endif .endif
@ -792,18 +784,14 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l ; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg sqr8 result, arg
lda #0 sqr8 result + 2, arg + 1
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter shl16 inter
add_carry result + 3 add_carry result + 3
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc rts ; 6 cyc
.endscope .endscope
.endmacro .endmacro

View file

@ -11,40 +11,19 @@ function db(func) {
return lines.join('\n'); return lines.join('\n');
} }
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log( console.log(
`.segment "TABLES" `.segment "TABLES"
.export mul_lobyte256 .export mul_lobyte
.export mul_hibyte256 .export mul_hibyte
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier ; (i * i) / 2 for the multiplier
.align 256 .align 256
mul_lobyte256: mul_lobyte:
${db((i) => squares[i] & 0xff)} ${db((i) => ((i * i) >> 1) & 0xff)}
.align 256 .align 256
mul_hibyte256: mul_hibyte:
${db((i) => (squares[i] >> 8) & 0xff)} ${db((i) => ((i * i) >> 9) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`); `);