set results from both 8-bit squares first

Since the results from the lo and hi squares don't overlap or overflow,
they can be written directly to the final output location without doing
any addition. Then only the multiplication that goes in the middle needs
any adds.
This commit is contained in:
Jamey Sharp 2024-12-31 02:22:31 -08:00
parent aee587388d
commit f06aed0c00

View file

@ -450,18 +450,6 @@ viewport_oy:
sta dest + 1 sta dest + 1
.endmacro .endmacro
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1
.endmacro
.segment "TABLES" .segment "TABLES"
; lookup table for top byte -> PORTB value for bank-switch ; lookup table for top byte -> PORTB value for bank-switch
.align 256 .align 256
@ -794,9 +782,7 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l ; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg sqr8 result, arg
lda #0 sqr8 result + 2, arg + 1
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
@ -804,8 +790,6 @@ arg2_pos:
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc rts ; 6 cyc
.endscope .endscope
.endmacro .endmacro