refactoring and start on squares

This commit is contained in:
Brooke Vibber 2024-12-29 17:37:06 -08:00
parent 8ad996981a
commit f903272335
2 changed files with 143 additions and 153 deletions

220
mandel.s
View file

@ -374,12 +374,88 @@ viewport_oy:
copy16 dest, FR2 + 2 ; 12 cyc copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; Adapted from https://everything2.com/title/Fast+6502+multiplication ; clobbers a, x
.macro imul8 dest, arg1, arg2 .macro sqr8 dest, arg
.local under256 ldx arg
.local next lda sqr_lobyte,x
.local small_product sta dest
; circa 92 cycles? this doesn't seem right lda sqr_hibyte,x
sta dest + 1
.endmacro
; lookup table for top byte -> PORTB value for bank-switch
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
bank_switch_table:
.repeat 256, i
.byte ((i & $c0) >> 4) | $e1
.endrepeat
.macro bank_switch bank
lda #((bank << 2) | $e1)
sta PORTB
.endmacro
.macro imul8 dest, arg1, arg2, xe
.if xe
; using 64KB lookup table
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.scope
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endscope
.else
; Using base 48k RAM compatibility mode
; Small table of half squares
; Adapted from https://everything2.com/title/Fast+6502+multiplication
; 81-92 cycles ; 81-92 cycles
.scope .scope
mul_factor_a = arg1 mul_factor_a = arg1
@ -433,78 +509,7 @@ viewport_oy:
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
.endscope .endscope
.endmacro .endif
; lookup table for top byte -> PORTB value for bank-switch
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
bank_switch_table:
.repeat 256, i
.byte ((i & $c0) >> 4) | $e1
.endrepeat
.macro bank_switch bank
lda #((bank << 2) | $e1)
sta PORTB
.endmacro
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro .endmacro
.proc imul8xe_init .proc imul8xe_init
@ -632,7 +637,13 @@ inner_loop:
.endproc .endproc
.proc imul16_func .macro imul16_impl xe
.local arg1
.local arg2
.local result
.local inter
.local arg1_pos
.local arg2_pos
arg1 = FR0 ; 16-bit arg (clobbered) arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
@ -643,20 +654,20 @@ inner_loop:
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2 imul8 result, arg1, arg2, xe
lda #0 lda #0
sta result + 2 sta result + 2
sta result + 3 sta result + 3
imul8 inter, arg1 + 1, arg2 imul8 inter, arg1 + 1, arg2, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
imul8 inter, arg1, arg2 + 1 imul8 inter, arg1, arg2 + 1, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1 imul8 inter, arg1 + 1, arg2 + 1, xe
add16 result + 2, result + 2, inter add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word ; In case of negative inputs, adjust high word
@ -671,47 +682,14 @@ arg1_pos:
arg2_pos: arg2_pos:
rts ; 6 cyc rts ; 6 cyc
.endmacro
.proc imul16_func
imul16_impl 0
.endproc .endproc
.proc imul16xe_func .proc imul16xe_func
arg1 = FR0 ; 16-bit arg (clobbered) imul16_impl 1
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
inter = temp2
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8xe result, arg1, arg2
lda #0
sta result + 2
sta result + 3
imul8xe inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8xe inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8xe inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endproc .endproc
.macro round16 arg .macro round16 arg

View file

@ -22,7 +22,10 @@ console.log(
.export mul_lobyte256 .export mul_lobyte256
.export mul_hibyte256 .export mul_hibyte256
.export mul_hibyte512 .export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier
.align 256 .align 256
mul_lobyte256: mul_lobyte256:
${db((i) => squares[i] & 0xff)} ${db((i) => squares[i] & 0xff)}
@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
mul_hibyte512: mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)} ${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`); `);