forked from brooke/mandel-6502
Faster imul16 routine
Improves runtime from 16.24 ms/px to 14.44 ms/px This uses a routine found on Everything2: https://everything2.com/title/Fast+6502+multiplication which uses a lookup table of squares to do 8-bit imuls, which are then composed into a 16-bit imul
This commit is contained in:
parent
29630c8887
commit
5637783529
5 changed files with 183 additions and 81 deletions
176
mandel.s
176
mandel.s
|
|
@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
|
|||
z_buffer_start = $b1 ; u8: index into z_buffer
|
||||
z_buffer_end = $b2 ; u8: index into z_buffer
|
||||
temp = $b4 ; u16
|
||||
|
||||
pixel_ptr = $b6 ; u16
|
||||
pixel_color = $b8 ; u8
|
||||
pixel_mask = $b9 ; u8
|
||||
pixel_shift = $ba ; u8
|
||||
pixel_offset = $bb ; u8
|
||||
fill_level = $bc ; u8
|
||||
palette_offset = $bd ; u8
|
||||
temp2 = $b6 ; u16
|
||||
pixel_ptr = $b8 ; u16
|
||||
pixel_color = $ba ; u8
|
||||
pixel_mask = $bb ; u8
|
||||
pixel_shift = $bc ; u8
|
||||
pixel_offset = $bd ; u8
|
||||
fill_level = $be ; u8
|
||||
palette_offset = $bf ; u8
|
||||
|
||||
; FP registers in zero page
|
||||
FR0 = $d4 ; float48
|
||||
|
|
@ -107,6 +107,10 @@ KEY_RIGHT = $87
|
|||
mantissa .byte 6
|
||||
.endstruct
|
||||
|
||||
.import mul_lobyte256
|
||||
.import mul_hibyte256
|
||||
.import mul_hibyte512
|
||||
|
||||
.data
|
||||
|
||||
strings:
|
||||
|
|
@ -257,6 +261,12 @@ fill_masks:
|
|||
add 4, dest, arg2, dest
|
||||
.endmacro
|
||||
|
||||
.macro add_carry dest
|
||||
lda dest
|
||||
adc #0
|
||||
sta dest
|
||||
.endmacro
|
||||
|
||||
; 2 + 9 * byte cycles
|
||||
.macro sub bytes, dest, arg1, arg2
|
||||
sec ; 2 cyc
|
||||
|
|
@ -334,65 +344,15 @@ fill_masks:
|
|||
neg 4, arg
|
||||
.endmacro
|
||||
|
||||
; inner loop for imul16
|
||||
; bitnum < 8: 25 or 41 cycles
|
||||
; bitnum >= 8: 30 or 46 cycles
|
||||
.macro bitmul16 arg1, arg2, result, bitnum
|
||||
.local zero
|
||||
.local one
|
||||
.local next
|
||||
|
||||
; does 16-bit adds
|
||||
; arg1 and arg2 are treated as unsigned
|
||||
; negative signed inputs must be flipped first
|
||||
|
||||
; 7 cycles up to the branch
|
||||
|
||||
; check if arg1 has 0 or 1 bit in this place
|
||||
; 5 cycles either way
|
||||
.if bitnum < 8
|
||||
lda arg1 ; 3 cyc
|
||||
and #(1 << (bitnum)) ; 2 cyc
|
||||
.else
|
||||
lda arg1 + 1 ; 3 cyc
|
||||
and #(1 << ((bitnum) - 8)) ; 2 cyc
|
||||
.endif
|
||||
bne one ; 2 cyc
|
||||
|
||||
zero: ; 18 cyc, 23 cyc
|
||||
lsr result + 3 ; 5 cyc
|
||||
jmp next ; 3 cyc
|
||||
|
||||
one: ; 32 cyc, 37 cyc
|
||||
; 16-bit add on the top bits
|
||||
clc ; 2 cyc
|
||||
lda result + 2 ; 3 cyc
|
||||
adc arg2 ; 3 cyc
|
||||
sta result + 2 ; 3 cyc
|
||||
lda result + 3 ; 3 cyc
|
||||
adc arg2 + 1 ; 3 cyc
|
||||
ror a ; 2 cyc - get a jump on the shift
|
||||
sta result + 3 ; 3 cyc
|
||||
next:
|
||||
ror result + 2 ; 5 cyc
|
||||
ror result + 1 ; 5 cyc
|
||||
.if bitnum >= 8
|
||||
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
||||
; when it's all uninitialized data
|
||||
ror result ; 5 cyc
|
||||
.endif
|
||||
|
||||
.endmacro
|
||||
|
||||
; 5 to 25 cycles
|
||||
.macro check_sign arg
|
||||
; Check sign bit and flip argument to postive,
|
||||
; keeping a count of sign bits in the X register.
|
||||
; keeping a count of sign bits in the Y register.
|
||||
.local positive
|
||||
lda arg + 1 ; 3 cyc
|
||||
bpl positive ; 2 cyc
|
||||
neg16 arg ; 18 cyc
|
||||
inx ; 2 cyc
|
||||
iny ; 2 cyc
|
||||
positive:
|
||||
.endmacro
|
||||
|
||||
|
|
@ -419,35 +379,93 @@ positive:
|
|||
copy16 dest, FR2 + 2 ; 12 cyc
|
||||
.endmacro
|
||||
|
||||
; min 470 cycles
|
||||
; max 780 cycles
|
||||
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
||||
.macro imul8 dest, arg1, arg2
|
||||
.local under256
|
||||
.local next
|
||||
.local small_product
|
||||
.scope
|
||||
mul_factor_a = arg1
|
||||
mul_factor_x = arg2
|
||||
mul_product_lo = dest
|
||||
mul_product_hi = dest + 1
|
||||
|
||||
lda mul_factor_a ; setup: 6 cycles
|
||||
;ldx mul_factor_x
|
||||
|
||||
clc ; (a + x)^2/2: 23 cycles
|
||||
adc mul_factor_x
|
||||
tax
|
||||
bcc under256
|
||||
lda mul_hibyte512,x
|
||||
bcs next
|
||||
under256:
|
||||
lda mul_hibyte256,x
|
||||
sec
|
||||
next:
|
||||
sta mul_product_hi
|
||||
lda mul_lobyte256,x
|
||||
|
||||
ldx mul_factor_a ; - a^2/2: 20 cycles
|
||||
sbc mul_lobyte256,x
|
||||
sta mul_product_lo
|
||||
lda mul_product_hi
|
||||
sbc mul_hibyte256,x
|
||||
sta mul_product_hi
|
||||
|
||||
ldx mul_factor_x ; + x & a & 1: 22 cycles
|
||||
txa ; (this is a kludge to correct a
|
||||
and mul_factor_a ; roundoff error that makes odd * odd too low)
|
||||
and #1
|
||||
|
||||
clc
|
||||
adc mul_product_lo
|
||||
bcc small_product
|
||||
inc mul_product_hi
|
||||
small_product:
|
||||
sec ; - x^2/2: 25 cycles
|
||||
sbc mul_lobyte256,x
|
||||
sta mul_product_lo
|
||||
lda mul_product_hi
|
||||
sbc mul_hibyte256,x
|
||||
sta mul_product_hi
|
||||
.endscope
|
||||
.endmacro
|
||||
|
||||
.proc imul16_func
|
||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||
result = FR2 ; 32-bit result
|
||||
inter = temp2
|
||||
|
||||
ldx #0 ; 2 cyc
|
||||
; counts the number of sign bits in X
|
||||
ldy #0 ; 2 cyc
|
||||
; counts the number of sign bits in Y
|
||||
check_sign arg1 ; 5 to 25 cyc
|
||||
check_sign arg2 ; 5 to 25 cyc
|
||||
|
||||
; zero out the 32-bit temp's top 16 bits
|
||||
lda #0 ; 2 cyc
|
||||
sta result + 2 ; 3 cyc
|
||||
sta result + 3 ; 3 cyc
|
||||
; the bottom two bytes will get cleared by the shifts
|
||||
|
||||
; unrolled loop for maximum speed, at the cost
|
||||
; of a larger routine
|
||||
; 440 to 696 cycles
|
||||
.repeat 16, bitnum
|
||||
; bitnum < 8: 25 or 41 cycles
|
||||
; bitnum >= 8: 30 or 46 cycles
|
||||
bitmul16 arg1, arg2, result, bitnum
|
||||
.endrepeat
|
||||
; h1l1 * h2l2
|
||||
; (h1*256 + l1) * (h2*256 + l2)
|
||||
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
||||
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
||||
|
||||
imul8 result, arg1, arg2
|
||||
lda #0
|
||||
sta result + 2
|
||||
sta result + 3
|
||||
|
||||
imul8 inter, arg1 + 1, arg2
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
|
||||
imul8 inter, arg1, arg2 + 1
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
|
||||
imul8 inter, arg1 + 1, arg2 + 1
|
||||
add16 result + 2, result + 2, inter
|
||||
|
||||
; In case of mixed input signs, return a negative result.
|
||||
cpx #1 ; 2 cyc
|
||||
cpy #1 ; 2 cyc
|
||||
bne positive_result ; 2 cyc
|
||||
neg32 result ; 34 cyc
|
||||
positive_result:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue