Compare commits

..

3 commits

Author SHA1 Message Date
05133aabdd slightly faster handling of signed mul
previously we were flipping the inputs if negative, and then the
output if both inputs were negative

turns out you can just treat the whole thing as an unsigned mul
and then subtract each term from the high word if the other term
is negative.

https://stackoverflow.com/a/28827013

this saves a handful of cycles, reducing our runtime to a merge
14.211 ms/px \o/
2024-12-15 20:17:45 -08:00
7f2bc43cff squares 2024-12-14 18:56:26 -08:00
5637783529 Faster imul16 routine
Improves runtime from 16.24 ms/px to 14.44 ms/px

This uses a routine found on Everything2:
https://everything2.com/title/Fast+6502+multiplication

which uses a lookup table of squares to do 8-bit imuls,
which are then composed into a 16-bit imul
2024-12-14 18:53:31 -08:00
3 changed files with 22 additions and 117 deletions

127
mandel.s
View file

@ -26,7 +26,6 @@ z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16 temp = $b4 ; u16
temp2 = $b6 ; u16 temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16 pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8 pixel_color = $ba ; u8
pixel_mask = $bb ; u8 pixel_mask = $bb ; u8
@ -345,68 +344,6 @@ fill_masks:
neg 4, arg neg 4, arg
.endmacro .endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the Y register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
iny ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc ; 518 - 828 cyc
.macro imul16 dest, arg1, arg2 .macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc copy16 FR0, arg1 ; 12 cyc
@ -430,42 +367,6 @@ positive:
copy16 dest, FR2 + 2 ; 12 cyc copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; min 470 cycles
; max 780 cycles
.proc imul16_func_orig
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldy #0 ; 2 cyc
; counts the number of sign bits in Y
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 440 to 696 cycles
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result.
cpy #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
rts ; 6 cyc
.endproc
; Adapted from https://everything2.com/title/Fast+6502+multiplication ; Adapted from https://everything2.com/title/Fast+6502+multiplication
.macro imul8 dest, arg1, arg2 .macro imul8 dest, arg1, arg2
.local under256 .local under256
@ -512,6 +413,7 @@ positive_result:
small_product: small_product:
sec ; - x^2/2: 25 cycles sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi lda mul_product_hi
sbc mul_hibyte256,x sbc mul_hibyte256,x
sta mul_product_hi sta mul_product_hi
@ -524,27 +426,19 @@ positive_result:
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
inter = temp2 inter = temp2
ldy #0 ; 2 cyc
; counts the number of sign bits in Y
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; h1l1 * h2l2 ; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2) ; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2
lda #0 lda #0
sta result + 0
sta result + 1
sta result + 2 sta result + 2
sta result + 3 sta result + 3
imul8 inter, arg1, arg2
add16 result, result, inter
imul8 inter, arg1 + 1, arg2 imul8 inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1 imul8 inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
@ -553,11 +447,16 @@ positive_result:
imul8 inter, arg1 + 1, arg2 + 1 imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter add16 result + 2, result + 2, inter
; In case of mixed input signs, return a negative result. ; In case of negative inputs, adjust high word
cpy #1 ; 2 cyc ; https://stackoverflow.com/a/28827013
bne positive_result ; 2 cyc lda arg1 + 1
neg32 result ; 34 cyc bpl arg1_pos
positive_result: sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc rts ; 6 cyc
.endproc .endproc

View file

@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions ## Deps and build instructions

View file

@ -11,6 +11,11 @@ function db(func) {
return lines.join('\n'); return lines.join('\n');
} }
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log( console.log(
`.segment "TABLES" `.segment "TABLES"
@ -20,14 +25,14 @@ console.log(
.align 256 .align 256
mul_lobyte256: mul_lobyte256:
${db((x) => Math.round(x * x / 2) & 0xff)} ${db((i) => squares[i] & 0xff)}
.align 256 .align 256
mul_hibyte256: mul_hibyte256:
${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)} ${db((i) => (squares[i] >> 8) & 0xff)}
.align 256 .align 256
mul_hibyte512: mul_hibyte512:
${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)} ${db((i) => (squares[i + 256] >> 8) & 0xff)}
`); `);