Compare commits

..

3 commits

Author SHA1 Message Date
0631886466 whee 2024-11-11 12:10:08 -08:00
97948dc814 Merge branch 'fastmul' into fastmul2 2024-11-11 11:45:58 -08:00
f10bb4fe18 WIP alternate imul16
not working at present
2023-02-11 16:03:18 -08:00
3 changed files with 117 additions and 22 deletions

127
mandel.s
View file

@ -26,6 +26,7 @@ z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16 temp = $b4 ; u16
temp2 = $b6 ; u16 temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16 pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8 pixel_color = $ba ; u8
pixel_mask = $bb ; u8 pixel_mask = $bb ; u8
@ -344,6 +345,68 @@ fill_masks:
neg 4, arg neg 4, arg
.endmacro .endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the Y register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
iny ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc ; 518 - 828 cyc
.macro imul16 dest, arg1, arg2 .macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc copy16 FR0, arg1 ; 12 cyc
@ -367,6 +430,42 @@ fill_masks:
copy16 dest, FR2 + 2 ; 12 cyc copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; min 470 cycles
; max 780 cycles
.proc imul16_func_orig
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldy #0 ; 2 cyc
; counts the number of sign bits in Y
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 440 to 696 cycles
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result.
cpy #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
rts ; 6 cyc
.endproc
; Adapted from https://everything2.com/title/Fast+6502+multiplication ; Adapted from https://everything2.com/title/Fast+6502+multiplication
.macro imul8 dest, arg1, arg2 .macro imul8 dest, arg1, arg2
.local under256 .local under256
@ -413,7 +512,6 @@ fill_masks:
small_product: small_product:
sec ; - x^2/2: 25 cycles sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi lda mul_product_hi
sbc mul_hibyte256,x sbc mul_hibyte256,x
sta mul_product_hi sta mul_product_hi
@ -426,19 +524,27 @@ fill_masks:
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
inter = temp2 inter = temp2
ldy #0 ; 2 cyc
; counts the number of sign bits in Y
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; h1l1 * h2l2 ; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2) ; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2
lda #0 lda #0
sta result + 0
sta result + 1
sta result + 2 sta result + 2
sta result + 3 sta result + 3
imul8 inter, arg1, arg2
add16 result, result, inter
imul8 inter, arg1 + 1, arg2 imul8 inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1 imul8 inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
@ -447,16 +553,11 @@ fill_masks:
imul8 inter, arg1 + 1, arg2 + 1 imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word ; In case of mixed input signs, return a negative result.
; https://stackoverflow.com/a/28827013 cpy #1 ; 2 cyc
lda arg1 + 1 bne positive_result ; 2 cyc
bpl arg1_pos neg32 result ; 34 cyc
sub16 result + 2, result + 2, arg2 positive_result:
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc rts ; 6 cyc
.endproc .endproc

View file

@ -37,7 +37,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions ## Deps and build instructions

View file

@ -11,11 +11,6 @@ function db(func) {
return lines.join('\n'); return lines.join('\n');
} }
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log( console.log(
`.segment "TABLES" `.segment "TABLES"
@ -25,14 +20,14 @@ console.log(
.align 256 .align 256
mul_lobyte256: mul_lobyte256:
${db((i) => squares[i] & 0xff)} ${db((x) => Math.round(x * x / 2) & 0xff)}
.align 256 .align 256
mul_hibyte256: mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)} ${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
.align 256 .align 256
mul_hibyte512: mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)} ${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
`); `);