Compare commits
3 commits
Author | SHA1 | Date | |
---|---|---|---|
05133aabdd | |||
7f2bc43cff | |||
5637783529 |
3 changed files with 22 additions and 117 deletions
127
mandel.s
127
mandel.s
|
@ -26,7 +26,6 @@ z_buffer_start = $b1 ; u8: index into z_buffer
|
||||||
z_buffer_end = $b2 ; u8: index into z_buffer
|
z_buffer_end = $b2 ; u8: index into z_buffer
|
||||||
temp = $b4 ; u16
|
temp = $b4 ; u16
|
||||||
temp2 = $b6 ; u16
|
temp2 = $b6 ; u16
|
||||||
|
|
||||||
pixel_ptr = $b8 ; u16
|
pixel_ptr = $b8 ; u16
|
||||||
pixel_color = $ba ; u8
|
pixel_color = $ba ; u8
|
||||||
pixel_mask = $bb ; u8
|
pixel_mask = $bb ; u8
|
||||||
|
@ -345,68 +344,6 @@ fill_masks:
|
||||||
neg 4, arg
|
neg 4, arg
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; inner loop for imul16
|
|
||||||
; bitnum < 8: 25 or 41 cycles
|
|
||||||
; bitnum >= 8: 30 or 46 cycles
|
|
||||||
.macro bitmul16 arg1, arg2, result, bitnum
|
|
||||||
.local zero
|
|
||||||
.local one
|
|
||||||
.local next
|
|
||||||
|
|
||||||
; does 16-bit adds
|
|
||||||
; arg1 and arg2 are treated as unsigned
|
|
||||||
; negative signed inputs must be flipped first
|
|
||||||
|
|
||||||
; 7 cycles up to the branch
|
|
||||||
|
|
||||||
; check if arg1 has 0 or 1 bit in this place
|
|
||||||
; 5 cycles either way
|
|
||||||
.if bitnum < 8
|
|
||||||
lda arg1 ; 3 cyc
|
|
||||||
and #(1 << (bitnum)) ; 2 cyc
|
|
||||||
.else
|
|
||||||
lda arg1 + 1 ; 3 cyc
|
|
||||||
and #(1 << ((bitnum) - 8)) ; 2 cyc
|
|
||||||
.endif
|
|
||||||
bne one ; 2 cyc
|
|
||||||
|
|
||||||
zero: ; 18 cyc, 23 cyc
|
|
||||||
lsr result + 3 ; 5 cyc
|
|
||||||
jmp next ; 3 cyc
|
|
||||||
|
|
||||||
one: ; 32 cyc, 37 cyc
|
|
||||||
; 16-bit add on the top bits
|
|
||||||
clc ; 2 cyc
|
|
||||||
lda result + 2 ; 3 cyc
|
|
||||||
adc arg2 ; 3 cyc
|
|
||||||
sta result + 2 ; 3 cyc
|
|
||||||
lda result + 3 ; 3 cyc
|
|
||||||
adc arg2 + 1 ; 3 cyc
|
|
||||||
ror a ; 2 cyc - get a jump on the shift
|
|
||||||
sta result + 3 ; 3 cyc
|
|
||||||
next:
|
|
||||||
ror result + 2 ; 5 cyc
|
|
||||||
ror result + 1 ; 5 cyc
|
|
||||||
.if bitnum >= 8
|
|
||||||
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
|
||||||
; when it's all uninitialized data
|
|
||||||
ror result ; 5 cyc
|
|
||||||
.endif
|
|
||||||
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
; 5 to 25 cycles
|
|
||||||
.macro check_sign arg
|
|
||||||
; Check sign bit and flip argument to postive,
|
|
||||||
; keeping a count of sign bits in the Y register.
|
|
||||||
.local positive
|
|
||||||
lda arg + 1 ; 3 cyc
|
|
||||||
bpl positive ; 2 cyc
|
|
||||||
neg16 arg ; 18 cyc
|
|
||||||
iny ; 2 cyc
|
|
||||||
positive:
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
; 518 - 828 cyc
|
; 518 - 828 cyc
|
||||||
.macro imul16 dest, arg1, arg2
|
.macro imul16 dest, arg1, arg2
|
||||||
copy16 FR0, arg1 ; 12 cyc
|
copy16 FR0, arg1 ; 12 cyc
|
||||||
|
@ -430,42 +367,6 @@ positive:
|
||||||
copy16 dest, FR2 + 2 ; 12 cyc
|
copy16 dest, FR2 + 2 ; 12 cyc
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; min 470 cycles
|
|
||||||
; max 780 cycles
|
|
||||||
.proc imul16_func_orig
|
|
||||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
|
||||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
|
||||||
result = FR2 ; 32-bit result
|
|
||||||
|
|
||||||
ldy #0 ; 2 cyc
|
|
||||||
; counts the number of sign bits in Y
|
|
||||||
check_sign arg1 ; 5 to 25 cyc
|
|
||||||
check_sign arg2 ; 5 to 25 cyc
|
|
||||||
|
|
||||||
; zero out the 32-bit temp's top 16 bits
|
|
||||||
lda #0 ; 2 cyc
|
|
||||||
sta result + 2 ; 3 cyc
|
|
||||||
sta result + 3 ; 3 cyc
|
|
||||||
; the bottom two bytes will get cleared by the shifts
|
|
||||||
|
|
||||||
; unrolled loop for maximum speed, at the cost
|
|
||||||
; of a larger routine
|
|
||||||
; 440 to 696 cycles
|
|
||||||
.repeat 16, bitnum
|
|
||||||
; bitnum < 8: 25 or 41 cycles
|
|
||||||
; bitnum >= 8: 30 or 46 cycles
|
|
||||||
bitmul16 arg1, arg2, result, bitnum
|
|
||||||
.endrepeat
|
|
||||||
|
|
||||||
; In case of mixed input signs, return a negative result.
|
|
||||||
cpy #1 ; 2 cyc
|
|
||||||
bne positive_result ; 2 cyc
|
|
||||||
neg32 result ; 34 cyc
|
|
||||||
positive_result:
|
|
||||||
|
|
||||||
rts ; 6 cyc
|
|
||||||
.endproc
|
|
||||||
|
|
||||||
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
||||||
.macro imul8 dest, arg1, arg2
|
.macro imul8 dest, arg1, arg2
|
||||||
.local under256
|
.local under256
|
||||||
|
@ -512,6 +413,7 @@ positive_result:
|
||||||
small_product:
|
small_product:
|
||||||
sec ; - x^2/2: 25 cycles
|
sec ; - x^2/2: 25 cycles
|
||||||
sbc mul_lobyte256,x
|
sbc mul_lobyte256,x
|
||||||
|
sta mul_product_lo
|
||||||
lda mul_product_hi
|
lda mul_product_hi
|
||||||
sbc mul_hibyte256,x
|
sbc mul_hibyte256,x
|
||||||
sta mul_product_hi
|
sta mul_product_hi
|
||||||
|
@ -524,27 +426,19 @@ positive_result:
|
||||||
result = FR2 ; 32-bit result
|
result = FR2 ; 32-bit result
|
||||||
inter = temp2
|
inter = temp2
|
||||||
|
|
||||||
ldy #0 ; 2 cyc
|
|
||||||
; counts the number of sign bits in Y
|
|
||||||
check_sign arg1 ; 5 to 25 cyc
|
|
||||||
check_sign arg2 ; 5 to 25 cyc
|
|
||||||
|
|
||||||
; h1l1 * h2l2
|
; h1l1 * h2l2
|
||||||
; (h1*256 + l1) * (h2*256 + l2)
|
; (h1*256 + l1) * (h2*256 + l2)
|
||||||
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
||||||
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
||||||
|
|
||||||
|
imul8 result, arg1, arg2
|
||||||
lda #0
|
lda #0
|
||||||
sta result + 0
|
|
||||||
sta result + 1
|
|
||||||
sta result + 2
|
sta result + 2
|
||||||
sta result + 3
|
sta result + 3
|
||||||
|
|
||||||
imul8 inter, arg1, arg2
|
|
||||||
add16 result, result, inter
|
|
||||||
|
|
||||||
imul8 inter, arg1 + 1, arg2
|
imul8 inter, arg1 + 1, arg2
|
||||||
add16 result + 1, result + 1, inter
|
add16 result + 1, result + 1, inter
|
||||||
|
add_carry result + 3
|
||||||
|
|
||||||
imul8 inter, arg1, arg2 + 1
|
imul8 inter, arg1, arg2 + 1
|
||||||
add16 result + 1, result + 1, inter
|
add16 result + 1, result + 1, inter
|
||||||
|
@ -553,11 +447,16 @@ positive_result:
|
||||||
imul8 inter, arg1 + 1, arg2 + 1
|
imul8 inter, arg1 + 1, arg2 + 1
|
||||||
add16 result + 2, result + 2, inter
|
add16 result + 2, result + 2, inter
|
||||||
|
|
||||||
; In case of mixed input signs, return a negative result.
|
; In case of negative inputs, adjust high word
|
||||||
cpy #1 ; 2 cyc
|
; https://stackoverflow.com/a/28827013
|
||||||
bne positive_result ; 2 cyc
|
lda arg1 + 1
|
||||||
neg32 result ; 34 cyc
|
bpl arg1_pos
|
||||||
positive_result:
|
sub16 result + 2, result + 2, arg2
|
||||||
|
arg1_pos:
|
||||||
|
lda arg2 + 1
|
||||||
|
bpl arg2_pos
|
||||||
|
sub16 result + 2, result + 2, arg1
|
||||||
|
arg2_pos:
|
||||||
|
|
||||||
rts ; 6 cyc
|
rts ; 6 cyc
|
||||||
.endproc
|
.endproc
|
||||||
|
|
|
@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
|
||||||
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
|
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
|
||||||
|
|
||||||
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
|
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
|
||||||
|
(done)
|
||||||
|
|
||||||
## Deps and build instructions
|
## Deps and build instructions
|
||||||
|
|
||||||
|
|
11
tables.js
11
tables.js
|
@ -11,6 +11,11 @@ function db(func) {
|
||||||
return lines.join('\n');
|
return lines.join('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let squares = [];
|
||||||
|
for (let i = 0; i < 512; i++) {
|
||||||
|
squares.push(Math.trunc((i * i + 1) / 2));
|
||||||
|
}
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`.segment "TABLES"
|
`.segment "TABLES"
|
||||||
|
|
||||||
|
@ -20,14 +25,14 @@ console.log(
|
||||||
|
|
||||||
.align 256
|
.align 256
|
||||||
mul_lobyte256:
|
mul_lobyte256:
|
||||||
${db((x) => Math.round(x * x / 2) & 0xff)}
|
${db((i) => squares[i] & 0xff)}
|
||||||
|
|
||||||
.align 256
|
.align 256
|
||||||
mul_hibyte256:
|
mul_hibyte256:
|
||||||
${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
|
${db((i) => (squares[i] >> 8) & 0xff)}
|
||||||
|
|
||||||
.align 256
|
.align 256
|
||||||
mul_hibyte512:
|
mul_hibyte512:
|
||||||
${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
|
${db((i) => (squares[i + 256] >> 8) & 0xff)}
|
||||||
|
|
||||||
`);
|
`);
|
||||||
|
|
Loading…
Reference in a new issue