Compare commits
No commits in common. "7f2bc43cff173e7dffd9a5629bb9bcb56f374259" and "29630c88872c8b22a2b357983f1cd0fc86bb197d" have entirely different histories.
7f2bc43cff
...
29630c8887
6 changed files with 81 additions and 184 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,4 +1,3 @@
|
||||||
*.o
|
*.o
|
||||||
*.xex
|
*.xex
|
||||||
tables.s
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -2,17 +2,13 @@
|
||||||
|
|
||||||
all : mandel.xex
|
all : mandel.xex
|
||||||
|
|
||||||
mandel.xex : mandel.o tables.o
|
%.xex : %.o
|
||||||
ld65 -C ./atari-asm-xex.cfg -o $@ $+
|
ld65 -C atari-asm-xex.cfg -o $@ $<
|
||||||
|
|
||||||
%.o : %.s
|
%.o : %.s
|
||||||
ca65 -o $@ $<
|
ca65 -o $@ $<
|
||||||
|
|
||||||
tables.s : tables.js
|
|
||||||
node tables.js > tables.s
|
|
||||||
|
|
||||||
clean :
|
clean :
|
||||||
rm -f tables.s
|
|
||||||
rm -f *.o
|
rm -f *.o
|
||||||
rm -f *.xex
|
rm -f *.xex
|
||||||
|
|
||||||
|
|
174
mandel.s
174
mandel.s
|
@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
|
||||||
z_buffer_start = $b1 ; u8: index into z_buffer
|
z_buffer_start = $b1 ; u8: index into z_buffer
|
||||||
z_buffer_end = $b2 ; u8: index into z_buffer
|
z_buffer_end = $b2 ; u8: index into z_buffer
|
||||||
temp = $b4 ; u16
|
temp = $b4 ; u16
|
||||||
temp2 = $b6 ; u16
|
|
||||||
pixel_ptr = $b8 ; u16
|
pixel_ptr = $b6 ; u16
|
||||||
pixel_color = $ba ; u8
|
pixel_color = $b8 ; u8
|
||||||
pixel_mask = $bb ; u8
|
pixel_mask = $b9 ; u8
|
||||||
pixel_shift = $bc ; u8
|
pixel_shift = $ba ; u8
|
||||||
pixel_offset = $bd ; u8
|
pixel_offset = $bb ; u8
|
||||||
fill_level = $be ; u8
|
fill_level = $bc ; u8
|
||||||
palette_offset = $bf ; u8
|
palette_offset = $bd ; u8
|
||||||
|
|
||||||
; FP registers in zero page
|
; FP registers in zero page
|
||||||
FR0 = $d4 ; float48
|
FR0 = $d4 ; float48
|
||||||
|
@ -107,10 +107,6 @@ KEY_RIGHT = $87
|
||||||
mantissa .byte 6
|
mantissa .byte 6
|
||||||
.endstruct
|
.endstruct
|
||||||
|
|
||||||
.import mul_lobyte256
|
|
||||||
.import mul_hibyte256
|
|
||||||
.import mul_hibyte512
|
|
||||||
|
|
||||||
.data
|
.data
|
||||||
|
|
||||||
strings:
|
strings:
|
||||||
|
@ -261,12 +257,6 @@ fill_masks:
|
||||||
add 4, dest, arg2, dest
|
add 4, dest, arg2, dest
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
.macro add_carry dest
|
|
||||||
lda dest
|
|
||||||
adc #0
|
|
||||||
sta dest
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
; 2 + 9 * byte cycles
|
; 2 + 9 * byte cycles
|
||||||
.macro sub bytes, dest, arg1, arg2
|
.macro sub bytes, dest, arg1, arg2
|
||||||
sec ; 2 cyc
|
sec ; 2 cyc
|
||||||
|
@ -344,15 +334,65 @@ fill_masks:
|
||||||
neg 4, arg
|
neg 4, arg
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; inner loop for imul16
|
||||||
|
; bitnum < 8: 25 or 41 cycles
|
||||||
|
; bitnum >= 8: 30 or 46 cycles
|
||||||
|
.macro bitmul16 arg1, arg2, result, bitnum
|
||||||
|
.local zero
|
||||||
|
.local one
|
||||||
|
.local next
|
||||||
|
|
||||||
|
; does 16-bit adds
|
||||||
|
; arg1 and arg2 are treated as unsigned
|
||||||
|
; negative signed inputs must be flipped first
|
||||||
|
|
||||||
|
; 7 cycles up to the branch
|
||||||
|
|
||||||
|
; check if arg1 has 0 or 1 bit in this place
|
||||||
|
; 5 cycles either way
|
||||||
|
.if bitnum < 8
|
||||||
|
lda arg1 ; 3 cyc
|
||||||
|
and #(1 << (bitnum)) ; 2 cyc
|
||||||
|
.else
|
||||||
|
lda arg1 + 1 ; 3 cyc
|
||||||
|
and #(1 << ((bitnum) - 8)) ; 2 cyc
|
||||||
|
.endif
|
||||||
|
bne one ; 2 cyc
|
||||||
|
|
||||||
|
zero: ; 18 cyc, 23 cyc
|
||||||
|
lsr result + 3 ; 5 cyc
|
||||||
|
jmp next ; 3 cyc
|
||||||
|
|
||||||
|
one: ; 32 cyc, 37 cyc
|
||||||
|
; 16-bit add on the top bits
|
||||||
|
clc ; 2 cyc
|
||||||
|
lda result + 2 ; 3 cyc
|
||||||
|
adc arg2 ; 3 cyc
|
||||||
|
sta result + 2 ; 3 cyc
|
||||||
|
lda result + 3 ; 3 cyc
|
||||||
|
adc arg2 + 1 ; 3 cyc
|
||||||
|
ror a ; 2 cyc - get a jump on the shift
|
||||||
|
sta result + 3 ; 3 cyc
|
||||||
|
next:
|
||||||
|
ror result + 2 ; 5 cyc
|
||||||
|
ror result + 1 ; 5 cyc
|
||||||
|
.if bitnum >= 8
|
||||||
|
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
||||||
|
; when it's all uninitialized data
|
||||||
|
ror result ; 5 cyc
|
||||||
|
.endif
|
||||||
|
|
||||||
|
.endmacro
|
||||||
|
|
||||||
; 5 to 25 cycles
|
; 5 to 25 cycles
|
||||||
.macro check_sign arg
|
.macro check_sign arg
|
||||||
; Check sign bit and flip argument to postive,
|
; Check sign bit and flip argument to postive,
|
||||||
; keeping a count of sign bits in the Y register.
|
; keeping a count of sign bits in the X register.
|
||||||
.local positive
|
.local positive
|
||||||
lda arg + 1 ; 3 cyc
|
lda arg + 1 ; 3 cyc
|
||||||
bpl positive ; 2 cyc
|
bpl positive ; 2 cyc
|
||||||
neg16 arg ; 18 cyc
|
neg16 arg ; 18 cyc
|
||||||
iny ; 2 cyc
|
inx ; 2 cyc
|
||||||
positive:
|
positive:
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
@ -379,93 +419,35 @@ positive:
|
||||||
copy16 dest, FR2 + 2 ; 12 cyc
|
copy16 dest, FR2 + 2 ; 12 cyc
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
; min 470 cycles
|
||||||
.macro imul8 dest, arg1, arg2
|
; max 780 cycles
|
||||||
.local under256
|
|
||||||
.local next
|
|
||||||
.local small_product
|
|
||||||
.scope
|
|
||||||
mul_factor_a = arg1
|
|
||||||
mul_factor_x = arg2
|
|
||||||
mul_product_lo = dest
|
|
||||||
mul_product_hi = dest + 1
|
|
||||||
|
|
||||||
lda mul_factor_a ; setup: 6 cycles
|
|
||||||
;ldx mul_factor_x
|
|
||||||
|
|
||||||
clc ; (a + x)^2/2: 23 cycles
|
|
||||||
adc mul_factor_x
|
|
||||||
tax
|
|
||||||
bcc under256
|
|
||||||
lda mul_hibyte512,x
|
|
||||||
bcs next
|
|
||||||
under256:
|
|
||||||
lda mul_hibyte256,x
|
|
||||||
sec
|
|
||||||
next:
|
|
||||||
sta mul_product_hi
|
|
||||||
lda mul_lobyte256,x
|
|
||||||
|
|
||||||
ldx mul_factor_a ; - a^2/2: 20 cycles
|
|
||||||
sbc mul_lobyte256,x
|
|
||||||
sta mul_product_lo
|
|
||||||
lda mul_product_hi
|
|
||||||
sbc mul_hibyte256,x
|
|
||||||
sta mul_product_hi
|
|
||||||
|
|
||||||
ldx mul_factor_x ; + x & a & 1: 22 cycles
|
|
||||||
txa ; (this is a kludge to correct a
|
|
||||||
and mul_factor_a ; roundoff error that makes odd * odd too low)
|
|
||||||
and #1
|
|
||||||
|
|
||||||
clc
|
|
||||||
adc mul_product_lo
|
|
||||||
bcc small_product
|
|
||||||
inc mul_product_hi
|
|
||||||
small_product:
|
|
||||||
sec ; - x^2/2: 25 cycles
|
|
||||||
sbc mul_lobyte256,x
|
|
||||||
sta mul_product_lo
|
|
||||||
lda mul_product_hi
|
|
||||||
sbc mul_hibyte256,x
|
|
||||||
sta mul_product_hi
|
|
||||||
.endscope
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
.proc imul16_func
|
.proc imul16_func
|
||||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||||
result = FR2 ; 32-bit result
|
result = FR2 ; 32-bit result
|
||||||
inter = temp2
|
|
||||||
|
|
||||||
ldy #0 ; 2 cyc
|
ldx #0 ; 2 cyc
|
||||||
; counts the number of sign bits in Y
|
; counts the number of sign bits in X
|
||||||
check_sign arg1 ; 5 to 25 cyc
|
check_sign arg1 ; 5 to 25 cyc
|
||||||
check_sign arg2 ; 5 to 25 cyc
|
check_sign arg2 ; 5 to 25 cyc
|
||||||
|
|
||||||
; h1l1 * h2l2
|
; zero out the 32-bit temp's top 16 bits
|
||||||
; (h1*256 + l1) * (h2*256 + l2)
|
lda #0 ; 2 cyc
|
||||||
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
sta result + 2 ; 3 cyc
|
||||||
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
sta result + 3 ; 3 cyc
|
||||||
|
; the bottom two bytes will get cleared by the shifts
|
||||||
|
|
||||||
imul8 result, arg1, arg2
|
; unrolled loop for maximum speed, at the cost
|
||||||
lda #0
|
; of a larger routine
|
||||||
sta result + 2
|
; 440 to 696 cycles
|
||||||
sta result + 3
|
.repeat 16, bitnum
|
||||||
|
; bitnum < 8: 25 or 41 cycles
|
||||||
imul8 inter, arg1 + 1, arg2
|
; bitnum >= 8: 30 or 46 cycles
|
||||||
add16 result + 1, result + 1, inter
|
bitmul16 arg1, arg2, result, bitnum
|
||||||
add_carry result + 3
|
.endrepeat
|
||||||
|
|
||||||
imul8 inter, arg1, arg2 + 1
|
|
||||||
add16 result + 1, result + 1, inter
|
|
||||||
add_carry result + 3
|
|
||||||
|
|
||||||
imul8 inter, arg1 + 1, arg2 + 1
|
|
||||||
add16 result + 2, result + 2, inter
|
|
||||||
|
|
||||||
; In case of mixed input signs, return a negative result.
|
; In case of mixed input signs, return a negative result.
|
||||||
cpy #1 ; 2 cyc
|
cpx #1 ; 2 cyc
|
||||||
bne positive_result ; 2 cyc
|
bne positive_result ; 2 cyc
|
||||||
neg32 result ; 34 cyc
|
neg32 result ; 34 cyc
|
||||||
positive_result:
|
positive_result:
|
||||||
|
|
|
@ -37,7 +37,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
|
||||||
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
|
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
|
||||||
|
|
||||||
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
|
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
|
||||||
(done)
|
|
||||||
|
|
||||||
## Deps and build instructions
|
## Deps and build instructions
|
||||||
|
|
||||||
|
|
38
tables.js
38
tables.js
|
@ -1,38 +0,0 @@
|
||||||
function db(func) {
|
|
||||||
let lines = [];
|
|
||||||
for (let i = 0; i < 256; i += 16) {
|
|
||||||
let items = [];
|
|
||||||
for (let j = 0; j < 16; j++) {
|
|
||||||
let x = i + j;
|
|
||||||
items.push(func(x));
|
|
||||||
}
|
|
||||||
lines.push(' .byte ' + items.join(', '));
|
|
||||||
}
|
|
||||||
return lines.join('\n');
|
|
||||||
}
|
|
||||||
|
|
||||||
let squares = [];
|
|
||||||
for (let i = 0; i < 512; i++) {
|
|
||||||
squares.push(Math.trunc((i * i + 1) / 2));
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
`.segment "TABLES"
|
|
||||||
|
|
||||||
.export mul_lobyte256
|
|
||||||
.export mul_hibyte256
|
|
||||||
.export mul_hibyte512
|
|
||||||
|
|
||||||
.align 256
|
|
||||||
mul_lobyte256:
|
|
||||||
${db((i) => squares[i] & 0xff)}
|
|
||||||
|
|
||||||
.align 256
|
|
||||||
mul_hibyte256:
|
|
||||||
${db((i) => (squares[i] >> 8) & 0xff)}
|
|
||||||
|
|
||||||
.align 256
|
|
||||||
mul_hibyte512:
|
|
||||||
${db((i) => (squares[i + 256] >> 8) & 0xff)}
|
|
||||||
|
|
||||||
`);
|
|
41
testme.js
41
testme.js
|
@ -1,41 +0,0 @@
|
||||||
// ax = (a + x)2/2 - a2/2 - x2/2
|
|
||||||
|
|
||||||
function half_square(x) {
|
|
||||||
return Math.round(x * x / 2) & 0xffff >>> 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
function mul8(a, b) {
|
|
||||||
let result = half_square(a + b) & 0xffff;
|
|
||||||
result = (result - half_square(a)) & 0xffff;
|
|
||||||
result = (result - half_square(b)) & 0xffff;
|
|
||||||
result = (result + (b & a & 1)) & 0xffff;
|
|
||||||
return result >>> 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
function mul16(a, b) {
|
|
||||||
let ah = (a & 0xff00) >>> 8;
|
|
||||||
let al = (a & 0x00ff) >>> 0;
|
|
||||||
let bh = (b & 0xff00) >>> 8;
|
|
||||||
let bl = (b & 0x00ff) >>> 0;
|
|
||||||
let result = (mul8(al, bl) & 0xffff) >>> 0;
|
|
||||||
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
|
|
||||||
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
|
|
||||||
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
let max = 65536;
|
|
||||||
//let max = 256;
|
|
||||||
//let max = 128;
|
|
||||||
//let max = 8;
|
|
||||||
|
|
||||||
for (let a = 0; a < max; a++) {
|
|
||||||
for (let b = 0; b < max; b++) {
|
|
||||||
let expected = Math.imul(a, b) >>> 0;
|
|
||||||
//let actual = mul8(a, b);
|
|
||||||
let actual = mul16(a, b);
|
|
||||||
if (expected !== actual) {
|
|
||||||
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in a new issue