Compare commits

...

3 commits

Author SHA1 Message Date
05133aabdd slightly faster handling of signed mul
previously we were flipping the inputs if negative, and then the
output if both inputs were negative

turns out you can just treat the whole thing as an unsigned mul
and then subtract each term from the high word if the other term
is negative.

https://stackoverflow.com/a/28827013

this saves a handful of cycles, reducing our runtime to a merge
14.211 ms/px \o/
2024-12-15 20:17:45 -08:00
7f2bc43cff squares 2024-12-14 18:56:26 -08:00
5637783529 Faster imul16 routine
Improves runtime from 16.24 ms/px to 14.44 ms/px

This uses a routine found on Everything2:
https://everything2.com/title/Fast+6502+multiplication

which uses a lookup table of squares to do 8-bit imuls,
which are then composed into a 16-bit imul
2024-12-14 18:53:31 -08:00
6 changed files with 188 additions and 97 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
*.o *.o
*.xex *.xex
tables.s
.DS_Store .DS_Store

View file

@ -2,13 +2,17 @@
all : mandel.xex all : mandel.xex
%.xex : %.o mandel.xex : mandel.o tables.o
ld65 -C atari-asm-xex.cfg -o $@ $< ld65 -C ./atari-asm-xex.cfg -o $@ $+
%.o : %.s %.o : %.s
ca65 -o $@ $< ca65 -o $@ $<
tables.s : tables.js
node tables.js > tables.s
clean : clean :
rm -f tables.s
rm -f *.o rm -f *.o
rm -f *.xex rm -f *.xex

194
mandel.s
View file

@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $b1 ; u8: index into z_buffer z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16 temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b6 ; u16 pixel_ptr = $b8 ; u16
pixel_color = $b8 ; u8 pixel_color = $ba ; u8
pixel_mask = $b9 ; u8 pixel_mask = $bb ; u8
pixel_shift = $ba ; u8 pixel_shift = $bc ; u8
pixel_offset = $bb ; u8 pixel_offset = $bd ; u8
fill_level = $bc ; u8 fill_level = $be ; u8
palette_offset = $bd ; u8 palette_offset = $bf ; u8
; FP registers in zero page ; FP registers in zero page
FR0 = $d4 ; float48 FR0 = $d4 ; float48
@ -107,6 +107,10 @@ KEY_RIGHT = $87
mantissa .byte 6 mantissa .byte 6
.endstruct .endstruct
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.data .data
strings: strings:
@ -257,6 +261,12 @@ fill_masks:
add 4, dest, arg2, dest add 4, dest, arg2, dest
.endmacro .endmacro
.macro add_carry dest
lda dest
adc #0
sta dest
.endmacro
; 2 + 9 * byte cycles ; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2 .macro sub bytes, dest, arg1, arg2
sec ; 2 cyc sec ; 2 cyc
@ -334,68 +344,6 @@ fill_masks:
neg 4, arg neg 4, arg
.endmacro .endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc ; 518 - 828 cyc
.macro imul16 dest, arg1, arg2 .macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc copy16 FR0, arg1 ; 12 cyc
@ -419,38 +367,96 @@ positive:
copy16 dest, FR2 + 2 ; 12 cyc copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; min 470 cycles ; Adapted from https://everything2.com/title/Fast+6502+multiplication
; max 780 cycles .macro imul8 dest, arg1, arg2
.local under256
.local next
.local small_product
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
under256:
lda mul_hibyte256,x
sec
next:
sta mul_product_hi
lda mul_lobyte256,x
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
.endscope
.endmacro
.proc imul16_func .proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered) arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
inter = temp2
ldx #0 ; 2 cyc ; h1l1 * h2l2
; counts the number of sign bits in X ; (h1*256 + l1) * (h2*256 + l2)
check_sign arg1 ; 5 to 25 cyc ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
check_sign arg2 ; 5 to 25 cyc ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
; zero out the 32-bit temp's top 16 bits imul8 result, arg1, arg2
lda #0 ; 2 cyc lda #0
sta result + 2 ; 3 cyc sta result + 2
sta result + 3 ; 3 cyc sta result + 3
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost imul8 inter, arg1 + 1, arg2
; of a larger routine add16 result + 1, result + 1, inter
; 440 to 696 cycles add_carry result + 3
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result. imul8 inter, arg1, arg2 + 1
cpx #1 ; 2 cyc add16 result + 1, result + 1, inter
bne positive_result ; 2 cyc add_carry result + 3
neg32 result ; 34 cyc
positive_result: imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc rts ; 6 cyc
.endproc .endproc

View file

@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions ## Deps and build instructions

38
tables.js Normal file
View file

@ -0,0 +1,38 @@
function db(func) {
let lines = [];
for (let i = 0; i < 256; i += 16) {
let items = [];
for (let j = 0; j < 16; j++) {
let x = i + j;
items.push(func(x));
}
lines.push(' .byte ' + items.join(', '));
}
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
`);

41
testme.js Normal file
View file

@ -0,0 +1,41 @@
// ax = (a + x)2/2 - a2/2 - x2/2
function half_square(x) {
return Math.round(x * x / 2) & 0xffff >>> 0;
}
function mul8(a, b) {
let result = half_square(a + b) & 0xffff;
result = (result - half_square(a)) & 0xffff;
result = (result - half_square(b)) & 0xffff;
result = (result + (b & a & 1)) & 0xffff;
return result >>> 0;
}
function mul16(a, b) {
let ah = (a & 0xff00) >>> 8;
let al = (a & 0x00ff) >>> 0;
let bh = (b & 0xff00) >>> 8;
let bl = (b & 0x00ff) >>> 0;
let result = (mul8(al, bl) & 0xffff) >>> 0;
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
return result;
}
let max = 65536;
//let max = 256;
//let max = 128;
//let max = 8;
for (let a = 0; a < max; a++) {
for (let b = 0; b < max; b++) {
let expected = Math.imul(a, b) >>> 0;
//let actual = mul8(a, b);
let actual = mul16(a, b);
if (expected !== actual) {
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
}
}
}