Faster imul16 routine

Improves runtime from 16.24 ms/px to 14.44 ms/px

This uses a routine found on Everything2:
https://everything2.com/title/Fast+6502+multiplication

which uses a lookup table of squares to do 8-bit imuls,
which are then composed into a 16-bit imul
This commit is contained in:
Brooke Vibber 2023-02-11 12:24:48 -08:00 committed by Brooke Vibber
parent 29630c8887
commit 5637783529
5 changed files with 183 additions and 81 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
*.o *.o
*.xex *.xex
tables.s
.DS_Store .DS_Store

View file

@ -2,13 +2,17 @@
all : mandel.xex all : mandel.xex
%.xex : %.o mandel.xex : mandel.o tables.o
ld65 -C atari-asm-xex.cfg -o $@ $< ld65 -C ./atari-asm-xex.cfg -o $@ $+
%.o : %.s %.o : %.s
ca65 -o $@ $< ca65 -o $@ $<
tables.s : tables.js
node tables.js > tables.s
clean : clean :
rm -f tables.s
rm -f *.o rm -f *.o
rm -f *.xex rm -f *.xex

176
mandel.s
View file

@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $b1 ; u8: index into z_buffer z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16 temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b6 ; u16 pixel_ptr = $b8 ; u16
pixel_color = $b8 ; u8 pixel_color = $ba ; u8
pixel_mask = $b9 ; u8 pixel_mask = $bb ; u8
pixel_shift = $ba ; u8 pixel_shift = $bc ; u8
pixel_offset = $bb ; u8 pixel_offset = $bd ; u8
fill_level = $bc ; u8 fill_level = $be ; u8
palette_offset = $bd ; u8 palette_offset = $bf ; u8
; FP registers in zero page ; FP registers in zero page
FR0 = $d4 ; float48 FR0 = $d4 ; float48
@ -107,6 +107,10 @@ KEY_RIGHT = $87
mantissa .byte 6 mantissa .byte 6
.endstruct .endstruct
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.data .data
strings: strings:
@ -257,6 +261,12 @@ fill_masks:
add 4, dest, arg2, dest add 4, dest, arg2, dest
.endmacro .endmacro
.macro add_carry dest
lda dest
adc #0
sta dest
.endmacro
; 2 + 9 * byte cycles ; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2 .macro sub bytes, dest, arg1, arg2
sec ; 2 cyc sec ; 2 cyc
@ -334,65 +344,15 @@ fill_masks:
neg 4, arg neg 4, arg
.endmacro .endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles ; 5 to 25 cycles
.macro check_sign arg .macro check_sign arg
; Check sign bit and flip argument to postive, ; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register. ; keeping a count of sign bits in the Y register.
.local positive .local positive
lda arg + 1 ; 3 cyc lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc bpl positive ; 2 cyc
neg16 arg ; 18 cyc neg16 arg ; 18 cyc
inx ; 2 cyc iny ; 2 cyc
positive: positive:
.endmacro .endmacro
@ -419,35 +379,93 @@ positive:
copy16 dest, FR2 + 2 ; 12 cyc copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; min 470 cycles ; Adapted from https://everything2.com/title/Fast+6502+multiplication
; max 780 cycles .macro imul8 dest, arg1, arg2
.local under256
.local next
.local small_product
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
under256:
lda mul_hibyte256,x
sec
next:
sta mul_product_hi
lda mul_lobyte256,x
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
.endscope
.endmacro
.proc imul16_func .proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered) arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
inter = temp2
ldx #0 ; 2 cyc ldy #0 ; 2 cyc
; counts the number of sign bits in X ; counts the number of sign bits in Y
check_sign arg1 ; 5 to 25 cyc check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost ; h1l1 * h2l2
; of a larger routine ; (h1*256 + l1) * (h2*256 + l2)
; 440 to 696 cycles ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
.repeat 16, bitnum ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles imul8 result, arg1, arg2
bitmul16 arg1, arg2, result, bitnum lda #0
.endrepeat sta result + 2
sta result + 3
imul8 inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of mixed input signs, return a negative result. ; In case of mixed input signs, return a negative result.
cpx #1 ; 2 cyc cpy #1 ; 2 cyc
bne positive_result ; 2 cyc bne positive_result ; 2 cyc
neg32 result ; 34 cyc neg32 result ; 34 cyc
positive_result: positive_result:

38
tables.js Normal file
View file

@ -0,0 +1,38 @@
function db(func) {
let lines = [];
for (let i = 0; i < 256; i += 16) {
let items = [];
for (let j = 0; j < 16; j++) {
let x = i + j;
items.push(func(x));
}
lines.push(' .byte ' + items.join(', '));
}
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
`);

41
testme.js Normal file
View file

@ -0,0 +1,41 @@
// ax = (a + x)2/2 - a2/2 - x2/2
function half_square(x) {
return Math.round(x * x / 2) & 0xffff >>> 0;
}
function mul8(a, b) {
let result = half_square(a + b) & 0xffff;
result = (result - half_square(a)) & 0xffff;
result = (result - half_square(b)) & 0xffff;
result = (result + (b & a & 1)) & 0xffff;
return result >>> 0;
}
function mul16(a, b) {
let ah = (a & 0xff00) >>> 8;
let al = (a & 0x00ff) >>> 0;
let bh = (b & 0xff00) >>> 8;
let bl = (b & 0x00ff) >>> 0;
let result = (mul8(al, bl) & 0xffff) >>> 0;
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
return result;
}
let max = 65536;
//let max = 256;
//let max = 128;
//let max = 8;
for (let a = 0; a < max; a++) {
for (let b = 0; b < max; b++) {
let expected = Math.imul(a, b) >>> 0;
//let actual = mul8(a, b);
let actual = mul16(a, b);
if (expected !== actual) {
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
}
}
}