Merge branch 'fastmul' into fastmul2
This commit is contained in:
commit
97948dc814
5 changed files with 196 additions and 15 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,3 +1,4 @@
|
|||
*.o
|
||||
*.xex
|
||||
tables.s
|
||||
.DS_Store
|
||||
|
|
8
Makefile
8
Makefile
|
@ -2,13 +2,17 @@
|
|||
|
||||
all : mandel.xex
|
||||
|
||||
%.xex : %.o
|
||||
ld65 -C atari-asm-xex.cfg -o $@ $<
|
||||
mandel.xex : mandel.o tables.o
|
||||
ld65 -C ./atari-asm-xex.cfg -o $@ $+
|
||||
|
||||
%.o : %.s
|
||||
ca65 -o $@ $<
|
||||
|
||||
tables.s : tables.js
|
||||
node tables.js > tables.s
|
||||
|
||||
clean :
|
||||
rm -f tables.s
|
||||
rm -f *.o
|
||||
rm -f *.xex
|
||||
|
||||
|
|
128
mandel.s
128
mandel.s
|
@ -25,14 +25,15 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
|
|||
z_buffer_start = $b1 ; u8: index into z_buffer
|
||||
z_buffer_end = $b2 ; u8: index into z_buffer
|
||||
temp = $b4 ; u16
|
||||
temp2 = $b6 ; u16
|
||||
|
||||
pixel_ptr = $b6 ; u16
|
||||
pixel_color = $b8 ; u8
|
||||
pixel_mask = $b9 ; u8
|
||||
pixel_shift = $ba ; u8
|
||||
pixel_offset = $bb ; u8
|
||||
fill_level = $bc ; u8
|
||||
palette_offset = $bd ; u8
|
||||
pixel_ptr = $b8 ; u16
|
||||
pixel_color = $ba ; u8
|
||||
pixel_mask = $bb ; u8
|
||||
pixel_shift = $bc ; u8
|
||||
pixel_offset = $bd ; u8
|
||||
fill_level = $be ; u8
|
||||
palette_offset = $bf ; u8
|
||||
|
||||
; FP registers in zero page
|
||||
FR0 = $d4 ; float48
|
||||
|
@ -107,6 +108,10 @@ KEY_RIGHT = $87
|
|||
mantissa .byte 6
|
||||
.endstruct
|
||||
|
||||
.import mul_lobyte256
|
||||
.import mul_hibyte256
|
||||
.import mul_hibyte512
|
||||
|
||||
.data
|
||||
|
||||
strings:
|
||||
|
@ -257,6 +262,12 @@ fill_masks:
|
|||
add 4, dest, arg2, dest
|
||||
.endmacro
|
||||
|
||||
.macro add_carry dest
|
||||
lda dest
|
||||
adc #0
|
||||
sta dest
|
||||
.endmacro
|
||||
|
||||
; 2 + 9 * byte cycles
|
||||
.macro sub bytes, dest, arg1, arg2
|
||||
sec ; 2 cyc
|
||||
|
@ -387,12 +398,12 @@ next:
|
|||
; 5 to 25 cycles
|
||||
.macro check_sign arg
|
||||
; Check sign bit and flip argument to postive,
|
||||
; keeping a count of sign bits in the X register.
|
||||
; keeping a count of sign bits in the Y register.
|
||||
.local positive
|
||||
lda arg + 1 ; 3 cyc
|
||||
bpl positive ; 2 cyc
|
||||
neg16 arg ; 18 cyc
|
||||
inx ; 2 cyc
|
||||
iny ; 2 cyc
|
||||
positive:
|
||||
.endmacro
|
||||
|
||||
|
@ -421,13 +432,13 @@ positive:
|
|||
|
||||
; min 470 cycles
|
||||
; max 780 cycles
|
||||
.proc imul16_func
|
||||
.proc imul16_func_orig
|
||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||
result = FR2 ; 32-bit result
|
||||
|
||||
ldx #0 ; 2 cyc
|
||||
; counts the number of sign bits in X
|
||||
ldy #0 ; 2 cyc
|
||||
; counts the number of sign bits in Y
|
||||
check_sign arg1 ; 5 to 25 cyc
|
||||
check_sign arg2 ; 5 to 25 cyc
|
||||
|
||||
|
@ -447,7 +458,98 @@ positive:
|
|||
.endrepeat
|
||||
|
||||
; In case of mixed input signs, return a negative result.
|
||||
cpx #1 ; 2 cyc
|
||||
cpy #1 ; 2 cyc
|
||||
bne positive_result ; 2 cyc
|
||||
neg32 result ; 34 cyc
|
||||
positive_result:
|
||||
|
||||
rts ; 6 cyc
|
||||
.endproc
|
||||
|
||||
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
||||
.macro imul8 dest, arg1, arg2
|
||||
.local under256
|
||||
.local next
|
||||
.local small_product
|
||||
.scope
|
||||
mul_factor_a = arg1
|
||||
mul_factor_x = arg2
|
||||
mul_product_lo = dest
|
||||
mul_product_hi = dest + 1
|
||||
|
||||
lda mul_factor_a ; setup: 6 cycles
|
||||
;ldx mul_factor_x
|
||||
|
||||
clc ; (a + x)^2/2: 23 cycles
|
||||
adc mul_factor_x
|
||||
tax
|
||||
bcc under256
|
||||
lda mul_hibyte512,x
|
||||
bcs next
|
||||
under256:
|
||||
lda mul_hibyte256,x
|
||||
sec
|
||||
next:
|
||||
sta mul_product_hi
|
||||
lda mul_lobyte256,x
|
||||
|
||||
ldx mul_factor_a ; - a^2/2: 20 cycles
|
||||
sbc mul_lobyte256,x
|
||||
sta mul_product_lo
|
||||
lda mul_product_hi
|
||||
sbc mul_hibyte256,x
|
||||
sta mul_product_hi
|
||||
|
||||
ldx mul_factor_x ; + x & a & 1: 22 cycles
|
||||
txa ; (this is a kludge to correct a
|
||||
and mul_factor_a ; roundoff error that makes odd * odd too low)
|
||||
and #1
|
||||
|
||||
clc
|
||||
adc mul_product_lo
|
||||
bcc small_product
|
||||
inc mul_product_hi
|
||||
small_product:
|
||||
sec ; - x^2/2: 25 cycles
|
||||
sbc mul_lobyte256,x
|
||||
lda mul_product_hi
|
||||
sbc mul_hibyte256,x
|
||||
sta mul_product_hi
|
||||
.endscope
|
||||
.endmacro
|
||||
|
||||
.proc imul16_func
|
||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||
result = FR2 ; 32-bit result
|
||||
inter = temp2
|
||||
|
||||
ldy #0 ; 2 cyc
|
||||
; counts the number of sign bits in Y
|
||||
check_sign arg1 ; 5 to 25 cyc
|
||||
check_sign arg2 ; 5 to 25 cyc
|
||||
|
||||
lda #0
|
||||
sta result + 0
|
||||
sta result + 1
|
||||
sta result + 2
|
||||
sta result + 3
|
||||
|
||||
imul8 inter, arg1, arg2
|
||||
add16 result, result, inter
|
||||
|
||||
imul8 inter, arg1 + 1, arg2
|
||||
add16 result + 1, result + 1, inter
|
||||
|
||||
imul8 inter, arg1, arg2 + 1
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
|
||||
imul8 inter, arg1 + 1, arg2 + 1
|
||||
add16 result + 2, result + 2, inter
|
||||
|
||||
; In case of mixed input signs, return a negative result.
|
||||
cpy #1 ; 2 cyc
|
||||
bne positive_result ; 2 cyc
|
||||
neg32 result ; 34 cyc
|
||||
positive_result:
|
||||
|
|
33
tables.js
Normal file
33
tables.js
Normal file
|
@ -0,0 +1,33 @@
|
|||
function db(func) {
|
||||
let lines = [];
|
||||
for (let i = 0; i < 256; i += 16) {
|
||||
let items = [];
|
||||
for (let j = 0; j < 16; j++) {
|
||||
let x = i + j;
|
||||
items.push(func(x));
|
||||
}
|
||||
lines.push(' .byte ' + items.join(', '));
|
||||
}
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
console.log(
|
||||
`.segment "TABLES"
|
||||
|
||||
.export mul_lobyte256
|
||||
.export mul_hibyte256
|
||||
.export mul_hibyte512
|
||||
|
||||
.align 256
|
||||
mul_lobyte256:
|
||||
${db((x) => Math.round(x * x / 2) & 0xff)}
|
||||
|
||||
.align 256
|
||||
mul_hibyte256:
|
||||
${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
|
||||
|
||||
.align 256
|
||||
mul_hibyte512:
|
||||
${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
|
||||
|
||||
`);
|
41
testme.js
Normal file
41
testme.js
Normal file
|
@ -0,0 +1,41 @@
|
|||
// ax = (a + x)2/2 - a2/2 - x2/2
|
||||
|
||||
function half_square(x) {
|
||||
return Math.round(x * x / 2) & 0xffff >>> 0;
|
||||
}
|
||||
|
||||
function mul8(a, b) {
|
||||
let result = half_square(a + b) & 0xffff;
|
||||
result = (result - half_square(a)) & 0xffff;
|
||||
result = (result - half_square(b)) & 0xffff;
|
||||
result = (result + (b & a & 1)) & 0xffff;
|
||||
return result >>> 0;
|
||||
}
|
||||
|
||||
function mul16(a, b) {
|
||||
let ah = (a & 0xff00) >>> 8;
|
||||
let al = (a & 0x00ff) >>> 0;
|
||||
let bh = (b & 0xff00) >>> 8;
|
||||
let bl = (b & 0x00ff) >>> 0;
|
||||
let result = (mul8(al, bl) & 0xffff) >>> 0;
|
||||
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
|
||||
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
|
||||
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
let max = 65536;
|
||||
//let max = 256;
|
||||
//let max = 128;
|
||||
//let max = 8;
|
||||
|
||||
for (let a = 0; a < max; a++) {
|
||||
for (let b = 0; b < max; b++) {
|
||||
let expected = Math.imul(a, b) >>> 0;
|
||||
//let actual = mul8(a, b);
|
||||
let actual = mul16(a, b);
|
||||
if (expected !== actual) {
|
||||
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue