diff --git a/.gitignore b/.gitignore index 8d2f7ce..771e47a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.xex +tables.s .DS_Store diff --git a/Makefile b/Makefile index 25148b4..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,17 @@ all : mandel.xex -%.xex : %.o - ld65 -C atari-asm-xex.cfg -o $@ $< +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< +tables.s : tables.js + node tables.js > tables.s + clean : + rm -f tables.s rm -f *.o rm -f *.xex diff --git a/mandel.s b/mandel.s index 097b700..71bc6c2 100644 --- a/mandel.s +++ b/mandel.s @@ -22,11 +22,12 @@ total_ms = $a4 ; float48 total_pixels = $aa ; float48 temp = $b0 ; u16 -pixel_ptr = $b2 ; u16 -pixel_color = $b4 ; u8 -pixel_mask = $b5 ; u8 -pixel_shift = $b6 ; u8 -pixel_offset = $b7 ; u8 +temp2 = $b2 ; u16 +pixel_ptr = $b4 ; u16 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 ; FP registers in zero page @@ -83,6 +84,10 @@ SETVBV = $E45C mantissa .byte 6 .endstruct +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 + .data strings: @@ -206,6 +211,12 @@ color_map: add 4, dest, arg2, dest .endmacro +.macro add_carry dest + lda dest + adc #0 + sta dest +.endmacro + ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc @@ -336,12 +347,12 @@ next: ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the X register. + ; keeping a count of sign bits in the Y register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc - inx ; 2 cyc + iny ; 2 cyc positive: .endmacro @@ -370,13 +381,13 @@ positive: ; min 470 cycles ; max 780 cycles -.proc imul16_func +.proc imul16_func_orig arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result - ldx #0 ; 2 cyc - ; counts the number of sign bits in X + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc @@ -396,7 +407,98 @@ positive: .endrepeat ; In case of mixed input signs, return a negative result. - cpx #1 ; 2 cyc + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: + + rts ; 6 cyc +.endproc + +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x + + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x + + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 + + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro + +.proc imul16_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc + + lda #0 + sta result + 0 + sta result + 1 + sta result + 2 + sta result + 3 + + imul8 inter, arg1, arg2 + add16 result, result, inter + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: diff --git a/tables.js b/tables.js new file mode 100644 index 0000000..5afc3c0 --- /dev/null +++ b/tables.js @@ -0,0 +1,33 @@ +function db(func) { + let lines = []; + for (let i = 0; i < 256; i += 16) { + let items = []; + for (let j = 0; j < 16; j++) { + let x = i + j; + items.push(func(x)); + } + lines.push(' .byte ' + items.join(', ')); + } + return lines.join('\n'); +} + +console.log( +`.segment "TABLES" + +.export mul_lobyte256 +.export mul_hibyte256 +.export mul_hibyte512 + +.align 256 +mul_lobyte256: +${db((x) => Math.round(x * x / 2) & 0xff)} + +.align 256 +mul_hibyte256: +${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)} + +.align 256 +mul_hibyte512: +${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)} + +`); diff --git a/testme.js b/testme.js new file mode 100644 index 0000000..e12e706 --- /dev/null +++ b/testme.js @@ -0,0 +1,41 @@ +// ax = (a + x)2/2 - a2/2 - x2/2 + +function half_square(x) { + return Math.round(x * x / 2) & 0xffff >>> 0; +} + +function mul8(a, b) { + let result = half_square(a + b) & 0xffff; + result = (result - half_square(a)) & 0xffff; + result = (result - half_square(b)) & 0xffff; + result = (result + (b & a & 1)) & 0xffff; + return result >>> 0; +} + +function mul16(a, b) { + let ah = (a & 0xff00) >>> 8; + let al = (a & 0x00ff) >>> 0; + let bh = (b & 0xff00) >>> 8; + let bl = (b & 0x00ff) >>> 0; + let result = (mul8(al, bl) & 0xffff) >>> 0; + result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0; + result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0; + result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0; + return result; +} + +let max = 65536; +//let max = 256; +//let max = 128; +//let max = 8; + +for (let a = 0; a < max; a++) { + for (let b = 0; b < max; b++) { + let expected = Math.imul(a, b) >>> 0; + //let actual = mul8(a, b); + let actual = mul16(a, b); + if (expected !== actual) { + console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`); + } + } +} \ No newline at end of file