From 5637783529e71dbbdc568d853d5df5616f25970c Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sat, 11 Feb 2023 12:24:48 -0800 Subject: [PATCH] Faster imul16 routine Improves runtime from 16.24 ms/px to 14.44 ms/px This uses a routine found on Everything2: https://everything2.com/title/Fast+6502+multiplication which uses a lookup table of squares to do 8-bit imuls, which are then composed into a 16-bit imul --- .gitignore | 1 + Makefile | 8 ++- mandel.s | 176 +++++++++++++++++++++++++++++------------------------ tables.js | 38 ++++++++++++ testme.js | 41 +++++++++++++ 5 files changed, 183 insertions(+), 81 deletions(-) create mode 100644 tables.js create mode 100644 testme.js diff --git a/.gitignore b/.gitignore index 8d2f7ce..771e47a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.xex +tables.s .DS_Store diff --git a/Makefile b/Makefile index 25148b4..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,17 @@ all : mandel.xex -%.xex : %.o - ld65 -C atari-asm-xex.cfg -o $@ $< +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< +tables.s : tables.js + node tables.js > tables.s + clean : + rm -f tables.s rm -f *.o rm -f *.xex diff --git a/mandel.s b/mandel.s index 3db6a77..1244a02 100644 --- a/mandel.s +++ b/mandel.s @@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not z_buffer_start = $b1 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer temp = $b4 ; u16 - -pixel_ptr = $b6 ; u16 -pixel_color = $b8 ; u8 -pixel_mask = $b9 ; u8 -pixel_shift = $ba ; u8 -pixel_offset = $bb ; u8 -fill_level = $bc ; u8 -palette_offset = $bd ; u8 +temp2 = $b6 ; u16 +pixel_ptr = $b8 ; u16 +pixel_color = $ba ; u8 +pixel_mask = $bb ; u8 +pixel_shift = $bc ; u8 +pixel_offset = $bd ; u8 +fill_level = $be ; u8 +palette_offset = $bf ; u8 ; FP registers in zero page FR0 = $d4 ; float48 @@ -107,6 +107,10 @@ KEY_RIGHT = $87 mantissa .byte 6 .endstruct +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 + .data strings: @@ -257,6 +261,12 @@ fill_masks: add 4, dest, arg2, dest .endmacro +.macro add_carry dest + lda dest + adc #0 + sta dest +.endmacro + ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc @@ -334,65 +344,15 @@ fill_masks: neg 4, arg .endmacro -; inner loop for imul16 -; bitnum < 8: 25 or 41 cycles -; bitnum >= 8: 30 or 46 cycles -.macro bitmul16 arg1, arg2, result, bitnum - .local zero - .local one - .local next - - ; does 16-bit adds - ; arg1 and arg2 are treated as unsigned - ; negative signed inputs must be flipped first - - ; 7 cycles up to the branch - - ; check if arg1 has 0 or 1 bit in this place - ; 5 cycles either way - .if bitnum < 8 - lda arg1 ; 3 cyc - and #(1 << (bitnum)) ; 2 cyc - .else - lda arg1 + 1 ; 3 cyc - and #(1 << ((bitnum) - 8)) ; 2 cyc - .endif - bne one ; 2 cyc - -zero: ; 18 cyc, 23 cyc - lsr result + 3 ; 5 cyc - jmp next ; 3 cyc - -one: ; 32 cyc, 37 cyc - ; 16-bit add on the top bits - clc ; 2 cyc - lda result + 2 ; 3 cyc - adc arg2 ; 3 cyc - sta result + 2 ; 3 cyc - lda result + 3 ; 3 cyc - adc arg2 + 1 ; 3 cyc - ror a ; 2 cyc - get a jump on the shift - sta result + 3 ; 3 cyc -next: - ror result + 2 ; 5 cyc - ror result + 1 ; 5 cyc - .if bitnum >= 8 - ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte - ; when it's all uninitialized data - ror result ; 5 cyc - .endif - -.endmacro - ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the X register. + ; keeping a count of sign bits in the Y register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc - inx ; 2 cyc + iny ; 2 cyc positive: .endmacro @@ -419,35 +379,93 @@ positive: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; min 470 cycles -; max 780 cycles +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x + + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x + + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 + + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro + .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result + inter = temp2 - ldx #0 ; 2 cyc - ; counts the number of sign bits in X + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc - - ; zero out the 32-bit temp's top 16 bits - lda #0 ; 2 cyc - sta result + 2 ; 3 cyc - sta result + 3 ; 3 cyc - ; the bottom two bytes will get cleared by the shifts - ; unrolled loop for maximum speed, at the cost - ; of a larger routine - ; 440 to 696 cycles - .repeat 16, bitnum - ; bitnum < 8: 25 or 41 cycles - ; bitnum >= 8: 30 or 46 cycles - bitmul16 arg1, arg2, result, bitnum - .endrepeat + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + + imul8 result, arg1, arg2 + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter ; In case of mixed input signs, return a negative result. - cpx #1 ; 2 cyc + cpy #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: diff --git a/tables.js b/tables.js new file mode 100644 index 0000000..c772f81 --- /dev/null +++ b/tables.js @@ -0,0 +1,38 @@ +function db(func) { + let lines = []; + for (let i = 0; i < 256; i += 16) { + let items = []; + for (let j = 0; j < 16; j++) { + let x = i + j; + items.push(func(x)); + } + lines.push(' .byte ' + items.join(', ')); + } + return lines.join('\n'); +} + +let squares = []; +for (let i = 0; i < 512; i++) { + squares.push(Math.trunc((i * i + 1) / 2)); +} + +console.log( +`.segment "TABLES" + +.export mul_lobyte256 +.export mul_hibyte256 +.export mul_hibyte512 + +.align 256 +mul_lobyte256: +${db((i) => squares[i] & 0xff)} + +.align 256 +mul_hibyte256: +${db((i) => (squares[i] >> 8) & 0xff)} + +.align 256 +mul_hibyte512: +${db((i) => (squares[i + 256] >> 8) & 0xff)} + +`); diff --git a/testme.js b/testme.js new file mode 100644 index 0000000..e12e706 --- /dev/null +++ b/testme.js @@ -0,0 +1,41 @@ +// ax = (a + x)2/2 - a2/2 - x2/2 + +function half_square(x) { + return Math.round(x * x / 2) & 0xffff >>> 0; +} + +function mul8(a, b) { + let result = half_square(a + b) & 0xffff; + result = (result - half_square(a)) & 0xffff; + result = (result - half_square(b)) & 0xffff; + result = (result + (b & a & 1)) & 0xffff; + return result >>> 0; +} + +function mul16(a, b) { + let ah = (a & 0xff00) >>> 8; + let al = (a & 0x00ff) >>> 0; + let bh = (b & 0xff00) >>> 8; + let bl = (b & 0x00ff) >>> 0; + let result = (mul8(al, bl) & 0xffff) >>> 0; + result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0; + result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0; + result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0; + return result; +} + +let max = 65536; +//let max = 256; +//let max = 128; +//let max = 8; + +for (let a = 0; a < max; a++) { + for (let b = 0; b < max; b++) { + let expected = Math.imul(a, b) >>> 0; + //let actual = mul8(a, b); + let actual = mul16(a, b); + if (expected !== actual) { + console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`); + } + } +} \ No newline at end of file