diff --git a/.gitignore b/.gitignore index 771e47a..8d2f7ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ *.o *.xex -tables.s .DS_Store diff --git a/Makefile b/Makefile index 008bf8c..25148b4 100644 --- a/Makefile +++ b/Makefile @@ -2,17 +2,13 @@ all : mandel.xex -mandel.xex : mandel.o tables.o - ld65 -C ./atari-asm-xex.cfg -o $@ $+ +%.xex : %.o + ld65 -C atari-asm-xex.cfg -o $@ $< %.o : %.s ca65 -o $@ $< -tables.s : tables.js - node tables.js > tables.s - clean : - rm -f tables.s rm -f *.o rm -f *.xex diff --git a/mandel.s b/mandel.s index 1244a02..3db6a77 100644 --- a/mandel.s +++ b/mandel.s @@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not z_buffer_start = $b1 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer temp = $b4 ; u16 -temp2 = $b6 ; u16 -pixel_ptr = $b8 ; u16 -pixel_color = $ba ; u8 -pixel_mask = $bb ; u8 -pixel_shift = $bc ; u8 -pixel_offset = $bd ; u8 -fill_level = $be ; u8 -palette_offset = $bf ; u8 + +pixel_ptr = $b6 ; u16 +pixel_color = $b8 ; u8 +pixel_mask = $b9 ; u8 +pixel_shift = $ba ; u8 +pixel_offset = $bb ; u8 +fill_level = $bc ; u8 +palette_offset = $bd ; u8 ; FP registers in zero page FR0 = $d4 ; float48 @@ -107,10 +107,6 @@ KEY_RIGHT = $87 mantissa .byte 6 .endstruct -.import mul_lobyte256 -.import mul_hibyte256 -.import mul_hibyte512 - .data strings: @@ -261,12 +257,6 @@ fill_masks: add 4, dest, arg2, dest .endmacro -.macro add_carry dest - lda dest - adc #0 - sta dest -.endmacro - ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc @@ -344,15 +334,65 @@ fill_masks: neg 4, arg .endmacro +; inner loop for imul16 +; bitnum < 8: 25 or 41 cycles +; bitnum >= 8: 30 or 46 cycles +.macro bitmul16 arg1, arg2, result, bitnum + .local zero + .local one + .local next + + ; does 16-bit adds + ; arg1 and arg2 are treated as unsigned + ; negative signed inputs must be flipped first + + ; 7 cycles up to the branch + + ; check if arg1 has 0 or 1 bit in this place + ; 5 cycles either way + .if bitnum < 8 + lda arg1 ; 3 cyc + and #(1 << (bitnum)) ; 2 cyc + .else + lda arg1 + 1 ; 3 cyc + and #(1 << ((bitnum) - 8)) ; 2 cyc + .endif + bne one ; 2 cyc + +zero: ; 18 cyc, 23 cyc + lsr result + 3 ; 5 cyc + jmp next ; 3 cyc + +one: ; 32 cyc, 37 cyc + ; 16-bit add on the top bits + clc ; 2 cyc + lda result + 2 ; 3 cyc + adc arg2 ; 3 cyc + sta result + 2 ; 3 cyc + lda result + 3 ; 3 cyc + adc arg2 + 1 ; 3 cyc + ror a ; 2 cyc - get a jump on the shift + sta result + 3 ; 3 cyc +next: + ror result + 2 ; 5 cyc + ror result + 1 ; 5 cyc + .if bitnum >= 8 + ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte + ; when it's all uninitialized data + ror result ; 5 cyc + .endif + +.endmacro + ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the Y register. + ; keeping a count of sign bits in the X register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc - iny ; 2 cyc + inx ; 2 cyc positive: .endmacro @@ -379,93 +419,35 @@ positive: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; Adapted from https://everything2.com/title/Fast+6502+multiplication -.macro imul8 dest, arg1, arg2 - .local under256 - .local next - .local small_product - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; setup: 6 cycles - ;ldx mul_factor_x - - clc ; (a + x)^2/2: 23 cycles - adc mul_factor_x - tax - bcc under256 - lda mul_hibyte512,x - bcs next - under256: - lda mul_hibyte256,x - sec - next: - sta mul_product_hi - lda mul_lobyte256,x - - ldx mul_factor_a ; - a^2/2: 20 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi - - ldx mul_factor_x ; + x & a & 1: 22 cycles - txa ; (this is a kludge to correct a - and mul_factor_a ; roundoff error that makes odd * odd too low) - and #1 - - clc - adc mul_product_lo - bcc small_product - inc mul_product_hi - small_product: - sec ; - x^2/2: 25 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi - .endscope -.endmacro - +; min 470 cycles +; max 780 cycles .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result - inter = temp2 - ldy #0 ; 2 cyc - ; counts the number of sign bits in Y + ldx #0 ; 2 cyc + ; counts the number of sign bits in X check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc + + ; zero out the 32-bit temp's top 16 bits + lda #0 ; 2 cyc + sta result + 2 ; 3 cyc + sta result + 3 ; 3 cyc + ; the bottom two bytes will get cleared by the shifts - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - - imul8 result, arg1, arg2 - lda #0 - sta result + 2 - sta result + 3 - - imul8 inter, arg1 + 1, arg2 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8 inter, arg1, arg2 + 1 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8 inter, arg1 + 1, arg2 + 1 - add16 result + 2, result + 2, inter + ; unrolled loop for maximum speed, at the cost + ; of a larger routine + ; 440 to 696 cycles + .repeat 16, bitnum + ; bitnum < 8: 25 or 41 cycles + ; bitnum >= 8: 30 or 46 cycles + bitmul16 arg1, arg2, result, bitnum + .endrepeat ; In case of mixed input signs, return a negative result. - cpy #1 ; 2 cyc + cpx #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: diff --git a/readme.md b/readme.md index 873793f..6b57378 100644 --- a/readme.md +++ b/readme.md @@ -37,7 +37,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. -(done) ## Deps and build instructions diff --git a/tables.js b/tables.js deleted file mode 100644 index c772f81..0000000 --- a/tables.js +++ /dev/null @@ -1,38 +0,0 @@ -function db(func) { - let lines = []; - for (let i = 0; i < 256; i += 16) { - let items = []; - for (let j = 0; j < 16; j++) { - let x = i + j; - items.push(func(x)); - } - lines.push(' .byte ' + items.join(', ')); - } - return lines.join('\n'); -} - -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - -console.log( -`.segment "TABLES" - -.export mul_lobyte256 -.export mul_hibyte256 -.export mul_hibyte512 - -.align 256 -mul_lobyte256: -${db((i) => squares[i] & 0xff)} - -.align 256 -mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} - -.align 256 -mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} - -`); diff --git a/testme.js b/testme.js deleted file mode 100644 index e12e706..0000000 --- a/testme.js +++ /dev/null @@ -1,41 +0,0 @@ -// ax = (a + x)2/2 - a2/2 - x2/2 - -function half_square(x) { - return Math.round(x * x / 2) & 0xffff >>> 0; -} - -function mul8(a, b) { - let result = half_square(a + b) & 0xffff; - result = (result - half_square(a)) & 0xffff; - result = (result - half_square(b)) & 0xffff; - result = (result + (b & a & 1)) & 0xffff; - return result >>> 0; -} - -function mul16(a, b) { - let ah = (a & 0xff00) >>> 8; - let al = (a & 0x00ff) >>> 0; - let bh = (b & 0xff00) >>> 8; - let bl = (b & 0x00ff) >>> 0; - let result = (mul8(al, bl) & 0xffff) >>> 0; - result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0; - result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0; - result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0; - return result; -} - -let max = 65536; -//let max = 256; -//let max = 128; -//let max = 8; - -for (let a = 0; a < max; a++) { - for (let b = 0; b < max; b++) { - let expected = Math.imul(a, b) >>> 0; - //let actual = mul8(a, b); - let actual = mul16(a, b); - if (expected !== actual) { - console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`); - } - } -} \ No newline at end of file