6 changed files with 82 additions and 185 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 *.o
 *.xex
-tables.s
 .DS_Store
--- a/8
+++ b/8
@ -2,17 +2,13 @@

 all : mandel.xex

-mandel.xex : mandel.o tables.o
-	ld65 -C ./atari-asm-xex.cfg -o $@ $+
+%.xex : %.o
+	ld65 -C atari-asm-xex.cfg -o $@ $<

 %.o : %.s
 	ca65 -o $@ $<

-tables.s : tables.js
-	node tables.js > tables.s
-
 clean :
-	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex

--- a/mandel.s
+++ b/mandel.s
@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
-temp2           = $b6 ; u16
-pixel_ptr       = $b8 ; u16
-pixel_color     = $ba ; u8
-pixel_mask      = $bb ; u8
-pixel_shift     = $bc ; u8
-pixel_offset    = $bd ; u8
-fill_level      = $be ; u8
-palette_offset  = $bf ; u8
+
+pixel_ptr       = $b6 ; u16
+pixel_color     = $b8 ; u8
+pixel_mask      = $b9 ; u8
+pixel_shift     = $ba ; u8
+pixel_offset    = $bb ; u8
+fill_level      = $bc ; u8
+palette_offset  = $bd ; u8

 ; FP registers in zero page
 FR0    = $d4 ; float48
@ -107,10 +107,6 @@ KEY_RIGHT = $87
    mantissa .byte 6
 .endstruct

-.import mul_lobyte256
-.import mul_hibyte256
-.import mul_hibyte512
-
 .data

 strings:
@ -261,12 +257,6 @@ fill_masks:
    add 4, dest, arg2, dest
 .endmacro

-.macro add_carry dest
-    lda dest
-    adc #0
-    sta dest
-.endmacro
-
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -344,15 +334,65 @@ fill_masks:
    neg 4, arg
 .endmacro

+; inner loop for imul16
+; bitnum < 8: 25 or 41 cycles
+; bitnum >= 8: 30 or 46 cycles
+.macro bitmul16 arg1, arg2, result, bitnum
+    .local zero
+    .local one
+    .local next
+
+    ; does 16-bit adds
+    ; arg1 and arg2 are treated as unsigned
+    ; negative signed inputs must be flipped first
+
+    ; 7 cycles up to the branch
+
+    ; check if arg1 has 0 or 1 bit in this place
+    ; 5 cycles either way
+    .if bitnum < 8
+        lda arg1                 ; 3 cyc
+        and #(1 << (bitnum))       ; 2 cyc
+    .else
+        lda arg1 + 1             ; 3 cyc
+        and #(1 << ((bitnum) - 8)) ; 2 cyc
+    .endif
+    bne one ; 2 cyc
+
+zero: ; 18 cyc, 23 cyc
+    lsr result + 3 ; 5 cyc
+    jmp next       ; 3 cyc
+
+one: ; 32 cyc, 37 cyc
+    ; 16-bit add on the top bits
+    clc            ; 2 cyc
+    lda result + 2 ; 3 cyc
+    adc arg2       ; 3 cyc
+    sta result + 2 ; 3 cyc
+    lda result + 3 ; 3 cyc
+    adc arg2 + 1   ; 3 cyc
+    ror a          ; 2 cyc - get a jump on the shift
+    sta result + 3 ; 3 cyc
+next:
+    ror result + 2 ; 5 cyc
+    ror result + 1 ; 5 cyc
+    .if bitnum >= 8
+        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
+        ; when it's all uninitialized data
+        ror result ; 5 cyc
+    .endif
+
+.endmacro
+
 ; 5 to 25 cycles
 .macro check_sign arg
    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the Y register.
+    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
-    iny           ; 2 cyc
+    inx           ; 2 cyc
 positive:
 .endmacro

@ -379,93 +419,35 @@ positive:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro

-; Adapted from https://everything2.com/title/Fast+6502+multiplication
-.macro imul8 dest, arg1, arg2
-    .local under256
-    .local next
-    .local small_product
-    .scope
-        mul_factor_a   = arg1
-        mul_factor_x   = arg2
-        mul_product_lo = dest
-        mul_product_hi = dest + 1
-
-        lda mul_factor_a      ; setup: 6 cycles
-        ;ldx mul_factor_x
-
-        clc                   ; (a + x)^2/2: 23 cycles
-        adc mul_factor_x
-        tax
-        bcc under256
-        lda mul_hibyte512,x
-        bcs next
-    under256:
-        lda mul_hibyte256,x
-        sec
-    next:
-        sta mul_product_hi
-        lda mul_lobyte256,x
-
-        ldx mul_factor_a      ; - a^2/2: 20 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
-
-        ldx mul_factor_x      ; + x & a & 1: 22 cycles
-        txa                   ; (this is a kludge to correct a
-        and mul_factor_a      ; roundoff error that makes odd * odd too low)
-        and #1
-
-        clc
-        adc mul_product_lo
-        bcc small_product
-        inc mul_product_hi
-    small_product:
-        sec                   ; - x^2/2: 25 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
-    .endscope
-.endmacro
-
+; min 470 cycles
+; max 780 cycles
 .proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
-    inter = temp2

-    ldy #0          ; 2 cyc
-    ; counts the number of sign bits in Y
+    ldx #0          ; 2 cyc
+    ; counts the number of sign bits in X
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    
-    ; h1l1 * h2l2
-    ; (h1*256 + l1) * (h2*256 + l2)
-    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
-    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+    ; zero out the 32-bit temp's top 16 bits
+    lda #0          ; 2 cyc
+    sta result + 2  ; 3 cyc
+    sta result + 3  ; 3 cyc
+    ; the bottom two bytes will get cleared by the shifts

-    imul8 result, arg1, arg2
-    lda #0
-    sta result + 2
-    sta result + 3
-
-    imul8 inter, arg1 + 1, arg2
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8 inter, arg1, arg2 + 1
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8 inter, arg1 + 1, arg2 + 1
-    add16 result + 2, result + 2, inter
+    ; unrolled loop for maximum speed, at the cost
+    ; of a larger routine
+    ; 440 to 696 cycles
+    .repeat 16, bitnum
+        ; bitnum < 8: 25 or 41 cycles
+        ; bitnum >= 8: 30 or 46 cycles
+        bitmul16 arg1, arg2, result, bitnum
+    .endrepeat

    ; In case of mixed input signs, return a negative result.
-    cpy #1              ; 2 cyc
+    cpx #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result:
--- a/readme.md
+++ b/readme.md
@ -37,7 +37,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.

 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
-(done)

 ## Deps and build instructions

--- a/tables.js
+++ b/tables.js
@ -1,38 +0,0 @@
-function db(func) {
-    let lines = [];
-    for (let i = 0; i < 256; i += 16) {
-        let items = [];
-        for (let j = 0; j < 16; j++) {
-            let x = i + j;
-            items.push(func(x));
-        }
-        lines.push('    .byte ' + items.join(', '));
-    }
-    return lines.join('\n');
-}
-
-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
-console.log(
-`.segment "TABLES"
-
-.export mul_lobyte256
-.export mul_hibyte256
-.export mul_hibyte512
-
-.align 256
-mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
-
-.align 256
-mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
-
-.align 256
-mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
-
-`);
--- a/testme.js
+++ b/testme.js
@ -1,41 +0,0 @@
-// ax = (a + x)2/2 - a2/2 - x2/2 
-
-function half_square(x) {
-    return Math.round(x * x / 2) & 0xffff >>> 0;
-}
-
-function mul8(a, b) {
-    let result = half_square(a + b) & 0xffff;
-    result = (result - half_square(a)) & 0xffff;
-    result = (result - half_square(b)) & 0xffff;
-    result = (result + (b & a & 1)) & 0xffff;
-    return result >>> 0;
-}
-
-function mul16(a, b) {
-    let ah = (a & 0xff00) >>> 8;
-    let al = (a & 0x00ff) >>> 0;
-    let bh = (b & 0xff00) >>> 8;
-    let bl = (b & 0x00ff) >>> 0;
-    let result = (mul8(al, bl) & 0xffff) >>> 0;
-    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
-    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
-    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
-    return result;
-}
-
-let max = 65536;
-//let max = 256;
-//let max = 128;
-//let max = 8;
-
-for (let a = 0; a < max; a++) {
-    for (let b = 0; b < max; b++) {
-        let expected = Math.imul(a, b) >>> 0;
-        //let actual = mul8(a, b);
-        let actual = mul16(a, b);
-        if (expected !== actual) {
-            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
-        }
-    }
-}