slightly faster handling of signed mul

previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/
squares
2024-12-15 20:17:45 -08:00 · 2024-12-14 18:56:26 -08:00 · 2024-12-14 18:53:31 -08:00
6 changed files with 188 additions and 97 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
--- a/8
+++ b/8
@ -2,13 +2,17 @@

 all : mandel.xex

-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+

 %.o : %.s
 	ca65 -o $@ $<

+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex

--- a/mandel.s
+++ b/mandel.s
@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
-
-pixel_ptr       = $b6 ; u16
-pixel_color     = $b8 ; u8
-pixel_mask      = $b9 ; u8
-pixel_shift     = $ba ; u8
-pixel_offset    = $bb ; u8
-fill_level      = $bc ; u8
-palette_offset  = $bd ; u8
+temp2           = $b6 ; u16
+pixel_ptr       = $b8 ; u16
+pixel_color     = $ba ; u8
+pixel_mask      = $bb ; u8
+pixel_shift     = $bc ; u8
+pixel_offset    = $bd ; u8
+fill_level      = $be ; u8
+palette_offset  = $bf ; u8

 ; FP registers in zero page
 FR0    = $d4 ; float48
@ -107,6 +107,10 @@ KEY_RIGHT = $87
    mantissa .byte 6
 .endstruct

+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data

 strings:
@ -257,6 +261,12 @@ fill_masks:
    add 4, dest, arg2, dest
 .endmacro

+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -334,68 +344,6 @@ fill_masks:
    neg 4, arg
 .endmacro

-; inner loop for imul16
-; bitnum < 8: 25 or 41 cycles
-; bitnum >= 8: 30 or 46 cycles
-.macro bitmul16 arg1, arg2, result, bitnum
-    .local zero
-    .local one
-    .local next
-
-    ; does 16-bit adds
-    ; arg1 and arg2 are treated as unsigned
-    ; negative signed inputs must be flipped first
-
-    ; 7 cycles up to the branch
-
-    ; check if arg1 has 0 or 1 bit in this place
-    ; 5 cycles either way
-    .if bitnum < 8
-        lda arg1                 ; 3 cyc
-        and #(1 << (bitnum))       ; 2 cyc
-    .else
-        lda arg1 + 1             ; 3 cyc
-        and #(1 << ((bitnum) - 8)) ; 2 cyc
-    .endif
-    bne one ; 2 cyc
-
-zero: ; 18 cyc, 23 cyc
-    lsr result + 3 ; 5 cyc
-    jmp next       ; 3 cyc
-
-one: ; 32 cyc, 37 cyc
-    ; 16-bit add on the top bits
-    clc            ; 2 cyc
-    lda result + 2 ; 3 cyc
-    adc arg2       ; 3 cyc
-    sta result + 2 ; 3 cyc
-    lda result + 3 ; 3 cyc
-    adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc - get a jump on the shift
-    sta result + 3 ; 3 cyc
-next:
-    ror result + 2 ; 5 cyc
-    ror result + 1 ; 5 cyc
-    .if bitnum >= 8
-        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
-        ; when it's all uninitialized data
-        ror result ; 5 cyc
-    .endif
-
-.endmacro
-
-; 5 to 25 cycles
-.macro check_sign arg
-    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
-    .local positive
-    lda arg + 1   ; 3 cyc
-    bpl positive  ; 2 cyc
-    neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
-positive:
-.endmacro
-
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
@ -419,38 +367,96 @@ positive:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro

-; min 470 cycles
-; max 780 cycles
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
 .proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
+    inter = temp2

-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
-    check_sign arg1 ; 5 to 25 cyc
-    check_sign arg2 ; 5 to 25 cyc
-    
-    ; zero out the 32-bit temp's top 16 bits
-    lda #0          ; 2 cyc
-    sta result + 2  ; 3 cyc
-    sta result + 3  ; 3 cyc
-    ; the bottom two bytes will get cleared by the shifts
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2

-    ; unrolled loop for maximum speed, at the cost
-    ; of a larger routine
-    ; 440 to 696 cycles
-    .repeat 16, bitnum
-        ; bitnum < 8: 25 or 41 cycles
-        ; bitnum >= 8: 30 or 46 cycles
-        bitmul16 arg1, arg2, result, bitnum
-    .endrepeat
+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3

-    ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
-    bne positive_result ; 2 cyc
-    neg32 result        ; 34 cyc
-positive_result:
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:

    rts ; 6 cyc
 .endproc
--- a/readme.md
+++ b/readme.md
@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.

 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
+(done)

 ## Deps and build instructions

--- a/tables.js
+++ b/tables.js
@ -0,0 +1,38 @@
+function db(func) {
+    let lines = [];
+    for (let i = 0; i < 256; i += 16) {
+        let items = [];
+        for (let j = 0; j < 16; j++) {
+            let x = i + j;
+            items.push(func(x));
+        }
+        lines.push('    .byte ' + items.join(', '));
+    }
+    return lines.join('\n');
+}
+
+let squares = [];
+for (let i = 0; i < 512; i++) {
+    squares.push(Math.trunc((i * i + 1) / 2));
+}
+
+console.log(
+`.segment "TABLES"
+
+.export mul_lobyte256
+.export mul_hibyte256
+.export mul_hibyte512
+
+.align 256
+mul_lobyte256:
+${db((i) => squares[i] & 0xff)}
+
+.align 256
+mul_hibyte256:
+${db((i) => (squares[i] >> 8) & 0xff)}
+
+.align 256
+mul_hibyte512:
+${db((i) => (squares[i + 256] >> 8) & 0xff)}
+
+`);
--- a/testme.js
+++ b/testme.js
@ -0,0 +1,41 @@
+// ax = (a + x)2/2 - a2/2 - x2/2 
+
+function half_square(x) {
+    return Math.round(x * x / 2) & 0xffff >>> 0;
+}
+
+function mul8(a, b) {
+    let result = half_square(a + b) & 0xffff;
+    result = (result - half_square(a)) & 0xffff;
+    result = (result - half_square(b)) & 0xffff;
+    result = (result + (b & a & 1)) & 0xffff;
+    return result >>> 0;
+}
+
+function mul16(a, b) {
+    let ah = (a & 0xff00) >>> 8;
+    let al = (a & 0x00ff) >>> 0;
+    let bh = (b & 0xff00) >>> 8;
+    let bl = (b & 0x00ff) >>> 0;
+    let result = (mul8(al, bl) & 0xffff) >>> 0;
+    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
+    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
+    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
+    return result;
+}
+
+let max = 65536;
+//let max = 256;
+//let max = 128;
+//let max = 8;
+
+for (let a = 0; a < max; a++) {
+    for (let b = 0; b < max; b++) {
+        let expected = Math.imul(a, b) >>> 0;
+        //let actual = mul8(a, b);
+        let actual = mul16(a, b);
+        if (expected !== actual) {
+            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
+        }
+    }
+}