Faster imul16 routine

Improves runtime from 16.24 ms/px to 14.44 ms/px This uses a routine found on Everything2: https://everything2.com/title/Fast+6502+multiplication which uses a lookup table of squares to do 8-bit imuls, which are then composed into a 16-bit imul
2023-02-11 12:24:48 -08:00 · 2023-02-11 12:24:48 -08:00 · 5637783529
commit 5637783529
parent 29630c8887
5 changed files with 183 additions and 81 deletions
--- a/mandel.s
+++ b/mandel.s
@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
-
-pixel_ptr       = $b6 ; u16
-pixel_color     = $b8 ; u8
-pixel_mask      = $b9 ; u8
-pixel_shift     = $ba ; u8
-pixel_offset    = $bb ; u8
-fill_level      = $bc ; u8
-palette_offset  = $bd ; u8
+temp2           = $b6 ; u16
+pixel_ptr       = $b8 ; u16
+pixel_color     = $ba ; u8
+pixel_mask      = $bb ; u8
+pixel_shift     = $bc ; u8
+pixel_offset    = $bd ; u8
+fill_level      = $be ; u8
+palette_offset  = $bf ; u8

 ; FP registers in zero page
 FR0    = $d4 ; float48
@ -107,6 +107,10 @@ KEY_RIGHT = $87
    mantissa .byte 6
 .endstruct

+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data

 strings:
@ -257,6 +261,12 @@ fill_masks:
    add 4, dest, arg2, dest
 .endmacro

+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -334,65 +344,15 @@ fill_masks:
    neg 4, arg
 .endmacro

-; inner loop for imul16
-; bitnum < 8: 25 or 41 cycles
-; bitnum >= 8: 30 or 46 cycles
-.macro bitmul16 arg1, arg2, result, bitnum
-    .local zero
-    .local one
-    .local next
-
-    ; does 16-bit adds
-    ; arg1 and arg2 are treated as unsigned
-    ; negative signed inputs must be flipped first
-
-    ; 7 cycles up to the branch
-
-    ; check if arg1 has 0 or 1 bit in this place
-    ; 5 cycles either way
-    .if bitnum < 8
-        lda arg1                 ; 3 cyc
-        and #(1 << (bitnum))       ; 2 cyc
-    .else
-        lda arg1 + 1             ; 3 cyc
-        and #(1 << ((bitnum) - 8)) ; 2 cyc
-    .endif
-    bne one ; 2 cyc
-
-zero: ; 18 cyc, 23 cyc
-    lsr result + 3 ; 5 cyc
-    jmp next       ; 3 cyc
-
-one: ; 32 cyc, 37 cyc
-    ; 16-bit add on the top bits
-    clc            ; 2 cyc
-    lda result + 2 ; 3 cyc
-    adc arg2       ; 3 cyc
-    sta result + 2 ; 3 cyc
-    lda result + 3 ; 3 cyc
-    adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc - get a jump on the shift
-    sta result + 3 ; 3 cyc
-next:
-    ror result + 2 ; 5 cyc
-    ror result + 1 ; 5 cyc
-    .if bitnum >= 8
-        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
-        ; when it's all uninitialized data
-        ror result ; 5 cyc
-    .endif
-
-.endmacro
-
 ; 5 to 25 cycles
 .macro check_sign arg
    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
+    ; keeping a count of sign bits in the Y register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
+    iny           ; 2 cyc
 positive:
 .endmacro

@ -419,35 +379,93 @@ positive:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro

-; min 470 cycles
-; max 780 cycles
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
 .proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
+    inter = temp2

-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
-    
-    ; zero out the 32-bit temp's top 16 bits
-    lda #0          ; 2 cyc
-    sta result + 2  ; 3 cyc
-    sta result + 3  ; 3 cyc
-    ; the bottom two bytes will get cleared by the shifts

-    ; unrolled loop for maximum speed, at the cost
-    ; of a larger routine
-    ; 440 to 696 cycles
-    .repeat 16, bitnum
-        ; bitnum < 8: 25 or 41 cycles
-        ; bitnum >= 8: 30 or 46 cycles
-        bitmul16 arg1, arg2, result, bitnum
-    .endrepeat
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter

    ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
+    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result: