slightly faster handling of signed mul

previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/
squares
2024-12-15 20:17:45 -08:00 · 2024-12-14 18:56:26 -08:00 · 2024-12-14 18:53:31 -08:00
3 changed files with 22 additions and 117 deletions
--- a/mandel.s
+++ b/mandel.s
@ -26,7 +26,6 @@ z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
 temp2           = $b6 ; u16
 pixel_ptr       = $b8 ; u16
 pixel_color     = $ba ; u8
 pixel_mask      = $bb ; u8
@ -345,68 +344,6 @@ fill_masks:
    neg 4, arg
 .endmacro
 ; inner loop for imul16
 ; bitnum < 8: 25 or 41 cycles
 ; bitnum >= 8: 30 or 46 cycles
 .macro bitmul16 arg1, arg2, result, bitnum
    .local zero
    .local one
    .local next
    ; does 16-bit adds
    ; arg1 and arg2 are treated as unsigned
    ; negative signed inputs must be flipped first
    ; 7 cycles up to the branch
    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << (bitnum))       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << ((bitnum) - 8)) ; 2 cyc
    .endif
    bne one ; 2 cyc
 zero: ; 18 cyc, 23 cyc
    lsr result + 3 ; 5 cyc
    jmp next       ; 3 cyc
 one: ; 32 cyc, 37 cyc
    ; 16-bit add on the top bits
    clc            ; 2 cyc
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc
 next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif
 .endmacro
 ; 5 to 25 cycles
 .macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the Y register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    iny           ; 2 cyc
 positive:
 .endmacro
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
@ -430,42 +367,6 @@ positive:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 ; min 470 cycles
 ; max 780 cycles
 .proc imul16_func_orig
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts
    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 440 to 696 cycles
    .repeat 16, bitnum
        ; bitnum < 8: 25 or 41 cycles
        ; bitnum >= 8: 30 or 46 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat
    ; In case of mixed input signs, return a negative result.
    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result:
    rts ; 6 cyc
 .endproc
 ; Adapted from https://everything2.com/title/Fast+6502+multiplication
 .macro imul8 dest, arg1, arg2
    .local under256
@ -512,6 +413,7 @@ positive_result:
    small_product:
        sec                   ; - x^2/2: 25 cycles
        sbc mul_lobyte256,x
        sta mul_product_lo
        lda mul_product_hi
        sbc mul_hibyte256,x
        sta mul_product_hi
@ -524,27 +426,19 @@ positive_result:
    result = FR2 ; 32-bit result
    inter = temp2
    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    ; h1l1 * h2l2
    ; (h1*256 + l1) * (h2*256 + l2)
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
    imul8 result, arg1, arg2
    lda #0
    sta result + 0
    sta result + 1
    sta result + 2
    sta result + 3
    imul8 inter, arg1, arg2
    add16 result, result, inter
    imul8 inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8 inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
@ -553,11 +447,16 @@ positive_result:
    imul8 inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter
-    ; In case of mixed input signs, return a negative result.
+    ; In case of negative inputs, adjust high word
-    cpy #1              ; 2 cyc
+    ; https://stackoverflow.com/a/28827013
-    bne positive_result ; 2 cyc
+    lda arg1 + 1
-    neg32 result        ; 34 cyc
+    bpl arg1_pos
-positive_result:
+    sub16 result + 2, result + 2, arg2
 arg1_pos:
    lda arg2 + 1
    bpl arg2_pos
    sub16 result + 2, result + 2, arg1
 arg2_pos:
    rts ; 6 cyc
 .endproc
--- a/readme.md
+++ b/readme.md
@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
 (done)
 ## Deps and build instructions
--- a/tables.js
+++ b/tables.js
@ -11,6 +11,11 @@ function db(func) {
    return lines.join('\n');
 }
 let squares = [];
 for (let i = 0; i < 512; i++) {
    squares.push(Math.trunc((i * i + 1) / 2));
 }
 console.log(
 `.segment "TABLES"
@ -20,14 +25,14 @@ console.log(
 .align 256
 mul_lobyte256:
-${db((x) => Math.round(x * x / 2) & 0xff)}
+${db((i) => squares[i] & 0xff)}
 .align 256
 mul_hibyte256:
-${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
+${db((i) => (squares[i] >> 8) & 0xff)}
 .align 256
 mul_hibyte512:
-${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
+${db((i) => (squares[i + 256] >> 8) & 0xff)}
 `);