whee

Merge branch 'fastmul' into fastmul2
WIP alternate imul16
2024-11-11 12:10:08 -08:00 · 2024-11-11 11:45:58 -08:00 · 2023-02-11 16:03:18 -08:00
3 changed files with 117 additions and 22 deletions
--- a/mandel.s
+++ b/mandel.s
@ -26,6 +26,7 @@ z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
 temp2           = $b6 ; u16
 pixel_ptr       = $b8 ; u16
 pixel_color     = $ba ; u8
 pixel_mask      = $bb ; u8
@ -344,6 +345,68 @@ fill_masks:
    neg 4, arg
 .endmacro
 ; inner loop for imul16
 ; bitnum < 8: 25 or 41 cycles
 ; bitnum >= 8: 30 or 46 cycles
 .macro bitmul16 arg1, arg2, result, bitnum
    .local zero
    .local one
    .local next
    ; does 16-bit adds
    ; arg1 and arg2 are treated as unsigned
    ; negative signed inputs must be flipped first
    ; 7 cycles up to the branch
    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << (bitnum))       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << ((bitnum) - 8)) ; 2 cyc
    .endif
    bne one ; 2 cyc
 zero: ; 18 cyc, 23 cyc
    lsr result + 3 ; 5 cyc
    jmp next       ; 3 cyc
 one: ; 32 cyc, 37 cyc
    ; 16-bit add on the top bits
    clc            ; 2 cyc
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc
 next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif
 .endmacro
 ; 5 to 25 cycles
 .macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the Y register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    iny           ; 2 cyc
 positive:
 .endmacro
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
@ -367,6 +430,42 @@ fill_masks:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 ; min 470 cycles
 ; max 780 cycles
 .proc imul16_func_orig
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts
    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 440 to 696 cycles
    .repeat 16, bitnum
        ; bitnum < 8: 25 or 41 cycles
        ; bitnum >= 8: 30 or 46 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat
    ; In case of mixed input signs, return a negative result.
    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result:
    rts ; 6 cyc
 .endproc
 ; Adapted from https://everything2.com/title/Fast+6502+multiplication
 .macro imul8 dest, arg1, arg2
    .local under256
@ -413,7 +512,6 @@ fill_masks:
    small_product:
        sec                   ; - x^2/2: 25 cycles
        sbc mul_lobyte256,x
        sta mul_product_lo
        lda mul_product_hi
        sbc mul_hibyte256,x
        sta mul_product_hi
@ -426,19 +524,27 @@ fill_masks:
    result = FR2 ; 32-bit result
    inter = temp2
    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    ; h1l1 * h2l2
    ; (h1*256 + l1) * (h2*256 + l2)
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
    imul8 result, arg1, arg2
    lda #0
    sta result + 0
    sta result + 1
    sta result + 2
    sta result + 3
    imul8 inter, arg1, arg2
    add16 result, result, inter
    imul8 inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8 inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
@ -447,16 +553,11 @@ fill_masks:
    imul8 inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter
-    ; In case of negative inputs, adjust high word
+    ; In case of mixed input signs, return a negative result.
-    ; https://stackoverflow.com/a/28827013
+    cpy #1              ; 2 cyc
-    lda arg1 + 1
+    bne positive_result ; 2 cyc
-    bpl arg1_pos
+    neg32 result        ; 34 cyc
-    sub16 result + 2, result + 2, arg2
+positive_result:
 arg1_pos:
    lda arg2 + 1
    bpl arg2_pos
    sub16 result + 2, result + 2, arg1
 arg2_pos:
    rts ; 6 cyc
 .endproc
--- a/readme.md
+++ b/readme.md
@ -37,7 +37,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
 (done)
 ## Deps and build instructions
--- a/tables.js
+++ b/tables.js
@ -11,11 +11,6 @@ function db(func) {
    return lines.join('\n');
 }
 let squares = [];
 for (let i = 0; i < 512; i++) {
    squares.push(Math.trunc((i * i + 1) / 2));
 }
 console.log(
 `.segment "TABLES"
@ -25,14 +20,14 @@ console.log(
 .align 256
 mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
+${db((x) => Math.round(x * x / 2) & 0xff)}
 .align 256
 mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
+${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
 .align 256
 mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
+${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
 `);
Author	SHA1	Message	Date
Brooke Vibber	0631886466	whee	2024-11-11 12:10:08 -08:00
Brooke Vibber	97948dc814	Merge branch 'fastmul' into fastmul2	2024-11-11 11:45:58 -08:00
Brion Vibber	f10bb4fe18	WIP alternate imul16 not working at present	2023-02-11 16:03:18 -08:00