refactoring and start on squares

2024-12-29 17:37:06 -08:00 · 2024-12-29 17:37:06 -08:00 · f903272335
commit f903272335
parent 8ad996981a
2 changed files with 150 additions and 160 deletions
--- a/mandel.s
+++ b/mandel.s
@ -374,65 +374,13 @@ viewport_oy:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro

-; Adapted from https://everything2.com/title/Fast+6502+multiplication
-.macro imul8 dest, arg1, arg2
-    .local under256
-    .local next
-    .local small_product
-    ; circa 92 cycles? this doesn't seem right
-    ; 81-92 cycles
-    .scope
-        mul_factor_a   = arg1
-        mul_factor_x   = arg2
-        mul_product_lo = dest
-        mul_product_hi = dest + 1
-
-        lda mul_factor_a      ; 3 cyc
-
-        ; (a + x)^2/2
-        clc                   ; 2 cyc         
-        adc mul_factor_x      ; 3 cyc
-        tax                   ; 2 cyc
-        bcc under256          ; 2 cyc
-        lda mul_hibyte512,x   ; 4 cyc
-        bcs next              ; 2 cyc
-    under256:
-        lda mul_hibyte256,x   ; 4 cyc
-        sec                   ; 2 cyc
-    next:
-        sta mul_product_hi    ; 3 cyc
-        lda mul_lobyte256,x   ; 4 cyc
-
-        ; - a^2/2
-        ldx mul_factor_a      ; 3 cyc
-        sbc mul_lobyte256,x   ; 4 cyc
-        sta mul_product_lo    ; 3 cyc
-        lda mul_product_hi    ; 3 cyc
-        sbc mul_hibyte256,x   ; 4 cyc
-        sta mul_product_hi    ; 3 cyc
-
-        ; + x & a & 1:
-        ; (this is a kludge to correct a
-        ; roundoff error that makes odd * odd too low)
-        ldx mul_factor_x      ; 3 cyc
-        txa                   ; 2 cyc
-        and mul_factor_a      ; 3 cyc
-        and #1                ; 2 cyc
-
-        clc                   ; 2 cyc
-        adc mul_product_lo    ; 3 cyc
-        bcc small_product     ; 2 cyc
-        inc mul_product_hi    ; 5 cyc
-
-        ; - x^2/2
-    small_product:
-        sec                   ; 2 cyc
-        sbc mul_lobyte256,x   ; 4 cyc
-        sta mul_product_lo    ; 3 cyc
-        lda mul_product_hi    ; 3 cyc
-        sbc mul_hibyte256,x   ; 4 cyc
-        sta mul_product_hi    ; 3 cyc
-    .endscope
+; clobbers a, x
+.macro sqr8 dest, arg
+    ldx arg
+    lda sqr_lobyte,x
+    sta dest
+    lda sqr_hibyte,x
+    sta dest + 1
 .endmacro

 ; lookup table for top byte -> PORTB value for bank-switch
@ -447,64 +395,121 @@ bank_switch_table:
    sta PORTB
 .endmacro

+.macro imul8 dest, arg1, arg2, xe
+    .if xe
+        ; using 64KB lookup table
+        ; 58-77 cycles
+        ; clobbers x, y, dest to dest + 3
+        .scope
+            output = dest
+            ptr = dest + 2 ; scratch space assumed

-; 58-77 cycles
-; clobbers x, y, dest to dest + 3
-.macro imul8xe dest, arg1, arg2
-.local done
-.local output
-.local ptr
-
-    output = dest
-    ptr = dest + 2 ; scratch space assumed
-
-    ; bottom 14 bits except the LSB are the per-bank table index
-    ; add $4000 for the bank pointer
-    lda arg1     ; 3 cyc
-    and #$fe     ; 2 cyc
-    sta ptr      ; 3 cyc
-    lda arg2     ; 3 cyc
-    and #$3f     ; 2 cyc
-    clc          ; 2 cyc
-    adc #$40     ; 2 cyc
-    sta ptr + 1  ; 3 cyc
-    
-    ; top 2 bits are the table bank selector
-    ldx arg2                ; 3 cyc
-    lda bank_switch_table,x ; 4 cyc
-    sta PORTB               ; 4 cyc
+            ; bottom 14 bits except the LSB are the per-bank table index
+            ; add $4000 for the bank pointer
+            lda arg1     ; 3 cyc
+            and #$fe     ; 2 cyc
+            sta ptr      ; 3 cyc
+            lda arg2     ; 3 cyc
+            and #$3f     ; 2 cyc
+            clc          ; 2 cyc
+            adc #$40     ; 2 cyc
+            sta ptr + 1  ; 3 cyc
+            
+            ; top 2 bits are the table bank selector
+            ldx arg2                ; 3 cyc
+            lda bank_switch_table,x ; 4 cyc
+            sta PORTB               ; 4 cyc


-    ; copy the entry into output
-    ldy #0       ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output   ; 3 cyc
-    iny          ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output+1 ; 3 cyc
+            ; copy the entry into output
+            ldy #0       ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output   ; 3 cyc
+            iny          ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output+1 ; 3 cyc

-    ; note: we are not restoring memory to save 6 cycles!
-    ; this means those 16kb have to be switched back to base RAM
-    ; if we need to use them anywhere else
-    ;;; restore memory
-    ;;lda #$81     ; 2 cyc - disabled
-    ;;sta PORTB    ; 4 cyc - disabled
+            ; note: we are not restoring memory to save 6 cycles!
+            ; this means those 16kb have to be switched back to base RAM
+            ; if we need to use them anywhere else
+            ;;; restore memory
+            ;;lda #$81     ; 2 cyc - disabled
+            ;;sta PORTB    ; 4 cyc - disabled

-    ; check that 1 bit we skipped to fit into space
-    lda arg1     ; 3 cyc
-    and #1       ; 2 cyc
-    beq done     ; 2 cyc
+            ; check that 1 bit we skipped to fit into space
+            lda arg1     ; 3 cyc
+            and #1       ; 2 cyc
+            beq done     ; 2 cyc

-    ; add the second param one last time for the skipped bit
-    clc          ; 2 cyc
-    lda arg2     ; 3 cyc
-    adc output   ; 3 cyc
-    sta output   ; 3 cyc
-    lda #0       ; 2 cyc
-    adc output+1 ; 3 cyc
-    sta output+1 ; 3 cyc
+            ; add the second param one last time for the skipped bit
+            clc          ; 2 cyc
+            lda arg2     ; 3 cyc
+            adc output   ; 3 cyc
+            sta output   ; 3 cyc
+            lda #0       ; 2 cyc
+            adc output+1 ; 3 cyc
+            sta output+1 ; 3 cyc

-done:
+        done:
+        .endscope
+    .else
+        ; Using base 48k RAM compatibility mode
+        ; Small table of half squares
+        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
+        ; 81-92 cycles
+        .scope
+            mul_factor_a   = arg1
+            mul_factor_x   = arg2
+            mul_product_lo = dest
+            mul_product_hi = dest + 1
+
+            lda mul_factor_a      ; 3 cyc
+
+            ; (a + x)^2/2
+            clc                   ; 2 cyc         
+            adc mul_factor_x      ; 3 cyc
+            tax                   ; 2 cyc
+            bcc under256          ; 2 cyc
+            lda mul_hibyte512,x   ; 4 cyc
+            bcs next              ; 2 cyc
+        under256:
+            lda mul_hibyte256,x   ; 4 cyc
+            sec                   ; 2 cyc
+        next:
+            sta mul_product_hi    ; 3 cyc
+            lda mul_lobyte256,x   ; 4 cyc
+
+            ; - a^2/2
+            ldx mul_factor_a      ; 3 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+
+            ; + x & a & 1:
+            ; (this is a kludge to correct a
+            ; roundoff error that makes odd * odd too low)
+            ldx mul_factor_x      ; 3 cyc
+            txa                   ; 2 cyc
+            and mul_factor_a      ; 3 cyc
+            and #1                ; 2 cyc
+
+            clc                   ; 2 cyc
+            adc mul_product_lo    ; 3 cyc
+            bcc small_product     ; 2 cyc
+            inc mul_product_hi    ; 5 cyc
+
+            ; - x^2/2
+        small_product:
+            sec                   ; 2 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+        .endscope
+    .endif
 .endmacro

 .proc imul8xe_init
@ -632,7 +637,13 @@ inner_loop:

 .endproc

-.proc imul16_func
+.macro imul16_impl xe
+    .local arg1
+    .local arg2
+    .local result
+    .local inter
+    .local arg1_pos
+    .local arg2_pos
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
@ -643,20 +654,20 @@ inner_loop:
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2

-    imul8 result, arg1, arg2
+    imul8 result, arg1, arg2, xe
    lda #0
    sta result + 2
    sta result + 3

-    imul8 inter, arg1 + 1, arg2
+    imul8 inter, arg1 + 1, arg2, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3

-    imul8 inter, arg1, arg2 + 1
+    imul8 inter, arg1, arg2 + 1, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3

-    imul8 inter, arg1 + 1, arg2 + 1
+    imul8 inter, arg1 + 1, arg2 + 1, xe
    add16 result + 2, result + 2, inter

    ; In case of negative inputs, adjust high word
@ -671,47 +682,14 @@ arg1_pos:
 arg2_pos:

    rts ; 6 cyc
+.endmacro
+
+.proc imul16_func
+    imul16_impl 0
 .endproc

 .proc imul16xe_func
-    arg1 = FR0   ; 16-bit arg (clobbered)
-    arg2 = FR1   ; 16-bit arg (clobbered)
-    result = FR2 ; 32-bit result
-    inter = temp2
-
-    ; h1l1 * h2l2
-    ; (h1*256 + l1) * (h2*256 + l2)
-    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
-    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
-
-    imul8xe result, arg1, arg2
-    lda #0
-    sta result + 2
-    sta result + 3
-
-    imul8xe inter, arg1 + 1, arg2
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8xe inter, arg1, arg2 + 1
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8xe inter, arg1 + 1, arg2 + 1
-    add16 result + 2, result + 2, inter
-
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg1 + 1
-    bpl arg1_pos
-    sub16 result + 2, result + 2, arg2
-arg1_pos:
-    lda arg2 + 1
-    bpl arg2_pos
-    sub16 result + 2, result + 2, arg1
-arg2_pos:
-
-    rts ; 6 cyc
+    imul16_impl 1
 .endproc

 .macro round16 arg
--- a/tables.js
+++ b/tables.js
@ -22,7 +22,10 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
+.export sqr_lobyte
+.export sqr_hibyte

+; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}

+; (i * i) for the plain squares
+.align 256
+sqr_lobyte:
+${db((i) => (i * i) & 0xff)}
+
+.align 256
+sqr_hibyte:
+${db((i) => ((i * i) >> 8) & 0xff)}
+
 `);