shave some cycles off 16-bit squaring with shift instead of add

also fix the comments about how many cycles shift takes
unify tables for squaring and multiplication
2024-12-31 15:29:40 -08:00 · 2024-12-31 02:26:24 -08:00 · 2024-12-31 02:22:31 -08:00 · 2024-12-31 02:01:45 -08:00
2 changed files with 35 additions and 68 deletions
--- a/mandel.s
+++ b/mandel.s
@ -129,11 +129,8 @@ KEY_0     = 50
    mantissa .byte 5
 .endstruct
-.import mul_lobyte256
+.import mul_lobyte
-.import mul_hibyte256
+.import mul_hibyte
 .import mul_hibyte512
 .import sqr_lobyte
 .import sqr_hibyte
 .data
@ -351,7 +348,7 @@ viewport_oy:
    sub 4, dest, arg1, arg2
 .endmacro
-; 3 + 5 * bytes cycles
+; 3 + 5 * (bytes - 1) cycles
 .macro shl bytes, arg
    asl arg              ; 3 cyc
    .repeat bytes-1, i
@ -359,17 +356,17 @@ viewport_oy:
    .endrepeat
 .endmacro
-; 13 cycles
+; 8 cycles
 .macro shl16 arg
    shl 2, arg
 .endmacro
-; 18 cycles
+; 13 cycles
 .macro shl24 arg
    shl 3, arg
 .endmacro
-; 23 cycles
+; 18 cycles
 .macro shl32 arg
    shl 4, arg
 .endmacro
@ -445,21 +442,13 @@ viewport_oy:
 ; clobbers a, x
 .macro sqr8 dest, arg
    ldx arg
-    lda sqr_lobyte,x
+    txa
    lsr
    lda mul_lobyte,x
    rol
    sta dest
-    lda sqr_hibyte,x
+    lda mul_hibyte,x
-    sta dest + 1
+    rol
 .endmacro
 ; clobbers a, x
 .macro sqr8_add16 dest, arg
    ldx arg
    clc
    lda sqr_lobyte,x
    adc dest
    sta dest
    lda sqr_hibyte,x
    adc dest + 1
    sta dest + 1
 .endmacro
@ -548,22 +537,25 @@ bank_switch_table:
            clc                   ; 2 cyc         
            adc mul_factor_x      ; 3 cyc
            tax                   ; 2 cyc
-            bcc under256          ; 2 cyc
+            lda mul_hibyte,x      ; 4 cyc
-            lda mul_hibyte512,x   ; 4 cyc
+            bcc next              ; 2 cyc
-            bcs next              ; 2 cyc
+            ; carry is set so we get to add 1 for free, but need to add 0x80
-        under256:
+            adc #$7f              ; 2 cyc
-            lda mul_hibyte256,x   ; 4 cyc
+            clc                   ; 2 cyc
-            sec                   ; 2 cyc
+            ; stash the sum temporarily so we can use it as an operand to add
            stx mul_product_lo    ; 3 cyc
            adc mul_product_lo    ; 3 cyc
        next:
            sec                   ; 2 cyc
            sta mul_product_hi    ; 3 cyc
-            lda mul_lobyte256,x   ; 4 cyc
+            lda mul_lobyte,x      ; 4 cyc
            ; - a^2/2
            ldx mul_factor_a      ; 3 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
+            sbc mul_lobyte,x      ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
+            sbc mul_hibyte,x      ; 4 cyc
            sta mul_product_hi    ; 3 cyc
            ; + x & a & 1:
@ -582,10 +574,10 @@ bank_switch_table:
            ; - x^2/2
        small_product:
            sec                   ; 2 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
+            sbc mul_lobyte,x      ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
+            sbc mul_hibyte,x      ; 4 cyc
            sta mul_product_hi    ; 3 cyc
        .endscope
    .endif
@ -792,18 +784,14 @@ arg2_pos:
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
        sqr8 result, arg
-        lda #0
+        sqr8 result + 2, arg + 1
        sta result + 2
        sta result + 3
        imul8 inter, arg + 1, arg, xe
-        add16 result + 1, result + 1, inter
+        shl16 inter
        add_carry result + 3
        add16 result + 1, result + 1, inter
        add_carry result + 3
        sqr8_add16 result + 2, arg + 1
        rts ; 6 cyc
    .endscope
 .endmacro
--- a/tables.js
+++ b/tables.js
@ -11,40 +11,19 @@ function db(func) {
    return lines.join('\n');
 }
 let squares = [];
 for (let i = 0; i < 512; i++) {
    squares.push(Math.trunc((i * i + 1) / 2));
 }
 console.log(
 `.segment "TABLES"
-.export mul_lobyte256
+.export mul_lobyte
-.export mul_hibyte256
+.export mul_hibyte
 .export mul_hibyte512
 .export sqr_lobyte
 .export sqr_hibyte
-; (i * i + 1) / 2 for the multiplier
+; (i * i) / 2 for the multiplier
 .align 256
-mul_lobyte256:
+mul_lobyte:
-${db((i) => squares[i] & 0xff)}
+${db((i) => ((i * i) >> 1) & 0xff)}
 .align 256
-mul_hibyte256:
+mul_hibyte:
-${db((i) => (squares[i] >> 8) & 0xff)}
+${db((i) => ((i * i) >> 9) & 0xff)}
 .align 256
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 ; (i * i) for the plain squares
 .align 256
 sqr_lobyte:
 ${db((i) => (i * i) & 0xff)}
 .align 256
 sqr_hibyte:
 ${db((i) => ((i * i) >> 8) & 0xff)}
 `);
Author	SHA1	Message	Date
Jamey Sharp	3553ce986f	shave some cycles off 16-bit squaring with shift instead of add also fix the comments about how many cycles shift takes	2024-12-31 15:29:40 -08:00
Jamey Sharp	0f49760aa5	unify tables for squaring and multiplication	2024-12-31 02:26:24 -08:00
Jamey Sharp	f06aed0c00	set results from both 8-bit squares first Since the results from the lo and hi squares don't overlap or overflow, they can be written directly to the final output location without doing any addition. Then only the multiplication that goes in the middle needs any adds.	2024-12-31 02:22:31 -08:00
Jamey Sharp	aee587388d	eliminate mul_hibyte512 table This costs an extra half cycle on average, assuming uniform distribution of multiplication inputs. I don't think a half cycle is worth an extra 256-byte table.	2024-12-31 02:01:45 -08:00