shave some cycles off 16-bit squaring with shift instead of add

also fix the comments about how many cycles shift takes
unify tables for squaring and multiplication
2024-12-31 15:29:40 -08:00 · 2024-12-31 02:26:24 -08:00 · 2024-12-31 02:22:31 -08:00 · 2024-12-31 02:01:45 -08:00
4 changed files with 144 additions and 252 deletions
--- a/mandel.s
+++ b/mandel.s
@ -1,42 +1,43 @@
 ; Our zero-page vars
-ox              = $80 ; fixed8.24: center point x
+sx    = $80     ; i16: screen pixel x
-oy              = $84 ; fixed8.24: center point y
+sy    = $82     ; i16: screen pixel y
-cx              = $88 ; fixed8.24: c_x
+ox    = $84     ; fixed4.12: center point x
-cy              = $8c ; fixed8.24: c_y
+oy    = $86     ; fixed4.12: center point y
 cx    = $88     ; fixed4.12: c_x
 cy    = $8a     ; fixed4.12: c_y
 zx    = $8c     ; fixed4.12: z_x
 zy    = $8e     ; fixed4.12: z_y
-zx              = $90 ; fixed8.24: z_x
+zx_2  = $90     ; fixed4.12: z_x^2
-zy              = $94 ; fixed8.24: z_y
+zy_2  = $92     ; fixed4.12: z_y^2
-zx_2            = $98 ; fixed8.24: z_x^2
+zx_zy = $94     ; fixed4.12: z_x * z_y
-zy_2            = $9c ; fixed8.24: z_y^2
+dist  = $96     ; fixed4.12: z_x^2 + z_y^2
-zx_zy           = $a0 ; fixed8.24: z_x * z_y
+iter          = $a0 ; u8: iteration count
 dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
 sx              = $a8 ; i16: screen pixel x
 sy              = $aa ; i16: screen pixel y
 z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $ad ; u8: index into z_buffer
 z_buffer_end    = $ae ; u8: index into z_buffer
 iter            = $af ; u8: iteration count
-ptr             = $b0 ; u16
+zoom          = $a1 ; u8: zoom shift level
-pixel_ptr       = $b2 ; u16
+count_frames  = $a2 ; u8
-zoom            = $b4 ; u8: zoom shift level
+count_pixels  = $a3 ; u8
-fill_level      = $b5 ; u8
+total_ms      = $a4 ; float48
-pixel_color     = $b6 ; u8
+total_pixels  = $aa ; float48
 pixel_mask      = $b7 ; u8
 pixel_shift     = $b8 ; u8
 pixel_offset    = $b9 ; u8
 palette_offset  = $ba ; u8
 chroma_offset   = $bb ; u8
 palette_ticks   = $bc ; u8
 chroma_ticks    = $bd ; u8
 count_frames    = $be ; u8
 count_pixels    = $bf ; u8
-total_pixels    = $c0 ; float48
+z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-total_ms        = $c6 ; float48
+z_buffer_start  = $b1 ; u8: index into z_buffer
-temp            = $cc ; u16
+z_buffer_end    = $b2 ; u8: index into z_buffer
-temp2           = $ce ; u16
+temp            = $b4 ; u16
 temp2           = $b6 ; u16
 pixel_ptr       = $b8 ; u16
 pixel_color     = $ba ; u8
 pixel_mask      = $bb ; u8
 pixel_shift     = $bc ; u8
 pixel_offset    = $bd ; u8
 fill_level      = $be ; u8
 palette_offset  = $bf ; u8
 palette_ticks = $c0 ; u8
 chroma_ticks  = $c1 ; u8
 chroma_offset = $c2 ; u8
 ptr           = $c4 ; u16
 palette_delay = 23
 chroma_delay = 137
@ -128,11 +129,8 @@ KEY_0     = 50
    mantissa .byte 5
 .endstruct
-.import mul_lobyte256
+.import mul_lobyte
-.import mul_hibyte256
+.import mul_hibyte
 .import mul_hibyte512
 .import sqr_lobyte
 .import sqr_hibyte
 .data
@ -292,16 +290,16 @@ viewport_zoom:
    .byte 6
 viewport_ox:
-    .dword $00000000
+    .word $0000
-    .dword $ff110000
+    .word $f110
-    .dword $ff110000
+    .word $f110
-    .dword $fe400000
+    .word $e400
 viewport_oy:
-    .dword $00000000
+    .word $0000
-    .dword $ffb60000
+    .word $fb60
-    .dword $ffbe0000
+    .word $fbe0
-    .dword $00000000
+    .word $0000
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@ -320,7 +318,7 @@ viewport_oy:
 ; 38 cycles
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg1, arg2
+    add 4, dest, arg2, dest
 .endmacro
 ; 8 cycles
@ -350,7 +348,7 @@ viewport_oy:
    sub 4, dest, arg1, arg2
 .endmacro
-; 3 + 5 * bytes cycles
+; 3 + 5 * (bytes - 1) cycles
 .macro shl bytes, arg
    asl arg              ; 3 cyc
    .repeat bytes-1, i
@ -358,17 +356,17 @@ viewport_oy:
    .endrepeat
 .endmacro
-; 13 cycles
+; 8 cycles
 .macro shl16 arg
    shl 2, arg
 .endmacro
-; 18 cycles
+; 13 cycles
 .macro shl24 arg
    shl 3, arg
 .endmacro
-; 23 cycles
+; 18 cycles
 .macro shl32 arg
    shl 4, arg
 .endmacro
@ -425,45 +423,32 @@ viewport_oy:
    round16 arg ; 11-27 cycles
 .endmacro
-; input: arg1, arg2 as fixed4.12
+.macro imul16_round dest, arg1, arg2, shift
 ; output: dest as fixed8.24
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; ? cyc
-    copy32 dest, FR2  ; 24 cyc
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
-; input: arg as fixed4.12
+.macro sqr16_round dest, arg, shift
-; output: dest as fixed8.24
+    ;imul16_round dest, arg, arg, shift
 .macro sqr16 dest, arg
    copy16 FR0, arg   ; 12 cyc
    jsr sqr16_func      ; ? cyc
-    copy32 dest, FR2  ; 24 cyc
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 ; input: arg as u8
 ; output: dest as u16
 ; clobbers a, x
 .macro sqr8 dest, arg
    ldx arg
-    lda sqr_lobyte,x
+    txa
    lsr
    lda mul_lobyte,x
    rol
    sta dest
-    lda sqr_hibyte,x
+    lda mul_hibyte,x
-    sta dest + 1
+    rol
 .endmacro
 ; input: arg as u8
 ; input/output: dest as u16
 ; clobbers a, x
 .macro sqr8_add16 dest, arg
    ldx arg
    clc
    lda sqr_lobyte,x
    adc dest
    sta dest
    lda sqr_hibyte,x
    adc dest + 1
    sta dest + 1
 .endmacro
@ -552,22 +537,25 @@ bank_switch_table:
            clc                   ; 2 cyc         
            adc mul_factor_x      ; 3 cyc
            tax                   ; 2 cyc
-            bcc under256          ; 2 cyc
+            lda mul_hibyte,x      ; 4 cyc
-            lda mul_hibyte512,x   ; 4 cyc
+            bcc next              ; 2 cyc
-            bcs next              ; 2 cyc
+            ; carry is set so we get to add 1 for free, but need to add 0x80
-        under256:
+            adc #$7f              ; 2 cyc
-            lda mul_hibyte256,x   ; 4 cyc
+            clc                   ; 2 cyc
-            sec                   ; 2 cyc
+            ; stash the sum temporarily so we can use it as an operand to add
            stx mul_product_lo    ; 3 cyc
            adc mul_product_lo    ; 3 cyc
        next:
            sec                   ; 2 cyc
            sta mul_product_hi    ; 3 cyc
-            lda mul_lobyte256,x   ; 4 cyc
+            lda mul_lobyte,x      ; 4 cyc
            ; - a^2/2
            ldx mul_factor_a      ; 3 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
+            sbc mul_lobyte,x      ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
+            sbc mul_hibyte,x      ; 4 cyc
            sta mul_product_hi    ; 3 cyc
            ; + x & a & 1:
@ -586,10 +574,10 @@ bank_switch_table:
            ; - x^2/2
        small_product:
            sec                   ; 2 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
+            sbc mul_lobyte,x      ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
+            sbc mul_hibyte,x      ; 4 cyc
            sta mul_product_hi    ; 3 cyc
        .endscope
    .endif
@ -796,18 +784,14 @@ arg2_pos:
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
        sqr8 result, arg
-        lda #0
+        sqr8 result + 2, arg + 1
        sta result + 2
        sta result + 3
        imul8 inter, arg + 1, arg, xe
-        add16 result + 1, result + 1, inter
+        shl16 inter
        add_carry result + 3
        add16 result + 1, result + 1, inter
        add_carry result + 3
        sqr8_add16 result + 2, arg + 1
        rts ; 6 cyc
    .endscope
 .endmacro
@ -875,8 +859,8 @@ next:
 .proc mandelbrot
    ; input:
-    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 8.24
+    ; cy: position scaled to 4.12
    ;
    ; output:
    ; iter: iteration count at escape or 0
@ -888,41 +872,12 @@ next:
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
 ;    lda #00
 ;    ldx #(iter - zx + 1)
 ;initloop:
 ;    sta zx - 1,x
 ;    dex
 ;    bne initloop
 ;    sta z_buffer_start
 ;    sta z_buffer_end
    lda #00
-    sta zx
+    ldx #(iter - zx + 1)
-    sta zx + 1
+initloop:
-    sta zx + 2
+    sta zx - 1,x
-    sta zx + 3
+    dex
-    sta zy
+    bne initloop
    sta zy + 1
    sta zy + 2
    sta zy + 3
    sta zx_2
    sta zx_2 + 1
    sta zx_2 + 2
    sta zx_2 + 3
    sta zy_2
    sta zy_2 + 1
    sta zy_2 + 2
    sta zy_2 + 3
    sta zx_zy
    sta zx_zy + 1
    sta zx_zy + 2
    sta zx_zy + 3
    sta dist
    sta dist + 1
    sta dist + 2
    sta dist + 3
    sta iter
    sta z_buffer_start
    sta z_buffer_end
@ -934,8 +889,6 @@ loop:
 keep_going:
    .macro quick_exit arg, max
        ; arg: fixed8.24
        ; max: integer
        .local positive
        .local negative
        .local nope_out
@ -943,16 +896,16 @@ keep_going:
        .local all_done
        ; check sign bit
-        lda arg + 3
+        lda arg + 1
        bmi negative
    positive:
-        cmp #max
+        cmp #((max) << 4)
        bmi all_done ; 'less than'
        jmp exit_path
    negative:
-        cmp #(256 - max)
+        cmp #(256 - ((max) << 4))
        beq first_equal ; 'equal' on first byte
        bpl all_done    ; 'greater than'
@ -960,44 +913,34 @@ keep_going:
        jmp exit_path
    first_equal:
        ; following bytes all 0 shows it's really 'equal'
        lda arg + 2
        bne all_done
        lda arg + 1
        bne all_done
        lda arg
-        bne all_done
+        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
        jmp exit_path
    all_done:
    .endmacro
-    ; 8.24: (-128 .. 127.9)
+    ; 4.12: (-8 .. +7.9)
    ; zx = zx_2  - zy_2  + cx
-    sub32 zx, zx_2, zy_2
+    sub16 zx, zx_2, zy_2
-    add32 zx, zx, cx
+    add16 zx, zx, cx
    quick_exit zx, 2
    ; zy = zx_zy + zx_zy + cy
-    add32 zy, zx_zy, zx_zy
+    add16 zy, zx_zy, zx_zy
-    add32 zy, zy, cy
+    add16 zy, zy, cy
    quick_exit zy, 2
    ; convert 8.24 -> 4.12: (-8 .. +7.9)
    shift_round_16 zx, 4
    shift_round_16 zy, 4
    ; zx_2 = zx * zx
-    sqr16 zx_2, zx + 2
+    sqr16_round zx_2, zx, 4
    ; zy_2 = zy * zy
-    sqr16 zy_2, zy + 2
+    sqr16_round zy_2, zy, 4
    ; zx_zy = zx * zy
-    imul16 zx_zy, zx + 2, zy + 2
+    imul16_round zx_zy, zx, zy, 4
    ; dist = zx_2 + zy_2
-    add32 dist, zx_2, zy_2
+    add16 dist, zx_2, zy_2
    quick_exit dist, 4
    ; if may be in the lake, look for looping output with a small buffer
@ -1034,10 +977,10 @@ z_buffer_loop:
    ; Compare the previously stored z values
    ldy #0
-    z_compare zx + 2
+    z_compare zx
-    z_compare zx + 3
+    z_compare zx + 1
-    z_compare zy + 2
+    z_compare zy
-    z_compare zy + 3
+    z_compare zy + 1
    cpy #4
    bne z_no_matches
@ -1052,10 +995,10 @@ z_no_matches:
 z_nothing_to_read:
    ; Store and expand
-    z_store zx + 2
+    z_store zx
-    z_store zx + 3
+    z_store zx + 1
-    z_store zy + 2
+    z_store zy
-    z_store zy + 3
+    z_store zy + 1
    z_advance
    stx z_buffer_end
@ -1106,17 +1049,14 @@ cont:
 enough:
 .endmacro
-.macro zoom_factor dest, src, aspect
+.macro zoom_factor dest, src, zoom, aspect
    ; output: dest: fixed8.24
    ; input: src: fixed4.12
    ; aspect: fixed4.12
    ; clobbers A, X, flags, etc
    copy16 dest, src
    scale_zoom dest
    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
-    imul16 dest, dest, aspect
+    imul16_round dest, dest, aspect, 4
 .endmacro
 .proc pset
@ -1341,15 +1281,12 @@ skip_luma:
    cpy #KEY_MINUS
    beq minus
-    ; temp+temp2 = $00010000 << (8 - zoom)
+    ; temp = $0010 << (8 - zoom)
-    lda #$00
+    lda #$10
    sta temp
    sta temp + 1
    lda #$01
    sta temp + 2
    lda #$00
-    sta temp + 3
+    sta temp + 1
-    scale_zoom temp + 2
+    scale_zoom temp
    cpy #KEY_UP
    beq up
@ -1359,7 +1296,14 @@ skip_luma:
    beq left
    cpy #KEY_RIGHT
    beq right
-    jmp number_keys
+    cpy #KEY_1
    beq one
    cpy #KEY_2
    beq two
    cpy #KEY_3
    beq three
    cpy #KEY_4
    beq four
 skip_char:
    lda #0
@ -1378,29 +1322,17 @@ minus:
    dec zoom
    jmp done
 up:
-    sub32 oy, oy, temp
+    sub16 oy, oy, temp 
    jmp done
 down:
-    add32 oy, oy, temp
+    add16 oy, oy, temp
    jmp done
 left:
-    sub32 ox, ox, temp
+    sub16 ox, ox, temp
    jmp done
 right:
-    add32 ox, ox, temp
+    add16 ox, ox, temp
    jmp done
 number_keys:
    cpy #KEY_1
    beq one
    cpy #KEY_2
    beq two
    cpy #KEY_3
    beq three
    cpy #KEY_4
    beq four
    jmp skip_char
 one:
    ldx #0
    jmp load_key_viewport
@ -1462,32 +1394,17 @@ zero_byte_loop:
    txa
    asl a
    asl a
    tax
    lda viewport_ox,x
    sta ox
    lda viewport_oy,x
    sta oy
    inx
    lda viewport_ox,x
    sta ox + 1
    lda viewport_oy,x
    sta oy + 1
    inx
    lda viewport_ox,x
    sta ox + 2
    lda viewport_oy,x
    sta oy + 2
    inx
    lda viewport_ox,x
    sta ox + 3
    lda viewport_oy,x
    sta oy + 3
    rts
 .endproc
@ -1609,10 +1526,10 @@ skipped_mask:
 not_skipped_mask:
    ; run the fractal!
-    zoom_factor cx, sx, aspect_x
+    zoom_factor cx, sx, zoom, aspect_x
-    add32 cx, cx, ox
+    add16 cx, cx, ox
-    zoom_factor cy, sy, aspect_y
+    zoom_factor cy, sy, zoom, aspect_y
-    add32 cy, cy, oy
+    add16 cy, cy, oy
    jsr mandelbrot
    jsr pset
--- a/readme.md
+++ b/readme.md
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
-The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
 Iterations are capped at 255.
--- a/tables.js
+++ b/tables.js
@ -11,40 +11,19 @@ function db(func) {
    return lines.join('\n');
 }
 let squares = [];
 for (let i = 0; i < 512; i++) {
    squares.push(Math.trunc((i * i + 1) / 2));
 }
 console.log(
 `.segment "TABLES"
-.export mul_lobyte256
+.export mul_lobyte
-.export mul_hibyte256
+.export mul_hibyte
 .export mul_hibyte512
 .export sqr_lobyte
 .export sqr_hibyte
-; (i * i + 1) / 2 for the multiplier
+; (i * i) / 2 for the multiplier
 .align 256
-mul_lobyte256:
+mul_lobyte:
-${db((i) => squares[i] & 0xff)}
+${db((i) => ((i * i) >> 1) & 0xff)}
 .align 256
-mul_hibyte256:
+mul_hibyte:
-${db((i) => (squares[i] >> 8) & 0xff)}
+${db((i) => ((i * i) >> 9) & 0xff)}
 .align 256
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 ; (i * i) for the plain squares
 .align 256
 sqr_lobyte:
 ${db((i) => (i * i) & 0xff)}
 .align 256
 sqr_hibyte:
 ${db((i) => ((i * i) >> 8) & 0xff)}
 `);
--- a/todo.md
+++ b/todo.md
@ -1,13 +1,9 @@
 things to try:
 * skip add on the top-byte multiply in sqr8/mul8
  * should save a few cycles, suggestion by jamey
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 * try 3.13 fixed point instead of 4.12 for more precision
  * can we get away without the extra bit?
  * since exit compare space would be 6.26 i think so
 * y-axis mirror optimization
Author	SHA1	Message	Date
Jamey Sharp	3553ce986f	shave some cycles off 16-bit squaring with shift instead of add also fix the comments about how many cycles shift takes	2024-12-31 15:29:40 -08:00
Jamey Sharp	0f49760aa5	unify tables for squaring and multiplication	2024-12-31 02:26:24 -08:00
Jamey Sharp	f06aed0c00	set results from both 8-bit squares first Since the results from the lo and hi squares don't overlap or overflow, they can be written directly to the final output location without doing any addition. Then only the multiplication that goes in the middle needs any adds.	2024-12-31 02:22:31 -08:00
Jamey Sharp	aee587388d	eliminate mul_hibyte512 table This costs an extra half cycle on average, assuming uniform distribution of multiplication inputs. I don't think a half cycle is worth an extra 256-byte table.	2024-12-31 02:01:45 -08:00