fix fix

fix panning for 32-bi
update docs for 32-bit intermediates
2024-12-31 15:03:43 -08:00 · 2024-12-31 14:45:38 -08:00 · 2024-12-31 14:16:43 -08:00 · 2024-12-31 13:54:53 -08:00 · 2024-12-31 09:53:22 -08:00 · 2024-12-31 09:09:11 -08:00
4 changed files with 248 additions and 140 deletions
--- a/mandel.s
+++ b/mandel.s
@ -1,43 +1,42 @@
 ; Our zero-page vars
-sx    = $80     ; i16: screen pixel x
-sy    = $82     ; i16: screen pixel y
-ox    = $84     ; fixed4.12: center point x
-oy    = $86     ; fixed4.12: center point y
-cx    = $88     ; fixed4.12: c_x
-cy    = $8a     ; fixed4.12: c_y
-zx    = $8c     ; fixed4.12: z_x
-zy    = $8e     ; fixed4.12: z_y
+ox              = $80 ; fixed8.24: center point x
+oy              = $84 ; fixed8.24: center point y
+cx              = $88 ; fixed8.24: c_x
+cy              = $8c ; fixed8.24: c_y

-zx_2  = $90     ; fixed4.12: z_x^2
-zy_2  = $92     ; fixed4.12: z_y^2
-zx_zy = $94     ; fixed4.12: z_x * z_y
-dist  = $96     ; fixed4.12: z_x^2 + z_y^2
+zx              = $90 ; fixed8.24: z_x
+zy              = $94 ; fixed8.24: z_y
+zx_2            = $98 ; fixed8.24: z_x^2
+zy_2            = $9c ; fixed8.24: z_y^2

-iter          = $a0 ; u8: iteration count
+zx_zy           = $a0 ; fixed8.24: z_x * z_y
+dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
+sx              = $a8 ; i16: screen pixel x
+sy              = $aa ; i16: screen pixel y
+z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $ad ; u8: index into z_buffer
+z_buffer_end    = $ae ; u8: index into z_buffer
+iter            = $af ; u8: iteration count

-zoom          = $a1 ; u8: zoom shift level
-count_frames  = $a2 ; u8
-count_pixels  = $a3 ; u8
-total_ms      = $a4 ; float48
-total_pixels  = $aa ; float48
+ptr             = $b0 ; u16
+pixel_ptr       = $b2 ; u16
+zoom            = $b4 ; u8: zoom shift level
+fill_level      = $b5 ; u8
+pixel_color     = $b6 ; u8
+pixel_mask      = $b7 ; u8
+pixel_shift     = $b8 ; u8
+pixel_offset    = $b9 ; u8
+palette_offset  = $ba ; u8
+chroma_offset   = $bb ; u8
+palette_ticks   = $bc ; u8
+chroma_ticks    = $bd ; u8
+count_frames    = $be ; u8
+count_pixels    = $bf ; u8

-z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-z_buffer_start  = $b1 ; u8: index into z_buffer
-z_buffer_end    = $b2 ; u8: index into z_buffer
-temp            = $b4 ; u16
-temp2           = $b6 ; u16
-pixel_ptr       = $b8 ; u16
-pixel_color     = $ba ; u8
-pixel_mask      = $bb ; u8
-pixel_shift     = $bc ; u8
-pixel_offset    = $bd ; u8
-fill_level      = $be ; u8
-palette_offset  = $bf ; u8
-
-palette_ticks = $c0 ; u8
-chroma_ticks  = $c1 ; u8
-chroma_offset = $c2 ; u8
-ptr           = $c4 ; u16
+total_pixels    = $c0 ; float48
+total_ms        = $c6 ; float48
+temp            = $cc ; u16
+temp2           = $ce ; u16

 palette_delay = 23
 chroma_delay = 137
@ -129,8 +128,11 @@ KEY_0     = 50
    mantissa .byte 5
 .endstruct

-.import mul_lobyte
-.import mul_hibyte
+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+.import sqr_lobyte
+.import sqr_hibyte

 .data

@ -290,16 +292,16 @@ viewport_zoom:
    .byte 6

 viewport_ox:
-    .word $0000
-    .word $f110
-    .word $f110
-    .word $e400
+    .dword $00000000
+    .dword $ff110000
+    .dword $ff110000
+    .dword $fe400000

 viewport_oy:
-    .word $0000
-    .word $fb60
-    .word $fbe0
-    .word $0000
+    .dword $00000000
+    .dword $ffb60000
+    .dword $ffbe0000
+    .dword $00000000

 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@ -318,7 +320,7 @@ viewport_oy:

 ; 38 cycles
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg2, dest
+    add 4, dest, arg1, arg2
 .endmacro

 ; 8 cycles
@ -348,7 +350,7 @@ viewport_oy:
    sub 4, dest, arg1, arg2
 .endmacro

-; 3 + 5 * (bytes - 1) cycles
+; 3 + 5 * bytes cycles
 .macro shl bytes, arg
    asl arg              ; 3 cyc
    .repeat bytes-1, i
@ -356,17 +358,17 @@ viewport_oy:
    .endrepeat
 .endmacro

-; 8 cycles
+; 13 cycles
 .macro shl16 arg
    shl 2, arg
 .endmacro

-; 13 cycles
+; 18 cycles
 .macro shl24 arg
    shl 3, arg
 .endmacro

-; 18 cycles
+; 23 cycles
 .macro shl32 arg
    shl 4, arg
 .endmacro
@ -423,32 +425,45 @@ viewport_oy:
    round16 arg ; 11-27 cycles
 .endmacro

-.macro imul16_round dest, arg1, arg2, shift
+; input: arg1, arg2 as fixed4.12
+; output: dest as fixed8.24
+.macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
-    copy16 dest, FR2 + 2  ; 12 cyc
+    copy32 dest, FR2  ; 24 cyc
 .endmacro

-.macro sqr16_round dest, arg, shift
-    ;imul16_round dest, arg, arg, shift
+; input: arg as fixed4.12
+; output: dest as fixed8.24
+.macro sqr16 dest, arg
    copy16 FR0, arg   ; 12 cyc
-    jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
-    copy16 dest, FR2 + 2  ; 12 cyc
+    jsr sqr16_func    ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
 .endmacro

+; input: arg as u8
+; output: dest as u16
 ; clobbers a, x
 .macro sqr8 dest, arg
    ldx arg
-    txa
-    lsr
-    lda mul_lobyte,x
-    rol
+    lda sqr_lobyte,x
    sta dest
-    lda mul_hibyte,x
-    rol
+    lda sqr_hibyte,x
+    sta dest + 1
+.endmacro
+
+; input: arg as u8
+; input/output: dest as u16
+; clobbers a, x
+.macro sqr8_add16 dest, arg
+    ldx arg
+    clc
+    lda sqr_lobyte,x
+    adc dest
+    sta dest
+    lda sqr_hibyte,x
+    adc dest + 1
    sta dest + 1
 .endmacro

@ -537,25 +552,22 @@ bank_switch_table:
            clc                   ; 2 cyc         
            adc mul_factor_x      ; 3 cyc
            tax                   ; 2 cyc
-            lda mul_hibyte,x      ; 4 cyc
-            bcc next              ; 2 cyc
-            ; carry is set so we get to add 1 for free, but need to add 0x80
-            adc #$7f              ; 2 cyc
-            clc                   ; 2 cyc
-            ; stash the sum temporarily so we can use it as an operand to add
-            stx mul_product_lo    ; 3 cyc
-            adc mul_product_lo    ; 3 cyc
-        next:
+            bcc under256          ; 2 cyc
+            lda mul_hibyte512,x   ; 4 cyc
+            bcs next              ; 2 cyc
+        under256:
+            lda mul_hibyte256,x   ; 4 cyc
            sec                   ; 2 cyc
+        next:
            sta mul_product_hi    ; 3 cyc
-            lda mul_lobyte,x      ; 4 cyc
+            lda mul_lobyte256,x   ; 4 cyc

            ; - a^2/2
            ldx mul_factor_a      ; 3 cyc
-            sbc mul_lobyte,x      ; 4 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte,x      ; 4 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc

            ; + x & a & 1:
@ -574,10 +586,10 @@ bank_switch_table:
            ; - x^2/2
        small_product:
            sec                   ; 2 cyc
-            sbc mul_lobyte,x      ; 4 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte,x      ; 4 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc
        .endscope
    .endif
@ -784,14 +796,18 @@ arg2_pos:
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l

        sqr8 result, arg
-        sqr8 result + 2, arg + 1
+        lda #0
+        sta result + 2
+        sta result + 3

        imul8 inter, arg + 1, arg, xe
-        shl16 inter
+        add16 result + 1, result + 1, inter
        add_carry result + 3
        add16 result + 1, result + 1, inter
        add_carry result + 3

+        sqr8_add16 result + 2, arg + 1
+
        rts ; 6 cyc
    .endscope
 .endmacro
@ -859,8 +875,8 @@ next:

 .proc mandelbrot
    ; input:
-    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 4.12
+    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cy: position scaled to 8.24
    ;
    ; output:
    ; iter: iteration count at escape or 0
@ -872,12 +888,41 @@ next:
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
+;    lda #00
+;    ldx #(iter - zx + 1)
+;initloop:
+;    sta zx - 1,x
+;    dex
+;    bne initloop
+;    sta z_buffer_start
+;    sta z_buffer_end
+
    lda #00
-    ldx #(iter - zx + 1)
-initloop:
-    sta zx - 1,x
-    dex
-    bne initloop
+    sta zx
+    sta zx + 1
+    sta zx + 2
+    sta zx + 3
+    sta zy
+    sta zy + 1
+    sta zy + 2
+    sta zy + 3
+    sta zx_2
+    sta zx_2 + 1
+    sta zx_2 + 2
+    sta zx_2 + 3
+    sta zy_2
+    sta zy_2 + 1
+    sta zy_2 + 2
+    sta zy_2 + 3
+    sta zx_zy
+    sta zx_zy + 1
+    sta zx_zy + 2
+    sta zx_zy + 3
+    sta dist
+    sta dist + 1
+    sta dist + 2
+    sta dist + 3
+    sta iter
    sta z_buffer_start
    sta z_buffer_end

@ -889,6 +934,8 @@ loop:
 keep_going:

    .macro quick_exit arg, max
+        ; arg: fixed8.24
+        ; max: integer
        .local positive
        .local negative
        .local nope_out
@ -896,51 +943,61 @@ keep_going:
        .local all_done

        ; check sign bit
-        lda arg + 1
+        lda arg + 3
        bmi negative

    positive:
-        cmp #((max) << 4)
+        cmp #max
        bmi all_done ; 'less than'
        jmp exit_path

    negative:
-        cmp #(256 - ((max) << 4))
+        cmp #(256 - max)
        beq first_equal ; 'equal' on first byte
        bpl all_done    ; 'greater than'

    nope_out:
        jmp exit_path
-    
+
    first_equal:
+        ; following bytes all 0 shows it's really 'equal'
+        lda arg + 2
+        bne all_done
+        lda arg + 1
+        bne all_done
        lda arg
-        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
+        bne all_done
+        jmp exit_path

    all_done:
    .endmacro

-    ; 4.12: (-8 .. +7.9)
+    ; 8.24: (-128 .. 127.9)
    ; zx = zx_2  - zy_2  + cx
-    sub16 zx, zx_2, zy_2
-    add16 zx, zx, cx
+    sub32 zx, zx_2, zy_2
+    add32 zx, zx, cx
    quick_exit zx, 2

    ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy, zx_zy
-    add16 zy, zy, cy
+    add32 zy, zx_zy, zx_zy
+    add32 zy, zy, cy
    quick_exit zy, 2

+    ; convert 8.24 -> 4.12: (-8 .. +7.9)
+    shift_round_16 zx, 4
+    shift_round_16 zy, 4
+
    ; zx_2 = zx * zx
-    sqr16_round zx_2, zx, 4
+    sqr16 zx_2, zx + 2

    ; zy_2 = zy * zy
-    sqr16_round zy_2, zy, 4
+    sqr16 zy_2, zy + 2

    ; zx_zy = zx * zy
-    imul16_round zx_zy, zx, zy, 4
+    imul16 zx_zy, zx + 2, zy + 2

    ; dist = zx_2 + zy_2
-    add16 dist, zx_2, zy_2
+    add32 dist, zx_2, zy_2
    quick_exit dist, 4

    ; if may be in the lake, look for looping output with a small buffer
@ -977,10 +1034,10 @@ z_buffer_loop:

    ; Compare the previously stored z values
    ldy #0
-    z_compare zx
-    z_compare zx + 1
-    z_compare zy
-    z_compare zy + 1
+    z_compare zx + 2
+    z_compare zx + 3
+    z_compare zy + 2
+    z_compare zy + 3

    cpy #4
    bne z_no_matches
@ -995,10 +1052,10 @@ z_no_matches:
 z_nothing_to_read:

    ; Store and expand
-    z_store zx
-    z_store zx + 1
-    z_store zy
-    z_store zy + 1
+    z_store zx + 2
+    z_store zx + 3
+    z_store zy + 2
+    z_store zy + 3
    z_advance
    stx z_buffer_end

@ -1049,14 +1106,17 @@ cont:
 enough:
 .endmacro

-.macro zoom_factor dest, src, zoom, aspect
+.macro zoom_factor dest, src, aspect
+    ; output: dest: fixed8.24
+    ; input: src: fixed4.12
+    ; aspect: fixed4.12
    ; clobbers A, X, flags, etc
    copy16 dest, src
    scale_zoom dest

    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect, 4
+    imul16 dest, dest, aspect
 .endmacro

 .proc pset
@ -1281,12 +1341,15 @@ skip_luma:
    cpy #KEY_MINUS
    beq minus

-    ; temp = $0010 << (8 - zoom)
-    lda #$10
-    sta temp
+    ; temp+temp2 = $00010000 << (8 - zoom)
    lda #$00
+    sta temp
    sta temp + 1
-    scale_zoom temp
+    lda #$01
+    sta temp + 2
+    lda #$00
+    sta temp + 3
+    scale_zoom temp + 2

    cpy #KEY_UP
    beq up
@ -1296,14 +1359,7 @@ skip_luma:
    beq left
    cpy #KEY_RIGHT
    beq right
-    cpy #KEY_1
-    beq one
-    cpy #KEY_2
-    beq two
-    cpy #KEY_3
-    beq three
-    cpy #KEY_4
-    beq four
+    jmp number_keys
 
 skip_char:
    lda #0
@ -1322,17 +1378,29 @@ minus:
    dec zoom
    jmp done
 up:
-    sub16 oy, oy, temp 
+    sub32 oy, oy, temp
    jmp done
 down:
-    add16 oy, oy, temp
+    add32 oy, oy, temp
    jmp done
 left:
-    sub16 ox, ox, temp
+    sub32 ox, ox, temp
    jmp done
 right:
-    add16 ox, ox, temp
+    add32 ox, ox, temp
    jmp done
+
+number_keys:
+    cpy #KEY_1
+    beq one
+    cpy #KEY_2
+    beq two
+    cpy #KEY_3
+    beq three
+    cpy #KEY_4
+    beq four
+    jmp skip_char
+
 one:
    ldx #0
    jmp load_key_viewport
@ -1394,17 +1462,32 @@ zero_byte_loop:

    txa
    asl a
+    asl a
+
    tax
    lda viewport_ox,x
    sta ox
    lda viewport_oy,x
    sta oy
+
    inx
    lda viewport_ox,x
    sta ox + 1
    lda viewport_oy,x
    sta oy + 1

+    inx
+    lda viewport_ox,x
+    sta ox + 2
+    lda viewport_oy,x
+    sta oy + 2
+
+    inx
+    lda viewport_ox,x
+    sta ox + 3
+    lda viewport_oy,x
+    sta oy + 3
+
    rts
 .endproc

@ -1526,10 +1609,10 @@ skipped_mask:
 not_skipped_mask:

    ; run the fractal!
-    zoom_factor cx, sx, zoom, aspect_x
-    add16 cx, cx, ox
-    zoom_factor cy, sy, zoom, aspect_y
-    add16 cy, cy, oy
+    zoom_factor cx, sx, aspect_x
+    add32 cx, cx, ox
+    zoom_factor cy, sy, aspect_y
+    add32 cy, cy, oy
    jsr mandelbrot
    jsr pset

--- a/readme.md
+++ b/readme.md
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication

-The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.

 Iterations are capped at 255.

@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e

 ## Todo

-See ideas in `todo.md`.
+See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -11,19 +11,40 @@ function db(func) {
    return lines.join('\n');
 }

+let squares = [];
+for (let i = 0; i < 512; i++) {
+    squares.push(Math.trunc((i * i + 1) / 2));
+}
+
 console.log(
 `.segment "TABLES"

-.export mul_lobyte
-.export mul_hibyte
+.export mul_lobyte256
+.export mul_hibyte256
+.export mul_hibyte512
+.export sqr_lobyte
+.export sqr_hibyte

-; (i * i) / 2 for the multiplier
+; (i * i + 1) / 2 for the multiplier
 .align 256
-mul_lobyte:
-${db((i) => ((i * i) >> 1) & 0xff)}
+mul_lobyte256:
+${db((i) => squares[i] & 0xff)}

 .align 256
-mul_hibyte:
-${db((i) => ((i * i) >> 9) & 0xff)}
+mul_hibyte256:
+${db((i) => (squares[i] >> 8) & 0xff)}
+
+.align 256
+mul_hibyte512:
+${db((i) => (squares[i + 256] >> 8) & 0xff)}
+
+; (i * i) for the plain squares
+.align 256
+sqr_lobyte:
+${db((i) => (i * i) & 0xff)}
+
+.align 256
+sqr_hibyte:
+${db((i) => ((i * i) >> 8) & 0xff)}

 `);
--- a/todo.md
+++ b/todo.md
@ -1,9 +1,13 @@
 things to try:

+* skip add on the top-byte multiply in sqr8/mul8
+  * should save a few cycles, suggestion by jamey
+
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D

 * try 3.13 fixed point instead of 4.12 for more precision
  * can we get away without the extra bit?
+  * since exit compare space would be 6.26 i think so

 * y-axis mirror optimization
Author	SHA1	Message	Date
Brooke Vibber	d8601bb856	fix fix	2024-12-31 15:03:43 -08:00
Brooke Vibber	7985ea9a39	fix panning for 32-bi	2024-12-31 14:45:38 -08:00
Brooke Vibber	cc83c76706	update docs for 32-bit intermediates	2024-12-31 14:16:43 -08:00
Brooke Vibber	2e8893fd78	haha fuck me	2024-12-31 13:54:53 -08:00
Brooke Vibber	81bf7f3c43	tweak	2024-12-31 09:53:22 -08:00
Brooke Vibber	1e0f577e09	wip	2024-12-31 09:09:11 -08:00
Brooke Vibber	d2f41f9644	wip	2024-12-31 09:02:42 -08:00
Brooke Vibber	2fcb30b76a	wip	2024-12-31 08:56:59 -08:00
Brooke Vibber	13257309dc	init fix	2024-12-31 08:34:02 -08:00
Brooke Vibber	7184b8e03f	wip	2024-12-31 08:24:47 -08:00
Brooke Vibber	4a1e35699a	wip	2024-12-31 08:24:44 -08:00
Brooke Vibber	0d086a179c	wip	2024-12-31 08:23:04 -08:00
Brooke Vibber	61eb1aaf21	notes	2024-12-31 05:11:26 -08:00