diff --git a/mandel.s b/mandel.s
index f4862c6..be1f59e 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,17 +1,17 @@
 ; Our zero-page vars
 sx    = $80     ; i16: screen pixel x
 sy    = $82     ; i16: screen pixel y
-ox    = $84     ; fixed4.12: center point x
-oy    = $86     ; fixed4.12: center point y
-cx    = $84     ; fixed4.12: c_x
-cy    = $86     ; fixed4.12: c_y
-zx    = $88     ; fixed4.12: z_x
-zy    = $8a     ; fixed4.12: z_y
+ox    = $84     ; fixed3.13: center point x
+oy    = $86     ; fixed3.13: center point y
+cx    = $84     ; fixed3.13: c_x
+cy    = $86     ; fixed3.13: c_y
+zx    = $88     ; fixed3.13: z_x
+zy    = $8a     ; fixed3.13: z_y
 
-zx_2  = $90     ; fixed8.24: z_x^2
-zy_2  = $94     ; fixed8.24: z_y^2
-zx_zy = $98     ; fixed8.24: z_x * z_y
-dist  = $9c     ; fixed8.24: z_x^2 + z_y^2
+zx_2  = $90     ; fixed6.26: z_x^2
+zy_2  = $94     ; fixed6.26: z_y^2
+zx_zy = $98     ; fixed6.26: z_x * z_y
+dist  = $9c     ; fixed6.26: z_x^2 + z_y^2
 
 iter  = $a0     ; u8: iteration count
 zoom  = $a1     ; u8: zoom shift level
@@ -42,6 +42,8 @@ half_height = height >> 1
 width = 160
 half_width = width >> 1
 stride = width >> 2
+width_ratio_3_13 = (5 << 11) ; 5/4
+height_ratio_3_13 = (3 << 11) ; 5/4
 
 DMACTL = $D400
 DLISTL = $D402
@@ -99,12 +101,18 @@ aspect:
     ; 184h is the equiv of 220.8h at square pixels
     ; 320 / 220.8 = 1.45 display aspect ratio
 aspect_x:
-    .word 5 << (12 - 2)
+    .word 5 << (13 - 2)
 
 aspect_y:
-    .word 3 << (12 - 2)
+    .word 3 << (13 - 2)
 
 
+bit_masks:
+    .byte 3
+    .byte 3 << 2
+    .byte 3 << 4
+    .byte 3 << 6
+
 display_list_start:
     ; 24 lines overscan
     .repeat 3
@@ -160,7 +168,7 @@ color_map:
 .endmacro
 
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg2, dest
+    add 2, dest, arg2, dest
 .endmacro
 
 ; 2 + 9 * byte cycles
@@ -236,6 +244,21 @@ color_map:
     neg 4, arg
 .endmacro
 
+.macro extend_8_16 dest, src
+    ; clobbers A, X
+    ; 13-15 cycles
+    .local positive
+    .local negative
+    ldx #0       ; 2 cyc
+    lda src      ; 3 cyc
+    sta dest     ; 3 cyc
+    bpl positive ; 2 cyc
+negative:
+    dex          ; 2 cyc
+positive:
+    stx dest + 1 ; 3 cyc
+.endmacro
+
 ; inner loop for imul16
 ; bitnum < 8: 25 or 41 cycles
 ; bitnum >= 8: 30 or 46 cycles
@@ -254,10 +277,10 @@ color_map:
     ; 5 cycles either way
     .if bitnum < 8
         lda arg1                 ; 3 cyc
-        and #(1 << (bitnum))       ; 2 cyc
+        and #(1 << bitnum)       ; 2 cyc
     .else
         lda arg1 + 1             ; 3 cyc
-        and #(1 << ((bitnum) - 8)) ; 2 cyc
+        and #(1 << (bitnum - 8)) ; 2 cyc
     .endif
     bne one ; 2 cyc
 
@@ -284,6 +307,7 @@ next:
         ror result ; 5 cyc
     .endif
 
+
 .endmacro
 
 ; 5 to 25 cycles
@@ -306,18 +330,11 @@ positive:
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
-.macro shift_round_16 arg, shift
-    .repeat shift
-        shl32 arg
-    .endrepeat
-    round16 arg
-.endmacro
-
-.macro imul16_round dest, arg1, arg2, shift
+.macro imul16_round dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
     jsr imul16_func   ; 470-780 cyc
-    shift_round_16 FR2, shift
+    round16 FR2       ; 5-28 cyc
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -421,60 +438,71 @@ next:
     ; dist = 0
     ; iter = 0
     lda #00
-    ldx #(iter - zx + 1)
+    ldx iter - zx
 initloop:
-    sta zx - 1,x
+    sta zx,x
     dex
     bne initloop
 
 loop:
-    ; iter++ & max-iters break
-    inc iter
-    bne keep_going
+    ; 1939 - 3007 cyc
+
+    ; iter++ & max-iters break = 7 cyc
+    inc iter       ; 5 cyc
+    bne keep_going ; 2 cyc
     rts
 keep_going:
 
-    .macro quick_exit arg
-        .local keep_going
-        lda arg + 1
-        cmp #(4 << 4)
-        bmi keep_going
-        rts
-    keep_going:
-    .endmacro
-
     ; 4.12: (-8 .. +7.9)
-    ; zx = zx_2  - zy_2  + cx
+    ; zx = zx_2  - zy_2  + cx   = 3 * 20 = 60 cyc
     sub16 zx, zx_2, zy_2
     add16 zx, zx, cx
-    quick_exit zx
 
-    ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy, zx_zy
+    ; zy = zx_zy + zx_zy + cy   = 3 * 20 = 60 cyc
+    sub16 zy, zx_zy, zx_zy
     add16 zy, zy, cy
 
-    ; zx_2 = zx * zx
-    imul16_round zx_2, zx, zx, 4
-    quick_exit dist
+    ; 8.24: (-128 .. +127.9)
+    ; zx_2 = zx * zx            = 518 - 828 cyc
+    imul16 zx_2, zx, zx
 
-    ; zy_2 = zy * zy
-    imul16_round zy_2, zy, zy, 4
-    quick_exit dist
+    ; zy_2 = zy * zy            = 518 - 828 cyc
+    imul16 zy_2, zy, zy
 
-    ; zx_zy = zx * zy
-    imul16_round zx_zy, zx, zy, 4
-    quick_exit dist
+    ; zx_zy = zx * zy           = 518 - 828 cyc
+    imul16 zx_zy, zx, zy
 
-    ; dist = zx_2 + zy_2
-    add16 dist, zx_2, zy_2
-    quick_exit dist
+    ; dist = zx_2 + zy_2        = 38 cyc
+    add32 dist, zx_2, zy_2
+
+    ; if dist >= 4 break, else continue iterating = 7 cyc
+    lda dist + 3  ; 3 cyc
+    cmp #4        ; 2 cyc
+    bmi still_in  ; 2 cyc
+    rts
+still_in:
+
+    ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
+    .repeat 4      ; 60 cyc
+        shl24 zx_2 ; 15 cyc
+    .endrepeat
+    round16 zx_2   ; 5-28 cycles
+
+    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
+    .repeat 4      ; 60 cyc
+        shl24 zy_2 ; 15 cyc
+    .endrepeat
+    round16 zy_2   ; 5-28 cycles
+
+    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
+    .repeat 4       ; 60 cyc
+        shl24 zx_zy ; 15 cyc
+    .endrepeat
+    round16 zx_zy   ; 5-28 cycles
 
     ; if may be in the lake, look for looping output with a small buffer
     ; as an optimization vs running to max iters
-    jmp loop
-
-peace_out:
-    rts
+    jmp loop ; 3 cycles
 
 .endproc
 
@@ -495,7 +523,7 @@ enough:
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect, 4
+    imul16_round dest, dest, aspect
 .endmacro
 
 .proc pset
@@ -556,9 +584,6 @@ point:
     ; pixel_mask <<= pixel_shift (shifting in ones)
     and #3
     sta pixel_shift
-    lda #3
-    sec
-    sbc pixel_shift
     tax
 shift_loop:
     beq shift_done
@@ -612,13 +637,9 @@ done:
     sta ox + 1
     sta oy
     sta oy + 1
-
-    ; zoom = 2x
-    lda #1
     sta zoom
 
     ; Disable display DMA
-    lda #0
     sta DMACTL
 
     ; zero the range from framebuffer_top to framebuffer_end