diff --git a/mandel.s b/mandel.s
index 50213ad..622ff62 100644
--- a/mandel.s
+++ b/mandel.s
@@ -433,6 +433,13 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro imul16 dest, arg1, arg2
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 .macro sqr16_round dest, arg, shift
     ;imul16_round dest, arg, arg, shift
     copy16 FR0, arg   ; 12 cyc
@@ -441,6 +448,12 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro sqr16 dest, arg
+    copy16 FR0, arg   ; 12 cyc
+    jsr sqr16_func    ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg
@@ -870,8 +883,8 @@ next:
 
 .proc mandelbrot
     ; input:
-    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 4.12
+    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cy: position scaled to 8.24
     ;
     ; output:
     ; iter: iteration count at escape or 0
@@ -909,10 +922,6 @@ next:
     sta zy_2 + 1
     sta zy_2 + 2
     sta zy_2 + 3
-    sta zx_zy
-    sta zx_zy + 1
-    sta zx_zy + 2
-    sta zx_zy + 3
     sta dist
     sta dist + 1
     sta dist + 2
@@ -929,6 +938,8 @@ loop:
 keep_going:
 
     .macro quick_exit arg, max
+        ; arg: fixed8.24
+        ; max: integer
         .local positive
         .local negative
         .local nope_out
@@ -936,51 +947,61 @@ keep_going:
         .local all_done
 
         ; check sign bit
-        lda arg + 1
+        lda arg + 3
         bmi negative
 
     positive:
-        cmp #((max) << 4)
+        cmp #max
         bmi all_done ; 'less than'
         jmp exit_path
 
     negative:
-        cmp #(256 - ((max) << 4))
+        cmp #(256 - max)
         beq first_equal ; 'equal' on first byte
         bpl all_done    ; 'greater than'
 
     nope_out:
         jmp exit_path
-    
+
     first_equal:
+        ; following bytes all 0 shows it's really 'equal'
+        lda arg + 2
+        bne all_done
+        lda arg + 1
+        bne all_done
         lda arg
-        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
+        bne all_done
+        jmp exit_path
 
     all_done:
     .endmacro
 
-    ; 4.12: (-8 .. +7.9)
+    ; 8.24: (-128 .. 127.9) / (-8 .. +7.9)
     ; zx = zx_2  - zy_2  + cx
-    sub16 zx, zx_2, zy_2
-    add16 zx, zx, cx
+    sub32 zx, zx_2, zy_2
+    add32 zx, zx, cx
     quick_exit zx, 2
 
     ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy, zx_zy
-    add16 zy, zy, cy
+    add32 zy, zx_zy, zx_zy
+    add32 zy, zy, cy
     quick_exit zy, 2
 
+    ; convert 8.24 -> 4.12
+    shift_round_16 zx, 4
+    shift_round_16 zy, 4
+
     ; zx_2 = zx * zx
-    sqr16_round zx_2, zx, 4
+    sqr16 zx_2, zx + 2
 
     ; zy_2 = zy * zy
-    sqr16_round zy_2, zy, 4
+    sqr16 zy_2, zy + 2
 
     ; zx_zy = zx * zy
-    imul16_round zx_zy, zx, zy, 4
+    imul16 zx_zy, zx + 2, zy + 2
 
     ; dist = zx_2 + zy_2
-    add16 dist, zx_2, zy_2
+    add32 dist, zx_2, zy_2
     quick_exit dist, 4
 
     ; if may be in the lake, look for looping output with a small buffer
@@ -1090,13 +1111,17 @@ enough:
 .endmacro
 
 .macro zoom_factor dest, src, zoom, aspect
+    ; output: dest: fixed8.24
+    ; input: src: fixed4.12
+    ; input: zoom: u8 ???
+    ; aspect: fixed4.12
     ; clobbers A, X, flags, etc
     copy16 dest, src
     scale_zoom dest
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect, 4
+    imul16 dest, dest, aspect
 .endmacro
 
 .proc pset
@@ -1567,9 +1592,9 @@ not_skipped_mask:
 
     ; run the fractal!
     zoom_factor cx, sx, zoom, aspect_x
-    add16 cx, cx, ox
+    add32 cx, cx, ox
     zoom_factor cy, sy, zoom, aspect_y
-    add16 cy, cy, oy
+    add32 cy, cy, oy
     jsr mandelbrot
     jsr pset
 
diff --git a/todo.md b/todo.md
index 6fb0282..29217cd 100644
--- a/todo.md
+++ b/todo.md
@@ -3,7 +3,7 @@ things to try:
 * skip add on the top-byte multiply in sqr8/mul8
   * should save a few cycles, suggestion by jamey
 
-* perform the zx += zx^s + cx in 32-bit space, before rounding
+* perform the zx_next = zx^s + cx in 32-bit space, before rounding
   * should improve precision on max zoom, might cost a few cycles
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D