diff --git a/mandel.s b/mandel.s
index 2b1e61d..31097bf 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,14 +1,99 @@
+; Our zero-page vars
+sx    = $80     ; 8 bits: screen pixel x
+sy    = $81     ; 8 bits: screen pixel y
+cx    = $82     ; 16 bits fixed point
+cy    = $84     ; 16 bits fixed point
+zx    = $86     ; 16 bits fixed point
+zy    = $88     ; 16 bits fixed point
+zx_2  = $8a     ; 32 bits fixed point
+zy_2  = $8e     ; 32 bits fixed point
+zx_zy = $92     ; 32 bits fixed point
+dist  = $96     ; 32 bits fixed point  
+iter  = $9a     ; 8 bits iteration count
+
+temp  = $a0     ; debug temp area
+
 ; FP registers in zero page
 FR0 = $d4
 FRE = $da
 FR1 = $e0
 FR2 = $e6
-FRX = $ec
 
 .code
 
 .export start
 
+; 2 + 9 * byte cycles
+.macro add bytes, dest, arg1, arg2
+    clc ; 2 cyc
+    .repeat bytes, byte ; 9 * byte cycles
+        lda arg1 + byte
+        adc arg2 + byte
+        sta dest + byte
+    .endrepeat
+.endmacro
+
+.macro add16 dest, arg1, arg2
+    add 2, dest, arg1, arg2
+.endmacro
+
+.macro add32 dest, arg1, arg2
+    add 2, dest, arg2, dest
+.endmacro
+
+; 2 + 9 * byte cycles
+.macro sub bytes, dest, arg1, arg2
+    sec ; 2 cyc
+    .repeat bytes, byte ; 9 * byte cycles
+        lda arg1 + byte
+        sbc arg2 + byte
+        sta dest + byte
+    .endrepeat
+.endmacro
+
+.macro sub16 dest, arg1, arg2
+    sub 2, dest, arg1, arg2
+.endmacro
+
+.macro sub32 dest, arg1, arg2
+    sub 4, dest, arg1, arg2
+.endmacro
+
+.macro shl bytes, arg
+    asl arg
+    .repeat bytes-1
+        rol arg
+    .endrepeat
+.endmacro
+
+.macro shl16 arg
+    shl 2, arg
+.endmacro
+
+.macro shl24 arg
+    shl 3, arg
+.endmacro
+
+.macro shl32 arg
+    shl 4, arg
+.endmacro
+
+; 6 * bytes cycles
+.macro copy bytes, dest, arg
+    .repeat bytes, byte ; 6 * bytes cycles
+        lda arg + byte  ; 3 cyc
+        sta dest + byte ; 3 cyc
+    .endrepeat
+.endmacro
+
+.macro copy16 dest, arg
+    copy 2, dest, arg
+.endmacro
+
+.macro copy32 dest, arg
+    copy 4, dest, arg
+.endmacro
+
 ; 2 + 8 * byte cycles
 .macro neg bytes, arg
     sec ; 2 cyc
@@ -92,9 +177,17 @@ next:
 positive:
 .endmacro
 
+; 518 - 828 cyc
+.macro imul16 dest, arg1, arg2
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; 470-780
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 ; min 470 cycles
 ; max 780 cycles
-.proc imul16
+.proc imul16_func
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
@@ -128,7 +221,7 @@ positive_result:
     rts ; 6 cyc
 .endproc
 
-.macro round16_incdec arg
+.macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local zero
     .local one
@@ -178,61 +271,113 @@ next:
 
 
 
-.proc iter
-    ; still working on the fixed-point
-    ; should we just use 16-bit adds?
-    ; does that require extra rounding?
-    ; is the integer precision right?
+.proc mandelbrot
+    ; input:
+    ; cx: position scaled to 4.12 fixed point - -8..+7.9
+    ; cy: position scaled to 4.12
+    ;
+    ; output:
+    ; iter: iteration count at escape or 0
 
-    ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
     ; zx = 0
     ; zy = 0
     ; zx_2 = 0
     ; zy_2 = 0
     ; zx_zy = 0
+    ; dist = 0
+    ; iter = 0
+    lda #00
+    ldx iter - zx
+initloop:
+    sta zx,x
+    dex
+    bne initloop
 
 loop:
-    ; 1652 - 2651 cyc
+    ; 1939 - 3007 cyc
 
-    ; iters++ = 2 cyc
+    ; iter++ & max-iters break = 7 cyc
+    inc iter       ; 5 cyc
+    bne keep_going ; 2 cyc
+    rts
+keep_going:
 
     ; 4.12: (-8 .. +7.9)
-    ; zx = zx_2  + zy_2  + cx   = 3 * 20 = 60 cyc
+    ; zx = zx_2  - zy_2  + cx   = 3 * 20 = 60 cyc
+    sub16 zx, zx_2, zy_2
+    add16 zx, zx, cx
+
     ; zy = zx_zy + zx_zy + cy   = 3 * 20 = 60 cyc
+    sub16 zy, zx_zy, zx_zy
+    add16 zy, zy, cy
 
     ; 8.24: (-128 .. +127.9)
-    ; zx_2 = zx * zx            = 470 - 780 cyc
-    ; zy_2 = zy * zy            = 470 - 780 cyc
-    ; zx_zy = zx * zy           = 470 - 780 cyc
-    ; dist = zx_2 + zy_2        = 38 cyc
-    ; if dist >= 4 break, else continue iterating = 7 cyc
+    ; zx_2 = zx * zx            = 518 - 828 cyc
+    imul16 zx_2, zx, zx
 
-    ; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
-    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
-    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
+    ; zy_2 = zy * zy            = 518 - 828 cyc
+    imul16 zy_2, zy, zy
+
+    ; zx_zy = zx * zy           = 518 - 828 cyc
+    imul16 zx_zy, zx, zy
+
+    ; dist = zx_2 + zy_2        = 38 cyc
+    add32 dist, zx_2, zy_2
+
+    ; if dist >= 4 break, else continue iterating = 7 cyc
+    lda dist + 3  ; 3 cyc
+    cmp #4        ; 2 cyc
+    bmi still_in  ; 2 cyc
+    rts
+still_in:
+
+    ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
+    .repeat 4      ; 60 cyc
+        shl24 zx_2 ; 15 cyc
+    .endrepeat
+    round16 zx_2   ; 5-28 cycles
+
+    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
+    .repeat 4      ; 60 cyc
+        shl24 zy_2 ; 15 cyc
+    .endrepeat
+    round16 zy_2   ; 5-28 cycles
+
+    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
+    .repeat 4       ; 60 cyc
+        shl24 zx_zy ; 15 cyc
+    .endrepeat
+    round16 zx_zy   ; 5-28 cycles
 
     ; if may be in the lake, look for looping output with a small buffer
     ; as an optimization vs running to max iters
+    jmp loop ; 3 cycles
 
 .endproc
 
 .proc start
 
 looplong:
-    ; FR0 = 5
-    ; FR1 = -3
-    lda #5
-    sta FR0
-    lda #0
-    sta FR0 + 1
-    lda #$fd
-    sta FR1
+    ; cx = -0.5
+    lda #$f7
+    sta cx
     lda #$ff
-    sta FR1 + 1
+    sta cx + 1
 
-    jsr imul16
+    ; cy = 1
+    lda #$10
+    sta cy
+    lda #$00
+    sta cy + 1
+
+    jsr mandelbrot
     ; should have 32-bit -15 in FR2
 
+    ; save the completed iter count for debugging
+    lda iter
+    sta temp
+
 loop:
-    jmp loop
+    ; keep looping over so we can work in the debugger
+    jmp looplong
 .endproc