diff --git a/mandel.s b/mandel.s
index 4ac8d4d..fc30532 100644
--- a/mandel.s
+++ b/mandel.s
@@ -262,7 +262,10 @@ palette_chroma_entries = 15
 
 .code
 
-z_buffer_len = 16
+;z_buffer_len = 16 ; 10.863 ms/px
+;z_buffer_len = 12 ; 10.619 ms/px
+z_buffer_len = 8 ; 10.612 ms/px
+;z_buffer_len = 4 ; 12.395 ms/px
 z_buffer_mask = z_buffer_len - 1
 z_buffer:
     ; the last N zx/zy values
@@ -273,11 +276,12 @@ z_buffer:
 
 .export start
 
-max_fill_level = 6
+;max_fill_level = 6
+max_fill_level = 3
 fill_masks:
-    .byte %00011111
-    .byte %00001111
-    .byte %00000111
+;    .byte %00011111
+;    .byte %00001111
+;    .byte %00000111
     .byte %00000011
     .byte %00000001
     .byte %00000000
@@ -310,18 +314,21 @@ viewport_oy:
     .endrepeat
 .endmacro
 
+; 20 cycles
 .macro add16 dest, arg1, arg2
     add 2, dest, arg1, arg2
 .endmacro
 
+; 38 cycles
 .macro add32 dest, arg1, arg2
     add 4, dest, arg2, dest
 .endmacro
 
+; 8 cycles
 .macro add_carry dest
-    lda dest
-    adc #0
-    sta dest
+    lda dest ; 3 cyc
+    adc #0   ; 2 cyc
+    sta dest ; 3 cyc
 .endmacro
 
 ; 2 + 9 * byte cycles
@@ -334,29 +341,35 @@ viewport_oy:
     .endrepeat
 .endmacro
 
+; 20 cycles
 .macro sub16 dest, arg1, arg2
     sub 2, dest, arg1, arg2
 .endmacro
 
+; 38 cycles
 .macro sub32 dest, arg1, arg2
     sub 4, dest, arg1, arg2
 .endmacro
 
+; 3 + 5 * bytes cycles
 .macro shl bytes, arg
-    asl arg
+    asl arg              ; 3 cyc
     .repeat bytes-1, i
-        rol arg + 1 + i
+        rol arg + 1 + i  ; 5 cyc
     .endrepeat
 .endmacro
 
+; 13 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
+; 18 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
+; 23 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
@@ -369,14 +382,17 @@ viewport_oy:
     .endrepeat
 .endmacro
 
+; 12 cycles
 .macro copy16 dest, arg
     copy 2, dest, arg
 .endmacro
 
+; 24 cycles
 .macro copy32 dest, arg
     copy 4, dest, arg
 .endmacro
 
+; 36 cycles
 .macro copyfloat dest, arg
     copy 6, dest, arg
 .endmacro
@@ -401,18 +417,20 @@ viewport_oy:
     neg 4, arg
 .endmacro
 
+; 11-27 + 23 * shift cycles
+; 103-119 cycles for shift=4
 .macro shift_round_16 arg, shift
     .repeat shift
-        shl32 arg
+        shl32 arg ; 23 cycles
     .endrepeat
-    round16 arg
+    round16 arg ; 11-27 cycles
 .endmacro
 
 .macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
     jsr imul16_func   ; ? cyc
-    shift_round_16 FR2, shift
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -420,7 +438,7 @@ viewport_oy:
     ;imul16_round dest, arg, arg, shift
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -806,6 +824,7 @@ arg2_pos:
     sqr16_impl 1
 .endproc
 
+; 11-27 cycles
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -818,21 +837,28 @@ arg2_pos:
     ;                   round down if negative
     ;          < $8000: round down
 
-    lda arg + 1
-    cmp #$80
-    beq high_half
-    bpl increment
-    bmi next
+    ; $8000 17
+    ; $8001 27
+    ; $8100 21
+    ; $7fff 11
+
+    lda arg + 1    ; 3 cyc
+    cmp #$80       ; 2 cyc
+    beq high_half  ; 2 cyc
+
+    bpl increment  ; 2 cyc
+
+    bmi next       ; 2 cyc
 
 high_half:
-    lda arg
-    beq check_sign
-    bpl increment
-    bmi next
+    lda arg        ; 3 cyc
+    beq check_sign ; 2 cyc
+
+    jmp increment  ; 3 cyc
 
 check_sign:
-    lda arg + 3
-    bmi next
+    lda arg + 3  ; 3 cyc
+    bmi next     ; 2 cyc
 
 increment:       ; 5-10 cyc
     inc arg + 2  ; 5 cyc