diff --git a/mandel.s b/mandel.s
index fc30532..4ac8d4d 100644
--- a/mandel.s
+++ b/mandel.s
@@ -262,10 +262,7 @@ palette_chroma_entries = 15
 
 .code
 
-;z_buffer_len = 16 ; 10.863 ms/px
-;z_buffer_len = 12 ; 10.619 ms/px
-z_buffer_len = 8 ; 10.612 ms/px
-;z_buffer_len = 4 ; 12.395 ms/px
+z_buffer_len = 16
 z_buffer_mask = z_buffer_len - 1
 z_buffer:
     ; the last N zx/zy values
@@ -276,12 +273,11 @@ z_buffer:
 
 .export start
 
-;max_fill_level = 6
-max_fill_level = 3
+max_fill_level = 6
 fill_masks:
-;    .byte %00011111
-;    .byte %00001111
-;    .byte %00000111
+    .byte %00011111
+    .byte %00001111
+    .byte %00000111
     .byte %00000011
     .byte %00000001
     .byte %00000000
@@ -314,21 +310,18 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 20 cycles
 .macro add16 dest, arg1, arg2
     add 2, dest, arg1, arg2
 .endmacro
 
-; 38 cycles
 .macro add32 dest, arg1, arg2
     add 4, dest, arg2, dest
 .endmacro
 
-; 8 cycles
 .macro add_carry dest
-    lda dest ; 3 cyc
-    adc #0   ; 2 cyc
-    sta dest ; 3 cyc
+    lda dest
+    adc #0
+    sta dest
 .endmacro
 
 ; 2 + 9 * byte cycles
@@ -341,35 +334,29 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 20 cycles
 .macro sub16 dest, arg1, arg2
     sub 2, dest, arg1, arg2
 .endmacro
 
-; 38 cycles
 .macro sub32 dest, arg1, arg2
     sub 4, dest, arg1, arg2
 .endmacro
 
-; 3 + 5 * bytes cycles
 .macro shl bytes, arg
-    asl arg              ; 3 cyc
+    asl arg
     .repeat bytes-1, i
-        rol arg + 1 + i  ; 5 cyc
+        rol arg + 1 + i
     .endrepeat
 .endmacro
 
-; 13 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
-; 18 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
-; 23 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
@@ -382,17 +369,14 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 12 cycles
 .macro copy16 dest, arg
     copy 2, dest, arg
 .endmacro
 
-; 24 cycles
 .macro copy32 dest, arg
     copy 4, dest, arg
 .endmacro
 
-; 36 cycles
 .macro copyfloat dest, arg
     copy 6, dest, arg
 .endmacro
@@ -417,20 +401,18 @@ viewport_oy:
     neg 4, arg
 .endmacro
 
-; 11-27 + 23 * shift cycles
-; 103-119 cycles for shift=4
 .macro shift_round_16 arg, shift
     .repeat shift
-        shl32 arg ; 23 cycles
+        shl32 arg
     .endrepeat
-    round16 arg ; 11-27 cycles
+    round16 arg
 .endmacro
 
 .macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
     jsr imul16_func   ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
+    shift_round_16 FR2, shift
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -438,7 +420,7 @@ viewport_oy:
     ;imul16_round dest, arg, arg, shift
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
+    shift_round_16 FR2, shift
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -824,7 +806,6 @@ arg2_pos:
     sqr16_impl 1
 .endproc
 
-; 11-27 cycles
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -837,28 +818,21 @@ arg2_pos:
     ;                   round down if negative
     ;          < $8000: round down
 
-    ; $8000 17
-    ; $8001 27
-    ; $8100 21
-    ; $7fff 11
-
-    lda arg + 1    ; 3 cyc
-    cmp #$80       ; 2 cyc
-    beq high_half  ; 2 cyc
-
-    bpl increment  ; 2 cyc
-
-    bmi next       ; 2 cyc
+    lda arg + 1
+    cmp #$80
+    beq high_half
+    bpl increment
+    bmi next
 
 high_half:
-    lda arg        ; 3 cyc
-    beq check_sign ; 2 cyc
-
-    jmp increment  ; 3 cyc
+    lda arg
+    beq check_sign
+    bpl increment
+    bmi next
 
 check_sign:
-    lda arg + 3  ; 3 cyc
-    bmi next     ; 2 cyc
+    lda arg + 3
+    bmi next
 
 increment:       ; 5-10 cyc
     inc arg + 2  ; 5 cyc