diff --git a/mandel.s b/mandel.s
index 4444274..9b569e2 100644
--- a/mandel.s
+++ b/mandel.s
@@ -30,18 +30,15 @@ FRX = $ec
 .endmacro
 
 ; inner loop for imul16
-; bitnum < 8: 25 or 41 cycles
-; bitnum >= 8: 30 or 46 cycles
+; 24 to 44 cycles
 .macro bitmul16 arg1, arg2, result, bitnum
-    .local zero
-    .local one
     .local next
 
     ; does 16-bit adds
     ; arg1 must be 0 or positive
     ; arg2 must be 0 or positive
 
-    ; 7 cycles up to the branch
+    clc ; 2 cyc
 
     ; check if arg1 has 0 or 1 bit in this place
     ; 5 cycles either way
@@ -52,29 +49,21 @@ FRX = $ec
         lda arg1 + 1             ; 3 cyc
         and #(1 << (bitnum - 8)) ; 2 cyc
     .endif
-    bne one ; 2 cyc
+    beq next ; 2 cyc
 
-zero: ; 18 cyc, 23 cyc
-    lsr result + 3 ; 5 cyc
-    ror result + 2 ; 5 cyc
-    ror result + 1 ; 5 cyc
-    .if bitnum >= 8
-        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
-        ; when it's all uninitialized data
-        ror result ; 5 cyc
-    .endif
-    jmp next       ; 3 cyc
-
-one: ; 32 cyc, 37 cyc
     ; 16-bit add on the top bits
-    clc            ; 2 cyc
     lda result + 2 ; 3 cyc
     adc arg2       ; 3 cyc
     sta result + 2 ; 3 cyc
     lda result + 3 ; 3 cyc
     adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc
+    ror a          ; 2 cyc - get a jump on the shift
     sta result + 3 ; 3 cyc
+
+    ; Shift the 32-bit result down by one bit,
+    ; saving the previous carry.
+    ror result + 3 ; 5 cyc
+next:
     ror result + 2 ; 5 cyc
     ror result + 1 ; 5 cyc
     .if bitnum >= 8
@@ -82,9 +71,6 @@ one: ; 32 cyc, 37 cyc
         ; when it's all uninitialized data
         ror result ; 5 cyc
     .endif
-
-next:
-
 .endmacro
 
 ; 5 to 25 cycles
@@ -121,8 +107,8 @@ positive:
     ; of a larger routine
     ; 424 to 672 cycles
     .repeat 16, bitnum
-        ; first half: 22 to 40 cycles
-        ; second half: 29 to 47 cycles
+        ; first half: 24 to 40 cycles
+        ; second half: 29 to 44 cycles
         bitmul16 arg1, arg2, result, bitnum
     .endrepeat
 
@@ -167,7 +153,7 @@ loop:
 
 .proc start
 
-looplong:
+loop:
     ; FR0 = 5
     ; FR1 = -3
     lda #5
@@ -182,6 +168,5 @@ looplong:
     jsr imul16
     ; should have 32-bit -15 in FR2
 
-loop:
     jmp loop
 .endproc