diff --git a/mandel.s b/mandel.s
index 487ceb1..d8b25e1 100644
--- a/mandel.s
+++ b/mandel.s
@@ -30,16 +30,18 @@ FRX = $ec
 .endmacro
 
 ; inner loop for imul16
-; 24 to 44 cycles
+; bitnum < 8: 25 or 30 cycles
+; bitnum >= 8: 39 or 44 cycles
 .macro bitmul16 arg1, arg2, result, bitnum
-    .local one
     .local zero
+    .local one
+    .local next
 
     ; does 16-bit adds
     ; arg1 must be 0 or positive
     ; arg2 must be 0 or positive
 
-    clc ; 2 cyc
+    ; 7 cycles up to the branch
 
     ; check if arg1 has 0 or 1 bit in this place
     ; 5 cycles either way
@@ -50,24 +52,10 @@ FRX = $ec
         lda arg1 + 1             ; 3 cyc
         and #(1 << (bitnum - 8)) ; 2 cyc
     .endif
-    beq zero ; 2 cyc
+    bne one ; 2 cyc
 
-one:
-    ; 16-bit add on the top bits
-    lda result + 2 ; 3 cyc
-    adc arg2       ; 3 cyc
-    sta result + 2 ; 3 cyc
-    lda result + 3 ; 3 cyc
-    adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc - get a jump on the shift
-    sta result + 3 ; 3 cyc
-    jmp oneb       ; 3 cyc
-    
-    ; Shift the 32-bit result down by one bit,
-    ; saving the previous carry.
-zero:
-    ror result + 3 ; 5 cyc
-oneb:
+zero: ; 18 cyc / 23 cyc
+    lsr result + 3 ; 5 cyc
     ror result + 2 ; 5 cyc
     ror result + 1 ; 5 cyc
     .if bitnum >= 8
@@ -75,6 +63,25 @@ oneb:
         ; when it's all uninitialized data
         ror result ; 5 cyc
     .endif
+    jmp next       ; 3 cyc
+
+one: ; 32 cyc / 37 cyc
+    ; 16-bit add on the top bits
+    clc            ; 2 cyc
+    lda result + 2 ; 3 cyc
+    adc arg2       ; 3 cyc
+    sta result + 2 ; 3 cyc
+    lda result + 3 ; 3 cyc
+    adc arg2 + 1   ; 3 cyc
+    ror a          ; 2 cyc - get a jump on the shift
+    sta result + 3 ; 3 cyc
+    ror result + 2 ; 5 cyc
+    ror result + 1 ; 5 cyc
+    .if bitnum >= 8
+        ror result ; 5 cyc
+    .endif
+next:
+
 .endmacro
 
 ; 5 to 25 cycles
@@ -157,7 +164,7 @@ loop:
 
 .proc start
 
-loop:
+looplong:
     ; FR0 = 5
     ; FR1 = -3
     lda #5
@@ -172,5 +179,6 @@ loop:
     jsr imul16
     ; should have 32-bit -15 in FR2
 
+loop:
     jmp loop
 .endproc