diff --git a/mandel.s b/mandel.s
index 386094f..ab7cea5 100644
--- a/mandel.s
+++ b/mandel.s
@@ -131,45 +131,48 @@ minus:
     shr 4, arg
 .endmacro
 
-.macro checkbit arg, bits
-    .if bits < 8
-        lda arg
-        and #(1 << bits)
-    .else
-        lda arg + 1
-        and #(1 << (bits - 8))
-    .endif
-.endmacro
-
-.macro bitmul16 arg1, arg2, res, bits
+.macro bitmul16 arg1, arg2, result, bitnum
     .local next
 
-    checkbit arg2, bits
     clc
+
+    ; check if arg1 has 0 or 1 bit in this place
+    .if bitnum < 8
+        lda arg1
+        and #(1 << bitnum)
+    .else
+        lda arg1 + 1
+        and #(1 << (bitnum - 8))
+    .endif
     beq next
 
     ; 16-bit add on the top bits
-    lda res + 2
-    adc arg1
-    sta res + 2
-    lda res + 3
-    adc arg1 + 1
+    lda result + 2
+    adc arg2
+    sta result + 2
+    lda result + 3
+    adc arg2 + 1
+    sta result + 3
 
 next:
-    ; shift result right one bit
-    ; (shifts in the carry bit)
-    ror a
-    ror res
-    sta res + 1
+    ; Shift the 32-bit result down by one bit,
+    ; saving the previous carry.
+    ror result + 3
+    ror result + 2
+    ror result + 1
+    .if bitnum >= 8
+        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
+        ; when it's all uninitialized data
+        ror result
+    .endif
 .endmacro
 
 .proc imul16
     ; 16-bit arg in FR0
     ; 16-bit arg in FR1
     ; 32-bit result in FR2
-    ; clobbers FR1 and FR2
 
-    ; zero out the 32-bit temp
+    ; zero out the 32-bit temp's top 16 bits
     lda #0
     sta FR2 + 2
     sta FR2 + 3
@@ -178,6 +181,8 @@ next:
     .repeat 16, bitnum
         bitmul16 FR0, FR1, FR2, bitnum
     .endrepeat
+
+    rts
 .endproc
 
 .proc iter