diff --git a/mandel.s b/mandel.s
index 60ffb76..386094f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -141,46 +141,43 @@ minus:
     .endif
 .endmacro
 
-.macro bitmul arg1, arg2, res, bits
+.macro bitmul16 arg1, arg2, res, bits
     .local next
+
     checkbit arg2, bits
+    clc
     beq next
-    add32 res, arg1
+
+    ; 16-bit add on the top bits
+    lda res + 2
+    adc arg1
+    sta res + 2
+    lda res + 3
+    adc arg1 + 1
+
 next:
-    shl32 arg1
+    ; shift result right one bit
+    ; (shifts in the carry bit)
+    ror a
+    ror res
+    sta res + 1
 .endmacro
 
 .proc imul16
     ; 16-bit arg in FR0
     ; 16-bit arg in FR1
-    ; 16-bit result in FR0
-
-    ; sign-extend the argument
-    sext16to32 FR0
+    ; 32-bit result in FR2
+    ; clobbers FR1 and FR2
 
     ; zero out the 32-bit temp
     lda #0
-    sta FRX
-    sta FRX+1
-    sta FRX+2
-    sta FRX+3
+    sta FR2 + 2
+    sta FR2 + 3
+    ; the bottom two bytes will get cleared by the shifts
 
-    ; shift and add :D
     .repeat 16, bitnum
-        bitmul FR0, FR1, FRX, bitnum
+        bitmul16 FR0, FR1, FR2, bitnum
     .endrepeat
-
-    ; Re-normalize the ones place
-    shr24 FRX
-    shr24 FRX
-    shr24 FRX
-
-    ; @fixme round the last bit
-
-    ; And copy out our result
-    copy16 FRX+2, FR0
-    ; @fixme could save a few cycles by combining the last two ops
-
 .endproc
 
 .proc iter