diff --git a/.gitignore b/.gitignore
index 8d2f7ce..771e47a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
diff --git a/Makefile b/Makefile
index 25148b4..008bf8c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,17 @@
 
 all : mandel.xex
 
-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 
 %.o : %.s
 	ca65 -o $@ $<
 
+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
 
diff --git a/mandel.s b/mandel.s
index 097b700..023a1ea 100644
--- a/mandel.s
+++ b/mandel.s
@@ -22,11 +22,12 @@ total_ms     = $a4 ; float48
 total_pixels = $aa ; float48
 
 temp         = $b0 ; u16
-pixel_ptr    = $b2 ; u16
-pixel_color  = $b4 ; u8
-pixel_mask   = $b5 ; u8
-pixel_shift  = $b6 ; u8
-pixel_offset = $b7 ; u8
+temp2        = $b2 ; u16
+pixel_ptr    = $b4 ; u16
+pixel_color  = $b6 ; u8
+pixel_mask   = $b7 ; u8
+pixel_shift  = $b8 ; u8
+pixel_offset = $b9 ; u8
 
 
 ; FP registers in zero page
@@ -83,6 +84,10 @@ SETVBV = $E45C
     mantissa .byte 6
 .endstruct
 
+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data
 
 strings:
@@ -206,6 +211,12 @@ color_map:
     add 4, dest, arg2, dest
 .endmacro
 
+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
     sec ; 2 cyc
@@ -336,12 +347,12 @@ next:
 ; 5 to 25 cycles
 .macro check_sign arg
     ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
+    ; keeping a count of sign bits in the Y register.
     .local positive
     lda arg + 1   ; 3 cyc
     bpl positive  ; 2 cyc
     neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
+    iny           ; 2 cyc
 positive:
 .endmacro
 
@@ -370,13 +381,13 @@ positive:
 
 ; min 470 cycles
 ; max 780 cycles
-.proc imul16_func
+.proc imul16_func_orig
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
 
-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
     check_sign arg1 ; 5 to 25 cyc
     check_sign arg2 ; 5 to 25 cyc
     
@@ -396,7 +407,94 @@ positive:
     .endrepeat
 
     ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
+    cpy #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:
+
+    rts ; 6 cyc
+.endproc
+
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
+.proc imul16_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
+
+    lda #0
+    sta result + 0
+    sta result + 1
+    sta result + 2
+    sta result + 3
+
+    imul8 temp, arg1, arg2
+    add16 result, result, temp
+
+    imul8 temp, arg1 + 1, arg2
+    add16 result + 1, result + 1, temp
+
+    imul8 temp, arg1, arg2 + 1
+    add16 result + 1, result + 1, temp
+    add_carry result + 3
+
+    imul8 temp, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, temp
+
+    ; In case of mixed input signs, return a negative result.
+    cpy #1              ; 2 cyc
     bne positive_result ; 2 cyc
     neg32 result        ; 34 cyc
 positive_result: