diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
index fb43089..9f871ca 100644
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@@ -8,6 +8,7 @@ MEMORY {
     ZP:      file = "", define = yes, start = $0082, size = $007E;
     #MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
     MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
+    TABLES:  file = %O, define = yes, start = $a000, size = $c000 - $a000;
 }
 FILES {
     %O: format = atari;
@@ -22,5 +23,5 @@ SEGMENTS {
     RODATA:   load = MAIN,    type = ro   optional = yes;
     DATA:     load = MAIN,    type = rw   optional = yes;
     BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
+    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
 }
diff --git a/mandel.s b/mandel.s
index 7bfb577..a5bcb35 100644
--- a/mandel.s
+++ b/mandel.s
@@ -113,6 +113,8 @@ KEY_RIGHT = $87
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
+.import sqr_lobyte
+.import sqr_hibyte
 
 .data
 
@@ -701,42 +703,40 @@ arg2_pos:
 .endmacro
 
 .macro sqr16_impl xe
-    .local arg
-    .local result
-    .local inter
-    .local arg_pos
-    arg = FR0    ; 16-bit arg (clobbered)
-    result = FR2 ; 32-bit result
-    inter = temp2
+    .scope
+        arg = FR0    ; 16-bit arg (clobbered)
+        result = FR2 ; 32-bit result
+        ;inter = temp2
+        inter = FR1
 
-    ; hl * hl
-    ; (h*256 + l) * (h*256 + l)
-    ; h*256*(h*256 + l) + l*(h*256 + l)
-    ; h*h*256*256 + h*l*256 + h*l*256 + l*l
+        lda arg + 1
+        bpl arg_pos
+        neg16 arg
+    arg_pos:
 
-    sqr8 result, arg
-    lda #0
-    sta result + 2
-    sta result + 3
+        ; hl * hl
+        ; (h*256 + l) * (h*256 + l)
+        ; h*256*(h*256 + l) + l*(h*256 + l)
+        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
-    imul8 inter, arg + 1, arg, xe
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
+        sqr8 result, arg
+        ;imul8 inter, arg, arg, xe
+        lda #0
+        sta result + 2
+        sta result + 3
 
-    sqr8 inter, arg + 1, arg + 1, xe
-    add16 result + 2, result + 2, inter
+        imul8 inter, arg + 1, arg, xe
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
 
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg + 1
-    bpl arg_pos
-    sub16 result + 2, result + 2, arg
-    sub16 result + 2, result + 2, arg
-arg_pos:
+        sqr8 inter, arg + 1
+        ;imul8 inter, arg + 1, arg + 1, xe
+        add16 result + 2, result + 2, inter
 
-    rts ; 6 cyc
+        rts ; 6 cyc
+    .endscope
 .endmacro
 
 .proc imul16_func
@@ -748,11 +748,11 @@ arg_pos:
 .endproc
 
 .proc sqr16_func
-    imul16_impl 0
+    sqr16_impl 0
 .endproc
 
 .proc sqr16xe_func
-    imul16_impl 1
+    sqr16_impl 1
 .endproc
 
 .macro round16 arg