diff --git a/mandel.s b/mandel.s
index 2e16b53..9eb6ce1 100644
--- a/mandel.s
+++ b/mandel.s
@@ -374,6 +374,14 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro sqr16_round dest, arg, shift
+    imul16_round dest, arg, arg, shift
+    ;copy16 FR0, arg   ; 12 cyc
+    ;jsr sqr16_func      ; ? cyc
+    ;shift_round_16 FR2, shift
+    ;copy16 dest, FR2 + 2  ; 12 cyc
+.endmacro
+
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg
@@ -537,6 +545,14 @@ init:
     lda #.hibyte(imul16xe_func)
     sta imul16_func + 2
 
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+
     ; create the lookup table
     ; go through the input set, in four 16KB chunks
 
@@ -684,6 +700,45 @@ arg2_pos:
     rts ; 6 cyc
 .endmacro
 
+.macro sqr16_impl xe
+    .local arg
+    .local result
+    .local inter
+    .local arg_pos
+    arg = FR0    ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; hl * hl
+    ; (h*256 + l) * (h*256 + l)
+    ; h*256*(h*256 + l) + l*(h*256 + l)
+    ; h*h*256*256 + h*l*256 + h*l*256 + l*l
+
+    sqr8 result, arg
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg + 1, arg, xe
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    sqr8 inter, arg + 1, arg + 1, xe
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg + 1
+    bpl arg_pos
+    sub16 result + 2, result + 2, arg
+    sub16 result + 2, result + 2, arg
+arg_pos:
+
+    rts ; 6 cyc
+.endmacro
+
 .proc imul16_func
     imul16_impl 0
 .endproc
@@ -692,6 +747,14 @@ arg2_pos:
     imul16_impl 1
 .endproc
 
+.proc sqr16_func
+    imul16_impl 0
+.endproc
+
+.proc sqr16xe_func
+    imul16_impl 1
+.endproc
+
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -803,10 +866,10 @@ keep_going:
     quick_exit zy, 2
 
     ; zx_2 = zx * zx
-    imul16_round zx_2, zx, zx, 4
+    sqr16_round zx_2, zx, 4
 
     ; zy_2 = zy * zy
-    imul16_round zy_2, zy, zy, 4
+    sqr16_round zy_2, zy, 4
 
     ; zx_zy = zx * zy
     imul16_round zx_zy, zx, zy, 4