7 changed files with 158 additions and 554 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 *.o
 *.xex
-tables.s
 .DS_Store
--- a/.mailmap
+++ b/.mailmap
@ -1,2 +0,0 @@
-Brooke Vibber <bvibber@pobox.com>
-Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>
--- a/8
+++ b/8
@ -2,17 +2,13 @@

 all : mandel.xex

-mandel.xex : mandel.o tables.o
-	ld65 -C ./atari-asm-xex.cfg -o $@ $+
+%.xex : %.o
+	ld65 -C atari-asm-xex.cfg -o $@ $<

 %.o : %.s
 	ca65 -o $@ $<

-tables.s : tables.js
-	node tables.js > tables.s
-
 clean :
-	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex

--- a/mandel.s
+++ b/mandel.s
@ -21,18 +21,13 @@ count_pixels = $a3 ; u8
 total_ms     = $a4 ; float48
 total_pixels = $aa ; float48

-z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-z_buffer_start  = $b1 ; u8: index into z_buffer
-z_buffer_end    = $b2 ; u8: index into z_buffer
-temp            = $b4 ; u16
-temp2           = $b6 ; u16
-pixel_ptr       = $b8 ; u16
-pixel_color     = $ba ; u8
-pixel_mask      = $bb ; u8
-pixel_shift     = $bc ; u8
-pixel_offset    = $bd ; u8
-fill_level      = $be ; u8
-palette_offset  = $bf ; u8
+temp         = $b0 ; u16
+pixel_ptr    = $b2 ; u16
+pixel_color  = $b4 ; u8
+pixel_mask   = $b5 ; u8
+pixel_shift  = $b6 ; u8
+pixel_offset = $b7 ; u8
+

 ; FP registers in zero page
 FR0    = $d4 ; float48
@ -43,9 +38,6 @@ CIX    = $f2 ; u8 - index into INBUFF
 INBUFF = $f3 ; u16 - pointer to ascii
 FLPTR  = $fc ; u16 - pointer to user buffer float48

-CH1    = $02f2 ; previous character read from keyboard
-CH     = $02fc ; current character read from keyboard
-
 LBUFF  = $0580 ; result buffer for FASC routine

 ; FP ROM routine vectors
@ -77,40 +69,20 @@ stride = width >> 2
 DMACTL = $D400
 DLISTL = $D402
 DLISTH = $D403
-WSYNC  = $D40A

 ; OS shadow registers
 SDLSTL = $230
 SDLSTH = $231

 ; interrupt stuff
-SYSVBV = $E45F
 XITVBV = $E462
 SETVBV = $E45C

-COLOR0 = $2C4
-COLOR1 = $2C5
-COLOR2 = $2C6
-COLOR3 = $2C7
-COLOR4 = $2C8
-
-; Keycodes!
-KEY_PLUS  = $06
-KEY_MINUS = $0e
-KEY_UP    = $8e
-KEY_DOWN  = $8f
-KEY_LEFT  = $86
-KEY_RIGHT = $87
-
 .struct float48
    exponent .byte
    mantissa .byte 6
 .endstruct

-.import mul_lobyte256
-.import mul_hibyte256
-.import mul_hibyte512
-
 .data

 strings:
@ -131,9 +103,8 @@ str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
-speed_precision = 6

-speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
+speed_start = str_self_len + 2
 speed_len = 14 + str_speed_len


@ -150,9 +121,6 @@ char_map:
        .byte 96 + i
    .endrepeat

-hex_chars:
-    .byte "0123456789abcdef"
-
 aspect:
    ; aspect ratio!
    ; pixels at 320w are 5:6 (narrow)
@ -216,33 +184,10 @@ color_map:
        .byte 3
    .endrepeat

-palette:
-    .byte $00
-    .byte $46
-    .byte $78
-    .byte $b4
 .code

-z_buffer_len = 16
-z_buffer_mask = z_buffer_len - 1
-z_buffer:
-    ; the last N zx/zy values
-    .repeat z_buffer_len
-        .word 0
-        .word 0
-    .endrepeat
-
 .export start

-max_fill_level = 6
-fill_masks:
-    .byte %00011111
-    .byte %00001111
-    .byte %00000111
-    .byte %00000011
-    .byte %00000001
-    .byte %00000000
-
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
@ -261,12 +206,6 @@ fill_masks:
    add 4, dest, arg2, dest
 .endmacro

-.macro add_carry dest
-    lda dest
-    adc #0
-    sta dest
-.endmacro
-
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -344,6 +283,68 @@ fill_masks:
    neg 4, arg
 .endmacro

+; inner loop for imul16
+; bitnum < 8: 25 or 41 cycles
+; bitnum >= 8: 30 or 46 cycles
+.macro bitmul16 arg1, arg2, result, bitnum
+    .local zero
+    .local one
+    .local next
+
+    ; does 16-bit adds
+    ; arg1 and arg2 are treated as unsigned
+    ; negative signed inputs must be flipped first
+
+    ; 7 cycles up to the branch
+
+    ; check if arg1 has 0 or 1 bit in this place
+    ; 5 cycles either way
+    .if bitnum < 8
+        lda arg1                 ; 3 cyc
+        and #(1 << (bitnum))       ; 2 cyc
+    .else
+        lda arg1 + 1             ; 3 cyc
+        and #(1 << ((bitnum) - 8)) ; 2 cyc
+    .endif
+    bne one ; 2 cyc
+
+zero: ; 18 cyc, 23 cyc
+    lsr result + 3 ; 5 cyc
+    jmp next       ; 3 cyc
+
+one: ; 32 cyc, 37 cyc
+    ; 16-bit add on the top bits
+    clc            ; 2 cyc
+    lda result + 2 ; 3 cyc
+    adc arg2       ; 3 cyc
+    sta result + 2 ; 3 cyc
+    lda result + 3 ; 3 cyc
+    adc arg2 + 1   ; 3 cyc
+    ror a          ; 2 cyc - get a jump on the shift
+    sta result + 3 ; 3 cyc
+next:
+    ror result + 2 ; 5 cyc
+    ror result + 1 ; 5 cyc
+    .if bitnum >= 8
+        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
+        ; when it's all uninitialized data
+        ror result ; 5 cyc
+    .endif
+
+.endmacro
+
+; 5 to 25 cycles
+.macro check_sign arg
+    ; Check sign bit and flip argument to postive,
+    ; keeping a count of sign bits in the X register.
+    .local positive
+    lda arg + 1   ; 3 cyc
+    bpl positive  ; 2 cyc
+    neg16 arg     ; 18 cyc
+    inx           ; 2 cyc
+positive:
+.endmacro
+
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
@ -367,96 +368,38 @@ fill_masks:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro

-; Adapted from https://everything2.com/title/Fast+6502+multiplication
-.macro imul8 dest, arg1, arg2
-    .local under256
-    .local next
-    .local small_product
-    .scope
-        mul_factor_a   = arg1
-        mul_factor_x   = arg2
-        mul_product_lo = dest
-        mul_product_hi = dest + 1
-
-        lda mul_factor_a      ; setup: 6 cycles
-        ;ldx mul_factor_x
-
-        clc                   ; (a + x)^2/2: 23 cycles
-        adc mul_factor_x
-        tax
-        bcc under256
-        lda mul_hibyte512,x
-        bcs next
-    under256:
-        lda mul_hibyte256,x
-        sec
-    next:
-        sta mul_product_hi
-        lda mul_lobyte256,x
-
-        ldx mul_factor_a      ; - a^2/2: 20 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
-
-        ldx mul_factor_x      ; + x & a & 1: 22 cycles
-        txa                   ; (this is a kludge to correct a
-        and mul_factor_a      ; roundoff error that makes odd * odd too low)
-        and #1
-
-        clc
-        adc mul_product_lo
-        bcc small_product
-        inc mul_product_hi
-    small_product:
-        sec                   ; - x^2/2: 25 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
-    .endscope
-.endmacro
-
+; min 470 cycles
+; max 780 cycles
 .proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
-    inter = temp2

-    ; h1l1 * h2l2
-    ; (h1*256 + l1) * (h2*256 + l2)
-    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
-    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+    ldx #0          ; 2 cyc
+    ; counts the number of sign bits in X
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
    
-    imul8 result, arg1, arg2
-    lda #0
-    sta result + 2
-    sta result + 3
+    ; zero out the 32-bit temp's top 16 bits
+    lda #0          ; 2 cyc
+    sta result + 2  ; 3 cyc
+    sta result + 3  ; 3 cyc
+    ; the bottom two bytes will get cleared by the shifts

-    imul8 inter, arg1 + 1, arg2
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
+    ; unrolled loop for maximum speed, at the cost
+    ; of a larger routine
+    ; 440 to 696 cycles
+    .repeat 16, bitnum
+        ; bitnum < 8: 25 or 41 cycles
+        ; bitnum >= 8: 30 or 46 cycles
+        bitmul16 arg1, arg2, result, bitnum
+    .endrepeat

-    imul8 inter, arg1, arg2 + 1
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8 inter, arg1 + 1, arg2 + 1
-    add16 result + 2, result + 2, inter
-
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg1 + 1
-    bpl arg1_pos
-    sub16 result + 2, result + 2, arg2
-arg1_pos:
-    lda arg2 + 1
-    bpl arg2_pos
-    sub16 result + 2, result + 2, arg1
-arg2_pos:
+    ; In case of mixed input signs, return a negative result.
+    cpx #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:

    rts ; 6 cyc
 .endproc
@ -519,14 +462,12 @@ initloop:
    sta zx - 1,x
    dex
    bne initloop
-    sta z_buffer_start
-    sta z_buffer_end

 loop:
    ; iter++ & max-iters break
    inc iter
    bne keep_going
-    jmp exit_path
+    rts
 keep_going:

    .macro quick_exit arg, max
@ -543,7 +484,7 @@ keep_going:
    positive:
        cmp #((max) << 4)
        bmi all_done ; 'less than'
-        jmp exit_path
+        rts

    negative:
        cmp #(256 - ((max) << 4))
@ -551,7 +492,7 @@ keep_going:
        bpl all_done    ; 'greater than'

    nope_out:
-        jmp exit_path
+        rts
    
    first_equal:
        lda arg
@ -586,100 +527,19 @@ keep_going:

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters
-    lda z_buffer_active
-    beq skip_z_buffer
-
-    ldx z_buffer_start
-    cpx z_buffer_end
-    beq z_nothing_to_read
-
-z_buffer_loop:
-    .macro z_compare arg
-        .local compare_no_match
-        lda z_buffer,x
-        inx
-        cmp arg
-        bne compare_no_match
-        iny
-    compare_no_match:
-    .endmacro
-    .macro z_advance
-        .local skip_reset_x
-        cpx #(z_buffer_len * 4)
-        bmi skip_reset_x
-        ldx #0
-    skip_reset_x:
-    .endmacro
-    .macro z_store arg
-        lda arg
-        sta z_buffer,x
-        inx
-    .endmacro
-
-    ; Compare the previously stored z values
-    ldy #0
-    z_compare zx
-    z_compare zx + 1
-    z_compare zy
-    z_compare zy + 1
-
-    cpy #4
-    bne z_no_matches
-    jmp z_exit
-
-z_no_matches:
-    z_advance
-
-    cpx z_buffer_end
-    bne z_buffer_loop
-
-z_nothing_to_read:
-
-    ; Store and expand
-    z_store zx
-    z_store zx + 1
-    z_store zy
-    z_store zy + 1
-    z_advance
-    stx z_buffer_end
-
-    ; Increment the start roller if necessary (limit size)
-    lda iter
-    cmp #(z_buffer_len * 4)
-    bmi skip_inc_start
-    lda z_buffer_start
-    clc
-    adc #4
-    tax
-    z_advance
-    stx z_buffer_start
-skip_inc_start:
-
-skip_z_buffer:
-
    jmp loop

-z_exit:
-    lda #0
-    sta iter
-
-exit_path:
-    ldx #0
-    lda iter
-    bne next
-    inx
-next:
-    stx z_buffer_active
+peace_out:
    rts

 .endproc

-.macro scale_zoom dest
-    ; clobbers X, flags
+.macro zoom_factor dest, src, zoom, aspect
    .local cont
    .local enough

    ; cx = (sx << (8 - zoom))
+    copy16 dest, src
    ldx zoom
 cont:
    cpx #8
@ -688,12 +548,6 @@ cont:
    inx
    jmp cont
 enough:
-.endmacro
-
-.macro zoom_factor dest, src, zoom, aspect
-    ; clobbers A, X, flags, etc
-    copy16 dest, src
-    scale_zoom dest

    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
@ -790,25 +644,6 @@ shift_done:
    rts
 .endproc

-.macro draw_text_indirect col, len, strptr
-    ; clobbers A, X
-    .local loop
-    .local done
-    ldx #0
-loop:
-    cpx #len
-    beq done
-    txa
-    tay
-    lda (strptr),y
-    tay
-    lda char_map,y
-    sta textbuffer + col,x
-    inx
-    jmp loop
-done:
-.endmacro
-
 .macro draw_text col, len, cstr
    ; clobbers A, X
    .local loop
@ -827,34 +662,9 @@ done:

 .proc vblank_handler
    inc count_frames
-    inc palette_offset
-    jsr update_palette
    jmp XITVBV
 .endproc

-.proc update_palette
-    lda palette
-    sta COLOR4
-
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 1
-    sta COLOR0
-
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 2
-    sta COLOR1
-
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 3
-    sta COLOR2
-.endproc
-
 .proc update_speed
    ; convert frames (u16) to fp
    ; add to frames_total
@ -865,105 +675,6 @@ done:
    ; draw text
 .endproc

-.proc keycheck
-    ; clobbers all
-    ; returns 255 in A if state change or 0 if no change
-
-    ; check keyboard buffer
-    lda CH
-    cmp #$ff
-    beq skip_char
-
-    ; Clear the keyboard buffer and re-enable interrupts
-    ldx #$ff
-    stx CH
-
-    tay
-
-    lda zoom
-    cpy #KEY_PLUS
-    beq plus
-    cpy #KEY_MINUS
-    beq minus
-
-    ; temp = $0010 << (8 - zoom)
-    lda #$10
-    sta temp
-    lda #$00
-    sta temp + 1
-    scale_zoom temp
-
-    cpy #KEY_UP
-    beq up
-    cpy #KEY_DOWN
-    beq down
-    cpy #KEY_LEFT
-    beq left
-    cpy #KEY_RIGHT
-    beq right
-
-skip_char:
-    lda #0
-    rts
-
-plus:
-    cmp #8
-    bpl skip_char
-    inc zoom
-    jmp done
-minus:
-    cmp #1
-    bmi skip_char
-    dec zoom
-    jmp done
-up:
-    sub16 oy, oy, temp 
-    jmp done
-down:
-    add16 oy, oy, temp
-    jmp done
-left:
-    sub16 ox, ox, temp
-    jmp done
-right:
-    add16 ox, ox, temp
-done:
-    lda #255
-    rts
-
-.endproc
-
-.proc clear_screen
-    ; zero the range from framebuffer_top to display_list
-    lda #.lobyte(framebuffer_top)
-    sta temp
-    lda #.hibyte(framebuffer_top)
-    sta temp + 1
-
-zero_page_loop:
-    lda #0
-    ldy #0
-zero_byte_loop:
-    sta (temp),y
-    iny
-    bne zero_byte_loop
-
-    inc temp + 1
-    lda temp + 1
-    cmp #.hibyte(display_list)
-    bne zero_page_loop
-
-    rts
-.endproc
-
-.proc status_bar
-    ; Status bar
-    draw_text 0, str_self_len, str_self
-    draw_text 40 - str_run_len, str_run_len, str_run
-
-    rts
-.endproc
-
 .proc start

    ; ox = 0; oy = 0; zoom = 0
@ -990,7 +701,24 @@ zero_byte_loop:
    lda #0
    sta DMACTL

-    jsr clear_screen
+    ; zero the range from framebuffer_top to framebuffer_end
+    lda #.lobyte(framebuffer_top)
+    sta temp
+    lda #.hibyte(framebuffer_top)
+    sta temp + 1
+
+zero_page_loop:
+    lda #0
+    ldy #0
+zero_byte_loop:
+    sta (temp),y
+    iny
+    bne zero_byte_loop
+
+    inc temp + 1
+    lda temp + 1
+    cmp #.hibyte(framebuffer_end)
+    bne zero_page_loop

    ; Copy the display list into properly aligned memory
    ; Can't cross 1024-byte boundaries :D
@ -1010,15 +738,14 @@ copy_byte_loop:
    sta DLISTH ; actual register
    sta SDLSTH ; shadow register the OS will copy in

+    ; Status bar
+    draw_text 0, str_self_len, str_self
+    draw_text 40 - str_run_len, str_run_len, str_run
+
    ; Re-enable display DMA
    lda #$22
    sta DMACTL

-    ; Initialize the palette
-    lda #0
-    sta palette_offset
-    jsr update_palette
-
    ; install the vblank handler
    lda #7 ; deferred
    ldx #.hibyte(vblank_handler)
@ -1026,14 +753,6 @@ copy_byte_loop:
    jsr SETVBV

 main_loop:
-    jsr clear_screen
-    jsr status_bar
-
-    lda #0
-    sta fill_level
-
-fill_loop:
-
    ; sy = -92 .. 91
    lda #(256-half_height)
    sta sy
@ -1048,53 +767,12 @@ loop_sy:
    sta sx + 1

 loop_sx:
-    ; check the fill mask
-    ldy #0
-
-loop_skip_level:
-    cpy fill_level
-    beq current_level
-
-    lda fill_masks,y
-    and sx
-    bne not_skipped_mask1
-
-    lda fill_masks,y
-    and sy
-    beq skipped_mask
-
-not_skipped_mask1:
-    iny
-    jmp loop_skip_level
-
-current_level:
-    lda fill_masks,y
-    and sx
-    bne skipped_mask
-
-    lda fill_masks,y
-    and sy
-    beq not_skipped_mask
-
-skipped_mask:
-    jmp skipped
-
-not_skipped_mask:
-
-    ; run the fractal!
    zoom_factor cx, sx, zoom, aspect_x
-    add16 cx, cx, ox
    zoom_factor cy, sy, zoom, aspect_y
-    add16 cy, cy, oy
    jsr mandelbrot
    jsr pset

-    jsr keycheck
-    beq no_key
-    ; @fixme clear the pixel stats
-    jmp main_loop

-no_key:
    ; check if we should update the counters
    ;
    ; count_pixels >= width? update!
@ -1106,7 +784,7 @@ no_key:
    ; count_frames >= 120? update!
    lda count_frames
    cmp #120 ; >= 2 seconds
-    bmi skipped
+    bmi skip_status

 update_status:
    ; FR0 = (float)count_pixels & clear count_pixels
@ -1166,11 +844,35 @@ update_status:
    ; convert to ASCII in INBUFF
    jsr FASC

-    ; print the first 6 digits
-    draw_text_indirect speed_start, speed_precision, INBUFF
-    draw_text speed_start + speed_precision, str_speed_len, str_speed
+    ; find the last byte
+    ldy #0
+number_loop:
+    lda (INBUFF),y
+    bmi lastchar

-skipped:
+    tax
+    lda char_map,x
+    sta textbuffer + speed_start,y
+
+    iny
+    bpl number_loop
+lastchar:
+    ; Y is last char
+    ; trim that high bit
+    and #$7f
+    tax
+    lda char_map,x
+    sta textbuffer + speed_start,y
+
+    ; Fill out any remaining spaces
+    lda #0
+space_loop:
+    iny
+    sta textbuffer + speed_start,y
+    cpy #(20)
+    bmi space_loop
+
+skip_status:

    clc
    lda sx
@ -1202,18 +904,9 @@ loop_sx_done:

 loop_sy_done:

-fill_loop_done:
-    inc fill_level
-    lda fill_level
-    cmp #max_fill_level
-    beq loop
-    jmp fill_loop
+    draw_text 40 - str_done_len, str_done_len, str_done

 loop:
    ; finished
-    draw_text 40 - str_done_len, str_done_len, str_done
-    jsr keycheck
-    beq loop
-    jmp main_loop
-
+    jmp loop
 .endproc
--- a/readme.md
+++ b/readme.md
@ -14,7 +14,7 @@ Non-goals:

 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.

-- brooke, january 2023 - february 2024
+-- brion, january 2023

 ## Current state

@ -28,8 +28,6 @@ The mandelbrot calculations are done using 4.12-precision fixed point numbers. I

 Iterations are capped at 255.

-The pixels are run in a progressive layout to get the basic shape on screen faster.
-
 ## Next steps

 Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
@ -37,7 +35,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.

 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
-(done)

 ## Deps and build instructions

--- a/tables.js
+++ b/tables.js
@ -1,38 +0,0 @@
-function db(func) {
-    let lines = [];
-    for (let i = 0; i < 256; i += 16) {
-        let items = [];
-        for (let j = 0; j < 16; j++) {
-            let x = i + j;
-            items.push(func(x));
-        }
-        lines.push('    .byte ' + items.join(', '));
-    }
-    return lines.join('\n');
-}
-
-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
-console.log(
-`.segment "TABLES"
-
-.export mul_lobyte256
-.export mul_hibyte256
-.export mul_hibyte512
-
-.align 256
-mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
-
-.align 256
-mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
-
-.align 256
-mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
-
-`);
--- a/testme.js
+++ b/testme.js
@ -1,41 +0,0 @@
-// ax = (a + x)2/2 - a2/2 - x2/2 
-
-function half_square(x) {
-    return Math.round(x * x / 2) & 0xffff >>> 0;
-}
-
-function mul8(a, b) {
-    let result = half_square(a + b) & 0xffff;
-    result = (result - half_square(a)) & 0xffff;
-    result = (result - half_square(b)) & 0xffff;
-    result = (result + (b & a & 1)) & 0xffff;
-    return result >>> 0;
-}
-
-function mul16(a, b) {
-    let ah = (a & 0xff00) >>> 8;
-    let al = (a & 0x00ff) >>> 0;
-    let bh = (b & 0xff00) >>> 8;
-    let bl = (b & 0x00ff) >>> 0;
-    let result = (mul8(al, bl) & 0xffff) >>> 0;
-    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
-    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
-    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
-    return result;
-}
-
-let max = 65536;
-//let max = 256;
-//let max = 128;
-//let max = 8;
-
-for (let a = 0; a < max; a++) {
-    for (let b = 0; b < max; b++) {
-        let expected = Math.imul(a, b) >>> 0;
-        //let actual = mul8(a, b);
-        let actual = mul16(a, b);
-        if (expected !== actual) {
-            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
-        }
-    }
-}