slightly faster handling of signed mul

previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/
squares
2024-12-15 20:17:45 -08:00 · 2024-12-14 18:56:26 -08:00 · 2024-12-14 18:53:31 -08:00 · 2024-08-19 13:21:44 -07:00 · 2024-08-18 21:07:53 -07:00 · 2024-08-18 21:06:30 -07:00
7 changed files with 554 additions and 158 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
--- a/.mailmap
+++ b/.mailmap
@ -0,0 +1,2 @@
+Brooke Vibber <bvibber@pobox.com>
+Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>
--- a/8
+++ b/8
@ -2,13 +2,17 @@

 all : mandel.xex

-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+

 %.o : %.s
 	ca65 -o $@ $<

+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex

--- a/mandel.s
+++ b/mandel.s
@ -21,13 +21,18 @@ count_pixels = $a3 ; u8
 total_ms     = $a4 ; float48
 total_pixels = $aa ; float48

-temp         = $b0 ; u16
-pixel_ptr    = $b2 ; u16
-pixel_color  = $b4 ; u8
-pixel_mask   = $b5 ; u8
-pixel_shift  = $b6 ; u8
-pixel_offset = $b7 ; u8
-
+z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $b1 ; u8: index into z_buffer
+z_buffer_end    = $b2 ; u8: index into z_buffer
+temp            = $b4 ; u16
+temp2           = $b6 ; u16
+pixel_ptr       = $b8 ; u16
+pixel_color     = $ba ; u8
+pixel_mask      = $bb ; u8
+pixel_shift     = $bc ; u8
+pixel_offset    = $bd ; u8
+fill_level      = $be ; u8
+palette_offset  = $bf ; u8

 ; FP registers in zero page
 FR0    = $d4 ; float48
@ -38,6 +43,9 @@ CIX    = $f2 ; u8 - index into INBUFF
 INBUFF = $f3 ; u16 - pointer to ascii
 FLPTR  = $fc ; u16 - pointer to user buffer float48

+CH1    = $02f2 ; previous character read from keyboard
+CH     = $02fc ; current character read from keyboard
+
 LBUFF  = $0580 ; result buffer for FASC routine

 ; FP ROM routine vectors
@ -69,20 +77,40 @@ stride = width >> 2
 DMACTL = $D400
 DLISTL = $D402
 DLISTH = $D403
+WSYNC  = $D40A

 ; OS shadow registers
 SDLSTL = $230
 SDLSTH = $231

 ; interrupt stuff
+SYSVBV = $E45F
 XITVBV = $E462
 SETVBV = $E45C

+COLOR0 = $2C4
+COLOR1 = $2C5
+COLOR2 = $2C6
+COLOR3 = $2C7
+COLOR4 = $2C8
+
+; Keycodes!
+KEY_PLUS  = $06
+KEY_MINUS = $0e
+KEY_UP    = $8e
+KEY_DOWN  = $8f
+KEY_LEFT  = $86
+KEY_RIGHT = $87
+
 .struct float48
    exponent .byte
    mantissa .byte 6
 .endstruct

+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data

 strings:
@ -90,7 +118,7 @@ str_self:
    .byte "MANDEL-6502"
 str_self_end:
 str_speed:
-    .byte "ms/px"
+    .byte " ms/px"
 str_speed_end:
 str_run:
    .byte " RUN"
@ -103,8 +131,9 @@ str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
+speed_precision = 6

-speed_start = str_self_len + 2
+speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
 speed_len = 14 + str_speed_len


@ -121,6 +150,9 @@ char_map:
        .byte 96 + i
    .endrepeat

+hex_chars:
+    .byte "0123456789abcdef"
+
 aspect:
    ; aspect ratio!
    ; pixels at 320w are 5:6 (narrow)
@ -184,10 +216,33 @@ color_map:
        .byte 3
    .endrepeat

+palette:
+    .byte $00
+    .byte $46
+    .byte $78
+    .byte $b4
 .code

+z_buffer_len = 16
+z_buffer_mask = z_buffer_len - 1
+z_buffer:
+    ; the last N zx/zy values
+    .repeat z_buffer_len
+        .word 0
+        .word 0
+    .endrepeat
+
 .export start

+max_fill_level = 6
+fill_masks:
+    .byte %00011111
+    .byte %00001111
+    .byte %00000111
+    .byte %00000011
+    .byte %00000001
+    .byte %00000000
+
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
@ -206,6 +261,12 @@ color_map:
    add 4, dest, arg2, dest
 .endmacro

+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -283,68 +344,6 @@ color_map:
    neg 4, arg
 .endmacro

-; inner loop for imul16
-; bitnum < 8: 25 or 41 cycles
-; bitnum >= 8: 30 or 46 cycles
-.macro bitmul16 arg1, arg2, result, bitnum
-    .local zero
-    .local one
-    .local next
-
-    ; does 16-bit adds
-    ; arg1 and arg2 are treated as unsigned
-    ; negative signed inputs must be flipped first
-
-    ; 7 cycles up to the branch
-
-    ; check if arg1 has 0 or 1 bit in this place
-    ; 5 cycles either way
-    .if bitnum < 8
-        lda arg1                 ; 3 cyc
-        and #(1 << (bitnum))       ; 2 cyc
-    .else
-        lda arg1 + 1             ; 3 cyc
-        and #(1 << ((bitnum) - 8)) ; 2 cyc
-    .endif
-    bne one ; 2 cyc
-
-zero: ; 18 cyc, 23 cyc
-    lsr result + 3 ; 5 cyc
-    jmp next       ; 3 cyc
-
-one: ; 32 cyc, 37 cyc
-    ; 16-bit add on the top bits
-    clc            ; 2 cyc
-    lda result + 2 ; 3 cyc
-    adc arg2       ; 3 cyc
-    sta result + 2 ; 3 cyc
-    lda result + 3 ; 3 cyc
-    adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc - get a jump on the shift
-    sta result + 3 ; 3 cyc
-next:
-    ror result + 2 ; 5 cyc
-    ror result + 1 ; 5 cyc
-    .if bitnum >= 8
-        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
-        ; when it's all uninitialized data
-        ror result ; 5 cyc
-    .endif
-
-.endmacro
-
-; 5 to 25 cycles
-.macro check_sign arg
-    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
-    .local positive
-    lda arg + 1   ; 3 cyc
-    bpl positive  ; 2 cyc
-    neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
-positive:
-.endmacro
-
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
@ -368,38 +367,96 @@ positive:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro

-; min 470 cycles
-; max 780 cycles
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
 .proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
+    inter = temp2

-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
-    check_sign arg1 ; 5 to 25 cyc
-    check_sign arg2 ; 5 to 25 cyc
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2

-    ; zero out the 32-bit temp's top 16 bits
-    lda #0          ; 2 cyc
-    sta result + 2  ; 3 cyc
-    sta result + 3  ; 3 cyc
-    ; the bottom two bytes will get cleared by the shifts
+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3

-    ; unrolled loop for maximum speed, at the cost
-    ; of a larger routine
-    ; 440 to 696 cycles
-    .repeat 16, bitnum
-        ; bitnum < 8: 25 or 41 cycles
-        ; bitnum >= 8: 30 or 46 cycles
-        bitmul16 arg1, arg2, result, bitnum
-    .endrepeat
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3

-    ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
-    bne positive_result ; 2 cyc
-    neg32 result        ; 34 cyc
-positive_result:
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:

    rts ; 6 cyc
 .endproc
@ -462,12 +519,14 @@ initloop:
    sta zx - 1,x
    dex
    bne initloop
+    sta z_buffer_start
+    sta z_buffer_end

 loop:
    ; iter++ & max-iters break
    inc iter
    bne keep_going
-    rts
+    jmp exit_path
 keep_going:

    .macro quick_exit arg, max
@ -484,7 +543,7 @@ keep_going:
    positive:
        cmp #((max) << 4)
        bmi all_done ; 'less than'
-        rts
+        jmp exit_path

    negative:
        cmp #(256 - ((max) << 4))
@ -492,7 +551,7 @@ keep_going:
        bpl all_done    ; 'greater than'

    nope_out:
-        rts
+        jmp exit_path
    
    first_equal:
        lda arg
@ -527,19 +586,100 @@ keep_going:

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters
+    lda z_buffer_active
+    beq skip_z_buffer
+
+    ldx z_buffer_start
+    cpx z_buffer_end
+    beq z_nothing_to_read
+
+z_buffer_loop:
+    .macro z_compare arg
+        .local compare_no_match
+        lda z_buffer,x
+        inx
+        cmp arg
+        bne compare_no_match
+        iny
+    compare_no_match:
+    .endmacro
+    .macro z_advance
+        .local skip_reset_x
+        cpx #(z_buffer_len * 4)
+        bmi skip_reset_x
+        ldx #0
+    skip_reset_x:
+    .endmacro
+    .macro z_store arg
+        lda arg
+        sta z_buffer,x
+        inx
+    .endmacro
+
+    ; Compare the previously stored z values
+    ldy #0
+    z_compare zx
+    z_compare zx + 1
+    z_compare zy
+    z_compare zy + 1
+
+    cpy #4
+    bne z_no_matches
+    jmp z_exit
+
+z_no_matches:
+    z_advance
+
+    cpx z_buffer_end
+    bne z_buffer_loop
+
+z_nothing_to_read:
+
+    ; Store and expand
+    z_store zx
+    z_store zx + 1
+    z_store zy
+    z_store zy + 1
+    z_advance
+    stx z_buffer_end
+
+    ; Increment the start roller if necessary (limit size)
+    lda iter
+    cmp #(z_buffer_len * 4)
+    bmi skip_inc_start
+    lda z_buffer_start
+    clc
+    adc #4
+    tax
+    z_advance
+    stx z_buffer_start
+skip_inc_start:
+
+skip_z_buffer:
+
    jmp loop

-peace_out:
+z_exit:
+    lda #0
+    sta iter
+
+exit_path:
+    ldx #0
+    lda iter
+    bne next
+    inx
+next:
+    stx z_buffer_active
    rts

 .endproc

-.macro zoom_factor dest, src, zoom, aspect
+.macro scale_zoom dest
+    ; clobbers X, flags
    .local cont
    .local enough

    ; cx = (sx << (8 - zoom))
-    copy16 dest, src
    ldx zoom
 cont:
    cpx #8
@ -548,6 +688,12 @@ cont:
    inx
    jmp cont
 enough:
+.endmacro
+
+.macro zoom_factor dest, src, zoom, aspect
+    ; clobbers A, X, flags, etc
+    copy16 dest, src
+    scale_zoom dest

    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
@ -644,6 +790,25 @@ shift_done:
    rts
 .endproc

+.macro draw_text_indirect col, len, strptr
+    ; clobbers A, X
+    .local loop
+    .local done
+    ldx #0
+loop:
+    cpx #len
+    beq done
+    txa
+    tay
+    lda (strptr),y
+    tay
+    lda char_map,y
+    sta textbuffer + col,x
+    inx
+    jmp loop
+done:
+.endmacro
+
 .macro draw_text col, len, cstr
    ; clobbers A, X
    .local loop
@ -662,9 +827,34 @@ done:

 .proc vblank_handler
    inc count_frames
+    inc palette_offset
+    jsr update_palette
    jmp XITVBV
 .endproc

+.proc update_palette
+    lda palette
+    sta COLOR4
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 1
+    sta COLOR0
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 2
+    sta COLOR1
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 3
+    sta COLOR2
+.endproc
+
 .proc update_speed
    ; convert frames (u16) to fp
    ; add to frames_total
@ -675,6 +865,105 @@ done:
    ; draw text
 .endproc

+.proc keycheck
+    ; clobbers all
+    ; returns 255 in A if state change or 0 if no change
+
+    ; check keyboard buffer
+    lda CH
+    cmp #$ff
+    beq skip_char
+
+    ; Clear the keyboard buffer and re-enable interrupts
+    ldx #$ff
+    stx CH
+
+    tay
+
+    lda zoom
+    cpy #KEY_PLUS
+    beq plus
+    cpy #KEY_MINUS
+    beq minus
+
+    ; temp = $0010 << (8 - zoom)
+    lda #$10
+    sta temp
+    lda #$00
+    sta temp + 1
+    scale_zoom temp
+
+    cpy #KEY_UP
+    beq up
+    cpy #KEY_DOWN
+    beq down
+    cpy #KEY_LEFT
+    beq left
+    cpy #KEY_RIGHT
+    beq right
+
+skip_char:
+    lda #0
+    rts
+
+plus:
+    cmp #8
+    bpl skip_char
+    inc zoom
+    jmp done
+minus:
+    cmp #1
+    bmi skip_char
+    dec zoom
+    jmp done
+up:
+    sub16 oy, oy, temp 
+    jmp done
+down:
+    add16 oy, oy, temp
+    jmp done
+left:
+    sub16 ox, ox, temp
+    jmp done
+right:
+    add16 ox, ox, temp
+done:
+    lda #255
+    rts
+
+.endproc
+
+.proc clear_screen
+    ; zero the range from framebuffer_top to display_list
+    lda #.lobyte(framebuffer_top)
+    sta temp
+    lda #.hibyte(framebuffer_top)
+    sta temp + 1
+
+zero_page_loop:
+    lda #0
+    ldy #0
+zero_byte_loop:
+    sta (temp),y
+    iny
+    bne zero_byte_loop
+
+    inc temp + 1
+    lda temp + 1
+    cmp #.hibyte(display_list)
+    bne zero_page_loop
+
+    rts
+.endproc
+
+.proc status_bar
+    ; Status bar
+    draw_text 0, str_self_len, str_self
+    draw_text 40 - str_run_len, str_run_len, str_run
+
+    rts
+.endproc
+
 .proc start

    ; ox = 0; oy = 0; zoom = 0
@ -701,24 +990,7 @@ done:
    lda #0
    sta DMACTL

-    ; zero the range from framebuffer_top to framebuffer_end
-    lda #.lobyte(framebuffer_top)
-    sta temp
-    lda #.hibyte(framebuffer_top)
-    sta temp + 1
-
-zero_page_loop:
-    lda #0
-    ldy #0
-zero_byte_loop:
-    sta (temp),y
-    iny
-    bne zero_byte_loop
-
-    inc temp + 1
-    lda temp + 1
-    cmp #.hibyte(framebuffer_end)
-    bne zero_page_loop
+    jsr clear_screen

    ; Copy the display list into properly aligned memory
    ; Can't cross 1024-byte boundaries :D
@ -738,14 +1010,15 @@ copy_byte_loop:
    sta DLISTH ; actual register
    sta SDLSTH ; shadow register the OS will copy in

-    ; Status bar
-    draw_text 0, str_self_len, str_self
-    draw_text 40 - str_run_len, str_run_len, str_run
-
    ; Re-enable display DMA
    lda #$22
    sta DMACTL

+    ; Initialize the palette
+    lda #0
+    sta palette_offset
+    jsr update_palette
+
    ; install the vblank handler
    lda #7 ; deferred
    ldx #.hibyte(vblank_handler)
@ -753,6 +1026,14 @@ copy_byte_loop:
    jsr SETVBV

 main_loop:
+    jsr clear_screen
+    jsr status_bar
+
+    lda #0
+    sta fill_level
+
+fill_loop:
+
    ; sy = -92 .. 91
    lda #(256-half_height)
    sta sy
@ -767,12 +1048,53 @@ loop_sy:
    sta sx + 1

 loop_sx:
+    ; check the fill mask
+    ldy #0
+
+loop_skip_level:
+    cpy fill_level
+    beq current_level
+
+    lda fill_masks,y
+    and sx
+    bne not_skipped_mask1
+
+    lda fill_masks,y
+    and sy
+    beq skipped_mask
+
+not_skipped_mask1:
+    iny
+    jmp loop_skip_level
+
+current_level:
+    lda fill_masks,y
+    and sx
+    bne skipped_mask
+
+    lda fill_masks,y
+    and sy
+    beq not_skipped_mask
+
+skipped_mask:
+    jmp skipped
+
+not_skipped_mask:
+
+    ; run the fractal!
    zoom_factor cx, sx, zoom, aspect_x
+    add16 cx, cx, ox
    zoom_factor cy, sy, zoom, aspect_y
+    add16 cy, cy, oy
    jsr mandelbrot
    jsr pset

+    jsr keycheck
+    beq no_key
+    ; @fixme clear the pixel stats
+    jmp main_loop

+no_key:
    ; check if we should update the counters
    ;
    ; count_pixels >= width? update!
@ -784,7 +1106,7 @@ loop_sx:
    ; count_frames >= 120? update!
    lda count_frames
    cmp #120 ; >= 2 seconds
-    bmi skip_status
+    bmi skipped

 update_status:
    ; FR0 = (float)count_pixels & clear count_pixels
@ -844,35 +1166,11 @@ update_status:
    ; convert to ASCII in INBUFF
    jsr FASC

-    ; find the last byte
-    ldy #0
-number_loop:
-    lda (INBUFF),y
-    bmi lastchar
+    ; print the first 6 digits
+    draw_text_indirect speed_start, speed_precision, INBUFF
+    draw_text speed_start + speed_precision, str_speed_len, str_speed

-    tax
-    lda char_map,x
-    sta textbuffer + speed_start,y
-
-    iny
-    bpl number_loop
-lastchar:
-    ; Y is last char
-    ; trim that high bit
-    and #$7f
-    tax
-    lda char_map,x
-    sta textbuffer + speed_start,y
-
-    ; Fill out any remaining spaces
-    lda #0
-space_loop:
-    iny
-    sta textbuffer + speed_start,y
-    cpy #(20)
-    bmi space_loop
-
-skip_status:
+skipped:

    clc
    lda sx
@ -904,9 +1202,18 @@ loop_sx_done:

 loop_sy_done:

-    draw_text 40 - str_done_len, str_done_len, str_done
+fill_loop_done:
+    inc fill_level
+    lda fill_level
+    cmp #max_fill_level
+    beq loop
+    jmp fill_loop

 loop:
    ; finished
-    jmp loop
+    draw_text 40 - str_done_len, str_done_len, str_done
+    jsr keycheck
+    beq loop
+    jmp main_loop
+
 .endproc
--- a/readme.md
+++ b/readme.md
@ -14,7 +14,7 @@ Non-goals:

 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.

-- brion, january 2023
+-- brooke, january 2023 - february 2024

 ## Current state

@ -28,6 +28,8 @@ The mandelbrot calculations are done using 4.12-precision fixed point numbers. I

 Iterations are capped at 255.

+The pixels are run in a progressive layout to get the basic shape on screen faster.
+
 ## Next steps

 Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
@ -35,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.

 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
+(done)

 ## Deps and build instructions

--- a/tables.js
+++ b/tables.js
@ -0,0 +1,38 @@
+function db(func) {
+    let lines = [];
+    for (let i = 0; i < 256; i += 16) {
+        let items = [];
+        for (let j = 0; j < 16; j++) {
+            let x = i + j;
+            items.push(func(x));
+        }
+        lines.push('    .byte ' + items.join(', '));
+    }
+    return lines.join('\n');
+}
+
+let squares = [];
+for (let i = 0; i < 512; i++) {
+    squares.push(Math.trunc((i * i + 1) / 2));
+}
+
+console.log(
+`.segment "TABLES"
+
+.export mul_lobyte256
+.export mul_hibyte256
+.export mul_hibyte512
+
+.align 256
+mul_lobyte256:
+${db((i) => squares[i] & 0xff)}
+
+.align 256
+mul_hibyte256:
+${db((i) => (squares[i] >> 8) & 0xff)}
+
+.align 256
+mul_hibyte512:
+${db((i) => (squares[i + 256] >> 8) & 0xff)}
+
+`);
--- a/testme.js
+++ b/testme.js
@ -0,0 +1,41 @@
+// ax = (a + x)2/2 - a2/2 - x2/2 
+
+function half_square(x) {
+    return Math.round(x * x / 2) & 0xffff >>> 0;
+}
+
+function mul8(a, b) {
+    let result = half_square(a + b) & 0xffff;
+    result = (result - half_square(a)) & 0xffff;
+    result = (result - half_square(b)) & 0xffff;
+    result = (result + (b & a & 1)) & 0xffff;
+    return result >>> 0;
+}
+
+function mul16(a, b) {
+    let ah = (a & 0xff00) >>> 8;
+    let al = (a & 0x00ff) >>> 0;
+    let bh = (b & 0xff00) >>> 8;
+    let bl = (b & 0x00ff) >>> 0;
+    let result = (mul8(al, bl) & 0xffff) >>> 0;
+    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
+    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
+    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
+    return result;
+}
+
+let max = 65536;
+//let max = 256;
+//let max = 128;
+//let max = 8;
+
+for (let a = 0; a < max; a++) {
+    for (let b = 0; b < max; b++) {
+        let expected = Math.imul(a, b) >>> 0;
+        //let actual = mul8(a, b);
+        let actual = mul16(a, b);
+        if (expected !== actual) {
+            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
+        }
+    }
+}
Author	SHA1	Message	Date
Brooke Vibber	05133aabdd	slightly faster handling of signed mul previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/	2024-12-15 20:17:45 -08:00
Brooke Vibber	7f2bc43cff	squares	2024-12-14 18:56:26 -08:00
Brion Vibber	5637783529	Faster imul16 routine Improves runtime from 16.24 ms/px to 14.44 ms/px This uses a routine found on Everything2: https://everything2.com/title/Fast+6502+multiplication which uses a lookup table of squares to do 8-bit imuls, which are then composed into a 16-bit imul	2024-12-14 18:53:31 -08:00
Brooke Vibber	29630c8887	update palette more smoothly	2024-08-19 13:21:44 -07:00
Brooke Vibber	c559b6e76b	palette adjustment	2024-08-18 21:07:53 -07:00
Brooke Vibber	6f05a9bbd0	basic palette cycling	2024-08-18 21:06:30 -07:00
Brooke Vibber	8be03993ab	fix time of drawing of 'DONE' text	2024-08-18 20:29:39 -07:00
Brooke Vibber	ee5b12dae8	mailmap	2024-08-18 20:15:47 -07:00
Brooke Vibber	201d9bf15c	clear screen after zoom/scroll	2024-02-25 15:15:23 -08:00
Brooke Vibber	c152c4346b	Progressive pixel layout	2024-02-04 14:25:15 -08:00
Brion Vibber	510457f97a	add a note to fix stats when changing zoom	2023-03-11 21:15:08 -08:00
Brion Vibber	3d792603db	keyboard nav sorta working	2023-03-11 20:45:32 -08:00
Brion Vibber	b1c26c1edd	WIP fix keyboard check	2023-03-05 16:57:41 -08:00
Brion Vibber	53336f7af1	WIP quick hack to check keyboard this for some reason only works ONCE though I can replicate the logic in BASIC and it works over multiple keys not sure what's wrong	2023-03-05 15:45:44 -08:00
Brion Vibber	24abc21b01	move speed to the right	2023-03-05 13:56:50 -08:00
Brion Vibber	9926ec28e7	clean up speed display now uses ms/px msg	2023-03-05 13:48:39 -08:00
Brion Vibber	0501a364c7	Check for repeated zx/zy values These will never escape, so saves some time in the lake trick is taken from fractint	2023-02-12 11:56:20 -08:00