Compare commits

...

17 commits
status ... main

Author SHA1 Message Date
05133aabdd slightly faster handling of signed mul
previously we were flipping the inputs if negative, and then the
output if both inputs were negative

turns out you can just treat the whole thing as an unsigned mul
and then subtract each term from the high word if the other term
is negative.

https://stackoverflow.com/a/28827013

this saves a handful of cycles, reducing our runtime to a merge
14.211 ms/px \o/
2024-12-15 20:17:45 -08:00
7f2bc43cff squares 2024-12-14 18:56:26 -08:00
5637783529 Faster imul16 routine
Improves runtime from 16.24 ms/px to 14.44 ms/px

This uses a routine found on Everything2:
https://everything2.com/title/Fast+6502+multiplication

which uses a lookup table of squares to do 8-bit imuls,
which are then composed into a 16-bit imul
2024-12-14 18:53:31 -08:00
29630c8887 update palette more smoothly 2024-08-19 13:21:44 -07:00
c559b6e76b palette adjustment 2024-08-18 21:07:53 -07:00
6f05a9bbd0 basic palette cycling 2024-08-18 21:06:30 -07:00
8be03993ab fix time of drawing of 'DONE' text 2024-08-18 20:29:39 -07:00
ee5b12dae8 mailmap 2024-08-18 20:15:47 -07:00
201d9bf15c clear screen after zoom/scroll 2024-02-25 15:15:23 -08:00
c152c4346b Progressive pixel layout 2024-02-04 14:25:15 -08:00
510457f97a add a note to fix stats when changing zoom 2023-03-11 21:15:08 -08:00
3d792603db keyboard nav sorta working 2023-03-11 20:45:32 -08:00
b1c26c1edd WIP fix keyboard check 2023-03-05 16:57:41 -08:00
53336f7af1 WIP quick hack to check keyboard
this for some reason only works ONCE
though I can replicate the logic in BASIC
and it works over multiple keys
not sure what's wrong
2023-03-05 15:45:44 -08:00
24abc21b01 move speed to the right 2023-03-05 13:56:50 -08:00
9926ec28e7 clean up speed display now uses ms/px msg 2023-03-05 13:48:39 -08:00
0501a364c7 Check for repeated zx/zy values
These will never escape, so saves
some time in the lake

trick is taken from fractint
2023-02-12 11:56:20 -08:00
7 changed files with 554 additions and 158 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
*.o
*.xex
tables.s
.DS_Store

2
.mailmap Normal file
View file

@ -0,0 +1,2 @@
Brooke Vibber <bvibber@pobox.com>
Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>

View file

@ -2,13 +2,17 @@
all : mandel.xex
%.xex : %.o
ld65 -C atari-asm-xex.cfg -o $@ $<
mandel.xex : mandel.o tables.o
ld65 -C ./atari-asm-xex.cfg -o $@ $+
%.o : %.s
ca65 -o $@ $<
tables.s : tables.js
node tables.js > tables.s
clean :
rm -f tables.s
rm -f *.o
rm -f *.xex

617
mandel.s
View file

@ -21,13 +21,18 @@ count_pixels = $a3 ; u8
total_ms = $a4 ; float48
total_pixels = $aa ; float48
temp = $b0 ; u16
pixel_ptr = $b2 ; u16
pixel_color = $b4 ; u8
pixel_mask = $b5 ; u8
pixel_shift = $b6 ; u8
pixel_offset = $b7 ; u8
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8
pixel_mask = $bb ; u8
pixel_shift = $bc ; u8
pixel_offset = $bd ; u8
fill_level = $be ; u8
palette_offset = $bf ; u8
; FP registers in zero page
FR0 = $d4 ; float48
@ -38,6 +43,9 @@ CIX = $f2 ; u8 - index into INBUFF
INBUFF = $f3 ; u16 - pointer to ascii
FLPTR = $fc ; u16 - pointer to user buffer float48
CH1 = $02f2 ; previous character read from keyboard
CH = $02fc ; current character read from keyboard
LBUFF = $0580 ; result buffer for FASC routine
; FP ROM routine vectors
@ -69,20 +77,40 @@ stride = width >> 2
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
WSYNC = $D40A
; OS shadow registers
SDLSTL = $230
SDLSTH = $231
; interrupt stuff
SYSVBV = $E45F
XITVBV = $E462
SETVBV = $E45C
COLOR0 = $2C4
COLOR1 = $2C5
COLOR2 = $2C6
COLOR3 = $2C7
COLOR4 = $2C8
; Keycodes!
KEY_PLUS = $06
KEY_MINUS = $0e
KEY_UP = $8e
KEY_DOWN = $8f
KEY_LEFT = $86
KEY_RIGHT = $87
.struct float48
exponent .byte
mantissa .byte 6
.endstruct
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.data
strings:
@ -90,7 +118,7 @@ str_self:
.byte "MANDEL-6502"
str_self_end:
str_speed:
.byte "ms/px"
.byte " ms/px"
str_speed_end:
str_run:
.byte " RUN"
@ -103,8 +131,9 @@ str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
speed_precision = 6
speed_start = str_self_len + 2
speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
speed_len = 14 + str_speed_len
@ -121,6 +150,9 @@ char_map:
.byte 96 + i
.endrepeat
hex_chars:
.byte "0123456789abcdef"
aspect:
; aspect ratio!
; pixels at 320w are 5:6 (narrow)
@ -184,10 +216,33 @@ color_map:
.byte 3
.endrepeat
palette:
.byte $00
.byte $46
.byte $78
.byte $b4
.code
z_buffer_len = 16
z_buffer_mask = z_buffer_len - 1
z_buffer:
; the last N zx/zy values
.repeat z_buffer_len
.word 0
.word 0
.endrepeat
.export start
max_fill_level = 6
fill_masks:
.byte %00011111
.byte %00001111
.byte %00000111
.byte %00000011
.byte %00000001
.byte %00000000
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
@ -206,6 +261,12 @@ color_map:
add 4, dest, arg2, dest
.endmacro
.macro add_carry dest
lda dest
adc #0
sta dest
.endmacro
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
@ -283,68 +344,6 @@ color_map:
neg 4, arg
.endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
@ -368,38 +367,96 @@ positive:
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
; min 470 cycles
; max 780 cycles
; Adapted from https://everything2.com/title/Fast+6502+multiplication
.macro imul8 dest, arg1, arg2
.local under256
.local next
.local small_product
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
under256:
lda mul_hibyte256,x
sec
next:
sta mul_product_hi
lda mul_lobyte256,x
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
.endscope
.endmacro
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
inter = temp2
ldx #0 ; 2 cyc
; counts the number of sign bits in X
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 440 to 696 cycles
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
imul8 result, arg1, arg2
lda #0
sta result + 2
sta result + 3
; In case of mixed input signs, return a negative result.
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
imul8 inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endproc
@ -462,12 +519,14 @@ initloop:
sta zx - 1,x
dex
bne initloop
sta z_buffer_start
sta z_buffer_end
loop:
; iter++ & max-iters break
inc iter
bne keep_going
rts
jmp exit_path
keep_going:
.macro quick_exit arg, max
@ -484,7 +543,7 @@ keep_going:
positive:
cmp #((max) << 4)
bmi all_done ; 'less than'
rts
jmp exit_path
negative:
cmp #(256 - ((max) << 4))
@ -492,7 +551,7 @@ keep_going:
bpl all_done ; 'greater than'
nope_out:
rts
jmp exit_path
first_equal:
lda arg
@ -527,19 +586,100 @@ keep_going:
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
lda z_buffer_active
beq skip_z_buffer
ldx z_buffer_start
cpx z_buffer_end
beq z_nothing_to_read
z_buffer_loop:
.macro z_compare arg
.local compare_no_match
lda z_buffer,x
inx
cmp arg
bne compare_no_match
iny
compare_no_match:
.endmacro
.macro z_advance
.local skip_reset_x
cpx #(z_buffer_len * 4)
bmi skip_reset_x
ldx #0
skip_reset_x:
.endmacro
.macro z_store arg
lda arg
sta z_buffer,x
inx
.endmacro
; Compare the previously stored z values
ldy #0
z_compare zx
z_compare zx + 1
z_compare zy
z_compare zy + 1
cpy #4
bne z_no_matches
jmp z_exit
z_no_matches:
z_advance
cpx z_buffer_end
bne z_buffer_loop
z_nothing_to_read:
; Store and expand
z_store zx
z_store zx + 1
z_store zy
z_store zy + 1
z_advance
stx z_buffer_end
; Increment the start roller if necessary (limit size)
lda iter
cmp #(z_buffer_len * 4)
bmi skip_inc_start
lda z_buffer_start
clc
adc #4
tax
z_advance
stx z_buffer_start
skip_inc_start:
skip_z_buffer:
jmp loop
peace_out:
z_exit:
lda #0
sta iter
exit_path:
ldx #0
lda iter
bne next
inx
next:
stx z_buffer_active
rts
.endproc
.macro zoom_factor dest, src, zoom, aspect
.macro scale_zoom dest
; clobbers X, flags
.local cont
.local enough
; cx = (sx << (8 - zoom))
copy16 dest, src
ldx zoom
cont:
cpx #8
@ -548,6 +688,12 @@ cont:
inx
jmp cont
enough:
.endmacro
.macro zoom_factor dest, src, zoom, aspect
; clobbers A, X, flags, etc
copy16 dest, src
scale_zoom dest
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
@ -644,6 +790,25 @@ shift_done:
rts
.endproc
.macro draw_text_indirect col, len, strptr
; clobbers A, X
.local loop
.local done
ldx #0
loop:
cpx #len
beq done
txa
tay
lda (strptr),y
tay
lda char_map,y
sta textbuffer + col,x
inx
jmp loop
done:
.endmacro
.macro draw_text col, len, cstr
; clobbers A, X
.local loop
@ -662,9 +827,34 @@ done:
.proc vblank_handler
inc count_frames
inc palette_offset
jsr update_palette
jmp XITVBV
.endproc
.proc update_palette
lda palette
sta COLOR4
clc
lda palette_offset
and #$f0
adc palette + 1
sta COLOR0
clc
lda palette_offset
and #$f0
adc palette + 2
sta COLOR1
clc
lda palette_offset
and #$f0
adc palette + 3
sta COLOR2
.endproc
.proc update_speed
; convert frames (u16) to fp
; add to frames_total
@ -675,6 +865,105 @@ done:
; draw text
.endproc
.proc keycheck
; clobbers all
; returns 255 in A if state change or 0 if no change
; check keyboard buffer
lda CH
cmp #$ff
beq skip_char
; Clear the keyboard buffer and re-enable interrupts
ldx #$ff
stx CH
tay
lda zoom
cpy #KEY_PLUS
beq plus
cpy #KEY_MINUS
beq minus
; temp = $0010 << (8 - zoom)
lda #$10
sta temp
lda #$00
sta temp + 1
scale_zoom temp
cpy #KEY_UP
beq up
cpy #KEY_DOWN
beq down
cpy #KEY_LEFT
beq left
cpy #KEY_RIGHT
beq right
skip_char:
lda #0
rts
plus:
cmp #8
bpl skip_char
inc zoom
jmp done
minus:
cmp #1
bmi skip_char
dec zoom
jmp done
up:
sub16 oy, oy, temp
jmp done
down:
add16 oy, oy, temp
jmp done
left:
sub16 ox, ox, temp
jmp done
right:
add16 ox, ox, temp
done:
lda #255
rts
.endproc
.proc clear_screen
; zero the range from framebuffer_top to display_list
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(display_list)
bne zero_page_loop
rts
.endproc
.proc status_bar
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
rts
.endproc
.proc start
; ox = 0; oy = 0; zoom = 0
@ -701,24 +990,7 @@ done:
lda #0
sta DMACTL
; zero the range from framebuffer_top to framebuffer_end
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(framebuffer_end)
bne zero_page_loop
jsr clear_screen
; Copy the display list into properly aligned memory
; Can't cross 1024-byte boundaries :D
@ -738,14 +1010,15 @@ copy_byte_loop:
sta DLISTH ; actual register
sta SDLSTH ; shadow register the OS will copy in
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
; Re-enable display DMA
lda #$22
sta DMACTL
; Initialize the palette
lda #0
sta palette_offset
jsr update_palette
; install the vblank handler
lda #7 ; deferred
ldx #.hibyte(vblank_handler)
@ -753,6 +1026,14 @@ copy_byte_loop:
jsr SETVBV
main_loop:
jsr clear_screen
jsr status_bar
lda #0
sta fill_level
fill_loop:
; sy = -92 .. 91
lda #(256-half_height)
sta sy
@ -767,12 +1048,53 @@ loop_sy:
sta sx + 1
loop_sx:
; check the fill mask
ldy #0
loop_skip_level:
cpy fill_level
beq current_level
lda fill_masks,y
and sx
bne not_skipped_mask1
lda fill_masks,y
and sy
beq skipped_mask
not_skipped_mask1:
iny
jmp loop_skip_level
current_level:
lda fill_masks,y
and sx
bne skipped_mask
lda fill_masks,y
and sy
beq not_skipped_mask
skipped_mask:
jmp skipped
not_skipped_mask:
; run the fractal!
zoom_factor cx, sx, zoom, aspect_x
add16 cx, cx, ox
zoom_factor cy, sy, zoom, aspect_y
add16 cy, cy, oy
jsr mandelbrot
jsr pset
jsr keycheck
beq no_key
; @fixme clear the pixel stats
jmp main_loop
no_key:
; check if we should update the counters
;
; count_pixels >= width? update!
@ -784,7 +1106,7 @@ loop_sx:
; count_frames >= 120? update!
lda count_frames
cmp #120 ; >= 2 seconds
bmi skip_status
bmi skipped
update_status:
; FR0 = (float)count_pixels & clear count_pixels
@ -844,35 +1166,11 @@ update_status:
; convert to ASCII in INBUFF
jsr FASC
; find the last byte
ldy #0
number_loop:
lda (INBUFF),y
bmi lastchar
; print the first 6 digits
draw_text_indirect speed_start, speed_precision, INBUFF
draw_text speed_start + speed_precision, str_speed_len, str_speed
tax
lda char_map,x
sta textbuffer + speed_start,y
iny
bpl number_loop
lastchar:
; Y is last char
; trim that high bit
and #$7f
tax
lda char_map,x
sta textbuffer + speed_start,y
; Fill out any remaining spaces
lda #0
space_loop:
iny
sta textbuffer + speed_start,y
cpy #(20)
bmi space_loop
skip_status:
skipped:
clc
lda sx
@ -904,9 +1202,18 @@ loop_sx_done:
loop_sy_done:
draw_text 40 - str_done_len, str_done_len, str_done
fill_loop_done:
inc fill_level
lda fill_level
cmp #max_fill_level
beq loop
jmp fill_loop
loop:
; finished
jmp loop
draw_text 40 - str_done_len, str_done_len, str_done
jsr keycheck
beq loop
jmp main_loop
.endproc

View file

@ -14,7 +14,7 @@ Non-goals:
Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brion, january 2023
-- brooke, january 2023 - february 2024
## Current state
@ -28,6 +28,8 @@ The mandelbrot calculations are done using 4.12-precision fixed point numbers. I
Iterations are capped at 255.
The pixels are run in a progressive layout to get the basic shape on screen faster.
## Next steps
Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
@ -35,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions

38
tables.js Normal file
View file

@ -0,0 +1,38 @@
function db(func) {
let lines = [];
for (let i = 0; i < 256; i += 16) {
let items = [];
for (let j = 0; j < 16; j++) {
let x = i + j;
items.push(func(x));
}
lines.push(' .byte ' + items.join(', '));
}
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
`);

41
testme.js Normal file
View file

@ -0,0 +1,41 @@
// ax = (a + x)2/2 - a2/2 - x2/2
function half_square(x) {
return Math.round(x * x / 2) & 0xffff >>> 0;
}
function mul8(a, b) {
let result = half_square(a + b) & 0xffff;
result = (result - half_square(a)) & 0xffff;
result = (result - half_square(b)) & 0xffff;
result = (result + (b & a & 1)) & 0xffff;
return result >>> 0;
}
function mul16(a, b) {
let ah = (a & 0xff00) >>> 8;
let al = (a & 0x00ff) >>> 0;
let bh = (b & 0xff00) >>> 8;
let bl = (b & 0x00ff) >>> 0;
let result = (mul8(al, bl) & 0xffff) >>> 0;
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
return result;
}
let max = 65536;
//let max = 256;
//let max = 128;
//let max = 8;
for (let a = 0; a < max; a++) {
for (let b = 0; b < max; b++) {
let expected = Math.imul(a, b) >>> 0;
//let actual = mul8(a, b);
let actual = mul16(a, b);
if (expected !== actual) {
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
}
}
}