Compare commits

..

No commits in common. "main" and "status" have entirely different histories.
main ... status

7 changed files with 158 additions and 554 deletions

1
.gitignore vendored
View file

@ -1,4 +1,3 @@
*.o
*.xex
tables.s
.DS_Store

View file

@ -1,2 +0,0 @@
Brooke Vibber <bvibber@pobox.com>
Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>

View file

@ -2,17 +2,13 @@
all : mandel.xex
mandel.xex : mandel.o tables.o
ld65 -C ./atari-asm-xex.cfg -o $@ $+
%.xex : %.o
ld65 -C atari-asm-xex.cfg -o $@ $<
%.o : %.s
ca65 -o $@ $<
tables.s : tables.js
node tables.js > tables.s
clean :
rm -f tables.s
rm -f *.o
rm -f *.xex

613
mandel.s
View file

@ -21,18 +21,13 @@ count_pixels = $a3 ; u8
total_ms = $a4 ; float48
total_pixels = $aa ; float48
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8
pixel_mask = $bb ; u8
pixel_shift = $bc ; u8
pixel_offset = $bd ; u8
fill_level = $be ; u8
palette_offset = $bf ; u8
temp = $b0 ; u16
pixel_ptr = $b2 ; u16
pixel_color = $b4 ; u8
pixel_mask = $b5 ; u8
pixel_shift = $b6 ; u8
pixel_offset = $b7 ; u8
; FP registers in zero page
FR0 = $d4 ; float48
@ -43,9 +38,6 @@ CIX = $f2 ; u8 - index into INBUFF
INBUFF = $f3 ; u16 - pointer to ascii
FLPTR = $fc ; u16 - pointer to user buffer float48
CH1 = $02f2 ; previous character read from keyboard
CH = $02fc ; current character read from keyboard
LBUFF = $0580 ; result buffer for FASC routine
; FP ROM routine vectors
@ -77,40 +69,20 @@ stride = width >> 2
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
WSYNC = $D40A
; OS shadow registers
SDLSTL = $230
SDLSTH = $231
; interrupt stuff
SYSVBV = $E45F
XITVBV = $E462
SETVBV = $E45C
COLOR0 = $2C4
COLOR1 = $2C5
COLOR2 = $2C6
COLOR3 = $2C7
COLOR4 = $2C8
; Keycodes!
KEY_PLUS = $06
KEY_MINUS = $0e
KEY_UP = $8e
KEY_DOWN = $8f
KEY_LEFT = $86
KEY_RIGHT = $87
.struct float48
exponent .byte
mantissa .byte 6
.endstruct
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.data
strings:
@ -131,9 +103,8 @@ str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
speed_precision = 6
speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
speed_start = str_self_len + 2
speed_len = 14 + str_speed_len
@ -150,9 +121,6 @@ char_map:
.byte 96 + i
.endrepeat
hex_chars:
.byte "0123456789abcdef"
aspect:
; aspect ratio!
; pixels at 320w are 5:6 (narrow)
@ -216,33 +184,10 @@ color_map:
.byte 3
.endrepeat
palette:
.byte $00
.byte $46
.byte $78
.byte $b4
.code
z_buffer_len = 16
z_buffer_mask = z_buffer_len - 1
z_buffer:
; the last N zx/zy values
.repeat z_buffer_len
.word 0
.word 0
.endrepeat
.export start
max_fill_level = 6
fill_masks:
.byte %00011111
.byte %00001111
.byte %00000111
.byte %00000011
.byte %00000001
.byte %00000000
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
@ -261,12 +206,6 @@ fill_masks:
add 4, dest, arg2, dest
.endmacro
.macro add_carry dest
lda dest
adc #0
sta dest
.endmacro
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
@ -344,6 +283,68 @@ fill_masks:
neg 4, arg
.endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
@ -367,96 +368,38 @@ fill_masks:
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
; Adapted from https://everything2.com/title/Fast+6502+multiplication
.macro imul8 dest, arg1, arg2
.local under256
.local next
.local small_product
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
under256:
lda mul_hibyte256,x
sec
next:
sta mul_product_hi
lda mul_lobyte256,x
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
.endscope
.endmacro
; min 470 cycles
; max 780 cycles
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
inter = temp2
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
ldx #0 ; 2 cyc
; counts the number of sign bits in X
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
imul8 result, arg1, arg2
lda #0
sta result + 2
sta result + 3
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
imul8 inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 440 to 696 cycles
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
imul8 inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
; In case of mixed input signs, return a negative result.
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
rts ; 6 cyc
.endproc
@ -519,14 +462,12 @@ initloop:
sta zx - 1,x
dex
bne initloop
sta z_buffer_start
sta z_buffer_end
loop:
; iter++ & max-iters break
inc iter
bne keep_going
jmp exit_path
rts
keep_going:
.macro quick_exit arg, max
@ -543,7 +484,7 @@ keep_going:
positive:
cmp #((max) << 4)
bmi all_done ; 'less than'
jmp exit_path
rts
negative:
cmp #(256 - ((max) << 4))
@ -551,7 +492,7 @@ keep_going:
bpl all_done ; 'greater than'
nope_out:
jmp exit_path
rts
first_equal:
lda arg
@ -586,100 +527,19 @@ keep_going:
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
lda z_buffer_active
beq skip_z_buffer
ldx z_buffer_start
cpx z_buffer_end
beq z_nothing_to_read
z_buffer_loop:
.macro z_compare arg
.local compare_no_match
lda z_buffer,x
inx
cmp arg
bne compare_no_match
iny
compare_no_match:
.endmacro
.macro z_advance
.local skip_reset_x
cpx #(z_buffer_len * 4)
bmi skip_reset_x
ldx #0
skip_reset_x:
.endmacro
.macro z_store arg
lda arg
sta z_buffer,x
inx
.endmacro
; Compare the previously stored z values
ldy #0
z_compare zx
z_compare zx + 1
z_compare zy
z_compare zy + 1
cpy #4
bne z_no_matches
jmp z_exit
z_no_matches:
z_advance
cpx z_buffer_end
bne z_buffer_loop
z_nothing_to_read:
; Store and expand
z_store zx
z_store zx + 1
z_store zy
z_store zy + 1
z_advance
stx z_buffer_end
; Increment the start roller if necessary (limit size)
lda iter
cmp #(z_buffer_len * 4)
bmi skip_inc_start
lda z_buffer_start
clc
adc #4
tax
z_advance
stx z_buffer_start
skip_inc_start:
skip_z_buffer:
jmp loop
z_exit:
lda #0
sta iter
exit_path:
ldx #0
lda iter
bne next
inx
next:
stx z_buffer_active
peace_out:
rts
.endproc
.macro scale_zoom dest
; clobbers X, flags
.macro zoom_factor dest, src, zoom, aspect
.local cont
.local enough
; cx = (sx << (8 - zoom))
copy16 dest, src
ldx zoom
cont:
cpx #8
@ -688,12 +548,6 @@ cont:
inx
jmp cont
enough:
.endmacro
.macro zoom_factor dest, src, zoom, aspect
; clobbers A, X, flags, etc
copy16 dest, src
scale_zoom dest
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
@ -790,25 +644,6 @@ shift_done:
rts
.endproc
.macro draw_text_indirect col, len, strptr
; clobbers A, X
.local loop
.local done
ldx #0
loop:
cpx #len
beq done
txa
tay
lda (strptr),y
tay
lda char_map,y
sta textbuffer + col,x
inx
jmp loop
done:
.endmacro
.macro draw_text col, len, cstr
; clobbers A, X
.local loop
@ -827,34 +662,9 @@ done:
.proc vblank_handler
inc count_frames
inc palette_offset
jsr update_palette
jmp XITVBV
.endproc
.proc update_palette
lda palette
sta COLOR4
clc
lda palette_offset
and #$f0
adc palette + 1
sta COLOR0
clc
lda palette_offset
and #$f0
adc palette + 2
sta COLOR1
clc
lda palette_offset
and #$f0
adc palette + 3
sta COLOR2
.endproc
.proc update_speed
; convert frames (u16) to fp
; add to frames_total
@ -865,105 +675,6 @@ done:
; draw text
.endproc
.proc keycheck
; clobbers all
; returns 255 in A if state change or 0 if no change
; check keyboard buffer
lda CH
cmp #$ff
beq skip_char
; Clear the keyboard buffer and re-enable interrupts
ldx #$ff
stx CH
tay
lda zoom
cpy #KEY_PLUS
beq plus
cpy #KEY_MINUS
beq minus
; temp = $0010 << (8 - zoom)
lda #$10
sta temp
lda #$00
sta temp + 1
scale_zoom temp
cpy #KEY_UP
beq up
cpy #KEY_DOWN
beq down
cpy #KEY_LEFT
beq left
cpy #KEY_RIGHT
beq right
skip_char:
lda #0
rts
plus:
cmp #8
bpl skip_char
inc zoom
jmp done
minus:
cmp #1
bmi skip_char
dec zoom
jmp done
up:
sub16 oy, oy, temp
jmp done
down:
add16 oy, oy, temp
jmp done
left:
sub16 ox, ox, temp
jmp done
right:
add16 ox, ox, temp
done:
lda #255
rts
.endproc
.proc clear_screen
; zero the range from framebuffer_top to display_list
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(display_list)
bne zero_page_loop
rts
.endproc
.proc status_bar
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
rts
.endproc
.proc start
; ox = 0; oy = 0; zoom = 0
@ -990,7 +701,24 @@ zero_byte_loop:
lda #0
sta DMACTL
jsr clear_screen
; zero the range from framebuffer_top to framebuffer_end
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(framebuffer_end)
bne zero_page_loop
; Copy the display list into properly aligned memory
; Can't cross 1024-byte boundaries :D
@ -1010,15 +738,14 @@ copy_byte_loop:
sta DLISTH ; actual register
sta SDLSTH ; shadow register the OS will copy in
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
; Re-enable display DMA
lda #$22
sta DMACTL
; Initialize the palette
lda #0
sta palette_offset
jsr update_palette
; install the vblank handler
lda #7 ; deferred
ldx #.hibyte(vblank_handler)
@ -1026,14 +753,6 @@ copy_byte_loop:
jsr SETVBV
main_loop:
jsr clear_screen
jsr status_bar
lda #0
sta fill_level
fill_loop:
; sy = -92 .. 91
lda #(256-half_height)
sta sy
@ -1048,53 +767,12 @@ loop_sy:
sta sx + 1
loop_sx:
; check the fill mask
ldy #0
loop_skip_level:
cpy fill_level
beq current_level
lda fill_masks,y
and sx
bne not_skipped_mask1
lda fill_masks,y
and sy
beq skipped_mask
not_skipped_mask1:
iny
jmp loop_skip_level
current_level:
lda fill_masks,y
and sx
bne skipped_mask
lda fill_masks,y
and sy
beq not_skipped_mask
skipped_mask:
jmp skipped
not_skipped_mask:
; run the fractal!
zoom_factor cx, sx, zoom, aspect_x
add16 cx, cx, ox
zoom_factor cy, sy, zoom, aspect_y
add16 cy, cy, oy
jsr mandelbrot
jsr pset
jsr keycheck
beq no_key
; @fixme clear the pixel stats
jmp main_loop
no_key:
; check if we should update the counters
;
; count_pixels >= width? update!
@ -1106,7 +784,7 @@ no_key:
; count_frames >= 120? update!
lda count_frames
cmp #120 ; >= 2 seconds
bmi skipped
bmi skip_status
update_status:
; FR0 = (float)count_pixels & clear count_pixels
@ -1166,11 +844,35 @@ update_status:
; convert to ASCII in INBUFF
jsr FASC
; print the first 6 digits
draw_text_indirect speed_start, speed_precision, INBUFF
draw_text speed_start + speed_precision, str_speed_len, str_speed
; find the last byte
ldy #0
number_loop:
lda (INBUFF),y
bmi lastchar
skipped:
tax
lda char_map,x
sta textbuffer + speed_start,y
iny
bpl number_loop
lastchar:
; Y is last char
; trim that high bit
and #$7f
tax
lda char_map,x
sta textbuffer + speed_start,y
; Fill out any remaining spaces
lda #0
space_loop:
iny
sta textbuffer + speed_start,y
cpy #(20)
bmi space_loop
skip_status:
clc
lda sx
@ -1202,18 +904,9 @@ loop_sx_done:
loop_sy_done:
fill_loop_done:
inc fill_level
lda fill_level
cmp #max_fill_level
beq loop
jmp fill_loop
draw_text 40 - str_done_len, str_done_len, str_done
loop:
; finished
draw_text 40 - str_done_len, str_done_len, str_done
jsr keycheck
beq loop
jmp main_loop
jmp loop
.endproc

View file

@ -14,7 +14,7 @@ Non-goals:
Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brooke, january 2023 - february 2024
-- brion, january 2023
## Current state
@ -28,8 +28,6 @@ The mandelbrot calculations are done using 4.12-precision fixed point numbers. I
Iterations are capped at 255.
The pixels are run in a progressive layout to get the basic shape on screen faster.
## Next steps
Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
@ -37,7 +35,6 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
(done)
## Deps and build instructions

View file

@ -1,38 +0,0 @@
function db(func) {
let lines = [];
for (let i = 0; i < 256; i += 16) {
let items = [];
for (let j = 0; j < 16; j++) {
let x = i + j;
items.push(func(x));
}
lines.push(' .byte ' + items.join(', '));
}
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
`);

View file

@ -1,41 +0,0 @@
// ax = (a + x)2/2 - a2/2 - x2/2
function half_square(x) {
return Math.round(x * x / 2) & 0xffff >>> 0;
}
function mul8(a, b) {
let result = half_square(a + b) & 0xffff;
result = (result - half_square(a)) & 0xffff;
result = (result - half_square(b)) & 0xffff;
result = (result + (b & a & 1)) & 0xffff;
return result >>> 0;
}
function mul16(a, b) {
let ah = (a & 0xff00) >>> 8;
let al = (a & 0x00ff) >>> 0;
let bh = (b & 0xff00) >>> 8;
let bl = (b & 0x00ff) >>> 0;
let result = (mul8(al, bl) & 0xffff) >>> 0;
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
return result;
}
let max = 65536;
//let max = 256;
//let max = 128;
//let max = 8;
for (let a = 0; a < max; a++) {
for (let b = 0; b < max; b++) {
let expected = Math.imul(a, b) >>> 0;
//let actual = mul8(a, b);
let actual = mul16(a, b);
if (expected !== actual) {
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
}
}
}