Compare commits

..

No commits in common. "cc83c76706519cce3fff61ce46df9589d31025d6" and "61eb1aaf21fdac377e6f04db117aa855ad73b940" have entirely different histories.

3 changed files with 92 additions and 153 deletions

237
mandel.s
View file

@ -1,42 +1,43 @@
; Our zero-page vars
ox = $80 ; fixed8.24: center point x
oy = $84 ; fixed8.24: center point y
cx = $88 ; fixed8.24: c_x
cy = $8c ; fixed8.24: c_y
sx = $80 ; i16: screen pixel x
sy = $82 ; i16: screen pixel y
ox = $84 ; fixed4.12: center point x
oy = $86 ; fixed4.12: center point y
cx = $88 ; fixed4.12: c_x
cy = $8a ; fixed4.12: c_y
zx = $8c ; fixed4.12: z_x
zy = $8e ; fixed4.12: z_y
zx = $90 ; fixed8.24: z_x
zy = $94 ; fixed8.24: z_y
zx_2 = $98 ; fixed8.24: z_x^2
zy_2 = $9c ; fixed8.24: z_y^2
zx_2 = $90 ; fixed4.12: z_x^2
zy_2 = $92 ; fixed4.12: z_y^2
zx_zy = $94 ; fixed4.12: z_x * z_y
dist = $96 ; fixed4.12: z_x^2 + z_y^2
zx_zy = $a0 ; fixed8.24: z_x * z_y
dist = $a4 ; fixed8.24: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $ad ; u8: index into z_buffer
z_buffer_end = $ae ; u8: index into z_buffer
iter = $af ; u8: iteration count
iter = $a0 ; u8: iteration count
ptr = $b0 ; u16
pixel_ptr = $b2 ; u16
zoom = $b4 ; u8: zoom shift level
fill_level = $b5 ; u8
pixel_color = $b6 ; u8
pixel_mask = $b7 ; u8
pixel_shift = $b8 ; u8
pixel_offset = $b9 ; u8
palette_offset = $ba ; u8
chroma_offset = $bb ; u8
palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8
count_frames = $be ; u8
count_pixels = $bf ; u8
zoom = $a1 ; u8: zoom shift level
count_frames = $a2 ; u8
count_pixels = $a3 ; u8
total_ms = $a4 ; float48
total_pixels = $aa ; float48
total_pixels = $c0 ; float48
total_ms = $c6 ; float48
temp = $cc ; u16
temp2 = $ce ; u16
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8
pixel_mask = $bb ; u8
pixel_shift = $bc ; u8
pixel_offset = $bd ; u8
fill_level = $be ; u8
palette_offset = $bf ; u8
palette_ticks = $c0 ; u8
chroma_ticks = $c1 ; u8
chroma_offset = $c2 ; u8
ptr = $c4 ; u16
palette_delay = 23
chroma_delay = 137
@ -292,16 +293,16 @@ viewport_zoom:
.byte 6
viewport_ox:
.dword $00000000
.dword $ff110000
.dword $ff110000
.dword $fe400000
.word $0000
.word $f110
.word $f110
.word $e400
viewport_oy:
.dword $00000000
.dword $ffb60000
.dword $ffbe0000
.dword $00000000
.word $0000
.word $fb60
.word $fbe0
.word $0000
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
@ -320,7 +321,7 @@ viewport_oy:
; 38 cycles
.macro add32 dest, arg1, arg2
add 4, dest, arg1, arg2
add 4, dest, arg2, dest
.endmacro
; 8 cycles
@ -425,25 +426,22 @@ viewport_oy:
round16 arg ; 11-27 cycles
.endmacro
; input: arg1, arg2 as fixed4.12
; output: dest as fixed8.24
.macro imul16 dest, arg1, arg2
.macro imul16_round dest, arg1, arg2, shift
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc
shift_round_16 FR2, shift ; 103-119 cycles for shift=4
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
; input: arg as fixed4.12
; output: dest as fixed8.24
.macro sqr16 dest, arg
.macro sqr16_round dest, arg, shift
;imul16_round dest, arg, arg, shift
copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc
jsr sqr16_func ; ? cyc
shift_round_16 FR2, shift ; 103-119 cycles for shift=4
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
; input: arg as u8
; output: dest as u16
; clobbers a, x
.macro sqr8 dest, arg
ldx arg
@ -453,8 +451,6 @@ viewport_oy:
sta dest + 1
.endmacro
; input: arg as u8
; input/output: dest as u16
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
@ -875,8 +871,8 @@ next:
.proc mandelbrot
; input:
; cx: position scaled to 8.24 fixed point - -128..+127.9
; cy: position scaled to 8.24
; cx: position scaled to 4.12 fixed point - -8..+7.9
; cy: position scaled to 4.12
;
; output:
; iter: iteration count at escape or 0
@ -888,41 +884,12 @@ next:
; zx_zy = 0
; dist = 0
; iter = 0
; lda #00
; ldx #(iter - zx + 1)
;initloop:
; sta zx - 1,x
; dex
; bne initloop
; sta z_buffer_start
; sta z_buffer_end
lda #00
sta zx
sta zx + 1
sta zx + 2
sta zx + 3
sta zy
sta zy + 1
sta zy + 2
sta zy + 3
sta zx_2
sta zx_2 + 1
sta zx_2 + 2
sta zx_2 + 3
sta zy_2
sta zy_2 + 1
sta zy_2 + 2
sta zy_2 + 3
sta zx_zy
sta zx_zy + 1
sta zx_zy + 2
sta zx_zy + 3
sta dist
sta dist + 1
sta dist + 2
sta dist + 3
sta iter
ldx #(iter - zx + 1)
initloop:
sta zx - 1,x
dex
bne initloop
sta z_buffer_start
sta z_buffer_end
@ -934,8 +901,6 @@ loop:
keep_going:
.macro quick_exit arg, max
; arg: fixed8.24
; max: integer
.local positive
.local negative
.local nope_out
@ -943,61 +908,51 @@ keep_going:
.local all_done
; check sign bit
lda arg + 3
lda arg + 1
bmi negative
positive:
cmp #max
cmp #((max) << 4)
bmi all_done ; 'less than'
jmp exit_path
negative:
cmp #(256 - max)
cmp #(256 - ((max) << 4))
beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than'
nope_out:
jmp exit_path
first_equal:
; following bytes all 0 shows it's really 'equal'
lda arg + 2
bne all_done
lda arg + 1
bne all_done
lda arg
bne all_done
jmp exit_path
beq nope_out ; 2nd byte 0 shows it's really 'equal'
all_done:
.endmacro
; 8.24: (-128 .. 127.9)
; 4.12: (-8 .. +7.9)
; zx = zx_2 - zy_2 + cx
sub32 zx, zx_2, zy_2
add32 zx, zx, cx
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
quick_exit zx, 2
; zy = zx_zy + zx_zy + cy
add32 zy, zx_zy, zx_zy
add32 zy, zy, cy
add16 zy, zx_zy, zx_zy
add16 zy, zy, cy
quick_exit zy, 2
; convert 8.24 -> 4.12: (-8 .. +7.9)
shift_round_16 zx, 4
shift_round_16 zy, 4
; zx_2 = zx * zx
sqr16 zx_2, zx + 2
sqr16_round zx_2, zx, 4
; zy_2 = zy * zy
sqr16 zy_2, zy + 2
sqr16_round zy_2, zy, 4
; zx_zy = zx * zy
imul16 zx_zy, zx + 2, zy + 2
imul16_round zx_zy, zx, zy, 4
; dist = zx_2 + zy_2
add32 dist, zx_2, zy_2
add16 dist, zx_2, zy_2
quick_exit dist, 4
; if may be in the lake, look for looping output with a small buffer
@ -1034,10 +989,10 @@ z_buffer_loop:
; Compare the previously stored z values
ldy #0
z_compare zx + 2
z_compare zx + 3
z_compare zy + 2
z_compare zy + 3
z_compare zx
z_compare zx + 1
z_compare zy
z_compare zy + 1
cpy #4
bne z_no_matches
@ -1052,10 +1007,10 @@ z_no_matches:
z_nothing_to_read:
; Store and expand
z_store zx + 2
z_store zx + 3
z_store zy + 2
z_store zy + 3
z_store zx
z_store zx + 1
z_store zy
z_store zy + 1
z_advance
stx z_buffer_end
@ -1106,17 +1061,14 @@ cont:
enough:
.endmacro
.macro zoom_factor dest, src, aspect
; output: dest: fixed8.24
; input: src: fixed4.12
; aspect: fixed4.12
.macro zoom_factor dest, src, zoom, aspect
; clobbers A, X, flags, etc
copy16 dest, src
scale_zoom dest
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
imul16 dest, dest, aspect
imul16_round dest, dest, aspect, 4
.endmacro
.proc pset
@ -1454,32 +1406,17 @@ zero_byte_loop:
txa
asl a
asl a
tax
lda viewport_ox,x
sta ox
lda viewport_oy,x
sta oy
inx
lda viewport_ox,x
sta ox + 1
lda viewport_oy,x
sta oy + 1
inx
lda viewport_ox,x
sta ox + 2
lda viewport_oy,x
sta oy + 2
inx
lda viewport_ox,x
sta ox + 3
lda viewport_oy,x
sta oy + 3
rts
.endproc
@ -1601,10 +1538,10 @@ skipped_mask:
not_skipped_mask:
; run the fractal!
zoom_factor cx, sx, aspect_x
add32 cx, cx, ox
zoom_factor cy, sy, aspect_y
add32 cy, cy, oy
zoom_factor cx, sx, zoom, aspect_x
add16 cx, cx, ox
zoom_factor cy, sy, zoom, aspect_y
add16 cy, cy, oy
jsr mandelbrot
jsr pset

View file

@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
Iterations are capped at 255.
@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e
## Todo
See ideas in `todo.md`.
See ideas in `todo.md`.

View file

@ -3,11 +3,13 @@ things to try:
* skip add on the top-byte multiply in sqr8/mul8
* should save a few cycles, suggestion by jamey
* perform the zx += zx^s + cx in 32-bit space, before rounding
* should improve precision on max zoom, might cost a few cycles
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit?
* since exit compare space would be 6.26 i think so
* y-axis mirror optimization