wip
This commit is contained in:
parent
0d086a179c
commit
4a1e35699a
2 changed files with 49 additions and 24 deletions
71
mandel.s
71
mandel.s
|
@ -433,6 +433,13 @@ viewport_oy:
|
||||||
copy16 dest, FR2 + 2 ; 12 cyc
|
copy16 dest, FR2 + 2 ; 12 cyc
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
.macro imul16 dest, arg1, arg2
|
||||||
|
copy16 FR0, arg1 ; 12 cyc
|
||||||
|
copy16 FR1, arg2 ; 12 cyc
|
||||||
|
jsr imul16_func ; ? cyc
|
||||||
|
copy32 dest, FR2 ; 24 cyc
|
||||||
|
.endmacro
|
||||||
|
|
||||||
.macro sqr16_round dest, arg, shift
|
.macro sqr16_round dest, arg, shift
|
||||||
;imul16_round dest, arg, arg, shift
|
;imul16_round dest, arg, arg, shift
|
||||||
copy16 FR0, arg ; 12 cyc
|
copy16 FR0, arg ; 12 cyc
|
||||||
|
@ -441,6 +448,12 @@ viewport_oy:
|
||||||
copy16 dest, FR2 + 2 ; 12 cyc
|
copy16 dest, FR2 + 2 ; 12 cyc
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
.macro sqr16 dest, arg
|
||||||
|
copy16 FR0, arg ; 12 cyc
|
||||||
|
jsr sqr16_func ; ? cyc
|
||||||
|
copy32 dest, FR2 ; 24 cyc
|
||||||
|
.endmacro
|
||||||
|
|
||||||
; clobbers a, x
|
; clobbers a, x
|
||||||
.macro sqr8 dest, arg
|
.macro sqr8 dest, arg
|
||||||
ldx arg
|
ldx arg
|
||||||
|
@ -870,8 +883,8 @@ next:
|
||||||
|
|
||||||
.proc mandelbrot
|
.proc mandelbrot
|
||||||
; input:
|
; input:
|
||||||
; cx: position scaled to 4.12 fixed point - -8..+7.9
|
; cx: position scaled to 8.24 fixed point - -128..+127.9
|
||||||
; cy: position scaled to 4.12
|
; cy: position scaled to 8.24
|
||||||
;
|
;
|
||||||
; output:
|
; output:
|
||||||
; iter: iteration count at escape or 0
|
; iter: iteration count at escape or 0
|
||||||
|
@ -909,10 +922,6 @@ next:
|
||||||
sta zy_2 + 1
|
sta zy_2 + 1
|
||||||
sta zy_2 + 2
|
sta zy_2 + 2
|
||||||
sta zy_2 + 3
|
sta zy_2 + 3
|
||||||
sta zx_zy
|
|
||||||
sta zx_zy + 1
|
|
||||||
sta zx_zy + 2
|
|
||||||
sta zx_zy + 3
|
|
||||||
sta dist
|
sta dist
|
||||||
sta dist + 1
|
sta dist + 1
|
||||||
sta dist + 2
|
sta dist + 2
|
||||||
|
@ -929,6 +938,8 @@ loop:
|
||||||
keep_going:
|
keep_going:
|
||||||
|
|
||||||
.macro quick_exit arg, max
|
.macro quick_exit arg, max
|
||||||
|
; arg: fixed8.24
|
||||||
|
; max: integer
|
||||||
.local positive
|
.local positive
|
||||||
.local negative
|
.local negative
|
||||||
.local nope_out
|
.local nope_out
|
||||||
|
@ -936,51 +947,61 @@ keep_going:
|
||||||
.local all_done
|
.local all_done
|
||||||
|
|
||||||
; check sign bit
|
; check sign bit
|
||||||
lda arg + 1
|
lda arg + 3
|
||||||
bmi negative
|
bmi negative
|
||||||
|
|
||||||
positive:
|
positive:
|
||||||
cmp #((max) << 4)
|
cmp #max
|
||||||
bmi all_done ; 'less than'
|
bmi all_done ; 'less than'
|
||||||
jmp exit_path
|
jmp exit_path
|
||||||
|
|
||||||
negative:
|
negative:
|
||||||
cmp #(256 - ((max) << 4))
|
cmp #(256 - max)
|
||||||
beq first_equal ; 'equal' on first byte
|
beq first_equal ; 'equal' on first byte
|
||||||
bpl all_done ; 'greater than'
|
bpl all_done ; 'greater than'
|
||||||
|
|
||||||
nope_out:
|
nope_out:
|
||||||
jmp exit_path
|
jmp exit_path
|
||||||
|
|
||||||
first_equal:
|
first_equal:
|
||||||
|
; following bytes all 0 shows it's really 'equal'
|
||||||
|
lda arg + 2
|
||||||
|
bne all_done
|
||||||
|
lda arg + 1
|
||||||
|
bne all_done
|
||||||
lda arg
|
lda arg
|
||||||
beq nope_out ; 2nd byte 0 shows it's really 'equal'
|
bne all_done
|
||||||
|
jmp exit_path
|
||||||
|
|
||||||
all_done:
|
all_done:
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; 4.12: (-8 .. +7.9)
|
; 8.24: (-128 .. 127.9) / (-8 .. +7.9)
|
||||||
; zx = zx_2 - zy_2 + cx
|
; zx = zx_2 - zy_2 + cx
|
||||||
sub16 zx, zx_2, zy_2
|
sub32 zx, zx_2, zy_2
|
||||||
add16 zx, zx, cx
|
add32 zx, zx, cx
|
||||||
quick_exit zx, 2
|
quick_exit zx, 2
|
||||||
|
|
||||||
; zy = zx_zy + zx_zy + cy
|
; zy = zx_zy + zx_zy + cy
|
||||||
add16 zy, zx_zy, zx_zy
|
add32 zy, zx_zy, zx_zy
|
||||||
add16 zy, zy, cy
|
add32 zy, zy, cy
|
||||||
quick_exit zy, 2
|
quick_exit zy, 2
|
||||||
|
|
||||||
|
; convert 8.24 -> 4.12
|
||||||
|
shift_round_16 zx, 4
|
||||||
|
shift_round_16 zy, 4
|
||||||
|
|
||||||
; zx_2 = zx * zx
|
; zx_2 = zx * zx
|
||||||
sqr16_round zx_2, zx, 4
|
sqr16 zx_2, zx + 2
|
||||||
|
|
||||||
; zy_2 = zy * zy
|
; zy_2 = zy * zy
|
||||||
sqr16_round zy_2, zy, 4
|
sqr16 zy_2, zy + 2
|
||||||
|
|
||||||
; zx_zy = zx * zy
|
; zx_zy = zx * zy
|
||||||
imul16_round zx_zy, zx, zy, 4
|
imul16 zx_zy, zx + 2, zy + 2
|
||||||
|
|
||||||
; dist = zx_2 + zy_2
|
; dist = zx_2 + zy_2
|
||||||
add16 dist, zx_2, zy_2
|
add32 dist, zx_2, zy_2
|
||||||
quick_exit dist, 4
|
quick_exit dist, 4
|
||||||
|
|
||||||
; if may be in the lake, look for looping output with a small buffer
|
; if may be in the lake, look for looping output with a small buffer
|
||||||
|
@ -1090,13 +1111,17 @@ enough:
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
.macro zoom_factor dest, src, zoom, aspect
|
.macro zoom_factor dest, src, zoom, aspect
|
||||||
|
; output: dest: fixed8.24
|
||||||
|
; input: src: fixed4.12
|
||||||
|
; input: zoom: u8 ???
|
||||||
|
; aspect: fixed4.12
|
||||||
; clobbers A, X, flags, etc
|
; clobbers A, X, flags, etc
|
||||||
copy16 dest, src
|
copy16 dest, src
|
||||||
scale_zoom dest
|
scale_zoom dest
|
||||||
|
|
||||||
; cy = cy * (3 / 4)
|
; cy = cy * (3 / 4)
|
||||||
; cx = cx * (5 / 4)
|
; cx = cx * (5 / 4)
|
||||||
imul16_round dest, dest, aspect, 4
|
imul16 dest, dest, aspect
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
.proc pset
|
.proc pset
|
||||||
|
@ -1567,9 +1592,9 @@ not_skipped_mask:
|
||||||
|
|
||||||
; run the fractal!
|
; run the fractal!
|
||||||
zoom_factor cx, sx, zoom, aspect_x
|
zoom_factor cx, sx, zoom, aspect_x
|
||||||
add16 cx, cx, ox
|
add32 cx, cx, ox
|
||||||
zoom_factor cy, sy, zoom, aspect_y
|
zoom_factor cy, sy, zoom, aspect_y
|
||||||
add16 cy, cy, oy
|
add32 cy, cy, oy
|
||||||
jsr mandelbrot
|
jsr mandelbrot
|
||||||
jsr pset
|
jsr pset
|
||||||
|
|
||||||
|
|
2
todo.md
2
todo.md
|
@ -3,7 +3,7 @@ things to try:
|
||||||
* skip add on the top-byte multiply in sqr8/mul8
|
* skip add on the top-byte multiply in sqr8/mul8
|
||||||
* should save a few cycles, suggestion by jamey
|
* should save a few cycles, suggestion by jamey
|
||||||
|
|
||||||
* perform the zx += zx^s + cx in 32-bit space, before rounding
|
* perform the zx_next = zx^s + cx in 32-bit space, before rounding
|
||||||
* should improve precision on max zoom, might cost a few cycles
|
* should improve precision on max zoom, might cost a few cycles
|
||||||
|
|
||||||
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
|
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
|
||||||
|
|
Loading…
Reference in a new issue