diff --git a/mandel.s b/mandel.s index 50213ad..622ff62 100644 --- a/mandel.s +++ b/mandel.s @@ -433,6 +433,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + .macro sqr16_round dest, arg, shift ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc @@ -441,6 +448,12 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16 dest, arg + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -870,8 +883,8 @@ next: .proc mandelbrot ; input: - ; cx: position scaled to 4.12 fixed point - -8..+7.9 - ; cy: position scaled to 4.12 + ; cx: position scaled to 8.24 fixed point - -128..+127.9 + ; cy: position scaled to 8.24 ; ; output: ; iter: iteration count at escape or 0 @@ -909,10 +922,6 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 @@ -929,6 +938,8 @@ loop: keep_going: .macro quick_exit arg, max + ; arg: fixed8.24 + ; max: integer .local positive .local negative .local nope_out @@ -936,51 +947,61 @@ keep_going: .local all_done ; check sign bit - lda arg + 1 + lda arg + 3 bmi negative positive: - cmp #((max) << 4) + cmp #max bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - ((max) << 4)) + cmp #(256 - max) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: jmp exit_path - + first_equal: + ; following bytes all 0 shows it's really 'equal' + lda arg + 2 + bne all_done + lda arg + 1 + bne all_done lda arg - beq nope_out ; 2nd byte 0 shows it's really 'equal' + bne all_done + jmp exit_path all_done: .endmacro - ; 4.12: (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx - sub16 zx, zx_2, zy_2 - add16 zx, zx, cx + sub32 zx, zx_2, zy_2 + add32 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add16 zy, zx_zy, zx_zy - add16 zy, zy, cy + add32 zy, zx_zy, zx_zy + add32 zy, zy, cy quick_exit zy, 2 + ; convert 8.24 -> 4.12 + shift_round_16 zx, 4 + shift_round_16 zy, 4 + ; zx_2 = zx * zx - sqr16_round zx_2, zx, 4 + sqr16 zx_2, zx + 2 ; zy_2 = zy * zy - sqr16_round zy_2, zy, 4 + sqr16 zy_2, zy + 2 ; zx_zy = zx * zy - imul16_round zx_zy, zx, zy, 4 + imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 - add16 dist, zx_2, zy_2 + add32 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer @@ -1090,13 +1111,17 @@ enough: .endmacro .macro zoom_factor dest, src, zoom, aspect + ; output: dest: fixed8.24 + ; input: src: fixed4.12 + ; input: zoom: u8 ??? + ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16_round dest, dest, aspect, 4 + imul16 dest, dest, aspect .endmacro .proc pset @@ -1567,9 +1592,9 @@ not_skipped_mask: ; run the fractal! zoom_factor cx, sx, zoom, aspect_x - add16 cx, cx, ox + add32 cx, cx, ox zoom_factor cy, sy, zoom, aspect_y - add16 cy, cy, oy + add32 cy, cy, oy jsr mandelbrot jsr pset diff --git a/todo.md b/todo.md index 6fb0282..29217cd 100644 --- a/todo.md +++ b/todo.md @@ -3,7 +3,7 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx += zx^s + cx in 32-bit space, before rounding +* perform the zx_next = zx^s + cx in 32-bit space, before rounding * should improve precision on max zoom, might cost a few cycles * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D