diff --git a/mandel.s b/mandel.s index f4862c6..be1f59e 100644 --- a/mandel.s +++ b/mandel.s @@ -1,17 +1,17 @@ ; Our zero-page vars sx = $80 ; i16: screen pixel x sy = $82 ; i16: screen pixel y -ox = $84 ; fixed4.12: center point x -oy = $86 ; fixed4.12: center point y -cx = $84 ; fixed4.12: c_x -cy = $86 ; fixed4.12: c_y -zx = $88 ; fixed4.12: z_x -zy = $8a ; fixed4.12: z_y +ox = $84 ; fixed3.13: center point x +oy = $86 ; fixed3.13: center point y +cx = $84 ; fixed3.13: c_x +cy = $86 ; fixed3.13: c_y +zx = $88 ; fixed3.13: z_x +zy = $8a ; fixed3.13: z_y -zx_2 = $90 ; fixed8.24: z_x^2 -zy_2 = $94 ; fixed8.24: z_y^2 -zx_zy = $98 ; fixed8.24: z_x * z_y -dist = $9c ; fixed8.24: z_x^2 + z_y^2 +zx_2 = $90 ; fixed6.26: z_x^2 +zy_2 = $94 ; fixed6.26: z_y^2 +zx_zy = $98 ; fixed6.26: z_x * z_y +dist = $9c ; fixed6.26: z_x^2 + z_y^2 iter = $a0 ; u8: iteration count zoom = $a1 ; u8: zoom shift level @@ -42,6 +42,8 @@ half_height = height >> 1 width = 160 half_width = width >> 1 stride = width >> 2 +width_ratio_3_13 = (5 << 11) ; 5/4 +height_ratio_3_13 = (3 << 11) ; 5/4 DMACTL = $D400 DLISTL = $D402 @@ -99,12 +101,18 @@ aspect: ; 184h is the equiv of 220.8h at square pixels ; 320 / 220.8 = 1.45 display aspect ratio aspect_x: - .word 5 << (12 - 2) + .word 5 << (13 - 2) aspect_y: - .word 3 << (12 - 2) + .word 3 << (13 - 2) +bit_masks: + .byte 3 + .byte 3 << 2 + .byte 3 << 4 + .byte 3 << 6 + display_list_start: ; 24 lines overscan .repeat 3 @@ -160,7 +168,7 @@ color_map: .endmacro .macro add32 dest, arg1, arg2 - add 4, dest, arg2, dest + add 2, dest, arg2, dest .endmacro ; 2 + 9 * byte cycles @@ -236,6 +244,21 @@ color_map: neg 4, arg .endmacro +.macro extend_8_16 dest, src + ; clobbers A, X + ; 13-15 cycles + .local positive + .local negative + ldx #0 ; 2 cyc + lda src ; 3 cyc + sta dest ; 3 cyc + bpl positive ; 2 cyc +negative: + dex ; 2 cyc +positive: + stx dest + 1 ; 3 cyc +.endmacro + ; inner loop for imul16 ; bitnum < 8: 25 or 41 cycles ; bitnum >= 8: 30 or 46 cycles @@ -254,10 +277,10 @@ color_map: ; 5 cycles either way .if bitnum < 8 lda arg1 ; 3 cyc - and #(1 << (bitnum)) ; 2 cyc + and #(1 << bitnum) ; 2 cyc .else lda arg1 + 1 ; 3 cyc - and #(1 << ((bitnum) - 8)) ; 2 cyc + and #(1 << (bitnum - 8)) ; 2 cyc .endif bne one ; 2 cyc @@ -284,6 +307,7 @@ next: ror result ; 5 cyc .endif + .endmacro ; 5 to 25 cycles @@ -306,18 +330,11 @@ positive: copy32 dest, FR2 ; 24 cyc .endmacro -.macro shift_round_16 arg, shift - .repeat shift - shl32 arg - .endrepeat - round16 arg -.endmacro - -.macro imul16_round dest, arg1, arg2, shift +.macro imul16_round dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc jsr imul16_func ; 470-780 cyc - shift_round_16 FR2, shift + round16 FR2 ; 5-28 cyc copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -421,60 +438,71 @@ next: ; dist = 0 ; iter = 0 lda #00 - ldx #(iter - zx + 1) + ldx iter - zx initloop: - sta zx - 1,x + sta zx,x dex bne initloop loop: - ; iter++ & max-iters break - inc iter - bne keep_going + ; 1939 - 3007 cyc + + ; iter++ & max-iters break = 7 cyc + inc iter ; 5 cyc + bne keep_going ; 2 cyc rts keep_going: - .macro quick_exit arg - .local keep_going - lda arg + 1 - cmp #(4 << 4) - bmi keep_going - rts - keep_going: - .endmacro - ; 4.12: (-8 .. +7.9) - ; zx = zx_2 - zy_2 + cx + ; zx = zx_2 - zy_2 + cx = 3 * 20 = 60 cyc sub16 zx, zx_2, zy_2 add16 zx, zx, cx - quick_exit zx - ; zy = zx_zy + zx_zy + cy - add16 zy, zx_zy, zx_zy + ; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc + sub16 zy, zx_zy, zx_zy add16 zy, zy, cy - ; zx_2 = zx * zx - imul16_round zx_2, zx, zx, 4 - quick_exit dist + ; 8.24: (-128 .. +127.9) + ; zx_2 = zx * zx = 518 - 828 cyc + imul16 zx_2, zx, zx - ; zy_2 = zy * zy - imul16_round zy_2, zy, zy, 4 - quick_exit dist + ; zy_2 = zy * zy = 518 - 828 cyc + imul16 zy_2, zy, zy - ; zx_zy = zx * zy - imul16_round zx_zy, zx, zy, 4 - quick_exit dist + ; zx_zy = zx * zy = 518 - 828 cyc + imul16 zx_zy, zx, zy - ; dist = zx_2 + zy_2 - add16 dist, zx_2, zy_2 - quick_exit dist + ; dist = zx_2 + zy_2 = 38 cyc + add32 dist, zx_2, zy_2 + + ; if dist >= 4 break, else continue iterating = 7 cyc + lda dist + 3 ; 3 cyc + cmp #4 ; 2 cyc + bmi still_in ; 2 cyc + rts +still_in: + + ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc + .repeat 4 ; 60 cyc + shl24 zx_2 ; 15 cyc + .endrepeat + round16 zx_2 ; 5-28 cycles + + ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc + .repeat 4 ; 60 cyc + shl24 zy_2 ; 15 cyc + .endrepeat + round16 zy_2 ; 5-28 cycles + + ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc + .repeat 4 ; 60 cyc + shl24 zx_zy ; 15 cyc + .endrepeat + round16 zx_zy ; 5-28 cycles ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters - jmp loop - -peace_out: - rts + jmp loop ; 3 cycles .endproc @@ -495,7 +523,7 @@ enough: ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16_round dest, dest, aspect, 4 + imul16_round dest, dest, aspect .endmacro .proc pset @@ -556,9 +584,6 @@ point: ; pixel_mask <<= pixel_shift (shifting in ones) and #3 sta pixel_shift - lda #3 - sec - sbc pixel_shift tax shift_loop: beq shift_done @@ -612,13 +637,9 @@ done: sta ox + 1 sta oy sta oy + 1 - - ; zoom = 2x - lda #1 sta zoom ; Disable display DMA - lda #0 sta DMACTL ; zero the range from framebuffer_top to framebuffer_end