diff --git a/mandel.s b/mandel.s index 4ac8d4d..fc30532 100644 --- a/mandel.s +++ b/mandel.s @@ -262,7 +262,10 @@ palette_chroma_entries = 15 .code -z_buffer_len = 16 +;z_buffer_len = 16 ; 10.863 ms/px +;z_buffer_len = 12 ; 10.619 ms/px +z_buffer_len = 8 ; 10.612 ms/px +;z_buffer_len = 4 ; 12.395 ms/px z_buffer_mask = z_buffer_len - 1 z_buffer: ; the last N zx/zy values @@ -273,11 +276,12 @@ z_buffer: .export start -max_fill_level = 6 +;max_fill_level = 6 +max_fill_level = 3 fill_masks: - .byte %00011111 - .byte %00001111 - .byte %00000111 +; .byte %00011111 +; .byte %00001111 +; .byte %00000111 .byte %00000011 .byte %00000001 .byte %00000000 @@ -310,18 +314,21 @@ viewport_oy: .endrepeat .endmacro +; 20 cycles .macro add16 dest, arg1, arg2 add 2, dest, arg1, arg2 .endmacro +; 38 cycles .macro add32 dest, arg1, arg2 add 4, dest, arg2, dest .endmacro +; 8 cycles .macro add_carry dest - lda dest - adc #0 - sta dest + lda dest ; 3 cyc + adc #0 ; 2 cyc + sta dest ; 3 cyc .endmacro ; 2 + 9 * byte cycles @@ -334,29 +341,35 @@ viewport_oy: .endrepeat .endmacro +; 20 cycles .macro sub16 dest, arg1, arg2 sub 2, dest, arg1, arg2 .endmacro +; 38 cycles .macro sub32 dest, arg1, arg2 sub 4, dest, arg1, arg2 .endmacro +; 3 + 5 * bytes cycles .macro shl bytes, arg - asl arg + asl arg ; 3 cyc .repeat bytes-1, i - rol arg + 1 + i + rol arg + 1 + i ; 5 cyc .endrepeat .endmacro +; 13 cycles .macro shl16 arg shl 2, arg .endmacro +; 18 cycles .macro shl24 arg shl 3, arg .endmacro +; 23 cycles .macro shl32 arg shl 4, arg .endmacro @@ -369,14 +382,17 @@ viewport_oy: .endrepeat .endmacro +; 12 cycles .macro copy16 dest, arg copy 2, dest, arg .endmacro +; 24 cycles .macro copy32 dest, arg copy 4, dest, arg .endmacro +; 36 cycles .macro copyfloat dest, arg copy 6, dest, arg .endmacro @@ -401,18 +417,20 @@ viewport_oy: neg 4, arg .endmacro +; 11-27 + 23 * shift cycles +; 103-119 cycles for shift=4 .macro shift_round_16 arg, shift .repeat shift - shl32 arg + shl32 arg ; 23 cycles .endrepeat - round16 arg + round16 arg ; 11-27 cycles .endmacro .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc jsr imul16_func ; ? cyc - shift_round_16 FR2, shift + shift_round_16 FR2, shift ; 103-119 cycles for shift=4 copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -420,7 +438,7 @@ viewport_oy: ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc - shift_round_16 FR2, shift + shift_round_16 FR2, shift ; 103-119 cycles for shift=4 copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -806,6 +824,7 @@ arg2_pos: sqr16_impl 1 .endproc +; 11-27 cycles .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -818,21 +837,28 @@ arg2_pos: ; round down if negative ; < $8000: round down - lda arg + 1 - cmp #$80 - beq high_half - bpl increment - bmi next + ; $8000 17 + ; $8001 27 + ; $8100 21 + ; $7fff 11 + + lda arg + 1 ; 3 cyc + cmp #$80 ; 2 cyc + beq high_half ; 2 cyc + + bpl increment ; 2 cyc + + bmi next ; 2 cyc high_half: - lda arg - beq check_sign - bpl increment - bmi next + lda arg ; 3 cyc + beq check_sign ; 2 cyc + + jmp increment ; 3 cyc check_sign: - lda arg + 3 - bmi next + lda arg + 3 ; 3 cyc + bmi next ; 2 cyc increment: ; 5-10 cyc inc arg + 2 ; 5 cyc