diff --git a/mandel.s b/mandel.s index 2e16b53..9eb6ce1 100644 --- a/mandel.s +++ b/mandel.s @@ -374,6 +374,14 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16_round dest, arg, shift + imul16_round dest, arg, arg, shift + ;copy16 FR0, arg ; 12 cyc + ;jsr sqr16_func ; ? cyc + ;shift_round_16 FR2, shift + ;copy16 dest, FR2 + 2 ; 12 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -537,6 +545,14 @@ init: lda #.hibyte(imul16xe_func) sta imul16_func + 2 + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + ; create the lookup table ; go through the input set, in four 16KB chunks @@ -684,6 +700,45 @@ arg2_pos: rts ; 6 cyc .endmacro +.macro sqr16_impl xe + .local arg + .local result + .local inter + .local arg_pos + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l + + sqr8 result, arg + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 + + sqr8 inter, arg + 1, arg + 1, xe + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg + 1 + bpl arg_pos + sub16 result + 2, result + 2, arg + sub16 result + 2, result + 2, arg +arg_pos: + + rts ; 6 cyc +.endmacro + .proc imul16_func imul16_impl 0 .endproc @@ -692,6 +747,14 @@ arg2_pos: imul16_impl 1 .endproc +.proc sqr16_func + imul16_impl 0 +.endproc + +.proc sqr16xe_func + imul16_impl 1 +.endproc + .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -803,10 +866,10 @@ keep_going: quick_exit zy, 2 ; zx_2 = zx * zx - imul16_round zx_2, zx, zx, 4 + sqr16_round zx_2, zx, 4 ; zy_2 = zy * zy - imul16_round zy_2, zy, zy, 4 + sqr16_round zy_2, zy, 4 ; zx_zy = zx * zy imul16_round zx_zy, zx, zy, 4