diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index fb43089..9f871ca 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -8,6 +8,7 @@ MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; + TABLES: file = %O, define = yes, start = $a000, size = $c000 - $a000; } FILES { %O: format = atari; @@ -22,5 +23,5 @@ SEGMENTS { RODATA: load = MAIN, type = ro optional = yes; DATA: load = MAIN, type = rw optional = yes; BSS: load = MAIN, type = bss, optional = yes, define = yes; - TABLES: load = MAIN, type = ro, optional = yes, align = 256; + TABLES: load = TABLES, type = ro, optional = yes, align = 256; } diff --git a/mandel.s b/mandel.s index 7bfb577..a5bcb35 100644 --- a/mandel.s +++ b/mandel.s @@ -113,6 +113,8 @@ KEY_RIGHT = $87 .import mul_lobyte256 .import mul_hibyte256 .import mul_hibyte512 +.import sqr_lobyte +.import sqr_hibyte .data @@ -701,42 +703,40 @@ arg2_pos: .endmacro .macro sqr16_impl xe - .local arg - .local result - .local inter - .local arg_pos - arg = FR0 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 + .scope + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + ;inter = temp2 + inter = FR1 - ; hl * hl - ; (h*256 + l) * (h*256 + l) - ; h*256*(h*256 + l) + l*(h*256 + l) - ; h*h*256*256 + h*l*256 + h*l*256 + l*l + lda arg + 1 + bpl arg_pos + neg16 arg + arg_pos: - sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l - imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - add16 result + 1, result + 1, inter - add_carry result + 3 + sqr8 result, arg + ;imul8 inter, arg, arg, xe + lda #0 + sta result + 2 + sta result + 3 - sqr8 inter, arg + 1, arg + 1, xe - add16 result + 2, result + 2, inter + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg + 1 - bpl arg_pos - sub16 result + 2, result + 2, arg - sub16 result + 2, result + 2, arg -arg_pos: + sqr8 inter, arg + 1 + ;imul8 inter, arg + 1, arg + 1, xe + add16 result + 2, result + 2, inter - rts ; 6 cyc + rts ; 6 cyc + .endscope .endmacro .proc imul16_func @@ -748,11 +748,11 @@ arg_pos: .endproc .proc sqr16_func - imul16_impl 0 + sqr16_impl 0 .endproc .proc sqr16xe_func - imul16_impl 1 + sqr16_impl 1 .endproc .macro round16 arg