4 changed files with 171 additions and 227 deletions
--- a/4
+++ b/4
@ -2,8 +2,8 @@
 all : mandel.xex
-mandel.xex : mandel.o tables.o atari-asm-xex.cfg
+mandel.xex : mandel.o tables.o
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 %.o : %.s
 	ca65 -o $@ $<
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -6,10 +6,7 @@ SYMBOLS {
 }
 MEMORY {
    ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
+    MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
    # Keep $4000-7fff clear for expanded RAM access window
    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
    # Keep $a000-$bfff clear for BASIC cartridge
 }
 FILES {
    %O: format = atari;
@ -24,5 +21,5 @@ SEGMENTS {
    RODATA:   load = MAIN,    type = ro   optional = yes;
    DATA:     load = MAIN,    type = rw   optional = yes;
    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
+    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
 }
--- a/mandel.s
+++ b/mandel.s
@ -62,11 +62,11 @@ FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
 FMOVE  = $DDB6 ; MOVE FR0 TO FR1
 ; High data
-framebuffer_top    = $a000
+framebuffer_top    = $8000
-textbuffer         = $af00
+textbuffer         = $8f00
-framebuffer_bottom = $b000
+framebuffer_bottom = $9000
-display_list       = $bf00
+display_list       = $9f00
-framebuffer_end    = $c000
+framebuffer_end    = $a000
 height = 184
 half_height = height >> 1
@ -107,14 +107,12 @@ KEY_RIGHT = $87
 .struct float48
    exponent .byte
-    mantissa .byte 5
+    mantissa .byte 6
 .endstruct
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
 .import sqr_lobyte
 .import sqr_hibyte
 .data
@ -376,150 +374,137 @@ viewport_oy:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
-.macro sqr16_round dest, arg, shift
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
-    ;imul16_round dest, arg, arg, shift
+.macro imul8 dest, arg1, arg2
-    copy16 FR0, arg   ; 12 cyc
+    .local under256
-    jsr sqr16_func      ; ? cyc
+    .local next
-    shift_round_16 FR2, shift
+    .local small_product
-    copy16 dest, FR2 + 2  ; 12 cyc
+    ; circa 92 cycles? this doesn't seem right
-.endmacro
+    ; 81-92 cycles
    .scope
        mul_factor_a   = arg1
        mul_factor_x   = arg2
        mul_product_lo = dest
        mul_product_hi = dest + 1
-; clobbers a, x
+        lda mul_factor_a      ; 3 cyc
-.macro sqr8 dest, arg
+
-    ldx arg
+        ; (a + x)^2/2
-    lda sqr_lobyte,x
+        clc                   ; 2 cyc         
-    sta dest
+        adc mul_factor_x      ; 3 cyc
-    lda sqr_hibyte,x
+        tax                   ; 2 cyc
-    sta dest + 1
+        bcc under256          ; 2 cyc
        lda mul_hibyte512,x   ; 4 cyc
        bcs next              ; 2 cyc
    under256:
        lda mul_hibyte256,x   ; 4 cyc
        sec                   ; 2 cyc
    next:
        sta mul_product_hi    ; 3 cyc
        lda mul_lobyte256,x   ; 4 cyc
        ; - a^2/2
        ldx mul_factor_a      ; 3 cyc
        sbc mul_lobyte256,x   ; 4 cyc
        sta mul_product_lo    ; 3 cyc
        lda mul_product_hi    ; 3 cyc
        sbc mul_hibyte256,x   ; 4 cyc
        sta mul_product_hi    ; 3 cyc
        ; + x & a & 1:
        ; (this is a kludge to correct a
        ; roundoff error that makes odd * odd too low)
        ldx mul_factor_x      ; 3 cyc
        txa                   ; 2 cyc
        and mul_factor_a      ; 3 cyc
        and #1                ; 2 cyc
        clc                   ; 2 cyc
        adc mul_product_lo    ; 3 cyc
        bcc small_product     ; 2 cyc
        inc mul_product_hi    ; 5 cyc
        ; - x^2/2
    small_product:
        sec                   ; 2 cyc
        sbc mul_lobyte256,x   ; 4 cyc
        sta mul_product_lo    ; 3 cyc
        lda mul_product_hi    ; 3 cyc
        sbc mul_hibyte256,x   ; 4 cyc
        sta mul_product_hi    ; 3 cyc
    .endscope
 .endmacro
 ; lookup table for top byte -> PORTB value for bank-switch
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
    .repeat 256, i
-        .byte ((i & $c0) >> 4) | $e3
+        .byte ((i & $c0) >> 4) | $e1
    .endrepeat
 .macro bank_switch bank
-    lda #((bank << 2) | $e3)
+    lda #((bank << 2) | $e1)
    sta PORTB
 .endmacro
 .macro imul8 dest, arg1, arg2, xe
    .if xe
        ; using 64KB lookup table
        ; 58-77 cycles
        ; clobbers x, y, dest to dest + 3
        .scope
            output = dest
            ptr = dest + 2 ; scratch space assumed
-            ; bottom 14 bits except the LSB are the per-bank table index
+; 58-77 cycles
-            ; add $4000 for the bank pointer
+; clobbers x, y, dest to dest + 3
-            lda arg1     ; 3 cyc
+.macro imul8xe dest, arg1, arg2
-            and #$fe     ; 2 cyc
+.local done
-            sta ptr      ; 3 cyc
+.local output
-            lda arg2     ; 3 cyc
+.local ptr
            and #$3f     ; 2 cyc
            clc          ; 2 cyc
            adc #$40     ; 2 cyc
            sta ptr + 1  ; 3 cyc
-            ; top 2 bits are the table bank selector
+    output = dest
-            ldx arg2                ; 3 cyc
+    ptr = dest + 2 ; scratch space assumed
-            lda bank_switch_table,x ; 4 cyc
+
-            sta PORTB               ; 4 cyc
+    ; bottom 14 bits except the LSB are the per-bank table index
    ; add $4000 for the bank pointer
    lda arg1     ; 3 cyc
    and #$fe     ; 2 cyc
    sta ptr      ; 3 cyc
    lda arg2     ; 3 cyc
    and #$3f     ; 2 cyc
    clc          ; 2 cyc
    adc #$40     ; 2 cyc
    sta ptr + 1  ; 3 cyc
    ; top 2 bits are the table bank selector
    ldx arg2                ; 3 cyc
    lda bank_switch_table,x ; 4 cyc
    sta PORTB               ; 4 cyc
-            ; copy the entry into output
+    ; copy the entry into output
-            ldy #0       ; 2 cyc
+    ldy #0       ; 2 cyc
-            lda (ptr),y  ; 5 cyc
+    lda (ptr),y  ; 5 cyc
-            sta output   ; 3 cyc
+    sta output   ; 3 cyc
-            iny          ; 2 cyc
+    iny          ; 2 cyc
-            lda (ptr),y  ; 5 cyc
+    lda (ptr),y  ; 5 cyc
-            sta output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
-            ; note: we are not restoring memory to save 6 cycles!
+    ; note: we are not restoring memory to save 6 cycles!
-            ; this means those 16kb have to be switched back to base RAM
+    ; this means those 16kb have to be switched back to base RAM
-            ; if we need to use them anywhere else
+    ; if we need to use them anywhere else
-            ;;; restore memory
+    ;;; restore memory
-            ;;lda #$81     ; 2 cyc - disabled
+    ;;lda #$81     ; 2 cyc - disabled
-            ;;sta PORTB    ; 4 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
-            ; check that 1 bit we skipped to fit into space
+    ; check that 1 bit we skipped to fit into space
-            lda arg1     ; 3 cyc
+    lda arg1     ; 3 cyc
-            and #1       ; 2 cyc
+    and #1       ; 2 cyc
-            beq done     ; 2 cyc
+    beq done     ; 2 cyc
-            ; add the second param one last time for the skipped bit
+    ; add the second param one last time for the skipped bit
-            clc          ; 2 cyc
+    clc          ; 2 cyc
-            lda arg2     ; 3 cyc
+    lda arg2     ; 3 cyc
-            adc output   ; 3 cyc
+    adc output   ; 3 cyc
-            sta output   ; 3 cyc
+    sta output   ; 3 cyc
-            lda #0       ; 2 cyc
+    lda #0       ; 2 cyc
-            adc output+1 ; 3 cyc
+    adc output+1 ; 3 cyc
-            sta output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
-        done:
+done:
        .endscope
    .else
        ; Using base 48k RAM compatibility mode
        ; Small table of half squares
        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
        ; 81-92 cycles
        .scope
            mul_factor_a   = arg1
            mul_factor_x   = arg2
            mul_product_lo = dest
            mul_product_hi = dest + 1
            lda mul_factor_a      ; 3 cyc
            ; (a + x)^2/2
            clc                   ; 2 cyc         
            adc mul_factor_x      ; 3 cyc
            tax                   ; 2 cyc
            bcc under256          ; 2 cyc
            lda mul_hibyte512,x   ; 4 cyc
            bcs next              ; 2 cyc
        under256:
            lda mul_hibyte256,x   ; 4 cyc
            sec                   ; 2 cyc
        next:
            sta mul_product_hi    ; 3 cyc
            lda mul_lobyte256,x   ; 4 cyc
            ; - a^2/2
            ldx mul_factor_a      ; 3 cyc
            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc
            ; + x & a & 1:
            ; (this is a kludge to correct a
            ; roundoff error that makes odd * odd too low)
            ldx mul_factor_x      ; 3 cyc
            txa                   ; 2 cyc
            and mul_factor_a      ; 3 cyc
            and #1                ; 2 cyc
            clc                   ; 2 cyc
            adc mul_product_lo    ; 3 cyc
            bcc small_product     ; 2 cyc
            inc mul_product_hi    ; 5 cyc
            ; - x^2/2
        small_product:
            sec                   ; 2 cyc
            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc
        .endscope
    .endif
 .endmacro
 .proc imul8xe_init
@ -547,14 +532,6 @@ init:
    lda #.hibyte(imul16xe_func)
    sta imul16_func + 2
    ; ditto for sqr16_func -> sqr16xe_func
    lda #$4c ; 'jmp' opcode
    sta sqr16_func
    lda #.lobyte(sqr16xe_func)
    sta sqr16_func + 1
    lda #.hibyte(sqr16xe_func)
    sta sqr16_func + 2
    ; create the lookup table
    ; go through the input set, in four 16KB chunks
@ -655,13 +632,7 @@ inner_loop:
 .endproc
-.macro imul16_impl xe
+.proc imul16_func
    .local arg1
    .local arg2
    .local result
    .local inter
    .local arg1_pos
    .local arg2_pos
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
@ -672,20 +643,20 @@ inner_loop:
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
-    imul8 result, arg1, arg2, xe
+    imul8 result, arg1, arg2
    lda #0
    sta result + 2
    sta result + 3
-    imul8 inter, arg1 + 1, arg2, xe
+    imul8 inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter
    add_carry result + 3
-    imul8 inter, arg1, arg2 + 1, xe
+    imul8 inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
    add_carry result + 3
-    imul8 inter, arg1 + 1, arg2 + 1, xe
+    imul8 inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter
    ; In case of negative inputs, adjust high word
@ -700,59 +671,47 @@ arg1_pos:
 arg2_pos:
    rts ; 6 cyc
 .endmacro
 .macro sqr16_impl xe
    .scope
        arg = FR0    ; 16-bit arg (clobbered)
        result = FR2 ; 32-bit result
        ;inter = temp2
        inter = FR1
        lda arg + 1
        bpl arg_pos
        neg16 arg
    arg_pos:
        ; hl * hl
        ; (h*256 + l) * (h*256 + l)
        ; h*256*(h*256 + l) + l*(h*256 + l)
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
        sqr8 result, arg
        ;imul8 inter, arg, arg, xe
        lda #0
        sta result + 2
        sta result + 3
        imul8 inter, arg + 1, arg, xe
        add16 result + 1, result + 1, inter
        add_carry result + 3
        add16 result + 1, result + 1, inter
        add_carry result + 3
        sqr8 inter, arg + 1
        ;imul8 inter, arg + 1, arg + 1, xe
        add16 result + 2, result + 2, inter
        rts ; 6 cyc
    .endscope
 .endmacro
 .proc imul16_func
    imul16_impl 0
 .endproc
 .proc imul16xe_func
-    imul16_impl 1
+    arg1 = FR0   ; 16-bit arg (clobbered)
-.endproc
+    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    inter = temp2
-.proc sqr16_func
+    ; h1l1 * h2l2
-    sqr16_impl 0
+    ; (h1*256 + l1) * (h2*256 + l2)
-.endproc
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
-.proc sqr16xe_func
+    imul8xe result, arg1, arg2
-    sqr16_impl 1
+    lda #0
    sta result + 2
    sta result + 3
    imul8xe inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8xe inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8xe inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter
    ; In case of negative inputs, adjust high word
    ; https://stackoverflow.com/a/28827013
    lda arg1 + 1
    bpl arg1_pos
    sub16 result + 2, result + 2, arg2
 arg1_pos:
    lda arg2 + 1
    bpl arg2_pos
    sub16 result + 2, result + 2, arg1
 arg2_pos:
    rts ; 6 cyc
 .endproc
 .macro round16 arg
@ -866,10 +825,10 @@ keep_going:
    quick_exit zy, 2
    ; zx_2 = zx * zx
-    sqr16_round zx_2, zx, 4
+    imul16_round zx_2, zx, zx, 4
    ; zy_2 = zy * zy
-    sqr16_round zy_2, zy, 4
+    imul16_round zy_2, zy, zy, 4
    ; zx_zy = zx * zy
    imul16_round zx_zy, zx, zy, 4
--- a/tables.js
+++ b/tables.js
@ -22,10 +22,7 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
 .export sqr_lobyte
 .export sqr_hibyte
 ; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 ; (i * i) for the plain squares
 .align 256
 sqr_lobyte:
 ${db((i) => (i * i) & 0xff)}
 .align 256
 sqr_hibyte:
 ${db((i) => ((i * i) >> 8) & 0xff)}
 `);