fix bank switch on xl/xe

was accidentally enabling basic rom :D 5m46s - 11.759 ms/px - 800xl 5m30s - 11.215 ms/px - 130xe
moving the framebuffer into the basic space
2024-12-30 03:56:35 -08:00 · 2024-12-29 21:19:55 -08:00 · 2024-12-29 21:06:48 -08:00 · 2024-12-29 20:37:58 -08:00 · 2024-12-29 17:56:14 -08:00 · 2024-12-29 17:37:06 -08:00
4 changed files with 227 additions and 171 deletions
--- a/4
+++ b/4
@ -2,8 +2,8 @@
 all : mandel.xex
-mandel.xex : mandel.o tables.o
+mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ $+
+	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
 %.o : %.s
 	ca65 -o $@ $<
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -6,7 +6,10 @@ SYMBOLS {
 }
 MEMORY {
    ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
+    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
    # Keep $4000-7fff clear for expanded RAM access window
    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
    # Keep $a000-$bfff clear for BASIC cartridge
 }
 FILES {
    %O: format = atari;
@ -21,5 +24,5 @@ SEGMENTS {
    RODATA:   load = MAIN,    type = ro   optional = yes;
    DATA:     load = MAIN,    type = rw   optional = yes;
    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
+    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
 }
--- a/mandel.s
+++ b/mandel.s
@ -62,11 +62,11 @@ FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
 FMOVE  = $DDB6 ; MOVE FR0 TO FR1
 ; High data
-framebuffer_top    = $8000
+framebuffer_top    = $a000
-textbuffer         = $8f00
+textbuffer         = $af00
-framebuffer_bottom = $9000
+framebuffer_bottom = $b000
-display_list       = $9f00
+display_list       = $bf00
-framebuffer_end    = $a000
+framebuffer_end    = $c000
 height = 184
 half_height = height >> 1
@ -107,12 +107,14 @@ KEY_RIGHT = $87
 .struct float48
    exponent .byte
-    mantissa .byte 6
+    mantissa .byte 5
 .endstruct
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
 .import sqr_lobyte
 .import sqr_hibyte
 .data
@ -374,137 +376,150 @@ viewport_oy:
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
-; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro sqr16_round dest, arg, shift
-.macro imul8 dest, arg1, arg2
+    ;imul16_round dest, arg, arg, shift
-    .local under256
+    copy16 FR0, arg   ; 12 cyc
-    .local next
+    jsr sqr16_func      ; ? cyc
-    .local small_product
+    shift_round_16 FR2, shift
-    ; circa 92 cycles? this doesn't seem right
+    copy16 dest, FR2 + 2  ; 12 cyc
-    ; 81-92 cycles
+.endmacro
    .scope
        mul_factor_a   = arg1
        mul_factor_x   = arg2
        mul_product_lo = dest
        mul_product_hi = dest + 1
-        lda mul_factor_a      ; 3 cyc
+; clobbers a, x
-
+.macro sqr8 dest, arg
-        ; (a + x)^2/2
+    ldx arg
-        clc                   ; 2 cyc         
+    lda sqr_lobyte,x
-        adc mul_factor_x      ; 3 cyc
+    sta dest
-        tax                   ; 2 cyc
+    lda sqr_hibyte,x
-        bcc under256          ; 2 cyc
+    sta dest + 1
        lda mul_hibyte512,x   ; 4 cyc
        bcs next              ; 2 cyc
    under256:
        lda mul_hibyte256,x   ; 4 cyc
        sec                   ; 2 cyc
    next:
        sta mul_product_hi    ; 3 cyc
        lda mul_lobyte256,x   ; 4 cyc
        ; - a^2/2
        ldx mul_factor_a      ; 3 cyc
        sbc mul_lobyte256,x   ; 4 cyc
        sta mul_product_lo    ; 3 cyc
        lda mul_product_hi    ; 3 cyc
        sbc mul_hibyte256,x   ; 4 cyc
        sta mul_product_hi    ; 3 cyc
        ; + x & a & 1:
        ; (this is a kludge to correct a
        ; roundoff error that makes odd * odd too low)
        ldx mul_factor_x      ; 3 cyc
        txa                   ; 2 cyc
        and mul_factor_a      ; 3 cyc
        and #1                ; 2 cyc
        clc                   ; 2 cyc
        adc mul_product_lo    ; 3 cyc
        bcc small_product     ; 2 cyc
        inc mul_product_hi    ; 5 cyc
        ; - x^2/2
    small_product:
        sec                   ; 2 cyc
        sbc mul_lobyte256,x   ; 4 cyc
        sta mul_product_lo    ; 3 cyc
        lda mul_product_hi    ; 3 cyc
        sbc mul_hibyte256,x   ; 4 cyc
        sta mul_product_hi    ; 3 cyc
    .endscope
 .endmacro
 ; lookup table for top byte -> PORTB value for bank-switch
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
    .repeat 256, i
-        .byte ((i & $c0) >> 4) | $e1
+        .byte ((i & $c0) >> 4) | $e3
    .endrepeat
 .macro bank_switch bank
-    lda #((bank << 2) | $e1)
+    lda #((bank << 2) | $e3)
    sta PORTB
 .endmacro
 .macro imul8 dest, arg1, arg2, xe
    .if xe
        ; using 64KB lookup table
        ; 58-77 cycles
        ; clobbers x, y, dest to dest + 3
        .scope
            output = dest
            ptr = dest + 2 ; scratch space assumed
-; 58-77 cycles
+            ; bottom 14 bits except the LSB are the per-bank table index
-; clobbers x, y, dest to dest + 3
+            ; add $4000 for the bank pointer
-.macro imul8xe dest, arg1, arg2
+            lda arg1     ; 3 cyc
-.local done
+            and #$fe     ; 2 cyc
-.local output
+            sta ptr      ; 3 cyc
-.local ptr
+            lda arg2     ; 3 cyc
            and #$3f     ; 2 cyc
            clc          ; 2 cyc
            adc #$40     ; 2 cyc
            sta ptr + 1  ; 3 cyc
-    output = dest
+            ; top 2 bits are the table bank selector
-    ptr = dest + 2 ; scratch space assumed
+            ldx arg2                ; 3 cyc
-
+            lda bank_switch_table,x ; 4 cyc
-    ; bottom 14 bits except the LSB are the per-bank table index
+            sta PORTB               ; 4 cyc
    ; add $4000 for the bank pointer
    lda arg1     ; 3 cyc
    and #$fe     ; 2 cyc
    sta ptr      ; 3 cyc
    lda arg2     ; 3 cyc
    and #$3f     ; 2 cyc
    clc          ; 2 cyc
    adc #$40     ; 2 cyc
    sta ptr + 1  ; 3 cyc
    ; top 2 bits are the table bank selector
    ldx arg2                ; 3 cyc
    lda bank_switch_table,x ; 4 cyc
    sta PORTB               ; 4 cyc
-    ; copy the entry into output
+            ; copy the entry into output
-    ldy #0       ; 2 cyc
+            ldy #0       ; 2 cyc
-    lda (ptr),y  ; 5 cyc
+            lda (ptr),y  ; 5 cyc
-    sta output   ; 3 cyc
+            sta output   ; 3 cyc
-    iny          ; 2 cyc
+            iny          ; 2 cyc
-    lda (ptr),y  ; 5 cyc
+            lda (ptr),y  ; 5 cyc
-    sta output+1 ; 3 cyc
+            sta output+1 ; 3 cyc
-    ; note: we are not restoring memory to save 6 cycles!
+            ; note: we are not restoring memory to save 6 cycles!
-    ; this means those 16kb have to be switched back to base RAM
+            ; this means those 16kb have to be switched back to base RAM
-    ; if we need to use them anywhere else
+            ; if we need to use them anywhere else
-    ;;; restore memory
+            ;;; restore memory
-    ;;lda #$81     ; 2 cyc - disabled
+            ;;lda #$81     ; 2 cyc - disabled
-    ;;sta PORTB    ; 4 cyc - disabled
+            ;;sta PORTB    ; 4 cyc - disabled
-    ; check that 1 bit we skipped to fit into space
+            ; check that 1 bit we skipped to fit into space
-    lda arg1     ; 3 cyc
+            lda arg1     ; 3 cyc
-    and #1       ; 2 cyc
+            and #1       ; 2 cyc
-    beq done     ; 2 cyc
+            beq done     ; 2 cyc
-    ; add the second param one last time for the skipped bit
+            ; add the second param one last time for the skipped bit
-    clc          ; 2 cyc
+            clc          ; 2 cyc
-    lda arg2     ; 3 cyc
+            lda arg2     ; 3 cyc
-    adc output   ; 3 cyc
+            adc output   ; 3 cyc
-    sta output   ; 3 cyc
+            sta output   ; 3 cyc
-    lda #0       ; 2 cyc
+            lda #0       ; 2 cyc
-    adc output+1 ; 3 cyc
+            adc output+1 ; 3 cyc
-    sta output+1 ; 3 cyc
+            sta output+1 ; 3 cyc
-done:
+        done:
        .endscope
    .else
        ; Using base 48k RAM compatibility mode
        ; Small table of half squares
        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
        ; 81-92 cycles
        .scope
            mul_factor_a   = arg1
            mul_factor_x   = arg2
            mul_product_lo = dest
            mul_product_hi = dest + 1
            lda mul_factor_a      ; 3 cyc
            ; (a + x)^2/2
            clc                   ; 2 cyc         
            adc mul_factor_x      ; 3 cyc
            tax                   ; 2 cyc
            bcc under256          ; 2 cyc
            lda mul_hibyte512,x   ; 4 cyc
            bcs next              ; 2 cyc
        under256:
            lda mul_hibyte256,x   ; 4 cyc
            sec                   ; 2 cyc
        next:
            sta mul_product_hi    ; 3 cyc
            lda mul_lobyte256,x   ; 4 cyc
            ; - a^2/2
            ldx mul_factor_a      ; 3 cyc
            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc
            ; + x & a & 1:
            ; (this is a kludge to correct a
            ; roundoff error that makes odd * odd too low)
            ldx mul_factor_x      ; 3 cyc
            txa                   ; 2 cyc
            and mul_factor_a      ; 3 cyc
            and #1                ; 2 cyc
            clc                   ; 2 cyc
            adc mul_product_lo    ; 3 cyc
            bcc small_product     ; 2 cyc
            inc mul_product_hi    ; 5 cyc
            ; - x^2/2
        small_product:
            sec                   ; 2 cyc
            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc
        .endscope
    .endif
 .endmacro
 .proc imul8xe_init
@ -532,6 +547,14 @@ init:
    lda #.hibyte(imul16xe_func)
    sta imul16_func + 2
    ; ditto for sqr16_func -> sqr16xe_func
    lda #$4c ; 'jmp' opcode
    sta sqr16_func
    lda #.lobyte(sqr16xe_func)
    sta sqr16_func + 1
    lda #.hibyte(sqr16xe_func)
    sta sqr16_func + 2
    ; create the lookup table
    ; go through the input set, in four 16KB chunks
@ -632,7 +655,13 @@ inner_loop:
 .endproc
-.proc imul16_func
+.macro imul16_impl xe
    .local arg1
    .local arg2
    .local result
    .local inter
    .local arg1_pos
    .local arg2_pos
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
@ -643,20 +672,20 @@ inner_loop:
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
-    imul8 result, arg1, arg2
+    imul8 result, arg1, arg2, xe
    lda #0
    sta result + 2
    sta result + 3
-    imul8 inter, arg1 + 1, arg2
+    imul8 inter, arg1 + 1, arg2, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3
-    imul8 inter, arg1, arg2 + 1
+    imul8 inter, arg1, arg2 + 1, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3
-    imul8 inter, arg1 + 1, arg2 + 1
+    imul8 inter, arg1 + 1, arg2 + 1, xe
    add16 result + 2, result + 2, inter
    ; In case of negative inputs, adjust high word
@ -671,47 +700,59 @@ arg1_pos:
 arg2_pos:
    rts ; 6 cyc
 .endmacro
 .macro sqr16_impl xe
    .scope
        arg = FR0    ; 16-bit arg (clobbered)
        result = FR2 ; 32-bit result
        ;inter = temp2
        inter = FR1
        lda arg + 1
        bpl arg_pos
        neg16 arg
    arg_pos:
        ; hl * hl
        ; (h*256 + l) * (h*256 + l)
        ; h*256*(h*256 + l) + l*(h*256 + l)
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
        sqr8 result, arg
        ;imul8 inter, arg, arg, xe
        lda #0
        sta result + 2
        sta result + 3
        imul8 inter, arg + 1, arg, xe
        add16 result + 1, result + 1, inter
        add_carry result + 3
        add16 result + 1, result + 1, inter
        add_carry result + 3
        sqr8 inter, arg + 1
        ;imul8 inter, arg + 1, arg + 1, xe
        add16 result + 2, result + 2, inter
        rts ; 6 cyc
    .endscope
 .endmacro
 .proc imul16_func
    imul16_impl 0
 .endproc
 .proc imul16xe_func
-    arg1 = FR0   ; 16-bit arg (clobbered)
+    imul16_impl 1
-    arg2 = FR1   ; 16-bit arg (clobbered)
+.endproc
    result = FR2 ; 32-bit result
    inter = temp2
-    ; h1l1 * h2l2
+.proc sqr16_func
-    ; (h1*256 + l1) * (h2*256 + l2)
+    sqr16_impl 0
-    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+.endproc
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
-    imul8xe result, arg1, arg2
+.proc sqr16xe_func
-    lda #0
+    sqr16_impl 1
    sta result + 2
    sta result + 3
    imul8xe inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8xe inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8xe inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter
    ; In case of negative inputs, adjust high word
    ; https://stackoverflow.com/a/28827013
    lda arg1 + 1
    bpl arg1_pos
    sub16 result + 2, result + 2, arg2
 arg1_pos:
    lda arg2 + 1
    bpl arg2_pos
    sub16 result + 2, result + 2, arg1
 arg2_pos:
    rts ; 6 cyc
 .endproc
 .macro round16 arg
@ -825,10 +866,10 @@ keep_going:
    quick_exit zy, 2
    ; zx_2 = zx * zx
-    imul16_round zx_2, zx, zx, 4
+    sqr16_round zx_2, zx, 4
    ; zy_2 = zy * zy
-    imul16_round zy_2, zy, zy, 4
+    sqr16_round zy_2, zy, 4
    ; zx_zy = zx * zy
    imul16_round zx_zy, zx, zy, 4
--- a/tables.js
+++ b/tables.js
@ -22,7 +22,10 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
 .export sqr_lobyte
 .export sqr_hibyte
 ; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 ; (i * i) for the plain squares
 .align 256
 sqr_lobyte:
 ${db((i) => (i * i) & 0xff)}
 .align 256
 sqr_hibyte:
 ${db((i) => ((i * i) >> 8) & 0xff)}
 `);
Author	SHA1	Message	Date
Brooke Vibber	70d2c91f03	fix bank switch on xl/xe was accidentally enabling basic rom :D 5m46s - 11.759 ms/px - 800xl 5m30s - 11.215 ms/px - 130xe	2024-12-30 03:56:35 -08:00
Brooke Vibber	acac5a8df4	moving the framebuffer into the basic space fails on 130xe and 800xl for some reason works on 800 as expected	2024-12-29 21:19:55 -08:00
Brooke Vibber	883f926e57	split memory, wip appears to work on 800 but xl/xe overlap basic lol	2024-12-29 21:06:48 -08:00
Brooke Vibber	0c63430dd9	wip tables segment to be	2024-12-29 20:37:58 -08:00
Brooke Vibber	3ab5006aa3	wip refacotring	2024-12-29 17:56:14 -08:00
Brooke Vibber	f903272335	refactoring and start on squares	2024-12-29 17:37:06 -08:00
Brooke Vibber	8ad996981a	whoops	2024-12-29 13:19:58 -08:00