From 8ad996981abdc4a74f2e220fe2ae970c1bd90960 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 13:19:58 -0800 Subject: [PATCH 1/7] whoops --- mandel.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 3579b0f..cf52fdb 100644 --- a/mandel.s +++ b/mandel.s @@ -107,7 +107,7 @@ KEY_RIGHT = $87 .struct float48 exponent .byte - mantissa .byte 6 + mantissa .byte 5 .endstruct .import mul_lobyte256 From f903272335b5749ef805ddbd31b52a58051e6b94 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 17:37:06 -0800 Subject: [PATCH 2/7] refactoring and start on squares --- mandel.s | 284 +++++++++++++++++++++++++----------------------------- tables.js | 12 +++ 2 files changed, 143 insertions(+), 153 deletions(-) diff --git a/mandel.s b/mandel.s index cf52fdb..2e16b53 100644 --- a/mandel.s +++ b/mandel.s @@ -374,65 +374,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; Adapted from https://everything2.com/title/Fast+6502+multiplication -.macro imul8 dest, arg1, arg2 - .local under256 - .local next - .local small_product - ; circa 92 cycles? this doesn't seem right - ; 81-92 cycles - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; 3 cyc - - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc - next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc - - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc - - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 - small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - .endscope +; clobbers a, x +.macro sqr8 dest, arg + ldx arg + lda sqr_lobyte,x + sta dest + lda sqr_hibyte,x + sta dest + 1 .endmacro ; lookup table for top byte -> PORTB value for bank-switch @@ -447,64 +395,121 @@ bank_switch_table: sta PORTB .endmacro +.macro imul8 dest, arg1, arg2, xe + .if xe + ; using 64KB lookup table + ; 58-77 cycles + ; clobbers x, y, dest to dest + 3 + .scope + output = dest + ptr = dest + 2 ; scratch space assumed -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc -done: + done: + .endscope + .else + ; Using base 48k RAM compatibility mode + ; Small table of half squares + ; Adapted from https://everything2.com/title/Fast+6502+multiplication + ; 81-92 cycles + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; 3 cyc + + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc + under256: + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc + next: + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc + + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc + + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 + small_product: + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + .endscope + .endif .endmacro .proc imul8xe_init @@ -632,7 +637,13 @@ inner_loop: .endproc -.proc imul16_func +.macro imul16_impl xe + .local arg1 + .local arg2 + .local result + .local inter + .local arg1_pos + .local arg2_pos arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result @@ -643,20 +654,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2 + imul8 result, arg1, arg2, xe lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2 + imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1 + imul8 inter, arg1, arg2 + 1, xe add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1 + imul8 inter, arg1 + 1, arg2 + 1, xe add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -671,47 +682,14 @@ arg1_pos: arg2_pos: rts ; 6 cyc +.endmacro + +.proc imul16_func + imul16_impl 0 .endproc .proc imul16xe_func - arg1 = FR0 ; 16-bit arg (clobbered) - arg2 = FR1 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 - - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - - imul8xe result, arg1, arg2 - lda #0 - sta result + 2 - sta result + 3 - - imul8xe inter, arg1 + 1, arg2 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1, arg2 + 1 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1 + 1, arg2 + 1 - add16 result + 2, result + 2, inter - - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg1 + 1 - bpl arg1_pos - sub16 result + 2, result + 2, arg2 -arg1_pos: - lda arg2 + 1 - bpl arg2_pos - sub16 result + 2, result + 2, arg1 -arg2_pos: - - rts ; 6 cyc + imul16_impl 1 .endproc .macro round16 arg diff --git a/tables.js b/tables.js index c772f81..50cbef9 100644 --- a/tables.js +++ b/tables.js @@ -22,7 +22,10 @@ console.log( .export mul_lobyte256 .export mul_hibyte256 .export mul_hibyte512 +.export sqr_lobyte +.export sqr_hibyte +; (i * i + 1) / 2 for the multiplier .align 256 mul_lobyte256: ${db((i) => squares[i] & 0xff)} @@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)} mul_hibyte512: ${db((i) => (squares[i + 256] >> 8) & 0xff)} +; (i * i) for the plain squares +.align 256 +sqr_lobyte: +${db((i) => (i * i) & 0xff)} + +.align 256 +sqr_hibyte: +${db((i) => ((i * i) >> 8) & 0xff)} + `); From 3ab5006aa3033cf595311fc6a09247c0c04f9c14 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 17:56:14 -0800 Subject: [PATCH 3/7] wip refacotring --- mandel.s | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 2e16b53..9eb6ce1 100644 --- a/mandel.s +++ b/mandel.s @@ -374,6 +374,14 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16_round dest, arg, shift + imul16_round dest, arg, arg, shift + ;copy16 FR0, arg ; 12 cyc + ;jsr sqr16_func ; ? cyc + ;shift_round_16 FR2, shift + ;copy16 dest, FR2 + 2 ; 12 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -537,6 +545,14 @@ init: lda #.hibyte(imul16xe_func) sta imul16_func + 2 + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + ; create the lookup table ; go through the input set, in four 16KB chunks @@ -684,6 +700,45 @@ arg2_pos: rts ; 6 cyc .endmacro +.macro sqr16_impl xe + .local arg + .local result + .local inter + .local arg_pos + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l + + sqr8 result, arg + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 + + sqr8 inter, arg + 1, arg + 1, xe + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg + 1 + bpl arg_pos + sub16 result + 2, result + 2, arg + sub16 result + 2, result + 2, arg +arg_pos: + + rts ; 6 cyc +.endmacro + .proc imul16_func imul16_impl 0 .endproc @@ -692,6 +747,14 @@ arg2_pos: imul16_impl 1 .endproc +.proc sqr16_func + imul16_impl 0 +.endproc + +.proc sqr16xe_func + imul16_impl 1 +.endproc + .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -803,10 +866,10 @@ keep_going: quick_exit zy, 2 ; zx_2 = zx * zx - imul16_round zx_2, zx, zx, 4 + sqr16_round zx_2, zx, 4 ; zy_2 = zy * zy - imul16_round zy_2, zy, zy, 4 + sqr16_round zy_2, zy, 4 ; zx_zy = zx * zy imul16_round zx_zy, zx, zy, 4 From 0c63430dd95a1c3e72e0fb0c252e233ddc0c9d79 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 20:37:58 -0800 Subject: [PATCH 4/7] wip tables segment to be --- Makefile | 4 ++-- atari-asm-xex.cfg | 3 ++- mandel.s | 10 +++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 008bf8c..bd14c7d 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ all : mandel.xex -mandel.xex : mandel.o tables.o - ld65 -C ./atari-asm-xex.cfg -o $@ $+ +mandel.xex : mandel.o tables.o atari-asm-xex.cfg + ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o %.o : %.s ca65 -o $@ $< diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index 6e6498d..fb43089 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -6,7 +6,8 @@ SYMBOLS { } MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; - MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; + #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; + MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; } FILES { %O: format = atari; diff --git a/mandel.s b/mandel.s index 9eb6ce1..7bfb577 100644 --- a/mandel.s +++ b/mandel.s @@ -375,11 +375,11 @@ viewport_oy: .endmacro .macro sqr16_round dest, arg, shift - imul16_round dest, arg, arg, shift - ;copy16 FR0, arg ; 12 cyc - ;jsr sqr16_func ; ? cyc - ;shift_round_16 FR2, shift - ;copy16 dest, FR2 + 2 ; 12 cyc + ;imul16_round dest, arg, arg, shift + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + shift_round_16 FR2, shift + copy16 dest, FR2 + 2 ; 12 cyc .endmacro ; clobbers a, x From 883f926e575cbc4720b25826b2252495a0621d81 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 21:06:48 -0800 Subject: [PATCH 5/7] split memory, wip appears to work on 800 but xl/xe overlap basic lol --- atari-asm-xex.cfg | 3 ++- mandel.s | 64 +++++++++++++++++++++++------------------------ 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index fb43089..9f871ca 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -8,6 +8,7 @@ MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; + TABLES: file = %O, define = yes, start = $a000, size = $c000 - $a000; } FILES { %O: format = atari; @@ -22,5 +23,5 @@ SEGMENTS { RODATA: load = MAIN, type = ro optional = yes; DATA: load = MAIN, type = rw optional = yes; BSS: load = MAIN, type = bss, optional = yes, define = yes; - TABLES: load = MAIN, type = ro, optional = yes, align = 256; + TABLES: load = TABLES, type = ro, optional = yes, align = 256; } diff --git a/mandel.s b/mandel.s index 7bfb577..a5bcb35 100644 --- a/mandel.s +++ b/mandel.s @@ -113,6 +113,8 @@ KEY_RIGHT = $87 .import mul_lobyte256 .import mul_hibyte256 .import mul_hibyte512 +.import sqr_lobyte +.import sqr_hibyte .data @@ -701,42 +703,40 @@ arg2_pos: .endmacro .macro sqr16_impl xe - .local arg - .local result - .local inter - .local arg_pos - arg = FR0 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 + .scope + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + ;inter = temp2 + inter = FR1 - ; hl * hl - ; (h*256 + l) * (h*256 + l) - ; h*256*(h*256 + l) + l*(h*256 + l) - ; h*h*256*256 + h*l*256 + h*l*256 + l*l + lda arg + 1 + bpl arg_pos + neg16 arg + arg_pos: - sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l - imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - add16 result + 1, result + 1, inter - add_carry result + 3 + sqr8 result, arg + ;imul8 inter, arg, arg, xe + lda #0 + sta result + 2 + sta result + 3 - sqr8 inter, arg + 1, arg + 1, xe - add16 result + 2, result + 2, inter + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg + 1 - bpl arg_pos - sub16 result + 2, result + 2, arg - sub16 result + 2, result + 2, arg -arg_pos: + sqr8 inter, arg + 1 + ;imul8 inter, arg + 1, arg + 1, xe + add16 result + 2, result + 2, inter - rts ; 6 cyc + rts ; 6 cyc + .endscope .endmacro .proc imul16_func @@ -748,11 +748,11 @@ arg_pos: .endproc .proc sqr16_func - imul16_impl 0 + sqr16_impl 0 .endproc .proc sqr16xe_func - imul16_impl 1 + sqr16_impl 1 .endproc .macro round16 arg From acac5a8df42f7128a785d8d6efd65b69ad2178bf Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 21:19:55 -0800 Subject: [PATCH 6/7] moving the framebuffer into the basic space fails on 130xe and 800xl for some reason works on 800 as expected --- atari-asm-xex.cfg | 5 +++-- mandel.s | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index 9f871ca..93b80f3 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -6,9 +6,10 @@ SYMBOLS { } MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; - #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; - TABLES: file = %O, define = yes, start = $a000, size = $c000 - $a000; + # Keep $4000-7fff clear for expanded RAM access window + TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; + # Keep $a000-$bfff clear for BASIC cartridge } FILES { %O: format = atari; diff --git a/mandel.s b/mandel.s index a5bcb35..8517685 100644 --- a/mandel.s +++ b/mandel.s @@ -62,11 +62,11 @@ FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) FMOVE = $DDB6 ; MOVE FR0 TO FR1 ; High data -framebuffer_top = $8000 -textbuffer = $8f00 -framebuffer_bottom = $9000 -display_list = $9f00 -framebuffer_end = $a000 +framebuffer_top = $a000 +textbuffer = $af00 +framebuffer_bottom = $b000 +display_list = $bf00 +framebuffer_end = $c000 height = 184 half_height = height >> 1 From 70d2c91f03dd4e2b90dd2419e060bbb220747dd9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 03:56:35 -0800 Subject: [PATCH 7/7] fix bank switch on xl/xe was accidentally enabling basic rom :D 5m46s - 11.759 ms/px - 800xl 5m30s - 11.215 ms/px - 130xe --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 8517685..9f594e8 100644 --- a/mandel.s +++ b/mandel.s @@ -397,11 +397,11 @@ viewport_oy: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 4) | $e1 + .byte ((i & $c0) >> 4) | $e3 .endrepeat .macro bank_switch bank - lda #((bank << 2) | $e1) + lda #((bank << 2) | $e3) sta PORTB .endmacro