From f10bb4fe18d560b8db777d591484ef93da545327 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sat, 11 Feb 2023 12:24:48 -0800 Subject: [PATCH 01/59] WIP alternate imul16 not working at present --- .gitignore | 1 + Makefile | 8 +++- mandel.s | 124 ++++++++++++++++++++++++++++++++++++++++++++++++----- tables.js | 33 ++++++++++++++ testme.js | 41 ++++++++++++++++++ 5 files changed, 194 insertions(+), 13 deletions(-) create mode 100644 tables.js create mode 100644 testme.js diff --git a/.gitignore b/.gitignore index 8d2f7ce..771e47a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.xex +tables.s .DS_Store diff --git a/Makefile b/Makefile index 25148b4..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,17 @@ all : mandel.xex -%.xex : %.o - ld65 -C atari-asm-xex.cfg -o $@ $< +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< +tables.s : tables.js + node tables.js > tables.s + clean : + rm -f tables.s rm -f *.o rm -f *.xex diff --git a/mandel.s b/mandel.s index 097b700..71bc6c2 100644 --- a/mandel.s +++ b/mandel.s @@ -22,11 +22,12 @@ total_ms = $a4 ; float48 total_pixels = $aa ; float48 temp = $b0 ; u16 -pixel_ptr = $b2 ; u16 -pixel_color = $b4 ; u8 -pixel_mask = $b5 ; u8 -pixel_shift = $b6 ; u8 -pixel_offset = $b7 ; u8 +temp2 = $b2 ; u16 +pixel_ptr = $b4 ; u16 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 ; FP registers in zero page @@ -83,6 +84,10 @@ SETVBV = $E45C mantissa .byte 6 .endstruct +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 + .data strings: @@ -206,6 +211,12 @@ color_map: add 4, dest, arg2, dest .endmacro +.macro add_carry dest + lda dest + adc #0 + sta dest +.endmacro + ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc @@ -336,12 +347,12 @@ next: ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the X register. + ; keeping a count of sign bits in the Y register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc - inx ; 2 cyc + iny ; 2 cyc positive: .endmacro @@ -370,13 +381,13 @@ positive: ; min 470 cycles ; max 780 cycles -.proc imul16_func +.proc imul16_func_orig arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result - ldx #0 ; 2 cyc - ; counts the number of sign bits in X + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc @@ -396,7 +407,98 @@ positive: .endrepeat ; In case of mixed input signs, return a negative result. - cpx #1 ; 2 cyc + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: + + rts ; 6 cyc +.endproc + +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x + + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x + + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 + + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro + +.proc imul16_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc + + lda #0 + sta result + 0 + sta result + 1 + sta result + 2 + sta result + 3 + + imul8 inter, arg1, arg2 + add16 result, result, inter + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: diff --git a/tables.js b/tables.js new file mode 100644 index 0000000..5afc3c0 --- /dev/null +++ b/tables.js @@ -0,0 +1,33 @@ +function db(func) { + let lines = []; + for (let i = 0; i < 256; i += 16) { + let items = []; + for (let j = 0; j < 16; j++) { + let x = i + j; + items.push(func(x)); + } + lines.push(' .byte ' + items.join(', ')); + } + return lines.join('\n'); +} + +console.log( +`.segment "TABLES" + +.export mul_lobyte256 +.export mul_hibyte256 +.export mul_hibyte512 + +.align 256 +mul_lobyte256: +${db((x) => Math.round(x * x / 2) & 0xff)} + +.align 256 +mul_hibyte256: +${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)} + +.align 256 +mul_hibyte512: +${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)} + +`); diff --git a/testme.js b/testme.js new file mode 100644 index 0000000..e12e706 --- /dev/null +++ b/testme.js @@ -0,0 +1,41 @@ +// ax = (a + x)2/2 - a2/2 - x2/2 + +function half_square(x) { + return Math.round(x * x / 2) & 0xffff >>> 0; +} + +function mul8(a, b) { + let result = half_square(a + b) & 0xffff; + result = (result - half_square(a)) & 0xffff; + result = (result - half_square(b)) & 0xffff; + result = (result + (b & a & 1)) & 0xffff; + return result >>> 0; +} + +function mul16(a, b) { + let ah = (a & 0xff00) >>> 8; + let al = (a & 0x00ff) >>> 0; + let bh = (b & 0xff00) >>> 8; + let bl = (b & 0x00ff) >>> 0; + let result = (mul8(al, bl) & 0xffff) >>> 0; + result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0; + result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0; + result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0; + return result; +} + +let max = 65536; +//let max = 256; +//let max = 128; +//let max = 8; + +for (let a = 0; a < max; a++) { + for (let b = 0; b < max; b++) { + let expected = Math.imul(a, b) >>> 0; + //let actual = mul8(a, b); + let actual = mul16(a, b); + if (expected !== actual) { + console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`); + } + } +} \ No newline at end of file From 0631886466d78205bb05afa09a9af24004861f32 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 11 Nov 2024 12:10:08 -0800 Subject: [PATCH 02/59] whee --- mandel.s | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mandel.s b/mandel.s index 7a998ad..fcc7867 100644 --- a/mandel.s +++ b/mandel.s @@ -529,6 +529,11 @@ positive_result: check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + lda #0 sta result + 0 sta result + 1 From 5637783529e71dbbdc568d853d5df5616f25970c Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sat, 11 Feb 2023 12:24:48 -0800 Subject: [PATCH 03/59] Faster imul16 routine Improves runtime from 16.24 ms/px to 14.44 ms/px This uses a routine found on Everything2: https://everything2.com/title/Fast+6502+multiplication which uses a lookup table of squares to do 8-bit imuls, which are then composed into a 16-bit imul --- .gitignore | 1 + Makefile | 8 ++- mandel.s | 176 +++++++++++++++++++++++++++++------------------------ tables.js | 38 ++++++++++++ testme.js | 41 +++++++++++++ 5 files changed, 183 insertions(+), 81 deletions(-) create mode 100644 tables.js create mode 100644 testme.js diff --git a/.gitignore b/.gitignore index 8d2f7ce..771e47a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.xex +tables.s .DS_Store diff --git a/Makefile b/Makefile index 25148b4..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,17 @@ all : mandel.xex -%.xex : %.o - ld65 -C atari-asm-xex.cfg -o $@ $< +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< +tables.s : tables.js + node tables.js > tables.s + clean : + rm -f tables.s rm -f *.o rm -f *.xex diff --git a/mandel.s b/mandel.s index 3db6a77..1244a02 100644 --- a/mandel.s +++ b/mandel.s @@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not z_buffer_start = $b1 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer temp = $b4 ; u16 - -pixel_ptr = $b6 ; u16 -pixel_color = $b8 ; u8 -pixel_mask = $b9 ; u8 -pixel_shift = $ba ; u8 -pixel_offset = $bb ; u8 -fill_level = $bc ; u8 -palette_offset = $bd ; u8 +temp2 = $b6 ; u16 +pixel_ptr = $b8 ; u16 +pixel_color = $ba ; u8 +pixel_mask = $bb ; u8 +pixel_shift = $bc ; u8 +pixel_offset = $bd ; u8 +fill_level = $be ; u8 +palette_offset = $bf ; u8 ; FP registers in zero page FR0 = $d4 ; float48 @@ -107,6 +107,10 @@ KEY_RIGHT = $87 mantissa .byte 6 .endstruct +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 + .data strings: @@ -257,6 +261,12 @@ fill_masks: add 4, dest, arg2, dest .endmacro +.macro add_carry dest + lda dest + adc #0 + sta dest +.endmacro + ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc @@ -334,65 +344,15 @@ fill_masks: neg 4, arg .endmacro -; inner loop for imul16 -; bitnum < 8: 25 or 41 cycles -; bitnum >= 8: 30 or 46 cycles -.macro bitmul16 arg1, arg2, result, bitnum - .local zero - .local one - .local next - - ; does 16-bit adds - ; arg1 and arg2 are treated as unsigned - ; negative signed inputs must be flipped first - - ; 7 cycles up to the branch - - ; check if arg1 has 0 or 1 bit in this place - ; 5 cycles either way - .if bitnum < 8 - lda arg1 ; 3 cyc - and #(1 << (bitnum)) ; 2 cyc - .else - lda arg1 + 1 ; 3 cyc - and #(1 << ((bitnum) - 8)) ; 2 cyc - .endif - bne one ; 2 cyc - -zero: ; 18 cyc, 23 cyc - lsr result + 3 ; 5 cyc - jmp next ; 3 cyc - -one: ; 32 cyc, 37 cyc - ; 16-bit add on the top bits - clc ; 2 cyc - lda result + 2 ; 3 cyc - adc arg2 ; 3 cyc - sta result + 2 ; 3 cyc - lda result + 3 ; 3 cyc - adc arg2 + 1 ; 3 cyc - ror a ; 2 cyc - get a jump on the shift - sta result + 3 ; 3 cyc -next: - ror result + 2 ; 5 cyc - ror result + 1 ; 5 cyc - .if bitnum >= 8 - ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte - ; when it's all uninitialized data - ror result ; 5 cyc - .endif - -.endmacro - ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the X register. + ; keeping a count of sign bits in the Y register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc - inx ; 2 cyc + iny ; 2 cyc positive: .endmacro @@ -419,35 +379,93 @@ positive: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; min 470 cycles -; max 780 cycles +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x + + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x + + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 + + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro + .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result + inter = temp2 - ldx #0 ; 2 cyc - ; counts the number of sign bits in X + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc - - ; zero out the 32-bit temp's top 16 bits - lda #0 ; 2 cyc - sta result + 2 ; 3 cyc - sta result + 3 ; 3 cyc - ; the bottom two bytes will get cleared by the shifts - ; unrolled loop for maximum speed, at the cost - ; of a larger routine - ; 440 to 696 cycles - .repeat 16, bitnum - ; bitnum < 8: 25 or 41 cycles - ; bitnum >= 8: 30 or 46 cycles - bitmul16 arg1, arg2, result, bitnum - .endrepeat + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + + imul8 result, arg1, arg2 + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter ; In case of mixed input signs, return a negative result. - cpx #1 ; 2 cyc + cpy #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: diff --git a/tables.js b/tables.js new file mode 100644 index 0000000..c772f81 --- /dev/null +++ b/tables.js @@ -0,0 +1,38 @@ +function db(func) { + let lines = []; + for (let i = 0; i < 256; i += 16) { + let items = []; + for (let j = 0; j < 16; j++) { + let x = i + j; + items.push(func(x)); + } + lines.push(' .byte ' + items.join(', ')); + } + return lines.join('\n'); +} + +let squares = []; +for (let i = 0; i < 512; i++) { + squares.push(Math.trunc((i * i + 1) / 2)); +} + +console.log( +`.segment "TABLES" + +.export mul_lobyte256 +.export mul_hibyte256 +.export mul_hibyte512 + +.align 256 +mul_lobyte256: +${db((i) => squares[i] & 0xff)} + +.align 256 +mul_hibyte256: +${db((i) => (squares[i] >> 8) & 0xff)} + +.align 256 +mul_hibyte512: +${db((i) => (squares[i + 256] >> 8) & 0xff)} + +`); diff --git a/testme.js b/testme.js new file mode 100644 index 0000000..e12e706 --- /dev/null +++ b/testme.js @@ -0,0 +1,41 @@ +// ax = (a + x)2/2 - a2/2 - x2/2 + +function half_square(x) { + return Math.round(x * x / 2) & 0xffff >>> 0; +} + +function mul8(a, b) { + let result = half_square(a + b) & 0xffff; + result = (result - half_square(a)) & 0xffff; + result = (result - half_square(b)) & 0xffff; + result = (result + (b & a & 1)) & 0xffff; + return result >>> 0; +} + +function mul16(a, b) { + let ah = (a & 0xff00) >>> 8; + let al = (a & 0x00ff) >>> 0; + let bh = (b & 0xff00) >>> 8; + let bl = (b & 0x00ff) >>> 0; + let result = (mul8(al, bl) & 0xffff) >>> 0; + result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0; + result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0; + result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0; + return result; +} + +let max = 65536; +//let max = 256; +//let max = 128; +//let max = 8; + +for (let a = 0; a < max; a++) { + for (let b = 0; b < max; b++) { + let expected = Math.imul(a, b) >>> 0; + //let actual = mul8(a, b); + let actual = mul16(a, b); + if (expected !== actual) { + console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`); + } + } +} \ No newline at end of file From 7f2bc43cff173e7dffd9a5629bb9bcb56f374259 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 14 Dec 2024 18:56:26 -0800 Subject: [PATCH 04/59] squares --- readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/readme.md b/readme.md index 6b57378..873793f 100644 --- a/readme.md +++ b/readme.md @@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. +(done) ## Deps and build instructions From 05133aabdd59739805bbe7bb2eb32e9815120718 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 15 Dec 2024 20:17:45 -0800 Subject: [PATCH 05/59] slightly faster handling of signed mul previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/ --- mandel.s | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/mandel.s b/mandel.s index 1244a02..3622995 100644 --- a/mandel.s +++ b/mandel.s @@ -344,18 +344,6 @@ fill_masks: neg 4, arg .endmacro -; 5 to 25 cycles -.macro check_sign arg - ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the Y register. - .local positive - lda arg + 1 ; 3 cyc - bpl positive ; 2 cyc - neg16 arg ; 18 cyc - iny ; 2 cyc -positive: -.endmacro - ; 518 - 828 cyc .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc @@ -438,11 +426,6 @@ positive: result = FR2 ; 32-bit result inter = temp2 - ldy #0 ; 2 cyc - ; counts the number of sign bits in Y - check_sign arg1 ; 5 to 25 cyc - check_sign arg2 ; 5 to 25 cyc - ; h1l1 * h2l2 ; (h1*256 + l1) * (h2*256 + l2) ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) @@ -464,11 +447,16 @@ positive: imul8 inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter - ; In case of mixed input signs, return a negative result. - cpy #1 ; 2 cyc - bne positive_result ; 2 cyc - neg32 result ; 34 cyc -positive_result: + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg1 + 1 + bpl arg1_pos + sub16 result + 2, result + 2, arg2 +arg1_pos: + lda arg2 + 1 + bpl arg2_pos + sub16 result + 2, result + 2, arg1 +arg2_pos: rts ; 6 cyc .endproc From 405cec6d511947ccc1a0dcc3c79e06e4ac1a5278 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 10:51:27 -0800 Subject: [PATCH 06/59] WIP imul8 via table experiments planning to try a 64KB table of 8x7-bit multiplies in the high memory on a 130XE or other high-memory-capable machine not yet working or finished too many cycles of overhead per invocation --- imul8xe.s | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ mandel.s | 71 ++++++++++++++++++++++------------------- 2 files changed, 133 insertions(+), 32 deletions(-) create mode 100644 imul8xe.s diff --git a/imul8xe.s b/imul8xe.s new file mode 100644 index 0000000..5cbb852 --- /dev/null +++ b/imul8xe.s @@ -0,0 +1,94 @@ +FR0 = $d4 ; float48 +PORTB = $d301 + + +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +bankswitch = ; ??? + +; input in X/Y (lo/hi) +; output in FR0 +; clobbers FR0 +; 128 cycles +proc imul8xe + output = FR0 + ptr = FR0 + 2 + + lda #0 ; 2 cyc + sta ptr ; 3 cyc + sta ptr + 1 ; 3 cyc + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + txa ; 2 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + tya ; 2 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + tya ; 2 cyc + and #$c0 ; 2 cyc + ; shift in extended RAM mode 2x 1 bits + sec ; 2 cyc + ror ; 2 cyc + ror ; 2 cyc + ; shift in 0 bits + asr ; 2 cyc + asr ; 2 cyc + asr ; 2 cyc + + ; save the second param for later + phy ; 3 cyc + + ; disable interrupts + lda NMIEN ; 4 cyc + pha ; 3 cyc + lda #0 ; 2 cyc + sta NMIEN ; 4 cyc + + ; set the standard top RAM and OS ROM on + or #$81 ; 2 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; restore memory + lda #$81 ; 2 cyc + sta PORTB ; 4 cyc + + ; restore interrupts + pla ; 3 cyc + sta NMIEN ; 4 cyc + + ; check that 1 bit we skipped to fit into space + txa ; 2 cyc + and $#1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last tie for the skipped bit + clc ; 2 cyc + pla ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: + pla + rts ; 6 cyc +endproc + +proc imul8xe_init + rts +endproc diff --git a/mandel.s b/mandel.s index 3622995..3b0bc9f 100644 --- a/mandel.s +++ b/mandel.s @@ -372,51 +372,58 @@ fill_masks: .local under256 .local next .local small_product + ; circa 92 cycles? this doesn't seem right .scope mul_factor_a = arg1 mul_factor_x = arg2 mul_product_lo = dest mul_product_hi = dest + 1 - lda mul_factor_a ; setup: 6 cycles - ;ldx mul_factor_x + lda mul_factor_a ; 3 cyc - clc ; (a + x)^2/2: 23 cycles - adc mul_factor_x - tax - bcc under256 - lda mul_hibyte512,x - bcs next + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc under256: - lda mul_hibyte256,x - sec + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc next: - sta mul_product_hi - lda mul_lobyte256,x + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc - ldx mul_factor_a ; - a^2/2: 20 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc - ldx mul_factor_x ; + x & a & 1: 22 cycles - txa ; (this is a kludge to correct a - and mul_factor_a ; roundoff error that makes odd * odd too low) - and #1 + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc - clc - adc mul_product_lo - bcc small_product - inc mul_product_hi + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 small_product: - sec ; - x^2/2: 25 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc .endscope .endmacro From f996c3cbcd84b3aff3fd39bf3daee9a6c60a9e2a Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 12:47:37 -0800 Subject: [PATCH 07/59] provisional maybe old mode runs in 81-92 cycles provisional code runs in 58-77 cycles if it works ;) --- imul8xe.s | 76 ++++++++++++++++++++----------------------------------- mandel.s | 1 + 2 files changed, 29 insertions(+), 48 deletions(-) diff --git a/imul8xe.s b/imul8xe.s index 5cbb852..d12f53f 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -3,55 +3,38 @@ PORTB = $d301 EXTENDED_RAM = $4000 ; 16KiB bank on the XE -bankswitch = ; ??? -; input in X/Y (lo/hi) -; output in FR0 -; clobbers FR0 -; 128 cycles -proc imul8xe - output = FR0 - ptr = FR0 + 2 +; lookup table for top byte -> PORTB value for bank-switch +.align 256 +bankswitch: + .repeat 256, i + .byte ((i & $c0) >> 5) | $c1 + .endrepeat - lda #0 ; 2 cyc - sta ptr ; 3 cyc - sta ptr + 1 ; 3 cyc +; 58-77 cycles +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed ; bottom 14 bits except the LSB are the per-bank table index ; add $4000 for the bank pointer - txa ; 2 cyc + lda arg1 ; 3 cyc and #$fe ; 2 cyc sta ptr ; 3 cyc - tya ; 2 cyc + lda arg2 ; 3 cyc and #$3f ; 2 cyc clc ; 2 cyc adc #$40 ; 2 cyc sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector - tya ; 2 cyc - and #$c0 ; 2 cyc - ; shift in extended RAM mode 2x 1 bits - sec ; 2 cyc - ror ; 2 cyc - ror ; 2 cyc - ; shift in 0 bits - asr ; 2 cyc - asr ; 2 cyc - asr ; 2 cyc - - ; save the second param for later - phy ; 3 cyc - - ; disable interrupts - lda NMIEN ; 4 cyc - pha ; 3 cyc - lda #0 ; 2 cyc - sta NMIEN ; 4 cyc - - ; set the standard top RAM and OS ROM on - or #$81 ; 2 cyc - sta PORTB ; 4 cyc + ldx arg2 ; 3 cyc + lda bank_switch,x ; 4 cyc + sta PORTB ; 4 cyc ; copy the entry into output @@ -62,22 +45,21 @@ proc imul8xe lda (ptr),y ; 5 cyc sta output+1 ; 3 cyc - ; restore memory - lda #$81 ; 2 cyc - sta PORTB ; 4 cyc - - ; restore interrupts - pla ; 3 cyc - sta NMIEN ; 4 cyc + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled ; check that 1 bit we skipped to fit into space - txa ; 2 cyc + lda arg1 ; 3 cyc and $#1 ; 2 cyc beq done ; 2 cyc ; add the second param one last tie for the skipped bit clc ; 2 cyc - pla ; 3 cyc + lda arg2 ; 3 cyc adc output ; 3 cyc sta output ; 3 cyc lda #0 ; 2 cyc @@ -85,9 +67,7 @@ proc imul8xe sta output+1 ; 3 cyc done: - pla - rts ; 6 cyc -endproc +.endmacro proc imul8xe_init rts diff --git a/mandel.s b/mandel.s index 3b0bc9f..e0a8570 100644 --- a/mandel.s +++ b/mandel.s @@ -373,6 +373,7 @@ fill_masks: .local next .local small_product ; circa 92 cycles? this doesn't seem right + ; 81-92 cycles .scope mul_factor_a = arg1 mul_factor_x = arg2 From 829d2860e8f946a088218fa5cde2e07067e0dfa6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 12:04:01 -0800 Subject: [PATCH 08/59] :P --- imul8xe.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/imul8xe.s b/imul8xe.s index d12f53f..15adf64 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -12,6 +12,7 @@ bankswitch: .endrepeat ; 58-77 cycles +; clobbers x, y, dest to dest + 3 .macro imul8xe dest, arg1, arg2 .local done .local output @@ -54,10 +55,10 @@ bankswitch: ; check that 1 bit we skipped to fit into space lda arg1 ; 3 cyc - and $#1 ; 2 cyc + and #1 ; 2 cyc beq done ; 2 cyc - ; add the second param one last tie for the skipped bit + ; add the second param one last time for the skipped bit clc ; 2 cyc lda arg2 ; 3 cyc adc output ; 3 cyc From a9d551a98d01a3634cb5068fc00506f2b398f8d2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 17:50:59 -0800 Subject: [PATCH 09/59] first draft initializer --- imul8xe.s | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/imul8xe.s b/imul8xe.s index 15adf64..855e044 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -70,6 +70,106 @@ bankswitch: done: .endmacro +.macro bank_switch bank + lda #((bank << 1) | $c1) + sta PORTB +.endmacro + proc imul8xe_init + + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + rts endproc + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldx #0 + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + sta (ptr),y + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$40 + bne outer_loop + + rts + +endproc From 34ce9da030ea3ee9853a8e5ecf64f65798faaded Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:17:01 -0800 Subject: [PATCH 10/59] builds, not used yte --- mandel.s | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/mandel.s b/mandel.s index e0a8570..d824193 100644 --- a/mandel.s +++ b/mandel.s @@ -74,6 +74,9 @@ width = 160 half_width = width >> 1 stride = width >> 2 +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +PORTB = $D301 ; memory & bank-switch for XL/XE + DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 @@ -428,6 +431,176 @@ fill_masks: .endscope .endmacro +; lookup table for top byte -> PORTB value for bank-switch +;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes +bankswitch: + .repeat 256, i + .byte ((i & $c0) >> 5) | $c1 + .endrepeat + +; 58-77 cycles +; clobbers x, y, dest to dest + 3 +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch,x ; 4 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled + + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: +.endmacro + +.macro bank_switch bank + lda #((bank << 1) | $c1) + sta PORTB +.endmacro + +.proc imul8xe_init + + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts +.endproc + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +.proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldx #0 + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + sta (ptr),y + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$40 + bne outer_loop + + rts + +.endproc + .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) From 45c5a4cb2d62d6fbed4ba64364220eb8827369f0 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:20:10 -0800 Subject: [PATCH 11/59] called, gets lost --- mandel.s | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index d824193..a8f3cac 100644 --- a/mandel.s +++ b/mandel.s @@ -433,7 +433,7 @@ fill_masks: ; lookup table for top byte -> PORTB value for bank-switch ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes -bankswitch: +bank_switch_table: .repeat 256, i .byte ((i & $c0) >> 5) | $c1 .endrepeat @@ -460,9 +460,9 @@ bankswitch: sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch,x ; 4 cyc - sta PORTB ; 4 cyc + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc ; copy the entry into output @@ -612,20 +612,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2 + imul8xe result, arg1, arg2 lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2 + imul8xe inter, arg1 + 1, arg2 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1 + imul8xe inter, arg1, arg2 + 1 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1 + imul8xe inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -1147,6 +1147,8 @@ zero_byte_loop: .proc start + jsr imul8xe_init + ; ox = 0; oy = 0; zoom = 0 ; count_frames = 0; count_pixels = 0 lda #0 From 0cde31905e62b9b97d8df3ea03c73a89bbb5d602 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:35:37 -0800 Subject: [PATCH 12/59] runs but doesn't work --- mandel.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index a8f3cac..79d9c78 100644 --- a/mandel.s +++ b/mandel.s @@ -548,7 +548,6 @@ done: lda #$40 sta ptr + 1 - ldx #0 ldy #0 ; outer loop: $00 -> $3f @@ -566,7 +565,9 @@ inner_loop: lda result sta (ptr),y lda result + 1 + iny sta (ptr),y + dey ; result += 2 * arg2 clc @@ -594,7 +595,7 @@ inner_loop: inc arg2 inc ptr + 1 lda ptr + 1 - cmp #$40 + cmp #$80 bne outer_loop rts From e84a990789b13c6c67e63cdb2db2a2be2b7893a6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 21:41:03 -0800 Subject: [PATCH 13/59] tweaks: --- mandel.s | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 79d9c78..8c6130b 100644 --- a/mandel.s +++ b/mandel.s @@ -435,9 +435,15 @@ fill_masks: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 5) | $c1 + .byte ((i & $c0) >> 4) | $d1 .endrepeat +.macro bank_switch bank + lda #((bank << 2) | $d1) + sta PORTB +.endmacro + + ; 58-77 cycles ; clobbers x, y, dest to dest + 3 .macro imul8xe dest, arg1, arg2 @@ -497,11 +503,6 @@ bank_switch_table: done: .endmacro -.macro bank_switch bank - lda #((bank << 1) | $c1) - sta PORTB -.endmacro - .proc imul8xe_init ; go through the input set, in four 16KB chunks @@ -576,13 +577,14 @@ inner_loop: sta result lda #0 adc result + 1 - sta result + sta result + 1 + clc lda arg2 adc result sta result lda #0 adc result + 1 - sta result + sta result + 1 ; inner loop check inc arg1 From ee1c2687054d760d21ffe8f1be97eb5eb6ecc7b9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 21:49:13 -0800 Subject: [PATCH 14/59] it works --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 8c6130b..3ff91d1 100644 --- a/mandel.s +++ b/mandel.s @@ -435,11 +435,11 @@ fill_masks: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 4) | $d1 + .byte ((i & $c0) >> 4) | $e1 .endrepeat .macro bank_switch bank - lda #((bank << 2) | $d1) + lda #((bank << 2) | $e1) sta PORTB .endmacro From 83cba4afa3e28cc8f6b0377c9edc49e60af36187 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Fri, 27 Dec 2024 18:37:03 -0800 Subject: [PATCH 15/59] Runtime detection of XE-style extended memory Uses the "big multiplication table" in 64KB of extended memory if bank switching appears to work, otherwise uses the table of squares lookups. Initial view clocks in at 13.133 ms/px for the XE version and still 14.211 ms/px for the 400/800/XL version. Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs, and base implementation on the 800XL emulator. --- mandel.s | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/mandel.s b/mandel.s index 3ff91d1..d198989 100644 --- a/mandel.s +++ b/mandel.s @@ -347,14 +347,6 @@ fill_masks: neg 4, arg .endmacro -; 518 - 828 cyc -.macro imul16 dest, arg1, arg2 - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc - copy32 dest, FR2 ; 24 cyc -.endmacro - .macro shift_round_16 arg, shift .repeat shift shl32 arg @@ -365,7 +357,7 @@ fill_masks: .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc + jsr imul16_func ; ? cyc shift_round_16 FR2, shift copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -505,6 +497,30 @@ done: .proc imul8xe_init + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + + ; create the lookup table ; go through the input set, in four 16KB chunks arg1 = FR1 @@ -615,6 +631,47 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + imul8 result, arg1, arg2 + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg1 + 1 + bpl arg1_pos + sub16 result + 2, result + 2, arg2 +arg1_pos: + lda arg2 + 1 + bpl arg2_pos + sub16 result + 2, result + 2, arg1 +arg2_pos: + + rts ; 6 cyc +.endproc + +.proc imul16xe_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + imul8xe result, arg1, arg2 lda #0 sta result + 2 From f32cc5fa7cdd117c26b5e923a7a29bcac8079f45 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Fri, 27 Dec 2024 19:15:19 -0800 Subject: [PATCH 16/59] whoops --- atari-asm-xex.cfg | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 atari-asm-xex.cfg diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg new file mode 100644 index 0000000..6e6498d --- /dev/null +++ b/atari-asm-xex.cfg @@ -0,0 +1,25 @@ +FEATURES { + STARTADDRESS: default = $2E00; +} +SYMBOLS { + __STARTADDRESS__: type = export, value = %S; +} +MEMORY { + ZP: file = "", define = yes, start = $0082, size = $007E; + MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; +} +FILES { + %O: format = atari; +} +FORMATS { + atari: runad = start; +} +SEGMENTS { + ZEROPAGE: load = ZP, type = zp, optional = yes; + EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs + CODE: load = MAIN, type = rw, define = yes; + RODATA: load = MAIN, type = ro optional = yes; + DATA: load = MAIN, type = rw optional = yes; + BSS: load = MAIN, type = bss, optional = yes, define = yes; + TABLES: load = MAIN, type = ro, optional = yes, align = 256; +} From d83b811444a1de0cf2f0dd58772d89517fe3d48c Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 28 Dec 2024 15:13:06 -0800 Subject: [PATCH 17/59] remove stray copy of the expanded-ram imul it's not finished or working, just keep the core one :D --- imul8xe.s | 175 ------------------------------------------------------ 1 file changed, 175 deletions(-) delete mode 100644 imul8xe.s diff --git a/imul8xe.s b/imul8xe.s deleted file mode 100644 index 855e044..0000000 --- a/imul8xe.s +++ /dev/null @@ -1,175 +0,0 @@ -FR0 = $d4 ; float48 -PORTB = $d301 - - -EXTENDED_RAM = $4000 ; 16KiB bank on the XE - -; lookup table for top byte -> PORTB value for bank-switch -.align 256 -bankswitch: - .repeat 256, i - .byte ((i & $c0) >> 5) | $c1 - .endrepeat - -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch,x ; 4 cyc - sta PORTB ; 4 cyc - - - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - -done: -.endmacro - -.macro bank_switch bank - lda #((bank << 1) | $c1) - sta PORTB -.endmacro - -proc imul8xe_init - - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -endproc - -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 - - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 - - ldx #0 - ldy #0 - - ; outer loop: $00 -> $3f -outer_loop: - - ; reset result to 0 - lda #0 - sta result - sta result + 1 - - ; inner loop: $00 -> $ff -inner_loop: - - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - sta (ptr),y - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$40 - bne outer_loop - - rts - -endproc From 0fcf4d66763a3c6f76dd99c1c5883d03238d4321 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 28 Dec 2024 17:40:21 -0800 Subject: [PATCH 18/59] comment tweak --- mandel.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index d198989..7c5c652 100644 --- a/mandel.s +++ b/mandel.s @@ -1209,7 +1209,7 @@ zero_byte_loop: jsr imul8xe_init - ; ox = 0; oy = 0; zoom = 0 + ; ox = 0; oy = 0 ; count_frames = 0; count_pixels = 0 lda #0 sta ox From 504457595a7d4ad1a351b70ea508031c3ad55714 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 28 Dec 2024 18:11:35 -0800 Subject: [PATCH 19/59] correct zoom border checks --- mandel.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mandel.s b/mandel.s index 7c5c652..fe29001 100644 --- a/mandel.s +++ b/mandel.s @@ -1148,11 +1148,13 @@ skip_char: rts plus: + lda zoom cmp #8 bpl skip_char inc zoom jmp done minus: + lda zoom cmp #1 bmi skip_char dec zoom From 2b0167226e9a4bfb399bca4d856080ab28fa621c Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sat, 28 Dec 2024 20:44:27 -0800 Subject: [PATCH 20/59] todos --- todo.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 todo.md diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..ed4e628 --- /dev/null +++ b/todo.md @@ -0,0 +1,26 @@ +things to try: + +* fix the pan/zoom bug where it doesn't reset loop right :( + +* add some preset viewports that can be switched via number keys (1, 2, 3 etc) + +* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D + +* square-root special case of multiplication for zx*zx and zy*zy + * the hi1*hi2 and lo1*lo2 8-bit muls can be optimized into a 512-byte lookup table + * jamey on mastodon tried this but had some problems. see what happens on our version! + +* double-check rounding behavior is correct + +* try 3.13 fixed point instead of 4.12 for more precision + * can we get away without the extra bit? + +* y-axis mirror optimization + +* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering + +* rework the palette cycling to look more like an advancing flow + +* extact viewport for display & re-input via keyboard + +* fujinet screenshot/viewport uploader From 0fc5ba914f7dab24046a631f889cf2c6db0a0cbe Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 12:29:36 -0800 Subject: [PATCH 21/59] fix pan/zoom bug was missing an rts on update_palette this happened to fall through to keycheck which if timing was wrong would dutifully process the viewport change and return to update_palette's caller which in turn was -not- expecting to reset the outer loop fixed --- mandel.s | 2 ++ todo.md | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index fe29001..d6ae4c6 100644 --- a/mandel.s +++ b/mandel.s @@ -1094,6 +1094,8 @@ done: and #$f0 adc palette + 3 sta COLOR2 + + rts .endproc .proc update_speed diff --git a/todo.md b/todo.md index ed4e628..aebaae3 100644 --- a/todo.md +++ b/todo.md @@ -1,7 +1,5 @@ things to try: -* fix the pan/zoom bug where it doesn't reset loop right :( - * add some preset viewports that can be switched via number keys (1, 2, 3 etc) * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D From 2118890977591785a15aef0fcbce86414ebdd7db Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 13:10:35 -0800 Subject: [PATCH 22/59] add an alternate viewport (compile-time currently) zoomed to max --- mandel.s | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/mandel.s b/mandel.s index d6ae4c6..3e60083 100644 --- a/mandel.s +++ b/mandel.s @@ -246,6 +246,18 @@ fill_masks: .byte %00000001 .byte %00000000 +viewport_zoom: + .byte 1 + .byte 8 + +viewport_ox: + .word $0000 + .word $f110 + +viewport_oy: + .word $0000 + .word $fbe0 + ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 clc ; 2 cyc @@ -1213,13 +1225,27 @@ zero_byte_loop: jsr imul8xe_init - ; ox = 0; oy = 0 + ; initialize viewport + ;ldx #0 ; overview + ldx #1 ; closeup + lda viewport_zoom,x + sta zoom + + txa + asl a + tax + lda viewport_ox,x + sta ox + lda viewport_oy,x + sta oy + inx + lda viewport_ox,x + sta ox + 1 + lda viewport_oy,x + sta oy + 1 + ; count_frames = 0; count_pixels = 0 lda #0 - sta ox - sta ox + 1 - sta oy - sta oy + 1 sta count_frames sta count_pixels @@ -1229,10 +1255,6 @@ zero_byte_loop: ldx #total_pixels jsr ZF1 - ; zoom = 2x - lda #1 - sta zoom - ; Disable display DMA lda #0 sta DMACTL From 15fc5367f9054de770afec40958295cbd26c7b46 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 13:18:54 -0800 Subject: [PATCH 23/59] switck with the overview as default fo rnow --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 3e60083..3579b0f 100644 --- a/mandel.s +++ b/mandel.s @@ -1226,8 +1226,8 @@ zero_byte_loop: jsr imul8xe_init ; initialize viewport - ;ldx #0 ; overview - ldx #1 ; closeup + ldx #0 ; overview + ;ldx #1 ; closeup lda viewport_zoom,x sta zoom From 8ad996981abdc4a74f2e220fe2ae970c1bd90960 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 13:19:58 -0800 Subject: [PATCH 24/59] whoops --- mandel.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 3579b0f..cf52fdb 100644 --- a/mandel.s +++ b/mandel.s @@ -107,7 +107,7 @@ KEY_RIGHT = $87 .struct float48 exponent .byte - mantissa .byte 6 + mantissa .byte 5 .endstruct .import mul_lobyte256 From f903272335b5749ef805ddbd31b52a58051e6b94 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 17:37:06 -0800 Subject: [PATCH 25/59] refactoring and start on squares --- mandel.s | 284 +++++++++++++++++++++++++----------------------------- tables.js | 12 +++ 2 files changed, 143 insertions(+), 153 deletions(-) diff --git a/mandel.s b/mandel.s index cf52fdb..2e16b53 100644 --- a/mandel.s +++ b/mandel.s @@ -374,65 +374,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; Adapted from https://everything2.com/title/Fast+6502+multiplication -.macro imul8 dest, arg1, arg2 - .local under256 - .local next - .local small_product - ; circa 92 cycles? this doesn't seem right - ; 81-92 cycles - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; 3 cyc - - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc - next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc - - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc - - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 - small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - .endscope +; clobbers a, x +.macro sqr8 dest, arg + ldx arg + lda sqr_lobyte,x + sta dest + lda sqr_hibyte,x + sta dest + 1 .endmacro ; lookup table for top byte -> PORTB value for bank-switch @@ -447,64 +395,121 @@ bank_switch_table: sta PORTB .endmacro +.macro imul8 dest, arg1, arg2, xe + .if xe + ; using 64KB lookup table + ; 58-77 cycles + ; clobbers x, y, dest to dest + 3 + .scope + output = dest + ptr = dest + 2 ; scratch space assumed -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc -done: + done: + .endscope + .else + ; Using base 48k RAM compatibility mode + ; Small table of half squares + ; Adapted from https://everything2.com/title/Fast+6502+multiplication + ; 81-92 cycles + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; 3 cyc + + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc + under256: + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc + next: + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc + + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc + + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 + small_product: + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + .endscope + .endif .endmacro .proc imul8xe_init @@ -632,7 +637,13 @@ inner_loop: .endproc -.proc imul16_func +.macro imul16_impl xe + .local arg1 + .local arg2 + .local result + .local inter + .local arg1_pos + .local arg2_pos arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result @@ -643,20 +654,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2 + imul8 result, arg1, arg2, xe lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2 + imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1 + imul8 inter, arg1, arg2 + 1, xe add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1 + imul8 inter, arg1 + 1, arg2 + 1, xe add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -671,47 +682,14 @@ arg1_pos: arg2_pos: rts ; 6 cyc +.endmacro + +.proc imul16_func + imul16_impl 0 .endproc .proc imul16xe_func - arg1 = FR0 ; 16-bit arg (clobbered) - arg2 = FR1 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 - - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - - imul8xe result, arg1, arg2 - lda #0 - sta result + 2 - sta result + 3 - - imul8xe inter, arg1 + 1, arg2 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1, arg2 + 1 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1 + 1, arg2 + 1 - add16 result + 2, result + 2, inter - - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg1 + 1 - bpl arg1_pos - sub16 result + 2, result + 2, arg2 -arg1_pos: - lda arg2 + 1 - bpl arg2_pos - sub16 result + 2, result + 2, arg1 -arg2_pos: - - rts ; 6 cyc + imul16_impl 1 .endproc .macro round16 arg diff --git a/tables.js b/tables.js index c772f81..50cbef9 100644 --- a/tables.js +++ b/tables.js @@ -22,7 +22,10 @@ console.log( .export mul_lobyte256 .export mul_hibyte256 .export mul_hibyte512 +.export sqr_lobyte +.export sqr_hibyte +; (i * i + 1) / 2 for the multiplier .align 256 mul_lobyte256: ${db((i) => squares[i] & 0xff)} @@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)} mul_hibyte512: ${db((i) => (squares[i + 256] >> 8) & 0xff)} +; (i * i) for the plain squares +.align 256 +sqr_lobyte: +${db((i) => (i * i) & 0xff)} + +.align 256 +sqr_hibyte: +${db((i) => ((i * i) >> 8) & 0xff)} + `); From 3ab5006aa3033cf595311fc6a09247c0c04f9c14 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 17:56:14 -0800 Subject: [PATCH 26/59] wip refacotring --- mandel.s | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 2e16b53..9eb6ce1 100644 --- a/mandel.s +++ b/mandel.s @@ -374,6 +374,14 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16_round dest, arg, shift + imul16_round dest, arg, arg, shift + ;copy16 FR0, arg ; 12 cyc + ;jsr sqr16_func ; ? cyc + ;shift_round_16 FR2, shift + ;copy16 dest, FR2 + 2 ; 12 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -537,6 +545,14 @@ init: lda #.hibyte(imul16xe_func) sta imul16_func + 2 + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + ; create the lookup table ; go through the input set, in four 16KB chunks @@ -684,6 +700,45 @@ arg2_pos: rts ; 6 cyc .endmacro +.macro sqr16_impl xe + .local arg + .local result + .local inter + .local arg_pos + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l + + sqr8 result, arg + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 + + sqr8 inter, arg + 1, arg + 1, xe + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg + 1 + bpl arg_pos + sub16 result + 2, result + 2, arg + sub16 result + 2, result + 2, arg +arg_pos: + + rts ; 6 cyc +.endmacro + .proc imul16_func imul16_impl 0 .endproc @@ -692,6 +747,14 @@ arg2_pos: imul16_impl 1 .endproc +.proc sqr16_func + imul16_impl 0 +.endproc + +.proc sqr16xe_func + imul16_impl 1 +.endproc + .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -803,10 +866,10 @@ keep_going: quick_exit zy, 2 ; zx_2 = zx * zx - imul16_round zx_2, zx, zx, 4 + sqr16_round zx_2, zx, 4 ; zy_2 = zy * zy - imul16_round zy_2, zy, zy, 4 + sqr16_round zy_2, zy, 4 ; zx_zy = zx * zy imul16_round zx_zy, zx, zy, 4 From 0c63430dd95a1c3e72e0fb0c252e233ddc0c9d79 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 20:37:58 -0800 Subject: [PATCH 27/59] wip tables segment to be --- Makefile | 4 ++-- atari-asm-xex.cfg | 3 ++- mandel.s | 10 +++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 008bf8c..bd14c7d 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ all : mandel.xex -mandel.xex : mandel.o tables.o - ld65 -C ./atari-asm-xex.cfg -o $@ $+ +mandel.xex : mandel.o tables.o atari-asm-xex.cfg + ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o %.o : %.s ca65 -o $@ $< diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index 6e6498d..fb43089 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -6,7 +6,8 @@ SYMBOLS { } MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; - MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; + #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; + MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; } FILES { %O: format = atari; diff --git a/mandel.s b/mandel.s index 9eb6ce1..7bfb577 100644 --- a/mandel.s +++ b/mandel.s @@ -375,11 +375,11 @@ viewport_oy: .endmacro .macro sqr16_round dest, arg, shift - imul16_round dest, arg, arg, shift - ;copy16 FR0, arg ; 12 cyc - ;jsr sqr16_func ; ? cyc - ;shift_round_16 FR2, shift - ;copy16 dest, FR2 + 2 ; 12 cyc + ;imul16_round dest, arg, arg, shift + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + shift_round_16 FR2, shift + copy16 dest, FR2 + 2 ; 12 cyc .endmacro ; clobbers a, x From 883f926e575cbc4720b25826b2252495a0621d81 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 21:06:48 -0800 Subject: [PATCH 28/59] split memory, wip appears to work on 800 but xl/xe overlap basic lol --- atari-asm-xex.cfg | 3 ++- mandel.s | 64 +++++++++++++++++++++++------------------------ 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index fb43089..9f871ca 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -8,6 +8,7 @@ MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; + TABLES: file = %O, define = yes, start = $a000, size = $c000 - $a000; } FILES { %O: format = atari; @@ -22,5 +23,5 @@ SEGMENTS { RODATA: load = MAIN, type = ro optional = yes; DATA: load = MAIN, type = rw optional = yes; BSS: load = MAIN, type = bss, optional = yes, define = yes; - TABLES: load = MAIN, type = ro, optional = yes, align = 256; + TABLES: load = TABLES, type = ro, optional = yes, align = 256; } diff --git a/mandel.s b/mandel.s index 7bfb577..a5bcb35 100644 --- a/mandel.s +++ b/mandel.s @@ -113,6 +113,8 @@ KEY_RIGHT = $87 .import mul_lobyte256 .import mul_hibyte256 .import mul_hibyte512 +.import sqr_lobyte +.import sqr_hibyte .data @@ -701,42 +703,40 @@ arg2_pos: .endmacro .macro sqr16_impl xe - .local arg - .local result - .local inter - .local arg_pos - arg = FR0 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 + .scope + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + ;inter = temp2 + inter = FR1 - ; hl * hl - ; (h*256 + l) * (h*256 + l) - ; h*256*(h*256 + l) + l*(h*256 + l) - ; h*h*256*256 + h*l*256 + h*l*256 + l*l + lda arg + 1 + bpl arg_pos + neg16 arg + arg_pos: - sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l - imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - add16 result + 1, result + 1, inter - add_carry result + 3 + sqr8 result, arg + ;imul8 inter, arg, arg, xe + lda #0 + sta result + 2 + sta result + 3 - sqr8 inter, arg + 1, arg + 1, xe - add16 result + 2, result + 2, inter + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg + 1 - bpl arg_pos - sub16 result + 2, result + 2, arg - sub16 result + 2, result + 2, arg -arg_pos: + sqr8 inter, arg + 1 + ;imul8 inter, arg + 1, arg + 1, xe + add16 result + 2, result + 2, inter - rts ; 6 cyc + rts ; 6 cyc + .endscope .endmacro .proc imul16_func @@ -748,11 +748,11 @@ arg_pos: .endproc .proc sqr16_func - imul16_impl 0 + sqr16_impl 0 .endproc .proc sqr16xe_func - imul16_impl 1 + sqr16_impl 1 .endproc .macro round16 arg From acac5a8df42f7128a785d8d6efd65b69ad2178bf Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Sun, 29 Dec 2024 21:19:55 -0800 Subject: [PATCH 29/59] moving the framebuffer into the basic space fails on 130xe and 800xl for some reason works on 800 as expected --- atari-asm-xex.cfg | 5 +++-- mandel.s | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index 9f871ca..93b80f3 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -6,9 +6,10 @@ SYMBOLS { } MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; - #MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; - TABLES: file = %O, define = yes, start = $a000, size = $c000 - $a000; + # Keep $4000-7fff clear for expanded RAM access window + TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; + # Keep $a000-$bfff clear for BASIC cartridge } FILES { %O: format = atari; diff --git a/mandel.s b/mandel.s index a5bcb35..8517685 100644 --- a/mandel.s +++ b/mandel.s @@ -62,11 +62,11 @@ FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) FMOVE = $DDB6 ; MOVE FR0 TO FR1 ; High data -framebuffer_top = $8000 -textbuffer = $8f00 -framebuffer_bottom = $9000 -display_list = $9f00 -framebuffer_end = $a000 +framebuffer_top = $a000 +textbuffer = $af00 +framebuffer_bottom = $b000 +display_list = $bf00 +framebuffer_end = $c000 height = 184 half_height = height >> 1 From 70d2c91f03dd4e2b90dd2419e060bbb220747dd9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 03:56:35 -0800 Subject: [PATCH 30/59] fix bank switch on xl/xe was accidentally enabling basic rom :D 5m46s - 11.759 ms/px - 800xl 5m30s - 11.215 ms/px - 130xe --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 8517685..9f594e8 100644 --- a/mandel.s +++ b/mandel.s @@ -397,11 +397,11 @@ viewport_oy: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 4) | $e1 + .byte ((i & $c0) >> 4) | $e3 .endrepeat .macro bank_switch bank - lda #((bank << 2) | $e1) + lda #((bank << 2) | $e3) sta PORTB .endmacro From c4b98c7be27558c662a23849126d2a802c9bf4bc Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 05:35:22 -0800 Subject: [PATCH 31/59] optimize out a temporary down to 11.076 ms/px on xe --- mandel.s | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 9f594e8..0239167 100644 --- a/mandel.s +++ b/mandel.s @@ -393,6 +393,18 @@ viewport_oy: sta dest + 1 .endmacro +; clobbers a, x +.macro sqr8_add16 dest, arg + ldx arg + clc + lda sqr_lobyte,x + adc dest + sta dest + lda sqr_hibyte,x + adc dest + 1 + sta dest + 1 +.endmacro + ; lookup table for top byte -> PORTB value for bank-switch ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: @@ -720,7 +732,6 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - ;imul8 inter, arg, arg, xe lda #0 sta result + 2 sta result + 3 @@ -731,9 +742,7 @@ arg2_pos: add16 result + 1, result + 1, inter add_carry result + 3 - sqr8 inter, arg + 1 - ;imul8 inter, arg + 1, arg + 1, xe - add16 result + 2, result + 2, inter + sqr8_add16 result + 2, arg + 1 rts ; 6 cyc .endscope From e51aa91e4e159f4ad632759f5c5bc7c8e5e6603f Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 06:48:04 -0800 Subject: [PATCH 32/59] notes --- todo.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/todo.md b/todo.md index aebaae3..2e28c8e 100644 --- a/todo.md +++ b/todo.md @@ -4,11 +4,7 @@ things to try: * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D -* square-root special case of multiplication for zx*zx and zy*zy - * the hi1*hi2 and lo1*lo2 8-bit muls can be optimized into a 512-byte lookup table - * jamey on mastodon tried this but had some problems. see what happens on our version! - -* double-check rounding behavior is correct +* optimize out a store/load with mul8_add16 and mul8_add24 * try 3.13 fixed point instead of 4.12 for more precision * can we get away without the extra bit? From 100c0f33148c411a6bb066f0de32e1f79ffe2c78 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 09:16:08 -0800 Subject: [PATCH 33/59] 1/2/3 selectable viewports --- mandel.s | 65 ++++++++++++++++++++++++++++++++++++++++++-------------- todo.md | 2 -- 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/mandel.s b/mandel.s index 0239167..90396d4 100644 --- a/mandel.s +++ b/mandel.s @@ -104,6 +104,9 @@ KEY_UP = $8e KEY_DOWN = $8f KEY_LEFT = $86 KEY_RIGHT = $87 +KEY_1 = $1f +KEY_2 = $1e +KEY_3 = $1a .struct float48 exponent .byte @@ -250,14 +253,17 @@ fill_masks: viewport_zoom: .byte 1 + .byte 6 .byte 8 viewport_ox: .word $0000 .word $f110 + .word $f110 viewport_oy: .word $0000 + .word $fb60 .word $fbe0 ; 2 + 9 * byte cycles @@ -1206,7 +1212,13 @@ done: beq left cpy #KEY_RIGHT beq right - + cpy #KEY_1 + beq one + cpy #KEY_2 + beq two + cpy #KEY_3 + beq three + skip_char: lda #0 rts @@ -1234,6 +1246,19 @@ left: jmp done right: add16 ox, ox, temp + jmp done +one: + ldx #0 + jmp load_key_viewport +two: + ldx #1 + jmp load_key_viewport +three: + ldx #2 + ; fall through +load_key_viewport: + jsr load_viewport + ; fall through done: lda #255 rts @@ -1271,13 +1296,10 @@ zero_byte_loop: rts .endproc -.proc start +; input: viewport selector in x +; clobbers: a, x +.proc load_viewport - jsr imul8xe_init - - ; initialize viewport - ldx #0 ; overview - ;ldx #1 ; closeup lda viewport_zoom,x sta zoom @@ -1294,16 +1316,16 @@ zero_byte_loop: lda viewport_oy,x sta oy + 1 - ; count_frames = 0; count_pixels = 0 - lda #0 - sta count_frames - sta count_pixels + rts +.endproc - ; total_ms = 0.0; total_pixels = 0.0 - ldx #total_ms - jsr ZF1 - ldx #total_pixels - jsr ZF1 +.proc start + + jsr imul8xe_init + + ; initialize viewport + ldx #0 ; overview + jsr load_viewport ; Disable display DMA lda #0 @@ -1345,6 +1367,17 @@ copy_byte_loop: jsr SETVBV main_loop: + ; count_frames = 0; count_pixels = 0 + lda #0 + sta count_frames + sta count_pixels + + ; total_ms = 0.0; total_pixels = 0.0 + ldx #total_ms + jsr ZF1 + ldx #total_pixels + jsr ZF1 + jsr clear_screen jsr status_bar diff --git a/todo.md b/todo.md index 2e28c8e..6c6d84d 100644 --- a/todo.md +++ b/todo.md @@ -1,7 +1,5 @@ things to try: -* add some preset viewports that can be switched via number keys (1, 2, 3 etc) - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * optimize out a store/load with mul8_add16 and mul8_add24 From 64a6cf50f3a5d7aa46632b6ab8f83120e2c49448 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 10:21:52 -0800 Subject: [PATCH 34/59] awesome new palette cycler --- mandel.s | 108 ++++++++++++++++++++++++++++++++++++++++++------------- todo.md | 2 -- 2 files changed, 84 insertions(+), 26 deletions(-) diff --git a/mandel.s b/mandel.s index 90396d4..198e40c 100644 --- a/mandel.s +++ b/mandel.s @@ -13,13 +13,13 @@ zy_2 = $92 ; fixed4.12: z_y^2 zx_zy = $94 ; fixed4.12: z_x * z_y dist = $96 ; fixed4.12: z_x^2 + z_y^2 -iter = $a0 ; u8: iteration count +iter = $a0 ; u8: iteration count -zoom = $a1 ; u8: zoom shift level -count_frames = $a2 ; u8 -count_pixels = $a3 ; u8 -total_ms = $a4 ; float48 -total_pixels = $aa ; float48 +zoom = $a1 ; u8: zoom shift level +count_frames = $a2 ; u8 +count_pixels = $a3 ; u8 +total_ms = $a4 ; float48 +total_pixels = $aa ; float48 z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not z_buffer_start = $b1 ; u8: index into z_buffer @@ -34,6 +34,14 @@ pixel_offset = $bd ; u8 fill_level = $be ; u8 palette_offset = $bf ; u8 +palette_ticks = $c0 ; u8 +chroma_ticks = $c1 ; u8 +chroma_offset = $c2 ; u8 + +palette_delay = 120 +chroma_delay = 120 + + ; FP registers in zero page FR0 = $d4 ; float48 FRE = $da @@ -224,11 +232,26 @@ color_map: .byte 3 .endrepeat -palette: - .byte $00 - .byte $46 - .byte $78 - .byte $b4 + +palette_start: + .byte $04 + .byte $08 + .byte $0e +palette_repeat: + .byte $03 + .byte $09 + +palette_entries = 3 + +palette_chroma: + .repeat 15, i + .byte (i + 1) << 4 + .endrepeat + .repeat 2, i + .byte (i + 1) << 4 + .endrepeat +palette_chroma_entries = 15 + .code z_buffer_len = 16 @@ -1136,31 +1159,65 @@ done: .proc vblank_handler inc count_frames + + inc chroma_ticks + lda chroma_ticks + cmp #(chroma_delay) + bne skip_chroma + + lda #0 + sta chroma_ticks + + inc chroma_offset + lda chroma_offset + cmp #(palette_chroma_entries) + bne skip_chroma + + lda #0 + sta chroma_offset +skip_chroma: + + inc palette_ticks + lda palette_ticks + cmp #(palette_delay) + bne skip_luma + + lda #0 + sta palette_ticks + inc palette_offset + lda palette_offset + cmp #(palette_entries) + bne skip_luma + + lda #0 + sta palette_offset + +skip_luma: jsr update_palette jmp XITVBV .endproc .proc update_palette - lda palette + lda #0 sta COLOR4 - clc - lda palette_offset - and #$f0 - adc palette + 1 + ldx chroma_offset + ldy palette_offset + lda palette_chroma,x + ora palette_start,y sta COLOR0 - clc - lda palette_offset - and #$f0 - adc palette + 2 + inx + iny + lda palette_chroma,x + ora palette_start,y sta COLOR1 - clc - lda palette_offset - and #$f0 - adc palette + 3 + inx + iny + lda palette_chroma,x + ora palette_start,y sta COLOR2 rts @@ -1358,6 +1415,9 @@ copy_byte_loop: ; Initialize the palette lda #0 sta palette_offset + sta palette_delay + sta chroma_offset + sta chroma_delay jsr update_palette ; install the vblank handler diff --git a/todo.md b/todo.md index 6c6d84d..a8675af 100644 --- a/todo.md +++ b/todo.md @@ -11,8 +11,6 @@ things to try: * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering -* rework the palette cycling to look more like an advancing flow - * extact viewport for display & re-input via keyboard * fujinet screenshot/viewport uploader From 71d8d93abc60b7b2a8f9730257c13cb5751a5905 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 11:33:55 -0800 Subject: [PATCH 35/59] even better palette cycling --- mandel.s | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 198e40c..9b6b32d 100644 --- a/mandel.s +++ b/mandel.s @@ -38,8 +38,8 @@ palette_ticks = $c0 ; u8 chroma_ticks = $c1 ; u8 chroma_offset = $c2 ; u8 -palette_delay = 120 -chroma_delay = 120 +palette_delay = 23 +chroma_delay = 137 ; FP registers in zero page @@ -1208,13 +1208,13 @@ skip_luma: ora palette_start,y sta COLOR0 - inx + ;inx iny lda palette_chroma,x ora palette_start,y sta COLOR1 - inx + ;inx iny lda palette_chroma,x ora palette_start,y From 14125a398aa37d08e0468f2bfd56379cf884d48d Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 11:35:45 -0800 Subject: [PATCH 36/59] cycle 'in' not 'out' --- mandel.s | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mandel.s b/mandel.s index 9b6b32d..985df82 100644 --- a/mandel.s +++ b/mandel.s @@ -234,12 +234,12 @@ color_map: palette_start: - .byte $04 - .byte $08 .byte $0e + .byte $08 + .byte $04 palette_repeat: - .byte $03 - .byte $09 + .byte $0e + .byte $08 palette_entries = 3 @@ -1206,7 +1206,7 @@ skip_luma: ldy palette_offset lda palette_chroma,x ora palette_start,y - sta COLOR0 + sta COLOR2 ;inx iny @@ -1218,7 +1218,7 @@ skip_luma: iny lda palette_chroma,x ora palette_start,y - sta COLOR2 + sta COLOR0 rts .endproc From 63e74d51520f24e4e2708feaf4071ee4d8191e0f Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 13:44:31 -0800 Subject: [PATCH 37/59] tweak --- mandel.s | 5 ++++- todo.md | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 985df82..844a286 100644 --- a/mandel.s +++ b/mandel.s @@ -434,13 +434,16 @@ viewport_oy: sta dest + 1 .endmacro +.segment "TABLES" ; lookup table for top byte -> PORTB value for bank-switch -;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes +.align 256 bank_switch_table: .repeat 256, i .byte ((i & $c0) >> 4) | $e3 .endrepeat +.code + .macro bank_switch bank lda #((bank << 2) | $e3) sta PORTB diff --git a/todo.md b/todo.md index a8675af..4aaedc0 100644 --- a/todo.md +++ b/todo.md @@ -11,6 +11,6 @@ things to try: * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering -* extact viewport for display & re-input via keyboard +* extract viewport for display & re-input via keyboard * fujinet screenshot/viewport uploader From 3bd9b1ac3164d4895b6564f48ecca76150f0384b Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 14:09:02 -0800 Subject: [PATCH 38/59] micro-optimizations in imul8xe 53-72 cycles overview in 10.896 ms/px --- mandel.s | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mandel.s b/mandel.s index 844a286..97a8863 100644 --- a/mandel.s +++ b/mandel.s @@ -37,6 +37,7 @@ palette_offset = $bf ; u8 palette_ticks = $c0 ; u8 chroma_ticks = $c1 ; u8 chroma_offset = $c2 ; u8 +ptr = $c4 ; u16 palette_delay = 23 chroma_delay = 137 @@ -452,21 +453,19 @@ bank_switch_table: .macro imul8 dest, arg1, arg2, xe .if xe ; using 64KB lookup table - ; 58-77 cycles - ; clobbers x, y, dest to dest + 3 + ; 53-72 cycles + ; clobbers x, y, dest, ptr .scope output = dest - ptr = dest + 2 ; scratch space assumed ; bottom 14 bits except the LSB are the per-bank table index ; add $4000 for the bank pointer lda arg1 ; 3 cyc and #$fe ; 2 cyc - sta ptr ; 3 cyc + tay ; 2 cyc lda arg2 ; 3 cyc and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc + ora #$40 ; 2 cyc sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector @@ -476,7 +475,6 @@ bank_switch_table: ; copy the entry into output - ldy #0 ; 2 cyc lda (ptr),y ; 5 cyc sta output ; 3 cyc iny ; 2 cyc @@ -609,6 +607,9 @@ init: lda #$00 sta arg1 sta arg2 + sta ptr + lda #$40 + sta ptr + 1 ; $00 * $00 -> $3f * $ff bank_switch 0 From 9b7f6b8937a0c7e647eec09f87c12b10de4f7ad8 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 14:22:03 -0800 Subject: [PATCH 39/59] add a viewport in the front spike --- mandel.s | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mandel.s b/mandel.s index 97a8863..9704a22 100644 --- a/mandel.s +++ b/mandel.s @@ -116,6 +116,13 @@ KEY_RIGHT = $87 KEY_1 = $1f KEY_2 = $1e KEY_3 = $1a +KEY_4 = 24 +KEY_5 = 29 +KEY_6 = 27 +KEY_7 = 51 +KEY_8 = 53 +KEY_9 = 48 +KEY_0 = 50 .struct float48 exponent .byte @@ -279,16 +286,19 @@ viewport_zoom: .byte 1 .byte 6 .byte 8 + .byte 6 viewport_ox: .word $0000 .word $f110 .word $f110 + .word $e400 viewport_oy: .word $0000 .word $fb60 .word $fbe0 + .word $0000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -1279,6 +1289,8 @@ skip_luma: beq two cpy #KEY_3 beq three + cpy #KEY_4 + beq four skip_char: lda #0 @@ -1316,6 +1328,9 @@ two: jmp load_key_viewport three: ldx #2 + jmp load_key_viewport +four: + ldx #3 ; fall through load_key_viewport: jsr load_viewport From 6db8cef82d4117ae2b3ede21e9ed3cf1ab720a22 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 15:17:50 -0800 Subject: [PATCH 40/59] 51-70 cycles for xe :D --- mandel.s | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mandel.s b/mandel.s index 9704a22..4ac8d4d 100644 --- a/mandel.s +++ b/mandel.s @@ -463,28 +463,27 @@ bank_switch_table: .macro imul8 dest, arg1, arg2, xe .if xe ; using 64KB lookup table - ; 53-72 cycles + ; 51-70 cycles ; clobbers x, y, dest, ptr .scope output = dest - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - tay ; 2 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - ora #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - ; top 2 bits are the table bank selector ldx arg2 ; 3 cyc lda bank_switch_table,x ; 4 cyc sta PORTB ; 4 cyc + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + txa ; 2 cyc + and #$3f ; 2 cyc + ora #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc ; copy the entry into output + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + tay ; 2 cyc lda (ptr),y ; 5 cyc sta output ; 3 cyc iny ; 2 cyc @@ -503,9 +502,9 @@ bank_switch_table: and #1 ; 2 cyc beq done ; 2 cyc - ; add the second param one last time for the skipped bit + ; add arg2 one last time for the skipped bit clc ; 2 cyc - lda arg2 ; 3 cyc + txa ; 2 cyc adc output ; 3 cyc sta output ; 3 cyc lda #0 ; 2 cyc From e6cbe0bc6be5e97b151d0ffd0696b290992722c9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 16:43:18 -0800 Subject: [PATCH 41/59] notes --- todo.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/todo.md b/todo.md index 4aaedc0..1281de7 100644 --- a/todo.md +++ b/todo.md @@ -2,14 +2,13 @@ things to try: * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D -* optimize out a store/load with mul8_add16 and mul8_add24 - * try 3.13 fixed point instead of 4.12 for more precision * can we get away without the extra bit? * y-axis mirror optimization * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering + * maybe redo tiering to just 4x4, 2x2, 1x1? * extract viewport for display & re-input via keyboard From ed79c80b167607f0c59d7c8f33569f9bf3e981f5 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 16:50:25 -0800 Subject: [PATCH 42/59] update readme --- readme.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/readme.md b/readme.md index 873793f..f297d60 100644 --- a/readme.md +++ b/readme.md @@ -14,15 +14,18 @@ Non-goals: Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals. --- brooke, january 2023 - february 2024 +-- brooke, january 2023 - december 2024 ## Current state -Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys. -The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. +The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. -The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input. +* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition +* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops +* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications +* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. @@ -30,17 +33,18 @@ Iterations are capped at 255. The pixels are run in a progressive layout to get the basic shape on screen faster. -## Next steps +There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D -Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it! +There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. -Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. - -I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. -(done) +There's some cute color cycling. ## Deps and build instructions I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that. Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices. + +## Todo + +See ideas in `todo.md`. \ No newline at end of file From 67649d47434b8b30a9c6a3319616e6531d3ba6a5 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 19:17:02 -0800 Subject: [PATCH 43/59] annotations, tweak --- mandel.s | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/mandel.s b/mandel.s index 4ac8d4d..787243f 100644 --- a/mandel.s +++ b/mandel.s @@ -310,18 +310,21 @@ viewport_oy: .endrepeat .endmacro +; 20 cycles .macro add16 dest, arg1, arg2 add 2, dest, arg1, arg2 .endmacro +; 38 cycles .macro add32 dest, arg1, arg2 add 4, dest, arg2, dest .endmacro +; 8 cycles .macro add_carry dest - lda dest - adc #0 - sta dest + lda dest ; 3 cyc + adc #0 ; 2 cyc + sta dest ; 3 cyc .endmacro ; 2 + 9 * byte cycles @@ -334,29 +337,35 @@ viewport_oy: .endrepeat .endmacro +; 20 cycles .macro sub16 dest, arg1, arg2 sub 2, dest, arg1, arg2 .endmacro +; 38 cycles .macro sub32 dest, arg1, arg2 sub 4, dest, arg1, arg2 .endmacro +; 3 + 5 * bytes cycles .macro shl bytes, arg - asl arg + asl arg ; 3 cyc .repeat bytes-1, i - rol arg + 1 + i + rol arg + 1 + i ; 5 cyc .endrepeat .endmacro +; 13 cycles .macro shl16 arg shl 2, arg .endmacro +; 18 cycles .macro shl24 arg shl 3, arg .endmacro +; 23 cycles .macro shl32 arg shl 4, arg .endmacro @@ -369,14 +378,17 @@ viewport_oy: .endrepeat .endmacro +; 12 cycles .macro copy16 dest, arg copy 2, dest, arg .endmacro +; 24 cycles .macro copy32 dest, arg copy 4, dest, arg .endmacro +; 36 cycles .macro copyfloat dest, arg copy 6, dest, arg .endmacro @@ -401,9 +413,10 @@ viewport_oy: neg 4, arg .endmacro +; 23 * shift .macro shift_round_16 arg, shift .repeat shift - shl32 arg + shl32 arg ; 23 cycles .endrepeat round16 arg .endmacro @@ -806,6 +819,7 @@ arg2_pos: sqr16_impl 1 .endproc +; 11-27 cycles .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -818,21 +832,28 @@ arg2_pos: ; round down if negative ; < $8000: round down - lda arg + 1 - cmp #$80 - beq high_half - bpl increment - bmi next + ; $8000 17 + ; $8001 27 + ; $8100 21 + ; $7fff 11 + + lda arg + 1 ; 3 cyc + cmp #$80 ; 2 cyc + beq high_half ; 2 cyc + + bpl increment ; 2 cyc + + bmi next ; 2 cyc high_half: - lda arg - beq check_sign - bpl increment - bmi next + lda arg ; 3 cyc + beq check_sign ; 2 cyc + + jmp increment ; 3 cyc check_sign: - lda arg + 3 - bmi next + lda arg + 3 ; 3 cyc + bmi next ; 2 cyc increment: ; 5-10 cyc inc arg + 2 ; 5 cyc From ec42f672d43ab8aecb863791ec55b22569436524 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 19:48:28 -0800 Subject: [PATCH 44/59] use an 8-item z buffer for slightly fasterness --- mandel.s | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 787243f..39e71b0 100644 --- a/mandel.s +++ b/mandel.s @@ -262,7 +262,10 @@ palette_chroma_entries = 15 .code -z_buffer_len = 16 +;z_buffer_len = 16 ; 10.863 ms/px +;z_buffer_len = 12 ; 10.619 ms/px +z_buffer_len = 8 ; 10.612 ms/px +;z_buffer_len = 4 ; 12.395 ms/px z_buffer_mask = z_buffer_len - 1 z_buffer: ; the last N zx/zy values From 0a7293d8bca6cc56182c356c993002ae1482f017 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 19:52:35 -0800 Subject: [PATCH 45/59] do 4x4 2x2 1x1 only in prep for bigger pixels --- mandel.s | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 39e71b0..b88105b 100644 --- a/mandel.s +++ b/mandel.s @@ -276,11 +276,12 @@ z_buffer: .export start -max_fill_level = 6 +;max_fill_level = 6 +max_fill_level = 3 fill_masks: - .byte %00011111 - .byte %00001111 - .byte %00000111 +; .byte %00011111 +; .byte %00001111 +; .byte %00000111 .byte %00000011 .byte %00000001 .byte %00000000 From b56dc1e98bfeb3c18c4f90df0e0d19fbe5362cde Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Mon, 30 Dec 2024 20:38:33 -0800 Subject: [PATCH 46/59] notes --- mandel.s | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index b88105b..fc30532 100644 --- a/mandel.s +++ b/mandel.s @@ -417,19 +417,20 @@ viewport_oy: neg 4, arg .endmacro -; 23 * shift +; 11-27 + 23 * shift cycles +; 103-119 cycles for shift=4 .macro shift_round_16 arg, shift .repeat shift shl32 arg ; 23 cycles .endrepeat - round16 arg + round16 arg ; 11-27 cycles .endmacro .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc jsr imul16_func ; ? cyc - shift_round_16 FR2, shift + shift_round_16 FR2, shift ; 103-119 cycles for shift=4 copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -437,7 +438,7 @@ viewport_oy: ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc - shift_round_16 FR2, shift + shift_round_16 FR2, shift ; 103-119 cycles for shift=4 copy16 dest, FR2 + 2 ; 12 cyc .endmacro From 61eb1aaf21fdac377e6f04db117aa855ad73b940 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 05:11:26 -0800 Subject: [PATCH 47/59] notes --- todo.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/todo.md b/todo.md index 1281de7..6fb0282 100644 --- a/todo.md +++ b/todo.md @@ -1,5 +1,11 @@ things to try: +* skip add on the top-byte multiply in sqr8/mul8 + * should save a few cycles, suggestion by jamey + +* perform the zx += zx^s + cx in 32-bit space, before rounding + * should improve precision on max zoom, might cost a few cycles + * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * try 3.13 fixed point instead of 4.12 for more precision From 0d086a179cf8e91b839f306bb597ef9e6125f6b2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:20:53 -0800 Subject: [PATCH 48/59] wip --- mandel.s | 108 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/mandel.s b/mandel.s index fc30532..50213ad 100644 --- a/mandel.s +++ b/mandel.s @@ -1,43 +1,42 @@ ; Our zero-page vars -sx = $80 ; i16: screen pixel x -sy = $82 ; i16: screen pixel y -ox = $84 ; fixed4.12: center point x -oy = $86 ; fixed4.12: center point y -cx = $88 ; fixed4.12: c_x -cy = $8a ; fixed4.12: c_y -zx = $8c ; fixed4.12: z_x -zy = $8e ; fixed4.12: z_y +ox = $80 ; fixed8.24: center point x +oy = $84 ; fixed8.24: center point y +cx = $88 ; fixed8.24: c_x +cy = $8c ; fixed8.24: c_y -zx_2 = $90 ; fixed4.12: z_x^2 -zy_2 = $92 ; fixed4.12: z_y^2 -zx_zy = $94 ; fixed4.12: z_x * z_y -dist = $96 ; fixed4.12: z_x^2 + z_y^2 +zx = $90 ; fixed8.24: z_x +zy = $94 ; fixed8.24: z_y +zx_2 = $98 ; fixed8.24: z_x^2 +zy_2 = $9c ; fixed8.24: z_y^2 -iter = $a0 ; u8: iteration count +zx_zy = $a0 ; fixed8.24: z_x * z_y +dist = $a4 ; fixed8.24: z_x^2 + z_y^2 +sx = $a8 ; i16: screen pixel x +sy = $aa ; i16: screen pixel y +z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start = $ad ; u8: index into z_buffer +z_buffer_end = $ae ; u8: index into z_buffer +iter = $af ; u8: iteration count -zoom = $a1 ; u8: zoom shift level -count_frames = $a2 ; u8 -count_pixels = $a3 ; u8 -total_ms = $a4 ; float48 -total_pixels = $aa ; float48 +ptr = $b0 ; u16 +pixel_ptr = $b2 ; u16 +zoom = $b4 ; u8: zoom shift level +fill_level = $b5 ; u8 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 +palette_offset = $ba ; u8 +chroma_offset = $bb ; u8 +palette_ticks = $bc ; u8 +chroma_ticks = $bd ; u8 +count_frames = $be ; u8 +count_pixels = $bf ; u8 -z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $b1 ; u8: index into z_buffer -z_buffer_end = $b2 ; u8: index into z_buffer -temp = $b4 ; u16 -temp2 = $b6 ; u16 -pixel_ptr = $b8 ; u16 -pixel_color = $ba ; u8 -pixel_mask = $bb ; u8 -pixel_shift = $bc ; u8 -pixel_offset = $bd ; u8 -fill_level = $be ; u8 -palette_offset = $bf ; u8 - -palette_ticks = $c0 ; u8 -chroma_ticks = $c1 ; u8 -chroma_offset = $c2 ; u8 -ptr = $c4 ; u16 +total_pixels = $c0 ; float48 +total_ms = $c6 ; float48 +temp = $cc ; u16 +temp2 = $ce ; u16 palette_delay = 23 chroma_delay = 137 @@ -884,12 +883,41 @@ next: ; zx_zy = 0 ; dist = 0 ; iter = 0 +; lda #00 +; ldx #(iter - zx + 1) +;initloop: +; sta zx - 1,x +; dex +; bne initloop +; sta z_buffer_start +; sta z_buffer_end + lda #00 - ldx #(iter - zx + 1) -initloop: - sta zx - 1,x - dex - bne initloop + sta zx + sta zx + 1 + sta zx + 2 + sta zx + 3 + sta zy + sta zy + 1 + sta zy + 2 + sta zy + 3 + sta zx_2 + sta zx_2 + 1 + sta zx_2 + 2 + sta zx_2 + 3 + sta zy_2 + sta zy_2 + 1 + sta zy_2 + 2 + sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 + sta dist + sta dist + 1 + sta dist + 2 + sta dist + 3 + sta iter sta z_buffer_start sta z_buffer_end From 4a1e35699adcce1af0f60ea51573e8a215975c66 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:19:45 -0800 Subject: [PATCH 49/59] wip --- mandel.s | 71 ++++++++++++++++++++++++++++++++++++++------------------ todo.md | 2 +- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/mandel.s b/mandel.s index 50213ad..622ff62 100644 --- a/mandel.s +++ b/mandel.s @@ -433,6 +433,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + .macro sqr16_round dest, arg, shift ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc @@ -441,6 +448,12 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro +.macro sqr16 dest, arg + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + ; clobbers a, x .macro sqr8 dest, arg ldx arg @@ -870,8 +883,8 @@ next: .proc mandelbrot ; input: - ; cx: position scaled to 4.12 fixed point - -8..+7.9 - ; cy: position scaled to 4.12 + ; cx: position scaled to 8.24 fixed point - -128..+127.9 + ; cy: position scaled to 8.24 ; ; output: ; iter: iteration count at escape or 0 @@ -909,10 +922,6 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 @@ -929,6 +938,8 @@ loop: keep_going: .macro quick_exit arg, max + ; arg: fixed8.24 + ; max: integer .local positive .local negative .local nope_out @@ -936,51 +947,61 @@ keep_going: .local all_done ; check sign bit - lda arg + 1 + lda arg + 3 bmi negative positive: - cmp #((max) << 4) + cmp #max bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - ((max) << 4)) + cmp #(256 - max) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: jmp exit_path - + first_equal: + ; following bytes all 0 shows it's really 'equal' + lda arg + 2 + bne all_done + lda arg + 1 + bne all_done lda arg - beq nope_out ; 2nd byte 0 shows it's really 'equal' + bne all_done + jmp exit_path all_done: .endmacro - ; 4.12: (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx - sub16 zx, zx_2, zy_2 - add16 zx, zx, cx + sub32 zx, zx_2, zy_2 + add32 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add16 zy, zx_zy, zx_zy - add16 zy, zy, cy + add32 zy, zx_zy, zx_zy + add32 zy, zy, cy quick_exit zy, 2 + ; convert 8.24 -> 4.12 + shift_round_16 zx, 4 + shift_round_16 zy, 4 + ; zx_2 = zx * zx - sqr16_round zx_2, zx, 4 + sqr16 zx_2, zx + 2 ; zy_2 = zy * zy - sqr16_round zy_2, zy, 4 + sqr16 zy_2, zy + 2 ; zx_zy = zx * zy - imul16_round zx_zy, zx, zy, 4 + imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 - add16 dist, zx_2, zy_2 + add32 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer @@ -1090,13 +1111,17 @@ enough: .endmacro .macro zoom_factor dest, src, zoom, aspect + ; output: dest: fixed8.24 + ; input: src: fixed4.12 + ; input: zoom: u8 ??? + ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16_round dest, dest, aspect, 4 + imul16 dest, dest, aspect .endmacro .proc pset @@ -1567,9 +1592,9 @@ not_skipped_mask: ; run the fractal! zoom_factor cx, sx, zoom, aspect_x - add16 cx, cx, ox + add32 cx, cx, ox zoom_factor cy, sy, zoom, aspect_y - add16 cy, cy, oy + add32 cy, cy, oy jsr mandelbrot jsr pset diff --git a/todo.md b/todo.md index 6fb0282..29217cd 100644 --- a/todo.md +++ b/todo.md @@ -3,7 +3,7 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx += zx^s + cx in 32-bit space, before rounding +* perform the zx_next = zx^s + cx in 32-bit space, before rounding * should improve precision on max zoom, might cost a few cycles * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D From 7184b8e03f2748efd532277995afe5fa7d4a3cf6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 07:33:20 -0800 Subject: [PATCH 50/59] wip --- mandel.s | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 622ff62..86a6b48 100644 --- a/mandel.s +++ b/mandel.s @@ -292,16 +292,16 @@ viewport_zoom: .byte 6 viewport_ox: - .word $0000 - .word $f110 - .word $f110 - .word $e400 + .dword $00000000 + .dword $ff110000 + .dword $ff110000 + .dword $fe400000 viewport_oy: - .word $0000 - .word $fb60 - .word $fbe0 - .word $0000 + .dword $00000000 + .dword $ffb60000 + .dword $ffbe0000 + .dword $00000000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -1459,17 +1459,32 @@ zero_byte_loop: txa asl a + asl a + tax lda viewport_ox,x sta ox lda viewport_oy,x sta oy + inx lda viewport_ox,x sta ox + 1 lda viewport_oy,x sta oy + 1 + inx + lda viewport_ox,x + sta ox + 2 + lda viewport_oy,x + sta oy + 2 + + inx + lda viewport_ox,x + sta ox + 3 + lda viewport_oy,x + sta oy + 3 + rts .endproc From 13257309dc3a6493e05575404f5deddd09e9192d Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 08:34:02 -0800 Subject: [PATCH 51/59] init fix --- mandel.s | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mandel.s b/mandel.s index 86a6b48..76816c2 100644 --- a/mandel.s +++ b/mandel.s @@ -922,6 +922,10 @@ next: sta zy_2 + 1 sta zy_2 + 2 sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 sta dist sta dist + 1 sta dist + 2 From 2fcb30b76a66819ab96ec3353b8ce4978f723675 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 08:56:59 -0800 Subject: [PATCH 52/59] wip --- mandel.s | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mandel.s b/mandel.s index 76816c2..0400003 100644 --- a/mandel.s +++ b/mandel.s @@ -980,7 +980,7 @@ keep_going: all_done: .endmacro - ; 8.24: (-128 .. 127.9) / (-8 .. +7.9) + ; 8.24: (-128 .. 127.9) ; zx = zx_2 - zy_2 + cx sub32 zx, zx_2, zy_2 add32 zx, zx, cx @@ -991,7 +991,7 @@ keep_going: add32 zy, zy, cy quick_exit zy, 2 - ; convert 8.24 -> 4.12 + ; convert 8.24 -> 4.12: (-8 .. +7.9) shift_round_16 zx, 4 shift_round_16 zy, 4 @@ -1042,10 +1042,10 @@ z_buffer_loop: ; Compare the previously stored z values ldy #0 - z_compare zx - z_compare zx + 1 - z_compare zy - z_compare zy + 1 + z_compare zx + 2 + z_compare zx + 3 + z_compare zy + 2 + z_compare zy + 3 cpy #4 bne z_no_matches @@ -1060,10 +1060,10 @@ z_no_matches: z_nothing_to_read: ; Store and expand - z_store zx - z_store zx + 1 - z_store zy - z_store zy + 1 + z_store zx + 2 + z_store zx + 3 + z_store zy + 2 + z_store zy + 3 z_advance stx z_buffer_end From d2f41f964435b3803ce694a70bf38687fd467caa Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:02:42 -0800 Subject: [PATCH 53/59] wip --- mandel.s | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/mandel.s b/mandel.s index 0400003..8b63941 100644 --- a/mandel.s +++ b/mandel.s @@ -425,14 +425,8 @@ viewport_oy: round16 arg ; 11-27 cycles .endmacro -.macro imul16_round dest, arg1, arg2, shift - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; ? cyc - shift_round_16 FR2, shift ; 103-119 cycles for shift=4 - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro - +; input: arg1, arg2 as fixed4.12 +; output: dest as fixed8.24 .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc @@ -440,20 +434,16 @@ viewport_oy: copy32 dest, FR2 ; 24 cyc .endmacro -.macro sqr16_round dest, arg, shift - ;imul16_round dest, arg, arg, shift - copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - shift_round_16 FR2, shift ; 103-119 cycles for shift=4 - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro - +; input: arg as fixed4.12 +; output: dest as fixed8.24 .macro sqr16 dest, arg copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc copy32 dest, FR2 ; 24 cyc .endmacro +; input: arg as u8 +; output: dest as u16 ; clobbers a, x .macro sqr8 dest, arg ldx arg From 1e0f577e099b3d7787d6e6d4fce1813ccd6b489c Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:09:11 -0800 Subject: [PATCH 54/59] wip --- mandel.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mandel.s b/mandel.s index 8b63941..6977582 100644 --- a/mandel.s +++ b/mandel.s @@ -453,6 +453,8 @@ viewport_oy: sta dest + 1 .endmacro +; input: arg as u8 +; input/output: dest as u16 ; clobbers a, x .macro sqr8_add16 dest, arg ldx arg From 81bf7f3c434646f0374c35f20131050bd314d1b2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 09:53:22 -0800 Subject: [PATCH 55/59] tweak --- mandel.s | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mandel.s b/mandel.s index 6977582..4ab6c19 100644 --- a/mandel.s +++ b/mandel.s @@ -1106,10 +1106,9 @@ cont: enough: .endmacro -.macro zoom_factor dest, src, zoom, aspect +.macro zoom_factor dest, src, aspect ; output: dest: fixed8.24 ; input: src: fixed4.12 - ; input: zoom: u8 ??? ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src @@ -1602,9 +1601,9 @@ skipped_mask: not_skipped_mask: ; run the fractal! - zoom_factor cx, sx, zoom, aspect_x + zoom_factor cx, sx, aspect_x add32 cx, cx, ox - zoom_factor cy, sy, zoom, aspect_y + zoom_factor cy, sy, aspect_y add32 cy, cy, oy jsr mandelbrot jsr pset From 2e8893fd7892429bc07bd1d653ef1319be7d2d7b Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 13:54:53 -0800 Subject: [PATCH 56/59] haha fuck me --- mandel.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandel.s b/mandel.s index 4ab6c19..04edec5 100644 --- a/mandel.s +++ b/mandel.s @@ -320,7 +320,7 @@ viewport_oy: ; 38 cycles .macro add32 dest, arg1, arg2 - add 4, dest, arg2, dest + add 4, dest, arg1, arg2 .endmacro ; 8 cycles From cc83c76706519cce3fff61ce46df9589d31025d6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 14:16:43 -0800 Subject: [PATCH 57/59] update docs for 32-bit intermediates --- readme.md | 4 ++-- todo.md | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/readme.md b/readme.md index f297d60..d60644c 100644 --- a/readme.md +++ b/readme.md @@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. +The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. Iterations are capped at 255. @@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e ## Todo -See ideas in `todo.md`. \ No newline at end of file +See ideas in `todo.md`. diff --git a/todo.md b/todo.md index 29217cd..284d653 100644 --- a/todo.md +++ b/todo.md @@ -3,13 +3,11 @@ things to try: * skip add on the top-byte multiply in sqr8/mul8 * should save a few cycles, suggestion by jamey -* perform the zx_next = zx^s + cx in 32-bit space, before rounding - * should improve precision on max zoom, might cost a few cycles - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * try 3.13 fixed point instead of 4.12 for more precision * can we get away without the extra bit? + * since exit compare space would be 6.26 i think so * y-axis mirror optimization From 7985ea9a399554340a76f8cfc340bb566d86a952 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 14:45:38 -0800 Subject: [PATCH 58/59] fix panning for 32-bi --- mandel.s | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/mandel.s b/mandel.s index 04edec5..fe86366 100644 --- a/mandel.s +++ b/mandel.s @@ -1341,12 +1341,15 @@ skip_luma: cpy #KEY_MINUS beq minus - ; temp = $0010 << (8 - zoom) - lda #$10 - sta temp + ; temp+temp2 = $00010000 << (8 - zoom) lda #$00 + sta temp sta temp + 1 - scale_zoom temp + lda #$01 + sta temp + 2 + lda #$00 + sta temp + 3 + scale_zoom temp + 2 cpy #KEY_UP beq up @@ -1356,14 +1359,7 @@ skip_luma: beq left cpy #KEY_RIGHT beq right - cpy #KEY_1 - beq one - cpy #KEY_2 - beq two - cpy #KEY_3 - beq three - cpy #KEY_4 - beq four + jmp number_keys skip_char: lda #0 @@ -1382,17 +1378,28 @@ minus: dec zoom jmp done up: - sub16 oy, oy, temp + sub32 oy, oy, temp jmp done down: - add16 oy, oy, temp + add32 oy, oy, temp jmp done left: - sub16 ox, ox, temp + sub32 ox, ox, temp jmp done right: - add16 ox, ox, temp + add32 ox, ox, temp jmp done + +number_keys: + cpy #KEY_1 + beq one + cpy #KEY_2 + beq two + cpy #KEY_3 + beq three + cpy #KEY_4 + beq four + one: ldx #0 jmp load_key_viewport From d8601bb856ac0858ea7a06f4c60f162f1664c52a Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Tue, 31 Dec 2024 15:03:43 -0800 Subject: [PATCH 59/59] fix fix --- mandel.s | 1 + 1 file changed, 1 insertion(+) diff --git a/mandel.s b/mandel.s index fe86366..b8985b3 100644 --- a/mandel.s +++ b/mandel.s @@ -1399,6 +1399,7 @@ number_keys: beq three cpy #KEY_4 beq four + jmp skip_char one: ldx #0