diff --git a/.gitignore b/.gitignore index 8d2f7ce..771e47a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.xex +tables.s .DS_Store diff --git a/Makefile b/Makefile index 25148b4..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,17 @@ all : mandel.xex -%.xex : %.o - ld65 -C atari-asm-xex.cfg -o $@ $< +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< +tables.s : tables.js + node tables.js > tables.s + clean : + rm -f tables.s rm -f *.o rm -f *.xex diff --git a/mandel.s b/mandel.s index 097b700..023a1ea 100644 --- a/mandel.s +++ b/mandel.s @@ -22,11 +22,12 @@ total_ms = $a4 ; float48 total_pixels = $aa ; float48 temp = $b0 ; u16 -pixel_ptr = $b2 ; u16 -pixel_color = $b4 ; u8 -pixel_mask = $b5 ; u8 -pixel_shift = $b6 ; u8 -pixel_offset = $b7 ; u8 +temp2 = $b2 ; u16 +pixel_ptr = $b4 ; u16 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 ; FP registers in zero page @@ -83,6 +84,10 @@ SETVBV = $E45C mantissa .byte 6 .endstruct +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 + .data strings: @@ -206,6 +211,12 @@ color_map: add 4, dest, arg2, dest .endmacro +.macro add_carry dest + lda dest + adc #0 + sta dest +.endmacro + ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc @@ -336,12 +347,12 @@ next: ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the X register. + ; keeping a count of sign bits in the Y register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc - inx ; 2 cyc + iny ; 2 cyc positive: .endmacro @@ -370,13 +381,13 @@ positive: ; min 470 cycles ; max 780 cycles -.proc imul16_func +.proc imul16_func_orig arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result - ldx #0 ; 2 cyc - ; counts the number of sign bits in X + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc @@ -396,7 +407,94 @@ positive: .endrepeat ; In case of mixed input signs, return a negative result. - cpx #1 ; 2 cyc + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: + + rts ; 6 cyc +.endproc + +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x + + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x + + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 + + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro + +.proc imul16_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc + + lda #0 + sta result + 0 + sta result + 1 + sta result + 2 + sta result + 3 + + imul8 temp, arg1, arg2 + add16 result, result, temp + + imul8 temp, arg1 + 1, arg2 + add16 result + 1, result + 1, temp + + imul8 temp, arg1, arg2 + 1 + add16 result + 1, result + 1, temp + add_carry result + 3 + + imul8 temp, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, temp + + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: