@nesterarm.s
@rww - custom arm routines for nesterds
	.section .itcm

	.align	4
	.arm

@==========================================================================
@PPU functions
@==========================================================================
	.global	PPU_render_bgASM
	.extern	ppuGlobals

@=====================================	
@PPU_render_bg
@NES background rendering
@=====================================
PPU_render_bgASM:
	@put necessary registers on stack
	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11}

	@get location of all the ppu globals
	ldr		r1, =ppuGlobals

	@load loopy_v to r7
	ldrh	r7, [r1, #22]

	@set r11 and r12 to tile_x and tile_y
	and		r11, r7, #31
	and		r12, r7, #992
	mov		r12, r12, lsr #5

	@set r10 to name_addr
	ldr		r9, =4095
	and		r10, r7, r9
	add		r10, r10, #8192

	@set r9 to attrib_addr
	and		r9, r7, #3072 @r9 = (loopy_v & 0x0C00)
	ldr		r8, =65532
	and		r8, r12, r8 @r8 = (tile_y & 0xFFFC)
	add		r9, r9, r11, lsr #2
	add		r9, r9, r8, lsl #1
	add		r9, r9, #8192
	add		r9, r9, #960

	@r7 = (loopy_v & 0x7000) >> 12
	and		r7, r7, #28672
	mov		r7, r7, lsr #12

	@r3 = 1023 (will load the value into r3 relative to the program counter)
	ldr		r3, =1023
	
	@move PPU_VRAM_banks[(attrib_addr) >> 10] to r8
	mov		r8, r9, lsr #10
	add		r2, r1, #36 @36 bytes into ppu globals is where PPU_VRAM_banks is located
	ldr		r8, [r2, r8, lsl #2]

	@move "NESVRAM(attrib_addr)" to r8
	and		r2, r9, r3
	ldrb	r8, [r8, r2]

	@from here, tile_y will only be used for the purpose of checking & 2
	ands	r12, r12, #2
	
	@branch if (tile_y & 2) == 2
	bne		.tileYNZ

@TODO: extra labels/branches probably faster than performing conditional ops no matter what
	@if(0x0000 == (tile_x & 0x0002))
	tst		r11, #2
	
	@attrib_bits = (NESVRAM(attrib_addr) & 0x03) << 2;
	andeq	r8, r8, #3
	moveq	r8, r8, lsl #2

	@attrib_bits = (NESVRAM(attrib_addr) & 0x0C);
	andne	r8, r8, #12

	b		.bgPreRender

.tileYNZ:
	@if(0x0000 == (tile_x & 0x0002))
	tst		r11, #2
	
	@attrib_bits = (NESVRAM(attrib_addr) & 0x30) >> 2;
	andeq	r8, r8, #48
	moveq	r8, r8, lsr #2

	@attrib_bits = (NESVRAM(attrib_addr) & 0xC0) >> 4;
	andne	r8, r8, #192
	movne	r8, r8, lsr #4
	
	@================================
.bgPreRender:
	@r2 = (SIDE_MARGIN - loopy_x)
	ldrb	r6, [r1, #26]
	mov		r2, #8 @SIDE_MARGIN
	sub		r2, r2, r6
	
	@r4 = p
	add		r4, r0, r2

	@r5 = solid
	add		r5, r1, #84 @84 bytes into ppuGlobals is where solid_buf begins
	add		r5, r5, r2, lsl #2

	mov		r6, #32

	@store off buf
	stmfd	sp!, {r0}

	@r0 = bg_pattern_table_addr (r1 is ppuGlobals)
	ldrh	r0, [r1, #16]
	
	@add bg_pattern_table_addr to r7 (currently (loopy_v & 0x7000) >> 12)
	add		r7, r7, r0
	
	@add offset from ppuGlobals in r1 so that it becomes the location of PPU_VRAM_banks
	add		r1, r1, #36

	@r0  == N/A (---CFU---)
	@r1  == PPU_VRAM_banks
	@r2  == N/A (---CFU---)
	@r3  == #1023 (---CFU---)
	@r4  == p
	@r5  == solid
	@r6  == i
	@r7  == bg_pattern_table_addr + (loopy_v & 0x7000) >> 12
	@r8  == attrib_bits
	@r9  == attrib_addr
	@r10 == name_addr
	@r11 == tile_x
	@r12 == (tile_y & 0x0002) (---CFU---)

	@push on-demand registers
	stmfd	sp!, {r12}

	@================================
.bgRenderLoop:
	@move PPU_VRAM_banks[(name_addr) >> 10] to r0
	mov		r0, r10, lsr #10
	ldr		r0, [r1, r0, lsl #2]

	@r3 = 1023
	ldr		r3, =1023

	@move "NESVRAM(name_addr)" to r0
	and		r2, r10, r3
	ldrb	r0, [r0, r2]
	
	@add it to r7 while shifting left 4
	add		r0, r7, r0, lsl #4
	
	@TODO: CHECK_MMC2(pattern_addr/r0);

	@pattern_lo(r12)   = NESVRAM(pattern_addr);
	@move PPU_VRAM_banks[(pattern_addr) >> 10] to r12
	mov		r12, r0, lsr #10
	ldr		r12, [r1, r12, lsl #2]

	@move "NESVRAM(pattern_addr)" to r12
	and		r2, r0, r3
	ldrb	r12, [r12, r2]
	
    @pattern_hi(r0)   = NESVRAM(pattern_addr+8);
    add		r0, r0, #8
	@move PPU_VRAM_banks[(pattern_addr+8) >> 10] to r2
	mov		r2, r0, lsr #10
	ldr		r2, [r1, r2, lsl #2]

	@move "NESVRAM(pattern_addr+8)" to r0
	and		r0, r0, r3
	ldrb	r0, [r2, r0]

	@address for bg_pal in r1
	add		r1, r1, #1408

	@DRAW_BG_PIXEL(128)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #128
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #128
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix0NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5]
	
	b		.pix0D
.pix0NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5]

.pix0D:
	@==================
	
	@DRAW_BG_PIXEL(64)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #64
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #64
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix1NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #1]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #4]
	
	b		.pix1D
.pix1NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #1]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #4]

.pix1D:
	@==================
	
	@DRAW_BG_PIXEL(32)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #32
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #32
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix2NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #2]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #8]
	
	b		.pix2D
.pix2NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #2]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #8]

.pix2D:
	@==================

	@DRAW_BG_PIXEL(16)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #16
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #16
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix3NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #3]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #12]
	
	b		.pix3D
.pix3NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #3]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #12]

.pix3D:
	@==================

	@DRAW_BG_PIXEL(8)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #8
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #8
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix4NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #4]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #16]
	
	b		.pix4D
.pix4NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #4]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #16]

.pix4D:
	@==================
	
	@DRAW_BG_PIXEL(4)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #4
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #4
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix5NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #5]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #20]
	
	b		.pix5D
.pix5NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #5]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #20]

.pix5D:
	@==================

	@DRAW_BG_PIXEL(2)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #2
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #2
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix6NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #6]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #24]
	
	b		.pix6D
.pix6NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #6]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #24]

.pix6D:
	@==================

	@DRAW_BG_PIXEL(1)
	@==================
	@if(pattern_lo & a) col |= 0x01;
	@if(pattern_hi & a) col |= 0x02;
	tst		r12, #1
	orrne	r3, r8, #1
	moveq	r3, r8
	tst		r0, #1
	orrne	r3, r3, #2
	
	@if(col & 0x03)
	tst		r3, #3
	beq		.pix7NE
	
	@r2 = NES_COLOR_BASE + bg_pal[col]
	ldrb	r2, [r1, r3]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #7]
	@*solid = BG_WRITTEN_FLAG
	mov		r2, #1
	str		r2, [r5, #28]
	
	b		.pix7D
.pix7NE:
	@r2 = NES_COLOR_BASE + bg_pal[0]
	ldrb	r2, [r1]
	add		r2, r2, #64
	@*p = r2
	strb	r2, [r4, #7]
	@*solid = 0
	mov		r2, #0
	str		r2, [r5, #28]

.pix7D:
	@==================

	@p += 8
	add		r4, r4, #8
	@solid += 8
	add		r5, r5, #32

	@having drawn all the pixels for this loop, increment r11 (tile_x) and r10 (name_addr)
	add		r10, r10, #1
	add		r11, r11, #1

	@restore address for PPU_VRAM_banks in r1
	sub		r1, r1, #1408

	subs	r6, r6, #1
	bpl		.bgRenderLoop
	@================================

	@pop on-demand registers
	ldmfd	sp!, {r12}

	@restore buf
	ldmfd	sp!, {r0}
	
	@TODO: PPU_bg_clip_left8
	
	@pop registers back off stack
	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11}
	
	bx		lr

@==========================================================================
@ARM utility functions (http://dualis.1emulation.com/dsti.html)
@==========================================================================
	.section .rodata

	.global GetTCMInfo
	.global GetCRInfo
	.global SetCRInfo

GetTCMInfo:
	mrc		p15, 0, r0, c0, c0, 2
	bx		lr

GetCRInfo:
	mrc		p15, 0, r0, c1, c0, 0
	bx		lr

SetCRInfo:
	mcr		p15, 0, r0, c1, c0, 0
	bx		lr
