
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <NDS.h>

#include "_console.h"
#include "_consolewritelog.h"
#include "_const.h"
#include "memtool.h"

#include "customjpeg.h"

#include "arm9tcm.h"

// --------------------------

static u16 *pYUV2RGBTable=NULL;

typedef s32 DCTELEM;		/* 16 or 32 bits is fine */

#define DCTSIZE		    8	/* The basic DCT block is 8x8 samples */
#define DCTSIZE2	    64	/* DCTSIZE squared; # of elements in a block */

// ------------------------------

static s16 *pDCs;
static s8 *pACs;

#include "jpeg_idct_ifast.h"

static __attribute__ ((noinline)) void DCT5bit_asm(s32 *pQuantizeTable,DCTELEM *pblock,bool isY)
{
  pblock[0]=((DCTELEM)*pDCs++)*((DCTELEM)pQuantizeTable[0]);
  
  u32 accnt=(u32)*pACs++;
  
  if(accnt==0){
    // DC only.
    s32 dc=pblock[0]>>5;
    if(isY==true){
      dc-=64;
      if(dc<0) dc=0;
      if(127<dc) dc=127;
      }else{
      if(dc<0) dc=0;
      if(127<dc) dc=127;
    }
    dc=(dc>>2)*2;
    for(u32 idx=0;idx<DCTSIZE2;idx++){
      pblock[idx]=dc;
    }
    return;
  }
  
#define D(ofs,zigzag) \
  "cmps %3,#"ofs" \n" \
  "ldrhisb r2,[%0],#1 \n" \
  "ldrhi r3,[%2,#"zigzag"*4] \n" \
  "bls DCT5bity_asm_loadnull"ofs" \n" \
  "smulbbhi r2,r2,r3 \n" \
  "str r2,[%1,#"zigzag"*4] \n" \
  
#define N(ofs,zigzag) \
  "DCT5bity_asm_loadnull"ofs": \n" \
  "str r4,[%1,#"zigzag"*4] \n" \
  
  asm volatile(
    "mov r4,#0 \n"
    
    D("0x00","0x01") D("0x01","0x08") D("0x02","0x10") D("0x03","0x09") D("0x04","0x02") D("0x05","0x03") D("0x06","0x0a") D("0x07","0x11")
    D("0x08","0x18") D("0x09","0x20") D("0x0a","0x19") D("0x0b","0x12") D("0x0c","0x0b") D("0x0d","0x04") D("0x0e","0x05") D("0x0f","0x0c")
    D("0x10","0x13") D("0x11","0x1a") D("0x12","0x21") D("0x13","0x28") D("0x14","0x30") D("0x15","0x29") D("0x16","0x22") D("0x17","0x1b")
    D("0x18","0x14") D("0x19","0x0d") D("0x1a","0x06") D("0x1b","0x07") D("0x1c","0x0e") D("0x1d","0x15") D("0x1e","0x1c") D("0x1f","0x23")
    D("0x20","0x2a") D("0x21","0x31") D("0x22","0x38") D("0x23","0x39") D("0x24","0x32") D("0x25","0x2b") D("0x26","0x24") D("0x27","0x1d")
    D("0x28","0x16") D("0x29","0x0f") D("0x2a","0x17") D("0x2b","0x1e") D("0x2c","0x25") D("0x2d","0x2c") D("0x2e","0x33") D("0x2f","0x3a")
    D("0x30","0x3b") D("0x31","0x34") D("0x32","0x2d") D("0x33","0x26") D("0x34","0x1f") D("0x35","0x27") D("0x36","0x2e") D("0x37","0x35")
    D("0x38","0x3c") D("0x39","0x3d") D("0x3a","0x36") D("0x3b","0x2f") D("0x3c","0x37") D("0x3d","0x3e") D("0x3e","0x3f")
    
    "b DCT5bity_asm_loadend \n"
    
    N("0x00","0x01") N("0x01","0x08") N("0x02","0x10") N("0x03","0x09") N("0x04","0x02") N("0x05","0x03") N("0x06","0x0a") N("0x07","0x11")
    N("0x08","0x18") N("0x09","0x20") N("0x0a","0x19") N("0x0b","0x12") N("0x0c","0x0b") N("0x0d","0x04") N("0x0e","0x05") N("0x0f","0x0c")
    N("0x10","0x13") N("0x11","0x1a") N("0x12","0x21") N("0x13","0x28") N("0x14","0x30") N("0x15","0x29") N("0x16","0x22") N("0x17","0x1b")
    N("0x18","0x14") N("0x19","0x0d") N("0x1a","0x06") N("0x1b","0x07") N("0x1c","0x0e") N("0x1d","0x15") N("0x1e","0x1c") N("0x1f","0x23")
    N("0x20","0x2a") N("0x21","0x31") N("0x22","0x38") N("0x23","0x39") N("0x24","0x32") N("0x25","0x2b") N("0x26","0x24") N("0x27","0x1d")
    N("0x28","0x16") N("0x29","0x0f") N("0x2a","0x17") N("0x2b","0x1e") N("0x2c","0x25") N("0x2d","0x2c") N("0x2e","0x33") N("0x2f","0x3a")
    N("0x30","0x3b") N("0x31","0x34") N("0x32","0x2d") N("0x33","0x26") N("0x34","0x1f") N("0x35","0x27") N("0x36","0x2e") N("0x37","0x35")
    N("0x38","0x3c") N("0x39","0x3d") N("0x3a","0x36") N("0x3b","0x2f") N("0x3c","0x37") N("0x3d","0x3e") N("0x3e","0x3f")
    
    "DCT5bity_asm_loadend: \n"
    
    : "+r"(pACs) : "r"(pblock), "r"(pQuantizeTable), "r"(accnt)
    : "r2","r3","r4"
  );
  
#undef D
#undef N

  if(isY==true){
    jpeg_idct_ifast5bit_clipy_asm(pblock);
    }else{
    jpeg_idct_ifast5bit_clipc_asm(pblock);
  }
}

static CODE_IN_ITCM_NOINLINE void DCT13bit_asm(s32 *pQuantizeTable,DCTELEM *pblock)
{
  pblock[0]=((DCTELEM)*pDCs++)*((DCTELEM)pQuantizeTable[0]);
  
  u32 accnt=(u32)*pACs++;
  
  if(accnt==0){
    // DC only.
    asm volatile(
      "ldr r0,[%0],#4 \n"
      "mov r1,r0 \n mov r2,r0 \n mov r3,r0 \n mov r4,r0 \n mov r5,r0 \n mov r6,r0 \n mov r7,r0 \n"
      "stmia %0!,{r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      "stmia %0!,{r0,r1,r2,r3,r4,r5,r6,r7} \n"
      : : "r"(pblock)
      : "r0","r1","r2","r3","r4","r5","r6","r7"
    );
    return;
  }
  
#define D(ofs,zigzag) \
  "cmps %3,#"ofs" \n" \
  "ldrhisb r2,[%0],#1 \n" \
  "ldrhi r3,[%2,#"zigzag"*4] \n" \
  "bls DCT13bity_asm_loadnull"ofs" \n" \
  "smulbbhi r2,r2,r3 \n" \
  "str r2,[%1,#"zigzag"*4] \n" \
  
#define N(ofs,zigzag) \
  "DCT13bity_asm_loadnull"ofs": \n" \
  "str r4,[%1,#"zigzag"*4] \n" \
  
  asm volatile(
    "mov r4,#0 \n"
    
    D("0x00","0x01") D("0x01","0x08") D("0x02","0x10") D("0x03","0x09") D("0x04","0x02") D("0x05","0x03") D("0x06","0x0a") D("0x07","0x11")
    D("0x08","0x18") D("0x09","0x20") D("0x0a","0x19") D("0x0b","0x12") D("0x0c","0x0b") D("0x0d","0x04") D("0x0e","0x05") D("0x0f","0x0c")
    D("0x10","0x13") D("0x11","0x1a") D("0x12","0x21") D("0x13","0x28") D("0x14","0x30") D("0x15","0x29") D("0x16","0x22") D("0x17","0x1b")
    D("0x18","0x14") D("0x19","0x0d") D("0x1a","0x06") D("0x1b","0x07") D("0x1c","0x0e") D("0x1d","0x15") D("0x1e","0x1c") D("0x1f","0x23")
    D("0x20","0x2a") D("0x21","0x31") D("0x22","0x38") D("0x23","0x39") D("0x24","0x32") D("0x25","0x2b") D("0x26","0x24") D("0x27","0x1d")
    D("0x28","0x16") D("0x29","0x0f") D("0x2a","0x17") D("0x2b","0x1e") D("0x2c","0x25") D("0x2d","0x2c") D("0x2e","0x33") D("0x2f","0x3a")
    D("0x30","0x3b") D("0x31","0x34") D("0x32","0x2d") D("0x33","0x26") D("0x34","0x1f") D("0x35","0x27") D("0x36","0x2e") D("0x37","0x35")
    D("0x38","0x3c") D("0x39","0x3d") D("0x3a","0x36") D("0x3b","0x2f") D("0x3c","0x37") D("0x3d","0x3e") D("0x3e","0x3f")
    
    "b DCT13bity_asm_loadend \n"
    
    N("0x00","0x01") N("0x01","0x08") N("0x02","0x10") N("0x03","0x09") N("0x04","0x02") N("0x05","0x03") N("0x06","0x0a") N("0x07","0x11")
    N("0x08","0x18") N("0x09","0x20") N("0x0a","0x19") N("0x0b","0x12") N("0x0c","0x0b") N("0x0d","0x04") N("0x0e","0x05") N("0x0f","0x0c")
    N("0x10","0x13") N("0x11","0x1a") N("0x12","0x21") N("0x13","0x28") N("0x14","0x30") N("0x15","0x29") N("0x16","0x22") N("0x17","0x1b")
    N("0x18","0x14") N("0x19","0x0d") N("0x1a","0x06") N("0x1b","0x07") N("0x1c","0x0e") N("0x1d","0x15") N("0x1e","0x1c") N("0x1f","0x23")
    N("0x20","0x2a") N("0x21","0x31") N("0x22","0x38") N("0x23","0x39") N("0x24","0x32") N("0x25","0x2b") N("0x26","0x24") N("0x27","0x1d")
    N("0x28","0x16") N("0x29","0x0f") N("0x2a","0x17") N("0x2b","0x1e") N("0x2c","0x25") N("0x2d","0x2c") N("0x2e","0x33") N("0x2f","0x3a")
    N("0x30","0x3b") N("0x31","0x34") N("0x32","0x2d") N("0x33","0x26") N("0x34","0x1f") N("0x35","0x27") N("0x36","0x2e") N("0x37","0x35")
    N("0x38","0x3c") N("0x39","0x3d") N("0x3a","0x36") N("0x3b","0x2f") N("0x3c","0x37") N("0x3d","0x3e") N("0x3e","0x3f")
    
    "DCT13bity_asm_loadend: \n"
    
    : "+r"(pACs) : "r"(pblock), "r"(pQuantizeTable), "r"(accnt)
    : "r2","r3","r4"
  );
  
#undef D
#undef N

  jpeg_idct_ifast13bit_asm(pblock);
}

static void YUV111toRGB15_asm(u16 *pBuf,const DCTELEM *py,const DCTELEM *pcb,const DCTELEM *pcr)
{
  for(u32 y=0;y<DCTSIZE;y++){
    asm volatile(
      // x=0,1
      "ldr r4,[%1,#(0+0)*4] \n"
      "ldr r5,[%2,#(0+0)*4] \n"
      "ldr r7,[%1,#(0+1)*4] \n"
      "orr r4,r4,r5,lsl #5 \n"
      "ldr r5,[%3,#(0+0)*4] \n"
      "ldr r8,[%2,#(0+1)*4] \n"
      "orr r4,r4,r5,lsl #10 \n"
      "ldrh r4,[%4,r4] \n"
      
      "orr r7,r7,r8,lsl #5 \n"
      "ldr r8,[%3,#(0+1)*4] \n"
      "orr r7,r7,r8,lsl #10 \n"
      "ldrh r7,[%4,r7] \n"
      
      "orr r4,r4,r7,lsl #16 \n"
      "str r4,[%0,#0*2] \n"
      
      // x=2,3
      "ldr r4,[%1,#(2+0)*4] \n"
      "ldr r5,[%2,#(2+0)*4] \n"
      "ldr r7,[%1,#(2+1)*4] \n"
      "orr r4,r4,r5,lsl #5 \n"
      "ldr r5,[%3,#(2+0)*4] \n"
      "ldr r8,[%2,#(2+1)*4] \n"
      "orr r4,r4,r5,lsl #10 \n"
      "ldrh r4,[%4,r4] \n"
      
      "orr r7,r7,r8,lsl #5 \n"
      "ldr r8,[%3,#(2+1)*4] \n"
      "orr r7,r7,r8,lsl #10 \n"
      "ldrh r7,[%4,r7] \n"
      
      "orr r4,r4,r7,lsl #16 \n"
      "str r4,[%0,#2*2] \n"
      
      // x=4,5
      "ldr r4,[%1,#(4+0)*4] \n"
      "ldr r5,[%2,#(4+0)*4] \n"
      "ldr r7,[%1,#(4+1)*4] \n"
      "orr r4,r4,r5,lsl #5 \n"
      "ldr r5,[%3,#(4+0)*4] \n"
      "ldr r8,[%2,#(4+1)*4] \n"
      "orr r4,r4,r5,lsl #10 \n"
      "ldrh r4,[%4,r4] \n"
      
      "orr r7,r7,r8,lsl #5 \n"
      "ldr r8,[%3,#(4+1)*4] \n"
      "orr r7,r7,r8,lsl #10 \n"
      "ldrh r7,[%4,r7] \n"
      
      "orr r4,r4,r7,lsl #16 \n"
      "str r4,[%0,#4*2] \n"
      
      // x=6,7
      "ldr r4,[%1,#(6+0)*4] \n"
      "ldr r5,[%2,#(6+0)*4] \n"
      "ldr r7,[%1,#(6+1)*4] \n"
      "orr r4,r4,r5,lsl #5 \n"
      "ldr r5,[%3,#(6+0)*4] \n"
      "ldr r8,[%2,#(6+1)*4] \n"
      "orr r4,r4,r5,lsl #10 \n"
      "ldrh r4,[%4,r4] \n"
      
      "orr r7,r7,r8,lsl #5 \n"
      "ldr r8,[%3,#(6+1)*4] \n"
      "orr r7,r7,r8,lsl #10 \n"
      "ldrh r7,[%4,r7] \n"
      
      "orr r4,r4,r7,lsl #16 \n"
      "str r4,[%0,#6*2] \n"
      
      : : "r"(pBuf), "r"(py),"r"(pcb),"r"(pcr), "r"(pYUV2RGBTable)
      : "r4","r5","r7","r8"
    );
    
    pBuf+=64;
    py+=DCTSIZE; pcb+=DCTSIZE; pcr+=DCTSIZE;
  }
}

// ----------------------------------------------

typedef struct {
  u16 *_pBuf;
  const DCTELEM *py;
  const DCTELEM *_pcb,*_pcr;
} TYUV411toRGB15_Data;

#define FIX16(x) ((s32)(x*0x10000))

#define MAX(x,v) x=((x<v) ? v : x);
#define MIN(x,v) x=((v<x) ? v : x);

static void YUV411toRGB15(const TYUV411toRGB15_Data *pData)
{
  u32 gr=0,gg=0,gb=0;
  
  for(u32 idx=0;idx<4;idx++){
    for(u32 y=0;y<4;y++){
      for(u32 x=0;x<4;x++){
        s32 cb=pData->_pcb[(y*DCTSIZE)+x];
        s32 cr=pData->_pcr[(y*DCTSIZE)+x];
        s32 tr=(                   +(FIX16( 1.4020)*cr))/0x10000;
        s32 tg=((FIX16(-0.3441)*cb)+(FIX16(-0.7139)*cr))/0x10000;
        s32 tb=((FIX16( 1.7718)*cb)+(FIX16(-0.0012)*cr))/0x10000;
        
        const DCTELEM *_py=&pData->py[((y*2)*DCTSIZE)+(x*2)];
        
        s32 r,g,b;
        u16 *__pBuf=&pData->_pBuf[((y*2)*64)+(x*2)];
        
        s32 y0=_py[(0*DCTSIZE)+0]+0x80;
        s32 y1=_py[(0*DCTSIZE)+1]+0x80;
        s32 y2=_py[(1*DCTSIZE)+0]+0x80;
        s32 y3=_py[(1*DCTSIZE)+1]+0x80;
        
        {
          r=y0+tr; MAX(r,0); r+=gr; gr=r&7; r>>=3; MIN(r,0x1f);
          g=y0+tg; MAX(g,0); g+=gg; gg=g&7; g>>=3; MIN(g,0x1f);
          b=y0+tb; MAX(b,0); b+=gb; gb=b&7; b>>=3; MIN(b,0x1f);
          __pBuf[((0*64)+0)]=RGB15(r,g,b)|BIT15;
        }
        {
          r=y1+tr; MAX(r,0); r+=gr; gr=r&7; r>>=3; MIN(r,0x1f);
          g=y1+tg; MAX(g,0); g+=gg; gg=g&7; g>>=3; MIN(g,0x1f);
          b=y1+tb; MAX(b,0); b+=gb; gb=b&7; b>>=3; MIN(b,0x1f);
          __pBuf[((0*64)+1)]=RGB15(r,g,b)|BIT15;
        }
        {
          r=y2+tr; MAX(r,0); r+=gr; gr=r&7; r>>=3; MIN(r,0x1f);
          g=y2+tg; MAX(g,0); g+=gg; gg=g&7; g>>=3; MIN(g,0x1f);
          b=y2+tb; MAX(b,0); b+=gb; gb=b&7; b>>=3; MIN(b,0x1f);
          __pBuf[((1*64)+0)]=RGB15(r,g,b)|BIT15;
        }
        {
          r=y3+tr; MAX(r,0); r+=gr; gr=r&7; r>>=3; MIN(r,0x1f);
          g=y3+tg; MAX(g,0); g+=gg; gg=g&7; g>>=3; MIN(g,0x1f);
          b=y3+tb; MAX(b,0); b+=gb; gb=b&7; b>>=3; MIN(b,0x1f);
          __pBuf[((1*64)+1)]=RGB15(r,g,b)|BIT15;
        }
      }
    }
    pData++;
  }
}

static CODE_IN_ITCM_NOINLINE void YUV411_13bit_toRGB15_asm(const TYUV411toRGB15_Data *pData)
{
  u32 gr=0,gg=0,gb=0;
  
  for(u32 idx=0;idx<4;idx++){
    for(u32 y=0;y<4;y++){
      for(u32 x=0;x<4;x++){
        s32 tr,tg,tb;

#define DataOffset__pBuf "4*0"
#define DataOffset_py "4*1"
#define DataOffset__pcb "4*2"
#define DataOffset__pcr "4*3"

// ----------

#define FIX_1_4020 "0x166E9" // 1.4020*0x10000
#define FIX_0_3441 "0xFFFFA7EA" // -0.3441*0x10000
#define FIX_0_7139 "0xFFFF493E" // -0.7139*0x10000
#define FIX_1_7718 "0x1C594" // 1.7718*0x10000

#define REG_tr "%0"
#define REG_tg "%1"
#define REG_tb "%2"
#define REG_pData "%3"
#define REG_x "%4"
#define REG_y "%5"
#define REG_FIXs "%6"

#define REG_cb "r8"
#define REG_cr "r9"
#define REG_FIX1 "r10"
#define REG_FIX2 "r11"

        asm volatile(
          // cb=pData->_pcb[(y*DCTSIZE)+x];
          "ldr "REG_cb",["REG_pData",#"DataOffset__pcb"] \n"
          "add "REG_cb","REG_cb","REG_y",lsl #5 \n" // 8*4byte = 7bit shift
          "ldr "REG_cb",["REG_cb","REG_x",lsl #2] \n"
          
          // cr=pData->_pcr[(y*DCTSIZE)+x];
          "ldr "REG_cr",["REG_pData",#"DataOffset__pcr"] \n"
          "add "REG_cr","REG_cr","REG_y",lsl #5 \n" // 8*4byte = 7bit shift
          "ldr "REG_cr",["REG_cr","REG_x",lsl #2] \n"
          
          "ldr "REG_FIX1",="FIX_1_4020" \n"
          "ldr "REG_FIX2",="FIX_0_3441" \n"
          "smulwb "REG_tr","REG_FIX1","REG_cr" \n"
          "smulwb "REG_tg","REG_FIX2","REG_cb" \n"
          "ldr "REG_FIX1",="FIX_0_7139" \n"
          "ldr "REG_FIX2",="FIX_1_7718" \n"
          "smlawb "REG_tg","REG_FIX1","REG_cr","REG_tg" \n"
          "smulwb "REG_tb","REG_FIX2","REG_cb" \n"
          
          : "=r"(tr), "=r"(tg), "=r"(tb) : "r"(pData), "r"(x), "r"(y)
          : REG_cb,REG_cr,REG_FIX1,REG_FIX2
        );
        
#undef FIX_1_4020
#undef FIX_0_3441
#undef FIX_0_7139
#undef FIX_1_7718

#undef REG_tr
#undef REG_tg
#undef REG_tb
#undef REG_pData
#undef REG_x
#undef REG_y

#undef REG_cb
#undef REG_cr
#undef REG_FIX1
#undef REG_FIX2

// ----------

        const DCTELEM *_py=&pData->py[((y*2)*DCTSIZE)+(x*2)];
        
        u16 *__pBuf=&pData->_pBuf[((y*2)*64)+(x*2)];
        
// ----------

#define REG_gr "%0"
#define REG_gg "%1"
#define REG_gb "%2"
#define REG_tr "%3"
#define REG_tg "%4"
#define REG_tb "%5"
#define REG___pBuf "%6"
#define REG__py "%7"

#define REG_y0 "r8"
#define REG_y1 "r9"
#define REG_r "r10"
#define REG_g "r11"
#define REG_b "r12"
#define REG_rgb "r14"

        asm volatile(
          // y0=_py[(0*DCTSIZE)+0]+0x1000;
          // y1=_py[(0*DCTSIZE)+1]+0x1000;
          "ldmia "REG__py",{"REG_y0","REG_y1"} \n"
          "add "REG_y0",#0x1000 \n"
          "add "REG_y1",#0x1000 \n"
          
          // r=y0+tr; MIN(r,0); r+=gr; gr=r&3; r>>=2; MAX(r,0x1f);
          "adds "REG_r","REG_y0","REG_tr" \n"
          "movlt "REG_r",#0 \n"
          "add "REG_r","REG_gr" \n"
          "and "REG_gr","REG_r",#0xff \n"
          "lsr "REG_r",#8 \n"
          "cmps "REG_r",#0x1f \n"
          "movgt "REG_r",#0x1f \n"
          
          "adds "REG_g","REG_y0","REG_tg" \n"
          "movlt "REG_g",#0 \n"
          "add "REG_g","REG_gg" \n"
          "and "REG_gg","REG_g",#0xff \n"
          "lsr "REG_g",#8 \n"
          "cmps "REG_g",#0x1f \n"
          "movgt "REG_g",#0x1f \n"
          
          "adds "REG_b","REG_y0","REG_tb" \n"
          "movlt "REG_b",#0 \n"
          "add "REG_b","REG_gb" \n"
          "and "REG_gb","REG_b",#0xff \n"
          "lsr "REG_b",#8 \n"
          "cmps "REG_b",#0x1f \n"
          "movgt "REG_b",#0x1f \n"
          
          "orr "REG_rgb","REG_r","REG_g",lsl #5 \n"
          "orr "REG_rgb","REG_rgb","REG_b",lsl #10 \n"
          "orr "REG_rgb",#0x8000 \n"
          
          // r=y1+tr; MIN(r,0); r+=gr; gr=r&3; r>>=2; MAX(r,0x1f);
          "adds "REG_r","REG_y1","REG_tr" \n"
          "movlt "REG_r",#0 \n"
          "add "REG_r","REG_gr" \n"
          "and "REG_gr","REG_r",#0xff \n"
          "lsr "REG_r",#8 \n"
          "cmps "REG_r",#0x1f \n"
          "movgt "REG_r",#0x1f \n"
          
          "adds "REG_g","REG_y1","REG_tg" \n"
          "movlt "REG_g",#0 \n"
          "add "REG_g","REG_gg" \n"
          "and "REG_gg","REG_g",#0xff \n"
          "lsr "REG_g",#8 \n"
          "cmps "REG_g",#0x1f \n"
          "movgt "REG_g",#0x1f \n"
          
          "adds "REG_b","REG_y1","REG_tb" \n"
          "movlt "REG_b",#0 \n"
          "add "REG_b","REG_gb" \n"
          "and "REG_gb","REG_b",#0xff \n"
          "lsr "REG_b",#8 \n"
          "cmps "REG_b",#0x1f \n"
          "movgt "REG_b",#0x1f \n"
          
          "orr "REG_rgb","REG_rgb","REG_r",lsl #16+0 \n"
          "orr "REG_rgb","REG_rgb","REG_g",lsl #16+5 \n"
          "orr "REG_rgb","REG_rgb","REG_b",lsl #16+10 \n"
          "orr "REG_rgb",#0x80000000 \n"
          
          "str "REG_rgb",["REG___pBuf"] \n"
          
          // y0=_py[(1*DCTSIZE)+0]+0x1000;
          // y1=_py[(1*DCTSIZE)+1]+0x1000;
          "ldr "REG_y0",["REG__py",#(8*4)+(0*4)] \n"
          "ldr "REG_y1",["REG__py",#(8*4)+(1*4)] \n"
          "add "REG_y0",#0x1000 \n"
          "add "REG_y1",#0x1000 \n"
          
          // r=y0+tr; MIN(r,0); r+=gr; gr=r&3; r>>=2; MAX(r,0x1f);
          "adds "REG_r","REG_y0","REG_tr" \n"
          "movlt "REG_r",#0 \n"
          "add "REG_r","REG_gr" \n"
          "and "REG_gr","REG_r",#0xff \n"
          "lsr "REG_r",#8 \n"
          "cmps "REG_r",#0x1f \n"
          "movgt "REG_r",#0x1f \n"
          
          "adds "REG_g","REG_y0","REG_tg" \n"
          "movlt "REG_g",#0 \n"
          "add "REG_g","REG_gg" \n"
          "and "REG_gg","REG_g",#0xff \n"
          "lsr "REG_g",#8 \n"
          "cmps "REG_g",#0x1f \n"
          "movgt "REG_g",#0x1f \n"
          
          "adds "REG_b","REG_y0","REG_tb" \n"
          "movlt "REG_b",#0 \n"
          "add "REG_b","REG_gb" \n"
          "and "REG_gb","REG_b",#0xff \n"
          "lsr "REG_b",#8 \n"
          "cmps "REG_b",#0x1f \n"
          "movgt "REG_b",#0x1f \n"
          
          "orr "REG_rgb","REG_r","REG_g",lsl #5 \n"
          "orr "REG_rgb","REG_rgb","REG_b",lsl #10 \n"
          "orr "REG_rgb",#0x8000 \n"
          
          // r=y1+tr; MIN(r,0); r+=gr; gr=r&3; r>>=2; MAX(r,0x1f);
          "adds "REG_r","REG_y1","REG_tr" \n"
          "movlt "REG_r",#0 \n"
          "add "REG_r","REG_gr" \n"
          "and "REG_gr","REG_r",#0xff \n"
          "lsr "REG_r",#8 \n"
          "cmps "REG_r",#0x1f \n"
          "movgt "REG_r",#0x1f \n"
          
          "adds "REG_g","REG_y1","REG_tg" \n"
          "movlt "REG_g",#0 \n"
          "add "REG_g","REG_gg" \n"
          "and "REG_gg","REG_g",#0xff \n"
          "lsr "REG_g",#8 \n"
          "cmps "REG_g",#0x1f \n"
          "movgt "REG_g",#0x1f \n"
          
          "adds "REG_b","REG_y1","REG_tb" \n"
          "movlt "REG_b",#0 \n"
          "add "REG_b","REG_gb" \n"
          "and "REG_gb","REG_b",#0xff \n"
          "lsr "REG_b",#8 \n"
          "cmps "REG_b",#0x1f \n"
          "movgt "REG_b",#0x1f \n"
          
          "orr "REG_rgb","REG_rgb","REG_r",lsl #16+0 \n"
          "orr "REG_rgb","REG_rgb","REG_g",lsl #16+5 \n"
          "orr "REG_rgb","REG_rgb","REG_b",lsl #16+10 \n"
          "orr "REG_rgb",#0x80000000 \n"
          
          "str "REG_rgb",["REG___pBuf",#64*2] \n"
          
          : "+r"(gr), "+r"(gg), "+r"(gb) : "r"(tr), "r"(tg), "r"(tb), "r"(__pBuf), "r"(_py)
          : REG_y0,REG_y1,REG_r,REG_g,REG_b,REG_rgb
        );

#undef REG_gr
#undef REG_gg
#undef REG_gb
#undef REG_tr
#undef REG_tg
#undef REG_tb
#undef REG___pBuf
#undef REG__py

#undef REG_y0
#undef REG_y1
#undef REG_r
#undef REG_g
#undef REG_b
#undef REG_rgb

      }
    }
    pData++;
  }
  
  asm volatile(
    "b YUV411_13bit_toRGB15_asm_end \n"
    ".pool \n"
    "YUV411_13bit_toRGB15_asm_end: \n"
    : : :
  );
}

#undef FIX16

#define FIX16(x) ((s32)(x*0x10000))

void customjpeg_InitYUV2RGBTable(void)
{
  pYUV2RGBTable=NULL;
}

void customjpeg_CreateYUV2RGBTable(void)
{
  if(pYUV2RGBTable!=NULL) return;
  
  pYUV2RGBTable=(u16*)malloc(32*32*32*2); // Ȃ̂safemallocgȂB
  
  for(s32 y=0;y<32;y++){
    for(s32 cr=0;cr<32;cr++){
      for(s32 cb=0;cb<32;cb++){
        s32 ty=y;
        s32 tcb=cb-16;
        s32 tcr=cr-16;
        s32 r=(FIX16(ty)                     +(FIX16( 1.4020)*tcr))/0x10000;
        s32 g=(FIX16(ty)+(FIX16(-0.3441)*tcb)+(FIX16(-0.7139)*tcr))/0x10000;
        s32 b=(FIX16(ty)+(FIX16( 1.7718)*tcb)+(FIX16(-0.0012)*tcr))/0x10000;
        if(r<0) r=0;
        if(31<r) r=31;
        if(g<0) g=0;
        if(31<g) g=31;
        if(b<0) b=0;
        if(31<b) b=31;
        pYUV2RGBTable[(cr<<10)|(cb<<5)|(y<<0)]=RGB15(r,g,b)|BIT15;
      }
    }
  }
}

#undef FIX16

void customjpeg_DecodeYUV111(s32 *pQuantizeTable,u8 *pData,u16 *pBuf)
{
  customjpeg_CreateYUV2RGBTable();
  
  pDCs=(s16*)pData;
  pACs=(s8*)&pDCs[DCTSIZE*DCTSIZE*3];
  
//  PrfStart();
  // 4.352ms
  for(u32 y=0;y<8;y++){
    for(u32 x=0;x<8;x++){
      static DCTELEM _y[DCTSIZE2],_cb[DCTSIZE2],_cr[DCTSIZE2];
      
      DCT5bit_asm(pQuantizeTable,_y,true);
      DCT5bit_asm(pQuantizeTable,_cb,false);
      DCT5bit_asm(pQuantizeTable,_cr,false);
      
      YUV111toRGB15_asm(&pBuf[x*DCTSIZE],_y,_cb,_cr);
    }
    pBuf+=DCTSIZE*64;
  }
//  PrfEnd(0); ShowLogHalt();
}

void customjpeg_DecodeYUV411(s32 *pQuantizeTable,u8 *pData,u16 *pBuf)
{
  pDCs=(s16*)pData;
  pACs=(s8*)&pDCs[4*4*6];
  
  static __attribute__ ((section (".dtcm"))) DCTELEM _y0[DCTSIZE2],_y1[DCTSIZE2],_y2[DCTSIZE2],_y3[DCTSIZE2],_cb[DCTSIZE2],_cr[DCTSIZE2];
  
  TYUV411toRGB15_Data YUV411toRGB15_Data[4]={
    { NULL, _y0, &_cb[((4*0)*DCTSIZE)+(4*0)], &_cr[((4*0)*DCTSIZE)+(4*0)] },
    { NULL, _y1, &_cb[((4*0)*DCTSIZE)+(4*1)], &_cr[((4*0)*DCTSIZE)+(4*1)] },
    { NULL, _y2, &_cb[((4*1)*DCTSIZE)+(4*0)], &_cr[((4*1)*DCTSIZE)+(4*0)] },
    { NULL, _y3, &_cb[((4*1)*DCTSIZE)+(4*1)], &_cr[((4*1)*DCTSIZE)+(4*1)] },
  };
  
//  PrfStart();
  // 3.759ms
  for(u32 y=0;y<4;y++){
    YUV411toRGB15_Data[0]._pBuf=&pBuf[((8*0)*64)+(8*0)];
    YUV411toRGB15_Data[1]._pBuf=&pBuf[((8*0)*64)+(8*1)];
    YUV411toRGB15_Data[2]._pBuf=&pBuf[((8*1)*64)+(8*0)];
    YUV411toRGB15_Data[3]._pBuf=&pBuf[((8*1)*64)+(8*1)];
    for(u32 x=0;x<4;x++){
      DCT13bit_asm(pQuantizeTable,_y0);
      DCT13bit_asm(pQuantizeTable,_y1);
      DCT13bit_asm(pQuantizeTable,_y2);
      DCT13bit_asm(pQuantizeTable,_y3);
      DCT13bit_asm(pQuantizeTable,_cb);
      DCT13bit_asm(pQuantizeTable,_cr);
      
      YUV411_13bit_toRGB15_asm(YUV411toRGB15_Data);
      
      YUV411toRGB15_Data[0]._pBuf+=DCTSIZE*2;
      YUV411toRGB15_Data[1]._pBuf+=DCTSIZE*2;
      YUV411toRGB15_Data[2]._pBuf+=DCTSIZE*2;
      YUV411toRGB15_Data[3]._pBuf+=DCTSIZE*2;
    }
    pBuf+=(DCTSIZE*2)*64;
  }
//  PrfEnd(0); ShowLogHalt();
}

