#include "idct_clip_table.h"
#define IDCT_SSE32_C
#include "idct_sse32.h"

/*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
 *  direct matrix multiply) Inverse Discrete Cosine Transform
*/

void __stdcall idct_sse32(short *block);

static const float ref_dct_matrix_t[8][8] =
{
    {/* [0][0-7] */ 0.353553,  0.490393,  0.461940,  0.415735,  0.353553,  0.277785,  0.191342,  0.097545},
    {/* [1][0-7] */ 0.353553,  0.415735,  0.191342, -0.097545, -0.353553, -0.490393, -0.461940, -0.277785},
    {/* [2][0-7] */ 0.353553,  0.277785, -0.191342, -0.490393, -0.353553,  0.097545,  0.461940,  0.415735},
    {/* [3][0-7] */ 0.353553,  0.097545, -0.461940, -0.277785,  0.353553,  0.415735, -0.191342, -0.490393},
    {/* [4][0-7] */ 0.353553, -0.097545, -0.461940,  0.277785,  0.353553, -0.415735, -0.191342,  0.490393},
    {/* [5][0-7] */ 0.353553, -0.277785, -0.191342,  0.490393, -0.353553, -0.097545,  0.461940, -0.415735},
    {/* [6][0-7] */ 0.353553, -0.415735,  0.191342,  0.097545, -0.353553,  0.490393, -0.461940,  0.277785},
    {/* [7][0-7] */ 0.353553, -0.490393,  0.461940, -0.415735,  0.353553, -0.277785,  0.191342, -0.097545}
};

void __stdcall idct_sse32(short *block)
{
	int i, j;
	float tmp[64];
	float fblock[64];

	__asm{emms }
	for (i=0; i<8; i++)
	{
		for (j=0; j<8; j++)
			fblock[i*8 +j]= (float)block[i*8 +j];
	}
	
	for (i=0; i<8; i++)
	{
		for (j=0; j<8; j++)
		{
			__asm{
				//オフセットの計算と初期化。
				mov eax, dword ptr [i];
				xorps xmm7, xmm7;
				
				mov ebx, dword ptr [j];
				shl eax, 5;
				
				shl ebx, 5;
				lea edx, [fblock +eax]; 
				
				lea edi, [ref_dct_matrix_t +ebx];
				movups xmm1, [edx] ;
				
				movups xmm2, [edi] ;
				movups xmm3, [edx +16] ;

				mulps xmm1, xmm2 ;
				movups xmm4, [edi +16] ;
				;
				mulps xmm3, xmm4 ;
				;
				;
				addps xmm7, xmm1 ;
				;
				addps xmm7, xmm3 ;

				movaps xmm1, xmm7 ;
				mov eax, dword ptr [i];

				shufps xmm7, xmm1, 0x39 ;
				shl eax, 2;

				addps xmm7, xmm1 ;
				add eax, ebx;

				movaps xmm1, xmm7 ;
				lea edi, [tmp +eax];

				shufps xmm7, xmm1,0x2 ;

				addss xmm7, xmm1 ;

				movss [edi], xmm7
			}
		}
	}	
		
	for (j=0; j<8; j++)
	{
		for (i=0; i<8; i++)
		{
			__asm{
				mov eax, dword ptr [i];
				xorps xmm7, xmm7;
				
				shl eax, 5 ;
				mov ebx, dword ptr [j];

				lea edx, [tmp +eax] ;
				shl ebx, 5 ;

				movups xmm1, [edx] ;
				lea edi, [ref_dct_matrix_t +ebx] ;

				movups xmm3, [edx +16] ;
				movups xmm2, [edi] ;

				movups xmm4, [edi +16] ;
				mulps xmm1, xmm2 ;
				;
				mulps xmm3, xmm4 ;
				;
				;
				addps xmm7, xmm1 ;
				;
				addps xmm7, xmm3 ;
				
				movaps xmm1, xmm7 ;
				
				shufps xmm7, xmm1, 0x39 ;
				
				addps xmm7, xmm1 ;
				
				movaps xmm1, xmm7 ;
				
				shufps xmm7, xmm1,0x2 ;
				
				addss xmm7, xmm1 ;

				cvttss2si eax, xmm7 ;//本当は四捨五入だけど、切り捨て（あぅ）

				emms;

				lea ecx, [eax +IDCT_CLIP_TABLE_OFFSET];
				mov eax, dword ptr [j];

				lea ebx, [idct_clip_table +ecx*2];	
				mov ecx, dword ptr [i];

				lea edx, [ecx+eax*8];

				mov eax, dword ptr [block];
				mov cx, word ptr [ebx];

				mov word ptr [eax+edx*2],cx;
			}
		}
	}
}