/* Reference_IDCT.c, Inverse Discrete Fourier Transform, double precision          */
/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

#include <math.h>
#include "idct_clip_table.h"
#define IDCT_SSE32_C
#include "idct_sse32.h"

/*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
 *  direct matrix multiply) Inverse Discrete Cosine Transform
*/

void __stdcall idct_sse32(short *block);

static const float ref_dct_matrix_t[8][8] =
{
    {/* [0][0-7] */ 0.353553, 0.490393, 0.461940, 0.415735, 0.353553, 0.277785, 0.191342, 0.097545},
    {/* [1][0-7] */ 0.353553, 0.415735, 0.191342, -0.097545, -0.353553, -0.490393, -0.461940, -0.277785},
    {/* [2][0-7] */ 0.353553, 0.277785, -0.191342, -0.490393, -0.353553, 0.097545, 0.461940, 0.415735},
    {/* [3][0-7] */ 0.353553, 0.097545, -0.461940, -0.277785, 0.353553, 0.415735, -0.191342, -0.490393},
    {/* [4][0-7] */ 0.353553, -0.097545, -0.461940, 0.277785, 0.353553, -0.415735, -0.191342, 0.490393},
    {/* [5][0-7] */ 0.353553, -0.277785, -0.191342, 0.490393, -0.353553, -0.097545, 0.461940, -0.415735},
    {/* [6][0-7] */ 0.353553, -0.415735, 0.191342, 0.097545, -0.353553, 0.490393, -0.461940, 0.277785},
    {/* [7][0-7] */ 0.353553, -0.490393, 0.461940, -0.415735, 0.353553, -0.277785, 0.191342, -0.097545}
};

void __stdcall idct_sse32(short *block)
{
	int i, j, v;
	float partial_product;
	float tmp[64];
	float fblock[64];

	__asm {
		finit;
	}

	for (i=0; i<8; i++)
	{
		for (j=0; j<8; j++)
			fblock[i*8 +j]= (float)block[i*8 +j];
	}
	
	for (i=0; i<8; i++)
	{
		for (j=0; j<8; j++)
		{
			partial_product = 0.0;
			__asm{
				//オフセットの計算と初期化。
				mov eax, dword ptr [i];
				mov ebx, dword ptr [j];
				shl eax, 5; //i*8 *4(byte) :offset
				shl ebx, 5; //j*8 *4(byte) :offset
				lea edi, [ref_dct_matrix_t +ebx];
				lea edx, [fblock +eax]; 
				
				//４個まとめて計算。
				// partial_product+= ref_dct_matrix[k][j]*block[8*i+k];
				movss xmm7, partial_product;// partial_product=0
				movups xmm1, [edx] ; //[block +8*i] (x0-x3)
				movups xmm2, [edi] ; //[ref_dct_matrix_t +8*j]
				mulps xmm1, xmm2 ;
				addps xmm7, xmm1 ;
				//残りの４個もまとめて計算
				movups xmm1, [edx +16] ;
				movups xmm2, [edi +16] ;
				mulps xmm1, xmm2 ;
				addps xmm7, xmm1 ;
				//８個の合計を求める
				movaps xmm1, xmm7 ;
				shufps xmm7, xmm1, 0x39 ;
				addps xmm7, xmm1 ;
				movaps xmm1, xmm7 ;
				shufps xmm7, xmm1,0x2 ;
				addss xmm7, xmm1 ;
				//出力〜。
				movss [partial_product], xmm7 ;
			}
			tmp[8*j+i] = partial_product;
		}
	}	
	/* Transpose operation is integrated into address mapping by switching 
	loop order of i and j */
		
	for (j=0; j<8; j++)
	{
		for (i=0; i<8; i++)
		{
			partial_product = 0.0;
			//for (k=0; k<8; k++)
			//partial_product+= ref_dct_matrix_t[i][k] *tmp[8*j+k];
			__asm{
				//initialize.
				mov eax, dword ptr [i];
				mov ebx, dword ptr [j];
				shl eax, 5 ;//i*8 *4(byte) :offset
				shl ebx, 5 ;//j*8 *4(byte) :offset
				lea edi, [ref_dct_matrix_t +ebx] ;
				lea edx, [tmp +eax] ;
				
				// partial_product+= ref_dct_matrix[k][j]*block[8*i+k];
				//計算準備。xmmレジスタへfloat4個（16バイト）ロード。
				movss xmm7, dword ptr [partial_product] ;
				movups xmm1, [edx] ;//[tmp +8*i]
				movups xmm2, [edi] ;//[ref_dct_matrix_t +8*i]
				mulps xmm1, xmm2 ;
				addps xmm7, xmm1 ;
				//残り4個もロード。
				movups xmm1, [edx +16] ;
				movups xmm2, [edi +16] ;
				mulps xmm1, xmm2 ;
				addps xmm7, xmm1 ;
				//Caluclation sum.
				movaps xmm1, xmm7 ;
				shufps xmm7, xmm1, 0x39 ;
				addps xmm7, xmm1 ;
				movaps xmm1, xmm7 ;
				shufps xmm7, xmm1,0x2 ;
				addss xmm7, xmm1 ;

				//save :partial data
				movss [partial_product], xmm7 ;
			}
				
			v = (int) floor(partial_product+0.5);
			block[8*j+i] = idct_clip_table[IDCT_CLIP_TABLE_OFFSET+v];
		}
	}
}
