/* TransformIMDCT_SSE2.c */
/* 2008/11/14            */

#include "StdAfx.h"

#include "VorbisTest.h"

#include "TransformDecoder.h"

/* */

void FFT8_SSE2(
	INT32        n,
	const FLOAT* Ar,
	const FLOAT* Ai,
	FLOAT*       Zr,
	FLOAT*       Zi)
{
	static const __m128 C1 = { 1.0F, 0.707106781F, 0.0F, -0.707106781F };
	static const __m128 S1 = { 0.0F, 0.707106781F, 1.0F,  0.707106781F };

	static const __m128 C2 = { 1.0F, 0.0F, 1.0F, 0.0F };
	static const __m128 S2 = { 0.0F, 1.0F, 0.0F, 1.0F };

	__m128 xr0, xr1;
	__m128 xi0, xi1;

	__m128 yr, yi;

	__m128 zr0, zr1;
	__m128 zi0, zi1;

	const FLOAT* ar = Ar;
	const FLOAT* ai = Ai;

	FLOAT* zr = Zr;
	FLOAT* zi = Zi;

	const FLOAT* end = ar + n * 8;

	while (ar < end) {
		/* Step.1 */
		xr0 = _mm_load_ps(ar + 0);
		xr1 = _mm_load_ps(ar + 4);

		xi0 = _mm_load_ps(ai + 0);
		xi1 = _mm_load_ps(ai + 4);

		zr0 = _mm_add_ps(xr0, xr1);
		zi0 = _mm_add_ps(xi0, xi1);

		yr = _mm_sub_ps(xr0, xr1);
		yi = _mm_sub_ps(xi0, xi1);

		zr1 = _mm_sub_ps(_mm_mul_ps(yr, C1), _mm_mul_ps(yi, S1));
		zi1 = _mm_add_ps(_mm_mul_ps(yi, C1), _mm_mul_ps(yr, S1));

		/* Step.2 */
		/* 0: 3210 -> 5410 */
		/* 1: 7654 -> 7632 */

		xr0 = _mm_shuffle_ps(zr0, zr1, _MM_SHUFFLE(1, 0, 1, 0));
		xr1 = _mm_shuffle_ps(zr0, zr1, _MM_SHUFFLE(3, 2, 3, 2));

		xi0 = _mm_shuffle_ps(zi0, zi1, _MM_SHUFFLE(1, 0, 1, 0));
		xi1 = _mm_shuffle_ps(zi0, zi1, _MM_SHUFFLE(3, 2, 3, 2));

		zr0 = _mm_add_ps(xr0, xr1);
		zi0 = _mm_add_ps(xi0, xi1);

		yr = _mm_sub_ps(xr0, xr1);
		yi = _mm_sub_ps(xi0, xi1);

		zr1 = _mm_sub_ps(_mm_mul_ps(yr, C2), _mm_mul_ps(yi, S2));
		zi1 = _mm_add_ps(_mm_mul_ps(yi, C2), _mm_mul_ps(yr, S2));

		/* Step.3 */
		/* 0: 5410 -> 6240 */
		/* 1: 7632 -> 7351 */

		xr0 = _mm_shuffle_ps(zr0, zr1, _MM_SHUFFLE(2, 0, 2, 0));
		xr1 = _mm_shuffle_ps(zr0, zr1, _MM_SHUFFLE(3, 1, 3, 1));

		xi0 = _mm_shuffle_ps(zi0, zi1, _MM_SHUFFLE(2, 0, 2, 0));
		xi1 = _mm_shuffle_ps(zi0, zi1, _MM_SHUFFLE(3, 1, 3, 1));

		zr0 = _mm_add_ps(xr0, xr1);
		zi0 = _mm_add_ps(xi0, xi1);

		zr1 = _mm_sub_ps(xr0, xr1);
		zi1 = _mm_sub_ps(xi0, xi1);

		/* Output */
		_mm_store_ps(zr + 0, zr0);
		_mm_store_ps(zr + 4, zr1);

		_mm_store_ps(zi + 0, zi0);
		_mm_store_ps(zi + 4, zi1);

		/* */

		ar += 8;
		ai += 8;

		zr += 8;
		zi += 8;
	}
}

/* */

void FFT_N_SSE2(
	FFT_N_t*     t,
	const FLOAT* ar,
	const FLOAT* ai,
	FLOAT*       zr,
	FLOAT*       zi)
{
	INT32 i, j;
	INT32 span;

	{
		FLOAT* xr = t->xr;
		FLOAT* xi = t->xi;

		const FLOAT* ar0 = ar;
		const FLOAT* ai0 = ai;

		FLOAT* end = xr + t->n;
		while (xr < end) {
			__m128 r0 = _mm_load_ps(ar0);
			__m128 i0 = _mm_load_ps(ai0);

			_mm_store_ps(xr, r0);
			_mm_store_ps(xi, i0);

			xr += 4;
			xi += 4;

			ar0 += 4;
			ai0 += 4;
		}
	}

	for (i = t->start, span = 0; i >= 0; i--, span++) {
		INT32 n = 1 << (i + 4);
		INT32 n2 = n / 2;
		INT32 pos = 0;
		INT32 count = 1 << span;

		for (j = 0; j < count; j++, pos += n) {
			FLOAT* xr0 = t->xr + pos;
			FLOAT* xi0 = t->xi + pos;

			FLOAT* xr1 = t->xr + pos + n2;
			FLOAT* xi1 = t->xi + pos + n2;

			const FLOAT* C = t->t->C[i];
			const FLOAT* S = t->t->S[i];

			FLOAT* end = xr0 + n2;
			while (xr0 < end) {
				__m128 xr_0 = _mm_load_ps(xr0);
				__m128 xr_1 = _mm_load_ps(xr1);

				__m128 xi_0 = _mm_load_ps(xi0);
				__m128 xi_1 = _mm_load_ps(xi1);

				__m128 yr = _mm_sub_ps(xr_0, xr_1);
				__m128 yi = _mm_sub_ps(xi_0, xi_1);

				__m128 C0 = _mm_load_ps(C);
				__m128 S0 = _mm_load_ps(S);

				xr_0 = _mm_add_ps(xr_0, xr_1);
				xi_0 = _mm_add_ps(xi_0, xi_1);

				xr_1 = _mm_sub_ps(_mm_mul_ps(yr, C0), _mm_mul_ps(yi, S0));
				xi_1 = _mm_add_ps(_mm_mul_ps(yi, C0), _mm_mul_ps(yr, S0));

				_mm_store_ps(xr0, xr_0);
				_mm_store_ps(xr1, xr_1);

				_mm_store_ps(xi0, xi_0);
				_mm_store_ps(xi1, xi_1);

				xr0 += 4;
				xi0 += 4;

				xr1 += 4;
				xi1 += 4;

				C += 4;
				S += 4;
			}
		}
	}

	FFT8_SSE2(t->n / 8, t->xr, t->xi, t->xr, t->xi);

	{
		const INT16* idx = t->t->I[t->start];

		UINT32* xr = (UINT32*)t->xr;
		UINT32* xi = (UINT32*)t->xi;

		UINT32* zr0 = (UINT32*)zr;
		UINT32* zi0 = (UINT32*)zi;

		UINT32* end = zr0 + t->n;
		while (zr0 < end) {
			INT16 o = idx[0];
			zr0[0] = xr[o];
			zi0[0] = xi[o];

			o = idx[1];
			zr0[1] = xr[o];
			zi0[1] = xi[o];

			o = idx[2];
			zr0[2] = xr[o];
			zi0[2] = xi[o];

			o = idx[3];
			zr0[3] = xr[o];
			zi0[3] = xi[o];

			idx += 4;

			zr0 += 4;
			zi0 += 4;
		}
	}
}

/* */

static void IMDCT_N_DCT_SSE2(
	IMDCT_N_t*   t,
	const FLOAT* x);

void IMDCT_N_Execute_SSE2(
	IMDCT_N_t*   t,
	const FLOAT* x,
	FLOAT*       z)
{
	INT32 n2 = t->n / 2;
	INT32 n4 = t->n / 4;

	FLOAT* z0 = z;
	FLOAT* y0;

	FLOAT* end;

	IMDCT_N_DCT_SSE2(t, x);

	y0 = t->y + n4 - 4;

	end = z0 + n4;
	while (z0 < end) {
		__m128 x0 = _mm_load_ps(y0);
		x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 1, 2, 3));
		_mm_store_ps(z0, x0);

		z0 += 4;
		y0 -= 4;
	}

	y0 = t->y;

	end = z0 + n2;
	while (z0 < end) {
		__m128 x0 = _mm_sub_ps(_mm_setzero_ps(), _mm_load_ps(y0 + 0));
		__m128 x1 = _mm_sub_ps(_mm_setzero_ps(), _mm_load_ps(y0 + 4));

		_mm_store_ps(z0 + 0, x0);
		_mm_store_ps(z0 + 4, x1);

		z0 += 8;
		y0 += 8;
	}

	y0 = t->y + n2 - 4;

	end = z0 + n4;
	while (z0 < end) {
		__m128 x0 = _mm_load_ps(y0);
		x0 = _mm_sub_ps(_mm_setzero_ps(), _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 1, 2, 3)));
		_mm_store_ps(z0, x0);

		z0 += 4;
		y0 -= 4;
	}
}

void IMDCT_N_DCT_SSE2(
	IMDCT_N_t*   t,
	const FLOAT* x)
{
	INT32 n2 = t->n / 2;
	INT32 n4 = t->n / 4;
	INT32 n8 = t->n / 8;

	{
		FLOAT* xr = t->xr;
		FLOAT* xi = t->xi;

		const FLOAT* x0 = x;
		const FLOAT* x1 = x + n2 - 8;

		const FLOAT* C = t->t->C[t->index];
		const FLOAT* S = t->t->S[t->index];

		FLOAT* end = xr + n4;
		while (xr < end) {
			__m128 x_00 = _mm_load_ps(x0 + 0);
			__m128 x_01 = _mm_load_ps(x0 + 4);

			__m128 x_10 = _mm_load_ps(x1 + 0);
			__m128 x_11 = _mm_load_ps(x1 + 4);

			__m128 x_0 = _mm_shuffle_ps(x_00, x_01, _MM_SHUFFLE(2, 0, 2, 0));
			__m128 x_1 = _mm_shuffle_ps(x_11, x_10, _MM_SHUFFLE(1, 3, 1, 3));

			__m128 C0 = _mm_load_ps(C);
			__m128 S0 = _mm_load_ps(S);

			__m128 xr0 =                              _mm_sub_ps(_mm_mul_ps(x_1, C0), _mm_mul_ps(x_0, S0));
			__m128 xi0 = _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(x_1, S0), _mm_mul_ps(x_0, C0)));

			_mm_store_ps(xr, xr0);
			_mm_store_ps(xi, xi0);

			xr += 4;
			xi += 4;

			x0 += 8;
			x1 -= 8;

			C += 4;
			S += 4;
		}
	}

	FFT_N_SSE2(
		t->fft,
		t->xr, t->xi,
		t->xr, t->xi);

	{
		FLOAT* z0 = t->y + n4 - 8;
		FLOAT* z1 = t->y + n4;

		const FLOAT* xr0 = t->xr + n8 - 4;
		const FLOAT* xr1 = t->xr + n8;

		const FLOAT* xi0 = t->xi + n8 - 4;
		const FLOAT* xi1 = t->xi + n8;

		const FLOAT* C0 = t->t->C[t->index] + n8 - 4;
		const FLOAT* S0 = t->t->S[t->index] + n8 - 4;

		const FLOAT* C1 = t->t->C[t->index] + n8;
		const FLOAT* S1 = t->t->S[t->index] + n8;

		FLOAT* end = z1 + n4;
		while (z1 < end) {
			__m128 xr0_0 = _mm_load_ps(xr0);
			__m128 xr1_0 = _mm_load_ps(xr1);

			__m128 xi0_0 = _mm_load_ps(xi0);
			__m128 xi1_0 = _mm_load_ps(xi1);

			__m128 C0_0 = _mm_load_ps(C0);
			__m128 S0_0 = _mm_load_ps(S0);

			__m128 C1_0 = _mm_load_ps(C1);
			__m128 S1_0 = _mm_load_ps(S1);

			__m128 z0_e = _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(xi0_0, S0_0), _mm_mul_ps(xr0_0, C0_0)));
			__m128 z1_e = _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(xi1_0, S1_0), _mm_mul_ps(xr1_0, C1_0)));

			__m128 z1_o = _mm_sub_ps(_mm_mul_ps(xr0_0, S0_0), _mm_mul_ps(xi0_0, C0_0));
			__m128 z0_o = _mm_sub_ps(_mm_mul_ps(xr1_0, S1_0), _mm_mul_ps(xi1_0, C1_0));

			__m128 z0_0 = _mm_shuffle_ps(z0_e, z0_o, _MM_SHUFFLE(2, 3, 1, 0));
			__m128 z0_1 = _mm_shuffle_ps(z0_e, z0_o, _MM_SHUFFLE(0, 1, 3, 2));

			__m128 z1_0 = _mm_shuffle_ps(z1_e, z1_o, _MM_SHUFFLE(2, 3, 1, 0));
			__m128 z1_1 = _mm_shuffle_ps(z1_e, z1_o, _MM_SHUFFLE(0, 1, 3, 2));

			z0_0 = _mm_shuffle_ps(z0_0, z0_0, _MM_SHUFFLE(3, 1, 2, 0));
			z0_1 = _mm_shuffle_ps(z0_1, z0_1, _MM_SHUFFLE(3, 1, 2, 0));
			z1_0 = _mm_shuffle_ps(z1_0, z1_0, _MM_SHUFFLE(3, 1, 2, 0));
			z1_1 = _mm_shuffle_ps(z1_1, z1_1, _MM_SHUFFLE(3, 1, 2, 0));

			_mm_store_ps(z0 + 0, z0_0);
			_mm_store_ps(z0 + 4, z0_1);

			_mm_store_ps(z1 + 0, z1_0);
			_mm_store_ps(z1 + 4, z1_1);

			z0 -= 8;
			z1 += 8;

			xr0 -= 4;
			xr1 += 4;

			xi0 -= 4;
			xi1 += 4;

			C0 -= 4;
			C1 += 4;

			S0 -= 4;
			S1 += 4;
		}
	}
}

/* */

