/* TransformIMDCT.c */
/* 2008/11/10       */

#include "StdAfx.h"

#include "VorbisTest.h"

#include "TransformDecoder.h"

/* */

static void FFT8(
	INT32        n,
	const FLOAT* Ar,
	const FLOAT* Ai,
	FLOAT*       Zr,
	FLOAT*       Zi)
{
	static const FLOAT C[4] = { 1.0F, 0.707106781F, 0.0F, -0.707106781F };
	static const FLOAT S[4] = { 0.0F, 0.707106781F, 1.0F,  0.707106781F };

	FLOAT xr[8];
	FLOAT xi[8];

	FLOAT yr[4];
	FLOAT yi[4];

	const FLOAT* ar = Ar;
	const FLOAT* ai = Ai;

	FLOAT* zr = Zr;
	FLOAT* zi = Zi;

	const FLOAT* end = ar + n * 8;

	while (ar < end) {
		/* Step.1 */
		xr[0] = ar[0] + ar[4];
		xr[1] = ar[1] + ar[5];
		xr[2] = ar[2] + ar[6];
		xr[3] = ar[3] + ar[7];

		xi[0] = ai[0] + ai[4];
		xi[1] = ai[1] + ai[5];
		xi[2] = ai[2] + ai[6];
		xi[3] = ai[3] + ai[7];

		yr[0] = ar[0] - ar[4];
		yr[1] = ar[1] - ar[5];
		yr[2] = ar[2] - ar[6];
		yr[3] = ar[3] - ar[7];

		yi[0] = ai[0] - ai[4];
		yi[1] = ai[1] - ai[5];
		yi[2] = ai[2] - ai[6];
		yi[3] = ai[3] - ai[7];

		xr[4] = yr[0] * C[0] - yi[0] * S[0];
		xr[5] = yr[1] * C[1] - yi[1] * S[1];
		xr[6] = yr[2] * C[2] - yi[2] * S[2];
		xr[7] = yr[3] * C[3] - yi[3] * S[3];

		xi[4] = yi[0] * C[0] + yr[0] * S[0];
		xi[5] = yi[1] * C[1] + yr[1] * S[1];
		xi[6] = yi[2] * C[2] + yr[2] * S[2];
		xi[7] = yi[3] * C[3] + yr[3] * S[3];

		/* Step.2 */
		yr[0] = xr[0] - xr[2];
		yr[1] = xr[1] - xr[3];
		yr[2] = xr[4] - xr[6];
		yr[3] = xr[5] - xr[7];

		yi[0] = xi[0] - xi[2];
		yi[1] = xi[1] - xi[3];
		yi[2] = xi[4] - xi[6];
		yi[3] = xi[5] - xi[7];

		xr[0] += xr[2];
		xr[1] += xr[3];
		xr[4] += xr[6];
		xr[5] += xr[7];

		xi[0] += xi[2];
		xi[1] += xi[3];
		xi[4] += xi[6];
		xi[5] += xi[7];

		xr[2] = yr[0] * C[0] - yi[0] * S[0];
		xr[3] = yr[1] * C[2] - yi[1] * S[2];
		xr[6] = yr[2] * C[0] - yi[2] * S[0];
		xr[7] = yr[3] * C[2] - yi[3] * S[2];

		xi[2] = yi[0] * C[0] + yr[0] * S[0];
		xi[3] = yi[1] * C[2] + yr[1] * S[2];
		xi[6] = yi[2] * C[0] + yr[2] * S[0];
		xi[7] = yi[3] * C[2] + yr[3] * S[2];

		/* Step.3 */
		zr[0] = xr[0] + xr[1];
		zr[1] = xr[4] + xr[5];
		zr[2] = xr[2] + xr[3];
		zr[3] = xr[6] + xr[7];

		zi[0] = xi[0] + xi[1];
		zi[1] = xi[4] + xi[5];
		zi[2] = xi[2] + xi[3];
		zi[3] = xi[6] + xi[7];

		zr[4] = xr[0] - xr[1];
		zr[5] = xr[4] - xr[5];
		zr[6] = xr[2] - xr[3];
		zr[7] = xr[6] - xr[7];

		zi[4] = xi[0] - xi[1];
		zi[5] = xi[4] - xi[5];
		zi[6] = xi[2] - xi[3];
		zi[7] = xi[6] - xi[7];

		ar += 8;
		ai += 8;

		zr += 8;
		zi += 8;
	}
}

/* */

static void FFT_Swap(INT16* I, INT32 n)
{
	INT32 n2 = n / 2;
	INT32 m  = n2 - 1;

	INT32 i;
	for (i = 0; i < n2; i++) {
		I[i * 2    ] = (INT16)i;
		I[i * 2 + 1] = (INT16)(n2 + ((i + 1) & m));
	}
}

static void FFT_Init(MemoryPool_t* pool, FFT_t* t)
{
	static const UINT8 SH[8] = { 0, 7, 6, 5, 4, 3, 2, 1 };

	INT32 i, j;

	for (i = 0; i < 8; i++) {
		INT32 s = i + 4;
		INT32 n = 1 << s;

		INT32 n2 = n / 2;

		t->C[i] = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * n2);
		t->S[i] = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * n2);
		if (t->C[i] == NULL || t->S[i] == NULL) {
			return;
		}

		for (j = 0; j < n2; j++) {
			DOUBLE th = M_PI * j / n2;
			t->C[i][j] = (FLOAT)cos(th);
			t->S[i][j] = (FLOAT)sin(th);
		}
	}

	for (i = 0; i < 8; i++) {
		INT32 s = i + 4;
		INT32 n = 1 << s;

		t->I[i] = (INT16*)MemoryPool_Allocate(pool, sizeof(INT16) * n);
		if (t->I == NULL) {
			return;
		}
	}

	for (i = 0; i < 8; i++) {
		INT32 s = i + 4;
		INT32 n = 1 << s;

		FFT_Swap(t->I[i], n);

		if (i == 0) {
			for (j = 0; j < 16; j++) {
				INT32 k = t->I[0][j];
				INT32 m = k % 8;
				k = (k / 8) * 8 + SH[m];
				t->I[0][j] = (INT16)k;
			}

		} else {
			const INT16* pv = t->I[i - 1];

			INT32 n2 = n / 2;
			for (j = 0; j < n; j++) {
				INT32 k = t->I[i][j];
				INT32 x = (k < n2) ? pv[k] : pv[k - n2] + n2;
				t->I[i][j] = (INT16)x;
			}
		}
	}
}

/* */

static BOOL FFT_N_Create(MemoryPool_t* pool, INT32 s, const FFT_t* fft, FFT_N_t* t)
{
	INT32 ss = s - 4;
	if (ss < 0 || ss >= 8) {
		return FALSE;
	}

	t->n = 1 << s;

	t->start = ss;

	t->t = fft;

	t->xr = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * t->n);
	t->xi = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * t->n);
	if (t->xr == NULL || t->xi == NULL) {
		return FALSE;
	}

	return TRUE;
}

static void FFT_N(
	FFT_N_t*     t,
	const FLOAT* ar,
	const FLOAT* ai,
	FLOAT*       zr,
	FLOAT*       zi)
{
	INT32 i, j, k;
	INT32 span;

	FLOAT yr[4];
	FLOAT yi[4];

	const INT16* idx = t->t->I[t->start];

	for (i = 0; i < t->n; i++) {
		t->xr[i] = ar[i];
		t->xi[i] = ai[i];
	}

	for (i = t->start, span = 0; i >= 0; i--, span++) {
		INT32 n = 1 << (i + 4);
		INT32 n2 = n / 2;
		INT32 pos = 0;
		INT32 count = 1 << span;

		for (j = 0; j < count; j++, pos += n) {
			FLOAT* xr0 = t->xr + pos;
			FLOAT* xi0 = t->xi + pos;

			FLOAT* xr1 = t->xr + pos + n2;
			FLOAT* xi1 = t->xi + pos + n2;

			const FLOAT* C = t->t->C[i];
			const FLOAT* S = t->t->S[i];

			for (k = 0; k < n2; k += 4) {
				yr[0] = xr0[0] - xr1[0];
				yr[1] = xr0[1] - xr1[1];
				yr[2] = xr0[2] - xr1[2];
				yr[3] = xr0[3] - xr1[3];

				yi[0] = xi0[0] - xi1[0];
				yi[1] = xi0[1] - xi1[1];
				yi[2] = xi0[2] - xi1[2];
				yi[3] = xi0[3] - xi1[3];

				xr0[0] += xr1[0];
				xr0[1] += xr1[1];
				xr0[2] += xr1[2];
				xr0[3] += xr1[3];

				xi0[0] += xi1[0];
				xi0[1] += xi1[1];
				xi0[2] += xi1[2];
				xi0[3] += xi1[3];

				xr1[0] = C[0] * yr[0] - S[0] * yi[0];
				xr1[1] = C[1] * yr[1] - S[1] * yi[1];
				xr1[2] = C[2] * yr[2] - S[2] * yi[2];
				xr1[3] = C[3] * yr[3] - S[3] * yi[3];

				xi1[0] = C[0] * yi[0] + S[0] * yr[0];
				xi1[1] = C[1] * yi[1] + S[1] * yr[1];
				xi1[2] = C[2] * yi[2] + S[2] * yr[2];
				xi1[3] = C[3] * yi[3] + S[3] * yr[3];

				xr0 += 4;
				xi0 += 4;

				xr1 += 4;
				xi1 += 4;

				C += 4;
				S += 4;
			}
		}
	}

	FFT8(t->n / 8, t->xr, t->xi, t->xr, t->xi);

	for (i = 0; i < t->n; i++) {
		j = *(idx++);
		zr[i] = t->xr[j];
		zi[i] = t->xi[j];
	}
}

/* */

static void IMDCT_Init(MemoryPool_t* pool, IMDCT_t* t)
{
	INT32 i, j;

	for (i = 0; i < 8; i++) {
		INT32 n = 1 << (i + 6);
		INT32 n4 = n / 4;

		t->C[i] = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * n4);
		t->S[i] = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * n4);
		if (t->C[i] == NULL || t->S[i] == NULL) {
			return;
		}

		for (j = 0; j < n4; j++) {
			DOUBLE th = 2 * M_PI * (j + 1.0 / 8.0) / n;
			t->C[i][j] = (FLOAT)cos(th);
			t->S[i][j] = (FLOAT)sin(th);
		}
	}
}

/* */

static BOOL IMDCT_N_Create(
	MemoryPool_t*  pool,
	INT32          s,
	const IMDCT_t* table,
	FFT_N_t*       fft,
	IMDCT_N_t*     t)
{
	INT32 ss = s - 6;

	if (ss < 0 || ss >= 8) {
		return FALSE;
	}

	t->n = 1 << s;

	t->index = ss;

	t->t = table;

	t->fft = fft;

	t->xr = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * t->n / 4);
	t->xi = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * t->n / 4);
	if (t->xr == NULL || t->xi == NULL) {
		return FALSE;
	}

	t->y = (FLOAT*)MemoryPool_Allocate(pool, sizeof(FLOAT) * t->n / 2);
	if (t->y == NULL) {
		return FALSE;
	}

	return TRUE;
}

static void IMDCT_N_DCT(
	IMDCT_N_t*   t,
	const FLOAT* x);

static void IMDCT_N_Execute(
	IMDCT_N_t*   t,
	const FLOAT* x,
	FLOAT*       z)
{
	INT32 i;
	INT32 n2 = t->n / 2;
	INT32 n4 = t->n / 4;

	FLOAT* z0 = z;
	FLOAT* y0;

	IMDCT_N_DCT(t, x);

	y0 = t->y + n4 - 4;

	for (i = 0; i < n4; i += 4) {
		z0[0] = y0[3];
		z0[1] = y0[2];
		z0[2] = y0[1];
		z0[3] = y0[0];

		z0 += 4;
		y0 -= 4;
	}

	y0 = t->y;

	for (i = 0; i < n4; i += 4) {
		z0[0] = -y0[0];
		z0[1] = -y0[1];
		z0[2] = -y0[2];
		z0[3] = -y0[3];

		z0[4] = -y0[4];
		z0[5] = -y0[5];
		z0[6] = -y0[6];
		z0[7] = -y0[7];

		z0 += 8;
		y0 += 8;
	}

	y0 = t->y + n2 - 4;

	for (i = 0; i < n4; i += 4) {
		z0[0] = -y0[3];
		z0[1] = -y0[2];
		z0[2] = -y0[1];
		z0[3] = -y0[0];

		z0 += 4;
		y0 -= 4;
	}
}

static void IMDCT_N_DCT(
	IMDCT_N_t*   t,
	const FLOAT* x)
{
	INT32 i;
	INT32 n2 = t->n / 2;
	INT32 n4 = t->n / 4;
	INT32 n8 = t->n / 8;

	{
		FLOAT* xr = t->xr;
		FLOAT* xi = t->xi;

		const FLOAT* x0 = x;
		const FLOAT* x1 = x + n2 - 8;

		const FLOAT* C = t->t->C[t->index];
		const FLOAT* S = t->t->S[t->index];

		for (i = 0; i < n4; i += 4) {
			xr[0] = x1[7] * C[0] - x0[0] * S[0];
			xr[1] = x1[5] * C[1] - x0[2] * S[1];
			xr[2] = x1[3] * C[2] - x0[4] * S[2];
			xr[3] = x1[1] * C[3] - x0[6] * S[3];

			xi[0] = -(x1[7] * S[0] + x0[0] * C[0]);
			xi[1] = -(x1[5] * S[1] + x0[2] * C[1]);
			xi[2] = -(x1[3] * S[2] + x0[4] * C[2]);
			xi[3] = -(x1[1] * S[3] + x0[6] * C[3]);

			xr += 4;
			xi += 4;

			x0 += 8;
			x1 -= 8;

			C += 4;
			S += 4;
		}
	}

	FFT_N(
		t->fft,
		t->xr, t->xi,
		t->xr, t->xi);

	{
		FLOAT* z0 = t->y + n4 - 8;
		FLOAT* z1 = t->y + n4;

		const FLOAT* xr0 = t->xr + n8 - 4;
		const FLOAT* xr1 = t->xr + n8;

		const FLOAT* xi0 = t->xi + n8 - 4;
		const FLOAT* xi1 = t->xi + n8;

		const FLOAT* C0 = t->t->C[t->index] + n8 - 4;
		const FLOAT* S0 = t->t->S[t->index] + n8 - 4;

		const FLOAT* C1 = t->t->C[t->index] + n8;
		const FLOAT* S1 = t->t->S[t->index] + n8;

		for (i = 0; i < n8; i += 4) {
			z0[0] = -(xi0[0] * S0[0] + xr0[0] * C0[0]);
			z0[2] = -(xi0[1] * S0[1] + xr0[1] * C0[1]);
			z0[4] = -(xi0[2] * S0[2] + xr0[2] * C0[2]);
			z0[6] = -(xi0[3] * S0[3] + xr0[3] * C0[3]);

			z1[0] = -(xi1[0] * S1[0] + xr1[0] * C1[0]);
			z1[2] = -(xi1[1] * S1[1] + xr1[1] * C1[1]);
			z1[4] = -(xi1[2] * S1[2] + xr1[2] * C1[2]);
			z1[6] = -(xi1[3] * S1[3] + xr1[3] * C1[3]);

			z1[1] =  xr0[3] * S0[3] - xi0[3] * C0[3];
			z1[3] =  xr0[2] * S0[2] - xi0[2] * C0[2];
			z1[5] =  xr0[1] * S0[1] - xi0[1] * C0[1];
			z1[7] =  xr0[0] * S0[0] - xi0[0] * C0[0];

			z0[1] =  xr1[3] * S1[3] - xi1[3] * C1[3];
			z0[3] =  xr1[2] * S1[2] - xi1[2] * C1[2];
			z0[5] =  xr1[1] * S1[1] - xi1[1] * C1[1];
			z0[7] =  xr1[0] * S1[0] - xi1[0] * C1[0];

			z0 -= 8;
			z1 += 8;

			xr0 -= 4;
			xr1 += 4;

			xi0 -= 4;
			xi1 += 4;

			C0 -= 4;
			C1 += 4;

			S0 -= 4;
			S1 += 4;
		}
	}
}

/* */

static FFT_t   s_fft;
static IMDCT_t s_imdct;

void TransformIMDCT_Init(MemoryPool_t* pool)
{
	FFT_Init  (pool, &s_fft  );
	IMDCT_Init(pool, &s_imdct);
}

void TransformIMDCT_Uninit(void)
{
}

/* */

extern BOOL g_Enable_SSE2;

/* */

void TT_FFT8(
	const FLOAT* ar,
	const FLOAT* ai,
	FLOAT*       zr,
	FLOAT*       zi)
{
	static const UINT8 SH[8] = { 0, 7, 6, 5, 4, 3, 2, 1 };

	INT32 i;

	ALIGN(16) FLOAT Z[16];

	if (!g_Enable_SSE2) {
		FFT8(1, ar, ai, Z, Z+8);

	} else {
		ALIGN(16) FLOAT A[16];

		memcpy(A+0, ar, 8 * sizeof(FLOAT));
		memcpy(A+8, ai, 8 * sizeof(FLOAT));

		FFT8_SSE2(1, A, A+8, Z, Z+8);
	}

	for (i = 0; i < 8; i++) {
		INT32 j = SH[i];
		zr[i] = Z[j + 0];
		zi[i] = Z[j + 8];
	}
}

/* */

struct TT_FFT {

	MemoryPool_t Pool;

	FFT_N_t fft;

};

TT_FFT_t* TT_CreateFFT(void)
{
	TT_FFT_t* t;

	t = (TT_FFT_t*)malloc(sizeof(TT_FFT_t));
	if (t == NULL) {
		return NULL;
	}

	memset(t, 0, sizeof(TT_FFT_t));

	MemoryPool_Init(&(t->Pool));

	return t;
}

BOOL TT_SetupFFT(
	TT_FFT_t* t,
	INT32     s)
{
	if (!FFT_N_Create(&(t->Pool), s, &s_fft, &(t->fft))) {
		return FALSE;
	}

	return TRUE;
}

void TT_ExecuteFFT(
	TT_FFT_t*    t,
	const FLOAT* ar,
	const FLOAT* ai,
	FLOAT*       zr,
	FLOAT*       zi)
{
	if (!g_Enable_SSE2) {
		FFT_N(
			&(t->fft),
			ar,
			ai,
			zr,
			zi);
	} else {
		FFT_N_SSE2(
			&(t->fft),
			ar,
			ai,
			zr,
			zi);
	}
}

void TT_ReleaseFFT(
	TT_FFT_t* t)
{
	if (t != NULL) {
		free(t);
	}
}

/* */

struct TT_IMDCT {

	MemoryPool_t Pool;

	FFT_N_t   fft;
	IMDCT_N_t d;

};

TT_IMDCT_t* TT_CreateIMDCT(void)
{
	TT_IMDCT_t* t;

	t = (TT_IMDCT_t*)malloc(sizeof(TT_IMDCT_t));
	if (t == NULL) {
		return NULL;
	}

	memset(t, 0, sizeof(TT_IMDCT_t));

	MemoryPool_Init(&(t->Pool));

	return t;
}

BOOL TT_SetupIMDCT(
	TT_IMDCT_t* t,
	INT32       s)
{
	if (!FFT_N_Create(&(t->Pool), s - 2, &s_fft, &(t->fft))) {
		return FALSE;
	}

	if (!IMDCT_N_Create(&(t->Pool), s, &s_imdct, &(t->fft), &(t->d))) {
		return FALSE;
	}

	return TRUE;
}

void TT_ExecuteIMDCT(
	TT_IMDCT_t*  t,
	const FLOAT* a,
	FLOAT*       z)
{
	if (!g_Enable_SSE2) {
		IMDCT_N_Execute(
			&(t->d),
			a,
			z);
	} else {
		IMDCT_N_Execute_SSE2(
			&(t->d),
			a,
			z);
	}
}

void TT_ReleaseIMDCT(
	TT_IMDCT_t* t)
{
	if (t != NULL) {
		free(t);
	}
}

/* */

struct TransformDecoderImpl {
	TransformDecoder_t d;
	FFT_N_t   fft;
	IMDCT_N_t imdct;
};

typedef struct TransformDecoderImpl TransformDecoderImpl_t;

static void TransformImpl(
	TransformDecoder_t* d0,
	const FLOAT*        x,
	FLOAT*              z)
{
	TransformDecoderImpl_t* d = (TransformDecoderImpl_t*)d0;

	IMDCT_N_Execute(
		&(d->imdct),
		x,
		z);
}

static void TransformImpl_SSE2(
	TransformDecoder_t* d0,
	const FLOAT*        x,
	FLOAT*              z)
{
	TransformDecoderImpl_t* d = (TransformDecoderImpl_t*)d0;

	IMDCT_N_Execute_SSE2(
		&(d->imdct),
		x,
		z);
}

TransformDecoder_t* CreateTransformDecoder(
	MemoryPool_t* pool,
	INT32         n)
{
	INT32 s;
	TransformDecoderImpl_t* d;

	for (s = 6; s < 14; s++) {
		INT32 nn = 1 << s;
		if (n == nn) {
			break;
		}
	}

	if (s >= 14) {
		return NULL;
	}

	d = (TransformDecoderImpl_t*)MemoryPool_Allocate(pool, sizeof(TransformDecoderImpl_t));
	if (d == NULL) {
		return NULL;
	}

	memset(d, 0, sizeof(TransformDecoderImpl_t));

	if (!FFT_N_Create(pool, s - 2, &s_fft, &(d->fft))) {
		return NULL;
	}

	if (!IMDCT_N_Create(pool, s, &s_imdct, &(d->fft), &(d->imdct))) {
		return NULL;
	}

	if (g_Enable_SSE2) {
		d->d.Transform = TransformImpl_SSE2;
	} else {
		d->d.Transform = TransformImpl;
	}

	return (TransformDecoder_t*)d;
}

/* */

