//-----------------------------------------------------------------------------
// Voice synthesis based on f0, spectrogram and spectrogram of 
// excitation signal.
//-----------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "world.h"

namespace stand
{
namespace math
{
namespace dsp
{

//-----------------------------------------------------------------------------
// GetOneFrameSegment() calculates a glottal vibration based on the spectral 
// envelope and excitation signal.
// Input:
//   f0                     : f0 contour
//   spectrogram            : Spectrogram (WORLD assumes spectrogram by Star())
//   residual_spectrogram   : Extracted spectrum of the excitation signal
//   fft_size               : FFT size used for Star() and Platinum()
//   current_frame          : Frame number used for the synthesis
//   minimum_phase          : Struct for minimum phase analysis
//   inverse_real_fft       : Struct for inverse FFT
// Output:
//   y                      : Output signal
// Caution: 
//   minimum_phase and inverse_real_fft are allocated in advance. This is for 
//   the rapid processing because set of FFT requires much computational cost.
//-----------------------------------------------------------------------------
void GetOneFrameSegment(double *f0, double **spectrogram, 
  double **residual_spectrogram, int fft_size, int current_frame, 
  MinimumPhaseAnalysis *minimum_phase, InverseRealFFT *inverse_real_fft, 
  double *y)
{
  for(int i = 0;i < minimum_phase->fft_size/2+1;i++)
    minimum_phase->log_spectrum[i] = 
    log(spectrogram[current_frame][i])/2.0;
  GetMinimumPhaseSpectrum(minimum_phase);

  inverse_real_fft->spectrum[0][0] = 
    minimum_phase->minimum_phase_spectrum[0][0] * 
    residual_spectrogram[current_frame][0];
  inverse_real_fft->spectrum[0][1] = 0;

  for(int i = 1;i < fft_size/2;i++)
  {
    inverse_real_fft->spectrum[i][0] = 
      minimum_phase->minimum_phase_spectrum[i][0] * 
      residual_spectrogram[current_frame][(i-1)*2+1] - 
      minimum_phase->minimum_phase_spectrum[i][1] * 
      residual_spectrogram[current_frame][i*2];
    inverse_real_fft->spectrum[i][1] = 
      minimum_phase->minimum_phase_spectrum[i][0] * 
      residual_spectrogram[current_frame][i*2] + 
      minimum_phase->minimum_phase_spectrum[i][1] * 
      residual_spectrogram[current_frame][(i-1)*2+1];
  }
  inverse_real_fft->spectrum[fft_size/2][0] = 
    minimum_phase->minimum_phase_spectrum[fft_size/2][0] * 
    residual_spectrogram[current_frame][fft_size-1];
  inverse_real_fft->spectrum[fft_size/2][1] = 0;
  fft_execute(inverse_real_fft->inverse_fft);

  for(int i = 0;i < fft_size;i++) 
    y[i] = inverse_real_fft->waveform[i] / (double)fft_size;
}

//-----------------------------------------------------------------------------
// Synthesis() synthesizes the voice from three parameters.
// Input:
//   f0                     : f0 contour
//   f0_length              : Length of f0
//   spectrogram            : Spectrogram (WORLD assumes spectrogram by Star())
//   residual_spectrogram   : Extracted spectrum of the excitation signal
//   fft_size               : FFT size used for Star() and Platinum()
//   frame_period           : Temporal inverval for Dio()
//   fs                     : Sampling frequency
//   y_length               : Length of Output (allocated in advance)
// Output:
//   y                      : Synthesized voice
//-----------------------------------------------------------------------------
void Synthesis(double *f0, int f0_length, double **spectrogram, 
  double **residual_spectrogram, int fft_size, double frame_period, int fs, 
  int y_length, double *y)
{
  double *impulse_response = (double *)malloc(sizeof(double) * fft_size);

  MinimumPhaseAnalysis minimum_phase = {0};
  InitializeMinimumPhaseAnalysis(fft_size, &minimum_phase);
  InverseRealFFT inverse_real_fft = {0};
  InitializeInverseRealFFT(fft_size, &inverse_real_fft);

  double current_time = 0.0;
  int current_position = 0;
  int current_frame = 0;
  for(int i = 0;;i++)
  {
    for(int j = 0;j < fft_size;j++) impulse_response[j] = 0.0;

    GetOneFrameSegment(f0, spectrogram, residual_spectrogram, 
      fft_size, current_frame, &minimum_phase, &inverse_real_fft, 
      impulse_response);

    current_position = (int)(current_time*(double)fs);

//    for(j = 0;j < fftl/2;j++)
    for(int j = 0;j < 3*fft_size/4;j++)
    {
      if(j+current_position >= y_length) break;
      y[j+current_position] += impulse_response[j];
    }

    // 更新
    current_time += 1.0/
      (f0[current_frame] == 0.0 ? DEFAULT_F0 : f0[current_frame]);
    current_frame = (int)(current_time/(frame_period/1000.0) + 0.5);
    current_position = (int)(current_time*(double)fs);
    if(3*fft_size/4+1+current_position >= y_length ||
      current_frame >= f0_length) break;
  }

  DestroyMinimumPhaseAnalysis(&minimum_phase);
  DestroyInverseRealFFT(&inverse_real_fft);
  free(impulse_response);
  return;
}

}
}
}
