#include "SamsungSolomonVoiceW_Int.h"
#include "SamsungSolomonVoiceW_Table.h"
#include "SamsungSolomonVoiceW_basic_op.h"
#include "SamsungSolomonVoiceW_FFT_functions.h"

#define TMP_DEBUG

#if(FLAG_SELECT_C_DSP == 1)
#include <stdio.h>
extern FILE *fp_inner8kout;
extern FILE /**fp_debug_FNLMSin_acc, */*fp_debug_FNLMSin_inner, *fp_debug_FNLMSin_outer, /**fp_debug_FNLMSout_acc, */*fp_debug_FNLMSout_inner, *fp_debug_FNLMSout_outr;
extern FILE /**fp_debug_RESout_acc, */*fp_debug_RESout_inner, *fp_debug_RESout_outer;
extern FILE *fp_debug_NSout_acc, *fp_debug_NSout_inner, *fp_debug_NSout_outer;

extern FILE *fp_debug_BFfbf_outer, *fp_debug_BFbm_outer, *fp_debug_BFgsc_outer;
#endif

#ifndef SRCB_FFT32_OPT
void Tx_c_fft_512_32b(int * farray_ptr, short isign)
{

	short i, j, k, ii, jj, kk, ji, kj;
	long long ftmp, ftmp_real, ftmp_imag;
	int tmp, tmp1, tmp2;

	/* Rearrange the input array in bit reversed order */
	for (i = 0, j = 0; i < DVTX_FFT_SIZE_512 - 2; i = i + 2)
	{
		if (j > i)
		{
			ftmp = *(farray_ptr + i);
			*(farray_ptr + i) = *(farray_ptr + j);
			*(farray_ptr + j) = (int)ftmp;

			ftmp = *(farray_ptr + i + 1);
			*(farray_ptr + i + 1) = *(farray_ptr + j + 1);
			*(farray_ptr + j + 1) = (int)ftmp;
		}

		k = DVTX_SIZE_BY_TWO_256;
		while (j >= k)
		{
			j = DVTXOP_sub(j, k);
			k = DVTXOP_shr(k, 1);
		}
		j += k;
	}

	/* The FFT part */
	if (isign == 1)
	{
		for (i = 0; i < DVTX_NUM_STAGE_8; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
			ii = tx_ii_table_512_swb[i];	/* 2 * number of FFT's */

			for (j = 0; j < jj; j = j + 2)
			{					/* j is sample counter */
				ji = j * ii;	/* ji is phase table index */

				for (k = j; k < DVTX_FFT_SIZE_512; k = k + kk)
				{				/* k is butterfly top */
					kj = DVTXOP_add(k, jj);	/* kj is butterfly bottom */

					/* Butterfly computations */
					ftmp_real = DVTXOP_LL_sub(DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji]),
						DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji + 1]));
					ftmp_imag = DVTXOP_LL_add(DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji]),
						DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji + 1]));

					tmp1 = DVTXOP_L_round(ftmp_real);
					tmp2 = DVTXOP_L_round(ftmp_imag);

					tmp = DVTXOP_L_sub(*(farray_ptr + k), tmp1);
					*(farray_ptr + kj) = DVTXOP_L_shr(tmp, 1);

					tmp = DVTXOP_L_sub(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + kj + 1) = DVTXOP_L_shr(tmp, 1);

					tmp = DVTXOP_L_add(*(farray_ptr + k), tmp1);
					*(farray_ptr + k) = DVTXOP_L_shr(tmp, 1);

					tmp = DVTXOP_L_add(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + k + 1) = DVTXOP_L_shr(tmp, 1);
				}
			}
		}

		/* The IFFT part */
	}
	else
	{
		for (i = 0; i < DVTX_NUM_STAGE_8; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
			ii = tx_ii_table_512_swb[i];	/* 2 * number of FFT's */

			for (j = 0; j < jj; j = j + 2)
			{					/* j is sample counter */
				ji = j * ii;	/* ji is phase table index */

				for (k = j; k < DVTX_FFT_SIZE_512; k = k + kk)
				{				/* k is butterfly top */
					kj = DVTXOP_add(k, jj);	/* kj is butterfly bottom */

					/* Butterfly computations */
					ftmp_real = DVTXOP_LL_add(DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji]),
						DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji + 1]));
					ftmp_imag = DVTXOP_LL_sub(DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji]),
						DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji + 1]));

					tmp1 = DVTXOP_L_round(ftmp_real);
					tmp2 = DVTXOP_L_round(ftmp_imag);

					*(farray_ptr + kj) = DVTXOP_L_sub(*(farray_ptr + k), tmp1);
					*(farray_ptr + kj + 1) = DVTXOP_L_sub(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + k) = DVTXOP_L_add(*(farray_ptr + k), tmp1);
					*(farray_ptr + k + 1) = DVTXOP_L_add(*(farray_ptr + k + 1), tmp2);
				}
			}
		}
	}
}

void Tx_r_fft_512_32b(int * farray_ptr, short isign)
{

	int ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
	long long Lftmp1_real, Lftmp1_imag, Lftmp2_real, Lftmp2_imag;
	short i, j;
	long long Ltmp1, Ltmp2;




	/* The FFT part */
	if (isign == 1)
	{
		/* Perform the complex FFT */
		Tx_c_fft_512_32b(farray_ptr, isign);

		/* First, handle the DC and foldover frequencies */
		ftmp1_real = *farray_ptr;
		ftmp2_real = *(farray_ptr + 1);
		*farray_ptr = DVTXOP_L_add(ftmp1_real, ftmp2_real);
		*(farray_ptr + 1) = DVTXOP_L_sub(ftmp1_real, ftmp2_real);

		/* Now, handle the remaining positive frequencies */
		for (i = 2, j = DVTX_FFT_SIZE_512 - i; i <= DVTX_SIZE_BY_TWO_256; i = i + 2, j = DVTX_FFT_SIZE_512 - i)
		{
			ftmp1_real = DVTXOP_L_add(*(farray_ptr + i), *(farray_ptr + j));
			ftmp1_imag = DVTXOP_L_sub(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_real = DVTXOP_L_add(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_imag = DVTXOP_L_sub(*(farray_ptr + j), *(farray_ptr + i));

			Lftmp1_real = DVTXOP_LL_deposit_h(ftmp1_real);
			Lftmp1_imag = DVTXOP_LL_deposit_h(ftmp1_imag);
			Lftmp2_real = DVTXOP_LL_deposit_h(ftmp2_real);
			Lftmp2_imag = DVTXOP_LL_deposit_h(ftmp2_imag);

			Ltmp1 = DVTXOP_LL_sub(DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[i]), DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[i + 1]));
			*(farray_ptr + i) = DVTXOP_L_round(DVTXOP_LL_shr(DVTXOP_LL_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_LL_add(DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[i]), DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[i + 1]));
			*(farray_ptr + i + 1) = DVTXOP_L_round(DVTXOP_LL_shr(DVTXOP_LL_add(Lftmp1_imag, Ltmp1), 1));

			Ltmp1 = DVTXOP_LL_add(DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[j]), DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[j + 1]));
			*(farray_ptr + j) = DVTXOP_L_round(DVTXOP_LL_shr(DVTXOP_LL_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_LL_add(DVTXOP_LL_negate(DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[j])), DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[j + 1]));
			Ltmp2 = DVTXOP_LL_add(DVTXOP_LL_negate(Lftmp1_imag), Ltmp1);
			*(farray_ptr + j + 1) = DVTXOP_L_round(DVTXOP_LL_shr(Ltmp2, 1));
		}
	}
	else
	{
		/* First, handle the DC and foldover frequencies */
		ftmp1_real = *farray_ptr;
		ftmp2_real = *(farray_ptr + 1);
		*farray_ptr = DVTXOP_L_shr(DVTXOP_L_add(ftmp1_real, ftmp2_real), 1);
		*(farray_ptr + 1) = DVTXOP_L_shr(DVTXOP_L_sub(ftmp1_real, ftmp2_real), 1);

		/* Now, handle the remaining positive frequencies */
		for (i = 2, j = DVTX_FFT_SIZE_512 - i; i <= DVTX_SIZE_BY_TWO_256; i = i + 2, j = DVTX_FFT_SIZE_512 - i)
		{
			ftmp1_real = DVTXOP_L_add(*(farray_ptr + i), *(farray_ptr + j));
			ftmp1_imag = DVTXOP_L_sub(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_real = DVTXOP_L_negate(DVTXOP_L_add(*(farray_ptr + j + 1), *(farray_ptr + i + 1)));
			ftmp2_imag = DVTXOP_L_negate(DVTXOP_L_sub(*(farray_ptr + j), *(farray_ptr + i)));

			Lftmp1_real = DVTXOP_LL_deposit_h(ftmp1_real);
			Lftmp1_imag = DVTXOP_LL_deposit_h(ftmp1_imag);
			Lftmp2_real = DVTXOP_LL_deposit_h(ftmp2_real);
			Lftmp2_imag = DVTXOP_LL_deposit_h(ftmp2_imag);

			Ltmp1 = DVTXOP_LL_add(DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[i]), DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[i + 1]));
			*(farray_ptr + i) = DVTXOP_L_round(DVTXOP_LL_shr(DVTXOP_LL_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_LL_sub(DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[i]), DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[i + 1]));
			*(farray_ptr + i + 1) = DVTXOP_L_round(DVTXOP_LL_shr(DVTXOP_LL_add(Lftmp1_imag, Ltmp1), 1));

			Ltmp1 = DVTXOP_LL_sub(DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[j]), DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[j + 1]));
			*(farray_ptr + j) = DVTXOP_L_round(DVTXOP_LL_shr(DVTXOP_LL_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_LL_negate(DVTXOP_LL_add(DVTXOP_LL_mult(ftmp2_imag, tx_phs_tbl_512_32b[j]), DVTXOP_LL_mult(ftmp2_real, tx_phs_tbl_512_32b[j + 1])));
			Ltmp2 = DVTXOP_LL_add(DVTXOP_LL_negate(Lftmp1_imag), Ltmp1);
			*(farray_ptr + j + 1) = DVTXOP_L_round(DVTXOP_LL_shr(Ltmp2, 1));
		}
		/* Perform the complex IFFT */
		Tx_c_fft_512_32b(farray_ptr, isign);

		/*
		short farray_ptr_tmp[512];
		for (i = 0; i < 512; i++)
		{
			farray_ptr_tmp[i] = DVTXOP_extract_h(farray_ptr[i]);
		}
		Tx_c_fft_512(farray_ptr_tmp, isign);
		for (i = 0; i < 512; i++)
		{
			farray_ptr[i] = DVTXOP_L_deposit_h(farray_ptr_tmp[i]);
		}
		*/
	}
	return;
}
#else

#ifdef OPT_FFT_HOON
short fft_512_32b_i[120] = {
		1,2,3,4,5,6,7,8,9,10,
		11,12,13,14,15,17,18,19,20,21,
		22,23,25,26,27,28,29,30,31,33,
		34,35,37,38,39,41,42,43,44,45,
		46,47,49,50,51,53,54,55,57,58,
		59,61,62,63,65,67,69,70,71,73,
		74,75,77,78,79,81,83,85,86,87,
		89,91,93,94,95,97,99,101,103,105,
		107,109,110,111,113,115,117,119,121,123,
		125,127,131,133,135,137,139,141,143,147,
		149,151,155,157,159,163,167,171,173,175,
		179,183,187,191,199,203,207,215,223,239
};
short fft_512_32b_j[120] = {
		128,64,192,32,160,96,224,16,144,80,
		208,48,176,112,240,136,72,200,40,168,
		104,232,152,88,216,56,184,120,248,132,
		68,196,164,100,228,148,84,212,52,180,
		116,244,140,76,204,172,108,236,156,92,
		220,188,124,252,130,194,162,98,226,146,
		82,210,178,114,242,138,202,170,106,234,
		154,218,186,122,250,134,198,166,230,150,
		214,182,118,246,142,206,174,238,158,222,
		190,254,193,161,225,145,209,177,241,201,
		169,233,217,185,249,197,229,213,181,245,
		205,237,221,253,227,211,243,235,251,247
};
#endif
#define HIFI3_OPT_ENABLE_SRIN
void Tx_c_fft_512_32b(int * farray_ptr, short isign)
{
    short i, j, k, ii, jj, kk;// ji, kj;
    int ii8;
    //long long ftmp, ftmp_real, ftmp_imag;
    //int tmp, tmp1, tmp2;
#ifndef OPT_FFT_HOON
    /* Rearrange the input array in bit reversed order */
    for (i = 0, j = 0; i < DVTX_FFT_SIZE_512 - 2; i+=2)
    {
        if (j > i)
        {
        	///**********************Coresponding C code****************************
        	/***********************************************************************
            ftmp = *(farray_ptr + i);
            *(farray_ptr + i) = *(farray_ptr + j);
            *(farray_ptr + j) = (int)ftmp;
            ftmp = *(farray_ptr + i + 1);
            *(farray_ptr + i + 1) = *(farray_ptr + j + 1);
            *(farray_ptr + j + 1) = (int)ftmp;
            ***********************************************************************/
        	ae_f32x2 ae_Ltmp1, ae_Ltmp2;
            ae_Ltmp1 = AE_L32X2_I((ae_f32x2 *)(farray_ptr + i), 0);
			ae_Ltmp2 = AE_L32X2_I((ae_f32x2 *)(farray_ptr + j), 0);
			AE_S32X2_I(ae_Ltmp1, (ae_f32x2 *)(farray_ptr + j), 0);
			AE_S32X2_I(ae_Ltmp2, (ae_f32x2 *)(farray_ptr + i), 0);
            ///********************************************************************

        }
#ifndef HIFI3_OPT_ENABLE_SRIN
        k = DVTX_SIZE_BY_TWO_256;
        while (j >= k)
        {
            j = DVTXOP_sub(j, k);
            k = DVTXOP_shr(k, 1);
        }
        j += k;
#else
        j = AE_ADDBRBA32(j,0x800000U);       // bitrev
#endif
    }
#else
    /* Rearrange the input array in bit reversed order */
    ae_f32x2 *ae_tptr = (ae_f32x2 *)farray_ptr;
    ae_f32x2 ae_Ltmp1;
    for (i = 0; i < 120; i++)
    {
		ae_Ltmp1 = AE_MOV32(ae_tptr[fft_512_32b_i[i]]);
		ae_tptr[fft_512_32b_i[i]] = AE_MOV32(ae_tptr[fft_512_32b_j[i]]);
		ae_tptr[fft_512_32b_j[i]] = AE_MOV32(ae_Ltmp1);
    }
#endif
    if (isign == 1) /* The FFT part */
    {
    	///**********************Coresponding C code****************************
    	/***********************************************************************
		for(j=0; j<DVTX_FFT_SIZE_512; j+=4)
		{
			tmp1 = *(farray_ptr + j + 2);
			tmp2 = *(farray_ptr + j + 3);
			*(farray_ptr + j + 2) = DVTXOP_L_shr(DVTXOP_L_sub(*(farray_ptr + j), tmp1), 1);
			*(farray_ptr + j + 3) = DVTXOP_L_shr(DVTXOP_L_sub(*(farray_ptr + j + 1), tmp2), 1);
			*(farray_ptr + j) = DVTXOP_L_shr(DVTXOP_L_add(*(farray_ptr + j), tmp1), 1);
			*(farray_ptr + j + 1) = DVTXOP_L_shr(DVTXOP_L_add(*(farray_ptr + j + 1), tmp2), 1);
		}
    	***********************************************************************/
    	{
			ae_f32x2 ae_Ltmp1, ae_Ltmp2, ae_Ltmp3, ae_Ltmp4;
			ae_f32x2 ae_Ltmp5, ae_Ltmp6, ae_Ltmp7, ae_Ltmp8;
			ae_f32x2 *ae_Lp1, *ae_Lp3;
			ae_valign align1, align3;
			ae_Lp3 = ae_Lp1 = (ae_f32x2 *)(&farray_ptr[0]);
			align1 = AE_LA64_PP(ae_Lp1);
			align3 = AE_ZALIGN64();

			//prologue
			AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);                     //load
			AE_LA32X2_IP(ae_Ltmp2, align1, ae_Lp1);
			ae_Ltmp3 = AE_SRAI32(AE_ADD32S(ae_Ltmp1, ae_Ltmp2), 1);    // shift
			for(j=0; j<DVTX_FFT_SIZE_512; j+=16)
			{
				AE_LA32X2_IP(ae_Ltmp4, align1, ae_Lp1);
				ae_Ltmp8 = AE_SRAI32(AE_SUB32S(ae_Ltmp1, ae_Ltmp2), 1);
				AE_LA32X2_IP(ae_Ltmp5, align1, ae_Lp1);
				ae_Ltmp6 = AE_SRAI32(AE_ADD32S(ae_Ltmp4, ae_Ltmp5), 1);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);                    // store
				ae_Ltmp7 = AE_SRAI32(AE_SUB32S(ae_Ltmp4, ae_Ltmp5), 1);
				AE_SA32X2_IP(ae_Ltmp8, align3, ae_Lp3);
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);                     //load
			    AE_SA32X2_IP(ae_Ltmp6, align3, ae_Lp3);
				AE_LA32X2_IP(ae_Ltmp2, align1, ae_Lp1);
				AE_SA32X2_IP(ae_Ltmp7, align3, ae_Lp3);
				ae_Ltmp3 = AE_SRAI32(AE_ADD32S(ae_Ltmp1, ae_Ltmp2), 1);    // shift
				AE_LA32X2_IP(ae_Ltmp4, align1, ae_Lp1);
				ae_Ltmp8 = AE_SRAI32(AE_SUB32S(ae_Ltmp1, ae_Ltmp2), 1);
				AE_LA32X2_IP(ae_Ltmp5, align1, ae_Lp1);
				ae_Ltmp6 = AE_SRAI32(AE_ADD32S(ae_Ltmp4, ae_Ltmp5), 1);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);                    // store
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);                     //load
				ae_Ltmp7 = AE_SRAI32(AE_SUB32S(ae_Ltmp4, ae_Ltmp5), 1);
				AE_SA32X2_IP(ae_Ltmp8, align3, ae_Lp3);
				AE_LA32X2_IP(ae_Ltmp2, align1, ae_Lp1);
				AE_SA32X2_IP(ae_Ltmp6, align3, ae_Lp3);
				ae_Ltmp3 = AE_SRAI32(AE_ADD32S(ae_Ltmp1, ae_Ltmp2), 1);    // shift
			    AE_SA32X2_IP(ae_Ltmp7, align3, ae_Lp3);
			}
			AE_SA64POS_FP(align3, ae_Lp3);
    	}
    	///********************************************************************

		for (i = 1; i < DVTX_NUM_STAGE_8; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			ii = DVTXOP_shl(1, 8-i);	/* 2 * tw_stage * number of FFT's */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */

			for(j=0; j<DVTX_FFT_SIZE_512; j+=kk)
			{
				///**********************Coresponding C code****************************
				/***********************************************************************
				for(k=j; k<j+jj; k+=2)
				{
					ji = (k - j)*ii;
					kj = k + jj;
					ftmp_real = DVTXOP_LL_sub(DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji]),
												DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji + 1]));
					ftmp_imag = DVTXOP_LL_add(DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji]),
												DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji + 1]));
					tmp1 = DVTXOP_L_round(ftmp_real);
					tmp2 = DVTXOP_L_round(ftmp_imag);
					*(farray_ptr + kj) = DVTXOP_L_shr(DVTXOP_L_sub(*(farray_ptr + k), tmp1), 1);
					*(farray_ptr + kj + 1) = DVTXOP_L_shr(DVTXOP_L_sub(*(farray_ptr + k + 1), tmp2), 1);
					*(farray_ptr + k) = DVTXOP_L_shr(DVTXOP_L_add(*(farray_ptr + k), tmp1), 1);
					*(farray_ptr + k + 1) = DVTXOP_L_shr(DVTXOP_L_add(*(farray_ptr + k + 1), tmp2), 1);
				}
				***********************************************************************/

			    ae_f32x2 *ae_Lp1, *ae_Lp2, *ae_Lp3, *ae_Lp4, *ae_Lp5;
			    ae_f32x2 ae_Ltmp1, ae_Ltmp2, ae_Ltmp5,ae_Ltmp6;
			    ae_f32x2 ae_Ltmp7, ae_Ltmp8, ae_Ltmp9;
			    ae_f64 ae_LLtmp1, ae_LLtmp2;
			    ae_f64 ae_LLtmp3, ae_LLtmp4;
			    ae_valign align1, align2, align3, align4;
				ae_Lp3 = ae_Lp1 = (ae_f32x2 *)(&farray_ptr[j]);
				ae_Lp4 = ae_Lp2 = (ae_f32x2 *)(&farray_ptr[j+jj]);
				ae_Lp5 = (ae_f32x2 *)(&tx_phs_tbl_512_32b[0]);
				align1 = AE_LA64_PP(ae_Lp1);
				align2 = AE_LA64_PP(ae_Lp2);
				align3 = AE_ZALIGN64();
				align4 = AE_ZALIGN64();
				ii8 = ii*8;

				//Prologue
				AE_LA32X2_IP(ae_Ltmp2, align2, ae_Lp2);
				AE_L32X2_XP(ae_Ltmp5, ae_Lp5, ii8);
				ae_LLtmp1 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);

				for( k=0; k<jj; k+=4)
				{
					AE_LA32X2_IP(ae_Ltmp7, align2, ae_Lp2);                //unroll
					ae_LLtmp2 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
					AE_L32X2_XP(ae_Ltmp8, ae_Lp5, ii8);                    //unroll
					ae_LLtmp4 = AE_MULF32S_LH(ae_Ltmp7, ae_Ltmp8);         //unroll
					AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
					AE_MULSF32S_LL(ae_LLtmp1, ae_Ltmp2, ae_Ltmp5);
					ae_LLtmp3 = AE_MULF32S_HH(ae_Ltmp7, ae_Ltmp8);         //unroll
					AE_MULAF32S_HL(ae_LLtmp2, ae_Ltmp2, ae_Ltmp5);
					ae_Ltmp2 = AE_ROUND32X2F64SASYM(ae_LLtmp1, ae_LLtmp2);
					AE_MULSF32S_LL(ae_LLtmp3, ae_Ltmp7, ae_Ltmp8);
					ae_Ltmp5 = AE_SRAI32(AE_ADD32S(ae_Ltmp1, ae_Ltmp2), 1);
					AE_LA32X2_IP(ae_Ltmp9, align1, ae_Lp1);                //unroll
					AE_MULAF32S_HL(ae_LLtmp4, ae_Ltmp7, ae_Ltmp8);         //unroll
					ae_Ltmp6 = AE_SRAI32(AE_SUB32S(ae_Ltmp1, ae_Ltmp2), 1);
					AE_LA32X2_IP(ae_Ltmp2, align2, ae_Lp2);
					ae_Ltmp8 = AE_ROUND32X2F64SASYM(ae_LLtmp3, ae_LLtmp4); // unroll
					AE_SA32X2_IP(ae_Ltmp5, align3, ae_Lp3);
					ae_Ltmp7 = AE_SRAI32(AE_ADD32S(ae_Ltmp9, ae_Ltmp8), 1);// unroll
					AE_SA32X2_IP(ae_Ltmp6, align4, ae_Lp4);
					//unroll
					AE_SA32X2_IP(ae_Ltmp7, align3, ae_Lp3);
					AE_L32X2_XP(ae_Ltmp5, ae_Lp5, ii8);
					ae_Ltmp6 = AE_SRAI32(AE_SUB32S(ae_Ltmp9, ae_Ltmp8), 1);
					ae_LLtmp1 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
					AE_SA32X2_IP(ae_Ltmp6, align4, ae_Lp4);
				}
				AE_SA64POS_FP(align3, ae_Lp3);
				AE_SA64POS_FP(align4, ae_Lp4);
				///********************************************************************
			}
		}
    }
    else /* The IFFT part */
    {
    	///**********************Coresponding C code****************************
    	/***********************************************************************
		for(j=0; j<DVTX_FFT_SIZE_512; j+=4)
		{
			tmp1 = *(farray_ptr + j + 2);
			tmp2 = *(farray_ptr + j + 3);
			*(farray_ptr + j + 2) = DVTXOP_L_sub(*(farray_ptr + j), tmp1);
			*(farray_ptr + j + 3) = DVTXOP_L_sub(*(farray_ptr + j + 1), tmp2);
			*(farray_ptr + j) = DVTXOP_L_add(*(farray_ptr + j), tmp1);
			*(farray_ptr + j + 1) = DVTXOP_L_add(*(farray_ptr + j + 1), tmp2);
		}
		***********************************************************************/
    	{
			ae_f32x2 ae_Ltmp1, ae_Ltmp2, ae_Ltmp3;
			ae_f32x2 ae_Ltmp4, ae_Ltmp5, ae_Ltmp6;
			ae_f32x2 *ae_Lp1, *ae_Lp3;
			ae_valign align1, align3;
			ae_Lp3 = ae_Lp1 = (ae_f32x2 *)(&farray_ptr[0]);
			align1 = AE_LA64_PP(ae_Lp1);
			align3 = AE_ZALIGN64();
            //Prologue
			AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
			AE_LA32X2_IP(ae_Ltmp2, align1, ae_Lp1);
			ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);

			for(j=0; j<DVTX_FFT_SIZE_512; j+=16)
			{
				AE_LA32X2_IP(ae_Ltmp4, align1, ae_Lp1);             // unroll 2
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				ae_Ltmp3 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);
				AE_LA32X2_IP(ae_Ltmp5, align1, ae_Lp1);            // unroll 2
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				//unroll 2
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);            // unroll 3 & 4
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp4, ae_Ltmp5);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				AE_LA32X2_IP(ae_Ltmp2, align1, ae_Lp1);           // unroll 3 & 4
				ae_Ltmp6 = AE_SUB32S(ae_Ltmp4, ae_Ltmp5);
				AE_LA32X2_IP(ae_Ltmp4, align1, ae_Lp1);           // unroll 3 & 4
				AE_SA32X2_IP(ae_Ltmp6, align3, ae_Lp3);

				//unroll 3 & 4
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);
				AE_LA32X2_IP(ae_Ltmp5, align1, ae_Lp1);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				ae_Ltmp3 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp4, ae_Ltmp5);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				AE_LA32X2_IP(ae_Ltmp2, align1, ae_Lp1);
				ae_Ltmp6 = AE_SUB32S(ae_Ltmp4, ae_Ltmp5);
				AE_SA32X2_IP(ae_Ltmp6, align3, ae_Lp3);
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);
			}
			AE_SA64POS_FP(align3, ae_Lp3);
    	}
		///********************************************************************

		for (i = 1; i < DVTX_NUM_STAGE_8; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
			ii = DVTXOP_shl(1, 8-i);	/* 2 * tw_stage * number of FFT's */

			for(j=0; j<DVTX_FFT_SIZE_512; j+=kk)
			{
				///**********************Coresponding C code****************************
				/***********************************************************************
				for(k=j; k<j+jj; k+=2)
				{
					ji = (k - j)*ii;
					kj = k + jj;
					ftmp_real = DVTXOP_LL_add(DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji]),
												DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji + 1]));
					ftmp_imag = DVTXOP_LL_sub(DVTXOP_LL_mult(*(farray_ptr + kj + 1), tx_phs_tbl_512_32b[ji]),
												DVTXOP_LL_mult(*(farray_ptr + kj), tx_phs_tbl_512_32b[ji + 1]));
					tmp1 = DVTXOP_L_round(ftmp_real);
					tmp2 = DVTXOP_L_round(ftmp_imag);
					*(farray_ptr + kj) = DVTXOP_L_sub(*(farray_ptr + k), tmp1);
					*(farray_ptr + kj + 1) = DVTXOP_L_sub(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + k) = DVTXOP_L_add(*(farray_ptr + k), tmp1);
					*(farray_ptr + k + 1) = DVTXOP_L_add(*(farray_ptr + k + 1), tmp2);
				}
				***********************************************************************/

			    ae_f32x2 *ae_Lp1, *ae_Lp2, *ae_Lp3, *ae_Lp4, *ae_Lp5;
			    ae_f32x2 ae_Ltmp1, ae_Ltmp2, ae_Ltmp5, ae_Ltmp6;
			    ae_f32x2 ae_Ltmp3;
			    ae_f64 ae_LLtmp1, ae_LLtmp2;
			    ae_valign align1, align2, align3, align4;
				ae_Lp3 = ae_Lp1 = (ae_f32x2 *)(&farray_ptr[j]);
				ae_Lp4 = ae_Lp2 = (ae_f32x2 *)(&farray_ptr[j+jj]);
				ae_Lp5 = (ae_f32x2 *)(&tx_phs_tbl_512_32b[0]);
				align1 = AE_LA64_PP(ae_Lp1);
				align2 = AE_LA64_PP(ae_Lp2);
				align3 = AE_ZALIGN64();
				align4 = AE_ZALIGN64();
				ii8 = ii*8;

				//Prologue
				AE_LA32X2_IP(ae_Ltmp2, align2, ae_Lp2);
				AE_L32X2_XP(ae_Ltmp5, ae_Lp5, ii8);
				ae_LLtmp1 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp2 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);

				for( k=0; k<jj; k+=4)
				{
					AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
					AE_MULAF32S_LL(ae_LLtmp1, ae_Ltmp2, ae_Ltmp5);
					AE_MULSF32S_HL(ae_LLtmp2, ae_Ltmp2, ae_Ltmp5);
					ae_Ltmp2 = AE_ROUND32X2F64SASYM(ae_LLtmp1, ae_LLtmp2);
					ae_Ltmp5 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);
					ae_Ltmp6 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);
					AE_SA32X2_IP(ae_Ltmp5, align3, ae_Lp3);
					AE_LA32X2_IP(ae_Ltmp2, align2, ae_Lp2);            //unroll
					AE_L32X2_XP(ae_Ltmp5, ae_Lp5, ii8);                //unroll
					AE_SA32X2_IP(ae_Ltmp6, align4, ae_Lp4);

					//unroll
					ae_LLtmp1 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
					AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
					ae_LLtmp2 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
					AE_MULAF32S_LL(ae_LLtmp1, ae_Ltmp2, ae_Ltmp5);
					AE_MULSF32S_HL(ae_LLtmp2, ae_Ltmp2, ae_Ltmp5);
					ae_Ltmp3 = AE_ROUND32X2F64SASYM(ae_LLtmp1, ae_LLtmp2);
					AE_LA32X2_IP(ae_Ltmp2, align2, ae_Lp2);
					ae_Ltmp5 = AE_ADD32S(ae_Ltmp1, ae_Ltmp3);
					AE_SA32X2_IP(ae_Ltmp5, align3, ae_Lp3);
					AE_L32X2_XP(ae_Ltmp5, ae_Lp5, ii8);
					ae_Ltmp6 = AE_SUB32S(ae_Ltmp1, ae_Ltmp3);
					ae_LLtmp1 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
					AE_SA32X2_IP(ae_Ltmp6, align4, ae_Lp4);
					ae_LLtmp2 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
				}
				AE_SA64POS_FP(align3, ae_Lp3);
				AE_SA64POS_FP(align4, ae_Lp4);
				///********************************************************************
			}
		}
    }
}

void Tx_r_fft_512_32b(int * farray_ptr, short isign)
{

#ifdef UNDO_CODE_CLEANUP
    int ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
    long long Lftmp1_real, Lftmp1_imag;
    short i, j;
    long long Ltmp1, Ltmp2;
#else
    int ftmp1_real, ftmp2_real;
    short i;
#endif

    if (isign == 1) /* The FFT part */
    {
        /* Perform the complex FFT */
        Tx_c_fft_512_32b(farray_ptr, isign);

        /* First, handle the DC and foldover frequencies */
        ftmp1_real = *farray_ptr;
        ftmp2_real = *(farray_ptr + 1);
        *farray_ptr = DVTXOP_L_add(ftmp1_real, ftmp2_real);
        *(farray_ptr + 1) = DVTXOP_L_sub(ftmp1_real, ftmp2_real);

        /* Now, handle the remaining positive frequencies */
        ///**********************Coresponding C code****************************
        /***********************************************************************
        for (i = 2, j = DVTX_FFT_SIZE_512 - i; i <= DVTX_SIZE_BY_TWO_256; i = i + 2, j = DVTX_FFT_SIZE_512 - i)
        {
        farray_ptr_tmp[i] = DVTXOP_extract_h(farray_ptr[i]);
        }
        ***********************************************************************/
        {
			ae_f32x2 *ae_Lp1, *ae_Lp2, *ae_Lp3, *ae_Lp4, *ae_Lp5;
			ae_f32x2 ae_Ltmp1, ae_Ltmp2, ae_Ltmp3, ae_Ltmp4, ae_Ltmp5;
			ae_f64 ae_LLtmp1, ae_LLtmp2, ae_LLtmp3, ae_LLtmp4, ae_LLtmp5, ae_LLtmp6;
			ae_valign align1, align2, align3, align4;
			ae_Lp3 = ae_Lp1 = (ae_f32x2 *)(&farray_ptr[2]);
			ae_Lp4 = ae_Lp2 = (ae_f32x2 *)(&farray_ptr[DVTX_FFT_SIZE_512 - 1]);
			ae_Lp5 = (ae_f32x2 *)(&tx_phs_tbl_512_32b[2]);
			align1 = AE_LA64_PP(ae_Lp1);
			align2 = AE_LA64_PP(ae_Lp2);
			align3 = AE_ZALIGN64();
			align4 = AE_ZALIGN64();

            //Prologue
			AE_LA32X2_RIP(ae_Ltmp2, align2, ae_Lp2);
			AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
			ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp2, ae_Ltmp2);
			AE_L32X2_IP(ae_Ltmp5, ae_Lp5, 8);

			for(i=2; i<=DVTX_SIZE_BY_TWO_256; i+=4)
			{
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);
				ae_Ltmp4 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);
				ae_LLtmp1 = AE_CVT64F32_H(ae_Ltmp3);
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);              //unroll
				ae_LLtmp2 = AE_CVT64F32_L(ae_Ltmp4);
				ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp3, AE_NEG32S(ae_Ltmp4));
				ae_LLtmp3 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
				AE_MULSF32S_LL(ae_LLtmp3, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp4 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
				AE_MULAF32S_HL(ae_LLtmp4, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp5 = AE_ADD64S(ae_LLtmp1, ae_LLtmp3);
				AE_LA32X2_RIP(ae_Ltmp2, align2, ae_Lp2);           // unroll
				ae_LLtmp6 = AE_ADD64S(ae_LLtmp2, ae_LLtmp4);
				ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp2, ae_Ltmp2);        // unroll
				AE_L32X2_IP(ae_Ltmp5, ae_Lp5, 8);                  // unroll
				ae_Ltmp3 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp5, 1), AE_SRAI64(ae_LLtmp6, 1));
				ae_LLtmp5 = AE_SUB64S(ae_LLtmp1, ae_LLtmp3);
				ae_LLtmp6 = AE_SUB64S(ae_LLtmp4, ae_LLtmp2);
				ae_Ltmp4 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp6, 1), AE_SRAI64(ae_LLtmp5, 1));
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);          //unroll
				AE_SA32X2_RIP(ae_Ltmp4, align4, ae_Lp4);

				//unroll
				ae_Ltmp4 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);
				ae_LLtmp1 = AE_CVT64F32_H(ae_Ltmp3);
				ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp3, AE_NEG32S(ae_Ltmp4));
				ae_LLtmp2 = AE_CVT64F32_L(ae_Ltmp4);
				ae_LLtmp3 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp4 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
				AE_MULSF32S_LL(ae_LLtmp3, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp5 = AE_ADD64S(ae_LLtmp1, ae_LLtmp3);
				AE_MULAF32S_HL(ae_LLtmp4, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp6 = AE_ADD64S(ae_LLtmp2, ae_LLtmp4);
				ae_Ltmp3 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp5, 1), AE_SRAI64(ae_LLtmp6, 1));
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
				ae_LLtmp6 = AE_SUB64S(ae_LLtmp4, ae_LLtmp2);
				AE_LA32X2_RIP(ae_Ltmp2, align2, ae_Lp2);
				ae_LLtmp5 = AE_SUB64S(ae_LLtmp1, ae_LLtmp3);
				AE_L32X2_IP(ae_Ltmp5, ae_Lp5, 8);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp2, ae_Ltmp2);
				ae_Ltmp4 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp6, 1), AE_SRAI64(ae_LLtmp5, 1));
				AE_SA32X2_RIP(ae_Ltmp4, align4, ae_Lp4);
			}
			AE_SA64POS_FP(align3, ae_Lp3);
			AE_SA64POS_FP(align4, ae_Lp4);
        }
		///********************************************************************
    }
    else /* The IFFT part */
    {
        /* First, handle the DC and foldover frequencies */
        ftmp1_real = *farray_ptr;
        ftmp2_real = *(farray_ptr + 1);
        *farray_ptr = DVTXOP_L_shr(DVTXOP_L_add(ftmp1_real, ftmp2_real), 1);
        *(farray_ptr + 1) = DVTXOP_L_shr(DVTXOP_L_sub(ftmp1_real, ftmp2_real), 1);

        /* Now, handle the remaining positive frequencies */
        ///**********************Coresponding C code****************************
        /***********************************************************************
        for (i = 2, j = DVTX_FFT_SIZE_512 - i; i <= DVTX_SIZE_BY_TWO_256; i = i + 2, j = DVTX_FFT_SIZE_512 - i)
        {
        farray_ptr[i] = DVTXOP_L_deposit_h(farray_ptr_tmp[i]);
        }
        ***********************************************************************/
        {
			ae_f32x2 *ae_Lp1, *ae_Lp2, *ae_Lp3, *ae_Lp4, *ae_Lp5;
			ae_f32x2 ae_Ltmp1, ae_Ltmp2, ae_Ltmp3, ae_Ltmp4, ae_Ltmp5;
			ae_f64 ae_LLtmp1, ae_LLtmp2, ae_LLtmp3, ae_LLtmp4, ae_LLtmp5, ae_LLtmp6;
			ae_valign align1, align2, align3, align4;
			ae_Lp3 = ae_Lp1 = (ae_f32x2 *)(&farray_ptr[2]);
			ae_Lp4 = ae_Lp2 = (ae_f32x2 *)(&farray_ptr[DVTX_FFT_SIZE_512 - 1]);
			ae_Lp5 = (ae_f32x2 *)(&tx_phs_tbl_512_32b[2]);
			align1 = AE_LA64_PP(ae_Lp1);
			align2 = AE_LA64_PP(ae_Lp2);
			align3 = AE_ZALIGN64();
			align4 = AE_ZALIGN64();
			//Prologue
			AE_LA32X2_RIP(ae_Ltmp2, align2, ae_Lp2);
			AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
			ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp2, ae_Ltmp2);
			for(i=2; i<=DVTX_SIZE_BY_TWO_256; i+=4)
			{

				AE_L32X2_IP(ae_Ltmp5, ae_Lp5, 8);
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);
				ae_Ltmp4 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);
				ae_Ltmp2 = AE_SEL32_LH(AE_NEG32S(ae_Ltmp3), ae_Ltmp4);
				ae_LLtmp3 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp1 = AE_CVT64F32_H(ae_Ltmp3);
				ae_LLtmp4 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp2 = AE_CVT64F32_L(ae_Ltmp4);
				AE_MULAF32S_LL(ae_LLtmp3, ae_Ltmp2, ae_Ltmp5);
				AE_MULSF32S_HL(ae_LLtmp4, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp5 = AE_ADD64S(ae_LLtmp1, ae_LLtmp3);
				ae_LLtmp6 = AE_ADD64S(ae_LLtmp2, ae_LLtmp4);
				AE_LA32X2_RIP(ae_Ltmp2, align2, ae_Lp2);                 //unroll
				ae_Ltmp3 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp5, 1), AE_SRAI64(ae_LLtmp6, 1));
				ae_LLtmp5 = AE_SUB64S(ae_LLtmp1, ae_LLtmp3);
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);                  //unroll
				ae_LLtmp6 = AE_SUB64S(ae_LLtmp4, ae_LLtmp2);
				AE_L32X2_IP(ae_Ltmp5, ae_Lp5, 8);                        //unroll
				ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp2, ae_Ltmp2);              //unroll
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				ae_Ltmp4 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp6, 1), AE_SRAI64(ae_LLtmp5, 1));
				ae_Ltmp3 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);                //unroll
				ae_Ltmp1 = AE_SUB32S(ae_Ltmp1, ae_Ltmp2);                //unroll
				AE_SA32X2_RIP(ae_Ltmp4, align4, ae_Lp4);

				//unroll
				ae_Ltmp2 = AE_SEL32_LH(AE_NEG32S(ae_Ltmp3), ae_Ltmp1);
				ae_LLtmp3 = AE_MULF32S_HH(ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp1 = AE_CVT64F32_H(ae_Ltmp3);
				AE_MULAF32S_LL(ae_LLtmp3, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp4 = AE_MULF32S_LH(ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp2 = AE_CVT64F32_L(ae_Ltmp1);
				AE_MULSF32S_HL(ae_LLtmp4, ae_Ltmp2, ae_Ltmp5);
				ae_LLtmp5 = AE_ADD64S(ae_LLtmp1, ae_LLtmp3);
				ae_LLtmp6 = AE_ADD64S(ae_LLtmp2, ae_LLtmp4);
				ae_Ltmp3 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp5, 1), AE_SRAI64(ae_LLtmp6, 1));
				ae_LLtmp5 = AE_SUB64S(ae_LLtmp1, ae_LLtmp3);
				AE_LA32X2_RIP(ae_Ltmp2, align2, ae_Lp2);
				ae_LLtmp6 = AE_SUB64S(ae_LLtmp4, ae_LLtmp2);
				AE_SA32X2_IP(ae_Ltmp3, align3, ae_Lp3);
				ae_Ltmp4 = AE_ROUND32X2F64SASYM(AE_SRAI64(ae_LLtmp6, 1), AE_SRAI64(ae_LLtmp5, 1));
				AE_LA32X2_IP(ae_Ltmp1, align1, ae_Lp1);
				ae_Ltmp2 = AE_SEL32_LH(ae_Ltmp2, ae_Ltmp2);
				AE_SA32X2_RIP(ae_Ltmp4, align4, ae_Lp4);
			}
			AE_SA64POS_FP(align3, ae_Lp3);
			AE_SA64POS_FP(align4, ae_Lp4);
        }
		///********************************************************************

        /* Perform the complex IFFT */
        Tx_c_fft_512_32b(farray_ptr, isign);
    }
}								/* end r_fft () */
#endif




#ifndef Tx_c_fft_OPT_DSP

void Tx_c_fft(short * farray_ptr, short isign, int num_point_fft)
{

	short i, j, k, ii, jj, kk, ji, kj;
	int ftmp, ftmp_real, ftmp_imag;
	short tmp, tmp1, tmp2;
	int fft_size,fft_half_size,num_fft_stage;
    short *tx_ii_table_ptr, *tx_phs_tbl_ptr;
	if(num_point_fft == 64)
	{
		fft_size = DVTX_FFT_SIZE_64;
		fft_half_size = DVTX_FFT_SIZE_32;
		num_fft_stage = DVTX_NUM_STAGE_5;
		tx_ii_table_ptr = tx_ii_table_64;
        tx_phs_tbl_ptr = tx_phs_tbl_64;
	}
	else if(num_point_fft == 256)
	{
		fft_size = DVTX_FFT_SIZE_256;
		fft_half_size = DVTX_FFT_SIZE_128;
		num_fft_stage = DVTX_NUM_STAGE_7;
		tx_ii_table_ptr = tx_ii_table_256_wb;
        tx_phs_tbl_ptr = tx_phs_tbl_256_wb;
	}
	else
		return;


	/* Rearrange the input array in bit reversed order */
	for (i = 0, j = 0; i < fft_size - 2; i = i + 2)
	{
		if (j > i)
		{
			ftmp = *(farray_ptr + i);
			*(farray_ptr + i) = *(farray_ptr + j);
			*(farray_ptr + j) = (short)ftmp;

			ftmp = *(farray_ptr + i + 1);
			*(farray_ptr + i + 1) = *(farray_ptr + j + 1);
			*(farray_ptr + j + 1) = (short)ftmp;
		}

		k = fft_half_size;
		while (j >= k)
		{
			j = DVTXOP_sub(j, k);
			k = DVTXOP_shr(k, 1);
		}
		j += k;
	}

	/* The FFT part */
	if (isign == 1)
	{
		for (i = 0; i < num_fft_stage; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
			ii = tx_ii_table_ptr[i];	/* 2 * number of FFT's */

			for (j = 0; j < jj; j = j + 2)
			{					/* j is sample counter */
				ji = j * ii;	/* ji is phase table index */

				for (k = j; k < fft_size; k = k + kk)
				{				/* k is butterfly top */
					kj = DVTXOP_add(k, jj);	/* kj is butterfly bottom */

					/* Butterfly computations */
                    ftmp_real = DVTXOP_L_sub(DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji]),
                        DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji + 1]));
                    ftmp_imag = DVTXOP_L_add(DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji]),
                        DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji + 1]));

					tmp1 = DVTXOP_round(ftmp_real);
					tmp2 = DVTXOP_round(ftmp_imag);

					tmp = DVTXOP_sub(*(farray_ptr + k), tmp1);
					*(farray_ptr + kj) = DVTXOP_shr(tmp, 1);

					tmp = DVTXOP_sub(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + kj + 1) = DVTXOP_shr(tmp, 1);

					tmp = DVTXOP_add(*(farray_ptr + k), tmp1);
					*(farray_ptr + k) = DVTXOP_shr(tmp, 1);

					tmp = DVTXOP_add(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + k + 1) = DVTXOP_shr(tmp, 1);
				}
			}
		}

		/* The IFFT part */
	}
	else
	{
		for (i = 0; i < num_fft_stage; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
			ii = tx_ii_table_ptr[i];	/* 2 * number of FFT's */

			for (j = 0; j < jj; j = j + 2)
			{					/* j is sample counter */
				ji = j * ii;	/* ji is phase table index */

				for (k = j; k < fft_size; k = k + kk)
				{				/* k is butterfly top */
					kj = DVTXOP_add(k, jj);	/* kj is butterfly bottom */

					/* Butterfly computations */
                    ftmp_real = DVTXOP_L_add(DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji]),
                        DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji + 1]));
                    ftmp_imag = DVTXOP_L_sub(DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji]),
                        DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji + 1]));

					tmp1 = DVTXOP_round(ftmp_real);
					tmp2 = DVTXOP_round(ftmp_imag);

					*(farray_ptr + kj) = DVTXOP_sub(*(farray_ptr + k), tmp1);
					*(farray_ptr + kj + 1) = DVTXOP_sub(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + k) = DVTXOP_add(*(farray_ptr + k), tmp1);
					*(farray_ptr + k + 1) = DVTXOP_add(*(farray_ptr + k + 1), tmp2);
				}
			}
		}
	}

}
#else
#ifdef OPT_FFT_HOON
short Tx_c_fft_256_i[56] = {
		2,4,6,8,10,12,14,18,20,22,
		24,26,28,30,34,36,38,42,44,46,
		50,52,54,58,60,62,66,70,74,76,
		78,82,86,90,92,94,98,102,106,110,
		114,118,122,126,134,138,142,150,154,158,
		166,174,182,190,206,222
};
short Tx_c_fft_256_j[56] = {
		128,64,192,32,160,96,224,144,80,208,
		48,176,112,240,136,72,200,168,104,232,
		152,88,216,184,120,248,132,196,164,100,
		228,148,212,180,116,244,140,204,172,236,
		156,220,188,252,194,162,226,210,178,242,
		202,234,218,250,230,246
};
short Tx_c_fft_64_i[12] = {
		2,4,6,10,12,14,18,22,26,30,
		38,46
};
short Tx_c_fft_64_j[12] = {
		32,16,48,40,24,56,36,52,44,60,
		50,58
};
#endif

void Tx_c_fft(short * farray_ptr, short isign, int num_point_fft)
{
	short i, j, k, ii, jj, kk, ji, kj;
	int ftmp, ftmp_real, ftmp_imag;
	short tmp, tmp1, tmp2;
	int fft_size,fft_half_size,num_fft_stage;
	short *tx_phs_tbl_ptr;
	ae_f16x4 *ae_p1, *ae_p2, *ae_p3, *ae_p4;
	ae_f32x2 ae_Ltmp1, ae_Ltmp2;
	ae_f16x4 ae_tmp1, ae_tmp2, ae_tmp3, ae_tmp4;
	ae_valign align1, align2, align3, align4;
#ifndef OPT_FFT_HOON
	if(num_point_fft == 64)
	{
		fft_size = DVTX_FFT_SIZE_64;
		fft_half_size = DVTX_FFT_SIZE_32;
		num_fft_stage = DVTX_NUM_STAGE_5;
	}
	else if(num_point_fft == 256)
	{
		fft_size = DVTX_FFT_SIZE_256;
		fft_half_size = DVTX_FFT_SIZE_128;
		num_fft_stage = DVTX_NUM_STAGE_7;
	}
	else
	{
		fft_size = DVTX_FFT_SIZE_512;
		fft_half_size = DVTX_FFT_SIZE_256;
		num_fft_stage = DVTX_NUM_STAGE_8;
	}

	tx_phs_tbl_ptr = &tx_phs_tbl_256_wb[0];
	/* Rearrange the input array in bit reversed order */
	for (i = 0, j = 0; i < fft_size - 2; i = i + 2)
	{
		if (j > i)
		{
			///**********************Coresponding C code****************************

			ftmp = *(farray_ptr + i);
			*(farray_ptr + i) = *(farray_ptr + j);
			*(farray_ptr + j) = (short)ftmp;
			ftmp = *(farray_ptr + i + 1);
			*(farray_ptr + i + 1) = *(farray_ptr + j + 1);
			*(farray_ptr + j + 1) = (short)ftmp;

		}
		k = fft_half_size;
		while (j >= k)
		{
			j = DVTXOP_sub(j, k);
			k = DVTXOP_shr(k, 1);
		}
		j += k;
	}
#else
	if(num_point_fft == 64)
	{
		fft_size = DVTX_FFT_SIZE_64;
		fft_half_size = DVTX_FFT_SIZE_32;
		num_fft_stage = DVTX_NUM_STAGE_5;

		for (i = 0; i < 12; i++)
		{
			int i_idx, j_idx;
			i_idx = Tx_c_fft_64_i[i];
			j_idx = Tx_c_fft_64_j[i];
			///**********************Coresponding C code****************************
			ftmp = *(farray_ptr + i_idx);
			*(farray_ptr + i_idx) = *(farray_ptr + j_idx);
			*(farray_ptr + j_idx) = (short)ftmp;
			ftmp = *(farray_ptr + i_idx + 1);
			*(farray_ptr + i_idx + 1) = *(farray_ptr + j_idx + 1);
			*(farray_ptr + j_idx + 1) = (short)ftmp;
		}
	}
	else if(num_point_fft == 256)
	{
		fft_size = DVTX_FFT_SIZE_256;
		fft_half_size = DVTX_FFT_SIZE_128;
		num_fft_stage = DVTX_NUM_STAGE_7;

		for (i = 0; i < 56; i++)
		{
			int i_idx, j_idx;
			i_idx = Tx_c_fft_256_i[i];
			j_idx = Tx_c_fft_256_j[i];
			///**********************Coresponding C code****************************
			ftmp = *(farray_ptr + i_idx);
			*(farray_ptr + i_idx) = *(farray_ptr + j_idx);
			*(farray_ptr + j_idx) = (short)ftmp;
			ftmp = *(farray_ptr + i_idx + 1);
			*(farray_ptr + i_idx + 1) = *(farray_ptr + j_idx + 1);
			*(farray_ptr + j_idx + 1) = (short)ftmp;
		}
	}
	else
		return;

	tx_phs_tbl_ptr = &tx_phs_tbl_256_wb[0];
#endif
	/* The FFT part */
	if (isign == 1)
	{
		///**********************Coresponding C code****************************
		/***********************************************************************
		for(j=0; j<fft_size; j+=4)
		{
			tmp1 = *(farray_ptr + j + 2);
			tmp2 = *(farray_ptr + j + 3);
			*(farray_ptr + j + 2) = DVTXOP_shr(DVTXOP_sub(*(farray_ptr + j), tmp1), 1);
			*(farray_ptr + j + 3) = DVTXOP_shr(DVTXOP_sub(*(farray_ptr + j + 1), tmp2), 1);
			*(farray_ptr + j) = DVTXOP_shr(DVTXOP_add(*(farray_ptr + j), tmp1), 1);
			*(farray_ptr + j + 1) = DVTXOP_shr(DVTXOP_add(*(farray_ptr + j + 1), tmp2), 1);
		}
		***********************************************************************/
		ae_p3 = ae_p1 = (ae_f16x4 *)(&farray_ptr[0]);
		align1 = AE_LA64_PP(ae_p1);
		align3 = AE_ZALIGN64();
		for(j=0; j<fft_size; j+=4)
		{
			AE_LA16X4_IP(ae_tmp1, align1, ae_p1);
			ae_tmp2 = AE_SEL16_7632(ae_tmp1, ae_tmp1);
			ae_tmp3 = AE_SEL16_5410(ae_tmp1, AE_NEG16S(ae_tmp1));
			ae_tmp3 = AE_SRAI16(AE_ADD16S(ae_tmp3, ae_tmp2), 1);
			AE_SA16X4_IP(ae_tmp3, align3, ae_p3);
		}
		AE_SA64POS_FP(align3, ae_p3);
		///********************************************************************

		for (i = 1; i < num_fft_stage; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
	//		ii = DVTXOP_shl(1, 8-i);	/* 2 * tw_stage * number of FFT's */
			ii = DVTXOP_shl(1, 7-i);
			for(j=0; j<fft_size; j+=kk)
			{
				///**********************Coresponding C code****************************
				/***********************************************************************
				for(k=j; k<j+jj; k+=2)
				{
					ji = (k - j)*ii;
					kj = k + jj;
					ftmp_real = DVTXOP_L_sub(DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji]),
						DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji + 1]));
					ftmp_imag = DVTXOP_L_add(DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji]),
						DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji + 1]));
					tmp1 = DVTXOP_round(ftmp_real);
					tmp2 = DVTXOP_round(ftmp_imag);
					*(farray_ptr + kj) = DVTXOP_shr(DVTXOP_sub(*(farray_ptr + k), tmp1), 1);
					*(farray_ptr + kj + 1) = DVTXOP_shr(DVTXOP_sub(*(farray_ptr + k + 1), tmp2), 1);
					*(farray_ptr + k) = DVTXOP_shr(DVTXOP_add(*(farray_ptr + k), tmp1), 1);
					*(farray_ptr + k + 1) = DVTXOP_shr(DVTXOP_add(*(farray_ptr + k + 1), tmp2), 1);
				}
				***********************************************************************/
				ae_p3 = ae_p1 = (ae_f16x4 *)(&farray_ptr[j]);
				ae_p4 = ae_p2 = (ae_f16x4 *)(&farray_ptr[j+jj]);
				align1 = AE_LA64_PP(ae_p1);
				align2 = AE_LA64_PP(ae_p2);
				align3 = AE_ZALIGN64();
				align4 = AE_ZALIGN64();
				for(k=0; k<jj; k+=4)
				{
					ae_Ltmp1 = AE_L16X2M_I((ae_p16x2s*)(&tx_phs_tbl_ptr[k*ii]), 0);
					ae_Ltmp2 = AE_L16X2M_I((ae_p16x2s*)(&tx_phs_tbl_ptr[(k+2)*ii]), 0);
					AE_LA16X4_IP(ae_tmp1, align1, ae_p1);
					AE_LA16X4_IP(ae_tmp2, align2, ae_p2);
					ae_Ltmp1 = AE_SLAI32(ae_Ltmp1, 8);
					ae_Ltmp2 = AE_SLAI32(ae_Ltmp2, 8);
					ae_Ltmp1 = AE_MULFC32X16RAS_H(ae_Ltmp1, ae_tmp2);
					ae_Ltmp2 = AE_MULFC32X16RAS_L(ae_Ltmp2, ae_tmp2);
					ae_tmp2 = AE_ROUND16X4F32SASYM(ae_Ltmp1, ae_Ltmp2);
					ae_tmp3 = AE_SRAI16(AE_ADD16S(ae_tmp1, ae_tmp2), 1);
					ae_tmp4 = AE_SRAI16(AE_SUB16S(ae_tmp1, ae_tmp2), 1);
					AE_SA16X4_IP(ae_tmp3, align3, ae_p3);
					AE_SA16X4_IP(ae_tmp4, align4, ae_p4);
				}
				AE_SA64POS_FP(align3, ae_p3);
				AE_SA64POS_FP(align4, ae_p4);
				///********************************************************************
			}
		}

		/* The IFFT part */
	}
	else
	{
		///**********************Coresponding C code****************************
		/***********************************************************************
		for(j=0; j<fft_size; j+=4)
		{
			tmp1 = *(farray_ptr + j + 2);
			tmp2 = *(farray_ptr + j + 3);
			*(farray_ptr + j + 2) = DVTXOP_sub(*(farray_ptr + j), tmp1);
			*(farray_ptr + j + 3) = DVTXOP_sub(*(farray_ptr + j + 1), tmp2);
			*(farray_ptr + j) = DVTXOP_add(*(farray_ptr + j), tmp1);
			*(farray_ptr + j + 1) = DVTXOP_add(*(farray_ptr + j + 1), tmp2);
		}
		***********************************************************************/
		ae_p3 = ae_p1 = (ae_f16x4 *)(&farray_ptr[0]);
		align1 = AE_LA64_PP(ae_p1);
		align3 = AE_ZALIGN64();
		for(j=0; j<fft_size; j+=4)
		{
			AE_LA16X4_IP(ae_tmp1, align1, ae_p1);
			ae_tmp2 = AE_SEL16_7632(ae_tmp1, ae_tmp1);
			ae_tmp3 = AE_SEL16_5410(ae_tmp1, AE_NEG16S(ae_tmp1));
			ae_tmp3 = AE_ADD16S(ae_tmp3, ae_tmp2);
			AE_SA16X4_IP(ae_tmp3, align3, ae_p3);
		}
		AE_SA64POS_FP(align3, ae_p3);
		///********************************************************************

		for (i = 1; i < num_fft_stage; i++)
		{						/* i is stage counter */
			jj = DVTXOP_shl(2, i);		/* FFT size */
			kk = DVTXOP_shl(jj, 1);	/* 2 * FFT size */
			ii = DVTXOP_shl(1, 7-i);	/* 2 * tw_stage * number of FFT's */

			for(j=0; j<fft_size; j+=kk)
			{
				///**********************Coresponding C code****************************
				/***********************************************************************
				for(k=j; k<j+jj; k+=2)
				{
					ji = (k - j)*ii;
					kj = k + jj;
					ftmp_real = DVTXOP_L_add(DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji]),
						DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji + 1]));
					ftmp_imag = DVTXOP_L_sub(DVTXOP_L_mult(*(farray_ptr + kj + 1), tx_phs_tbl_ptr[ji]),
						DVTXOP_L_mult(*(farray_ptr + kj), tx_phs_tbl_ptr[ji + 1]));
					tmp1 = DVTXOP_round(ftmp_real);
					tmp2 = DVTXOP_round(ftmp_imag);
					*(farray_ptr + kj) = DVTXOP_sub(*(farray_ptr + k), tmp1);
					*(farray_ptr + kj + 1) = DVTXOP_sub(*(farray_ptr + k + 1), tmp2);
					*(farray_ptr + k) = DVTXOP_add(*(farray_ptr + k), tmp1);
					*(farray_ptr + k + 1) = DVTXOP_add(*(farray_ptr + k + 1), tmp2);
				}
				***********************************************************************/
				ae_p3 = ae_p1 = (ae_f16x4 *)(&farray_ptr[j]);
				ae_p4 = ae_p2 = (ae_f16x4 *)(&farray_ptr[j+jj]);
				align1 = AE_LA64_PP(ae_p1);
				align2 = AE_LA64_PP(ae_p2);
				align3 = AE_ZALIGN64();
				align4 = AE_ZALIGN64();
				for(k=0; k<jj; k+=4)
				{
					ae_Ltmp1 = AE_L16X2M_I((ae_p16x2s*)(&tx_phs_tbl_ptr[256 - k*ii]), 0);
					ae_Ltmp2 = AE_L16X2M_I((ae_p16x2s*)(&tx_phs_tbl_ptr[256 - (k+2)*ii]), 0);


					AE_LA16X4_IP(ae_tmp1, align1, ae_p1);
					AE_LA16X4_IP(ae_tmp2, align2, ae_p2);
					ae_Ltmp1 = AE_NEG32S(AE_SLAI32(ae_Ltmp1, 8));
					ae_Ltmp2 = AE_NEG32S(AE_SLAI32(ae_Ltmp2, 8));
					ae_Ltmp1 = AE_MULFC32X16RAS_H(ae_Ltmp1, ae_tmp2);
					ae_Ltmp2 = AE_MULFC32X16RAS_L(ae_Ltmp2, ae_tmp2);
					ae_tmp2 = AE_ROUND16X4F32SASYM(ae_Ltmp1, ae_Ltmp2);
					ae_tmp3 = AE_ADD16S(ae_tmp1, ae_tmp2);
					ae_tmp4 = AE_SUB16S(ae_tmp1, ae_tmp2);
					AE_SA16X4_IP(ae_tmp3, align3, ae_p3);
					AE_SA16X4_IP(ae_tmp4, align4, ae_p4);
				}
				AE_SA64POS_FP(align3, ae_p3);
				AE_SA64POS_FP(align4, ae_p4);
				///********************************************************************
			}
		}
	}
}								/* end of c_fft () */

#endif

/* FFT */

#ifdef TMP_DEBUG
int fftbuf_32b[DVTX_M_FFT_LEN_WB];
#endif

void Tx_r_fft_512_32b_16b_input(short * farray_ptr, short isign)
{
	short i;

#ifndef TMP_DEBUG
	int fftbuf_32b[DVTX_M_FFT_LEN_WB] = { 0, };
#endif



	if (isign > 0)
	{
#ifndef Left_Shift_32to16_OPT_DSP
		for (i = 0; i < DVTX_FFT_SIZE_512; i++)
		{
			fftbuf_32b[i] = DVTXOP_L_shl((int)farray_ptr[i], 7);
		}
#else
		Left_Shift_32to16(farray_ptr,fftbuf_32b,7, DVTX_FFT_SIZE_512);
#endif
		Tx_r_fft_512_32b(fftbuf_32b, isign);

#ifndef Right_Shift_32to16_OPT_DSP
		for (i = 0; i < DVTX_FFT_SIZE_512; i++)
		{
			farray_ptr[i] = DVTXOP_extract_l(DVTXOP_L_shr_r(fftbuf_32b[i], 7));
		}
#else

		Right_Shift_32to16(fftbuf_32b, farray_ptr, 7,DVTX_FFT_SIZE_512 );
#endif
	}
	else
	{
#ifndef V32to16_OPT_DSP
		for (i = 0; i < DVTX_FFT_SIZE_512; i++)
		{
			fftbuf_32b[i] = DVTXOP_L_deposit_h(farray_ptr[i]);
		}
#else
		V32to16(farray_ptr,fftbuf_32b,DVTX_FFT_SIZE_512);
#endif

		Tx_r_fft_512_32b(fftbuf_32b, isign);
#ifndef V16to32_OPT_DSP
		for (i = 0; i < DVTX_FFT_SIZE_512; i++)
		{
			farray_ptr[i] = DVTXOP_round(fftbuf_32b[i]);
		}
#else
		V16to32(fftbuf_32b,farray_ptr,DVTX_FFT_SIZE_512);
#endif
	}
}

#ifndef Tx_r_fft_OPT_DSP
void Tx_r_fft(short * farray_ptr, short isign, int num_point_fft)
{
	short ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
	int Lftmp1_real, Lftmp1_imag, Lftmp2_real, Lftmp2_imag;
	short i, j;
	int Ltmp1, Ltmp2;
	int fft_size, fft_half_size, tw_stage;
	short *tx_phs_tbl_ptr;
	if (num_point_fft == 64)
	{
		fft_size = DVTX_FFT_SIZE_64;
		fft_half_size = DVTX_FFT_SIZE_32;
		tw_stage = 3;
		tx_phs_tbl_ptr = tx_phs_tbl_64;

	}
	else if (num_point_fft == 256)
	{
		fft_size = DVTX_FFT_SIZE_256;
		fft_half_size = DVTX_FFT_SIZE_128;
		tw_stage = 1;
		tx_phs_tbl_ptr = tx_phs_tbl_256_wb;
	}
	else
		return;

	/* The FFT part */
	if (isign == 1)
	{
		/* Perform the complex FFT */
		Tx_c_fft(farray_ptr, isign, num_point_fft);

		/* First, handle the DC and foldover frequencies */
		ftmp1_real = *farray_ptr;
		ftmp2_real = *(farray_ptr + 1);
		*farray_ptr = DVTXOP_add(ftmp1_real, ftmp2_real);
		*(farray_ptr + 1) = DVTXOP_sub(ftmp1_real, ftmp2_real);

		/* Now, handle the remaining positive frequencies */
		for (i = 2, j = fft_size - i; i <= fft_half_size; i = i + 2, j = fft_size - i)
		{
			ftmp1_real = DVTXOP_add(*(farray_ptr + i), *(farray_ptr + j));
			ftmp1_imag = DVTXOP_sub(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_real = DVTXOP_add(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_imag = DVTXOP_sub(*(farray_ptr + j), *(farray_ptr + i));

			Lftmp1_real = DVTXOP_L_deposit_h(ftmp1_real);
			Lftmp1_imag = DVTXOP_L_deposit_h(ftmp1_imag);
			Lftmp2_real = DVTXOP_L_deposit_h(ftmp2_real);
			Lftmp2_imag = DVTXOP_L_deposit_h(ftmp2_imag);

			Ltmp1 = DVTXOP_L_sub(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[i]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[i + 1]));
			*(farray_ptr + i) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[i]), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[i + 1]));
			*(farray_ptr + i + 1) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_imag, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[j]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[j + 1]));
			*(farray_ptr + j) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_negate(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[j])), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[j + 1]));
			Ltmp2 = DVTXOP_L_add(DVTXOP_L_negate(Lftmp1_imag), Ltmp1);
			*(farray_ptr + j + 1) = DVTXOP_round(DVTXOP_L_shr(Ltmp2, 1));
		}
	}
	else
	{
		/* First, handle the DC and foldover frequencies */
		ftmp1_real = *farray_ptr;
		ftmp2_real = *(farray_ptr + 1);
		*farray_ptr = DVTXOP_shr(DVTXOP_add(ftmp1_real, ftmp2_real), 1);
		*(farray_ptr + 1) = DVTXOP_shr(DVTXOP_sub(ftmp1_real, ftmp2_real), 1);

		/* Now, handle the remaining positive frequencies */
		for (i = 2, j = fft_size - i; i <= fft_half_size; i = i + 2, j = fft_size - i)
		{
			ftmp1_real = DVTXOP_add(*(farray_ptr + i), *(farray_ptr + j));
			ftmp1_imag = DVTXOP_sub(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_real = DVTXOP_negate(DVTXOP_add(*(farray_ptr + j + 1), *(farray_ptr + i + 1)));
			ftmp2_imag = DVTXOP_negate(DVTXOP_sub(*(farray_ptr + j), *(farray_ptr + i)));

			Lftmp1_real = DVTXOP_L_deposit_h(ftmp1_real);
			Lftmp1_imag = DVTXOP_L_deposit_h(ftmp1_imag);
			Lftmp2_real = DVTXOP_L_deposit_h(ftmp2_real);
			Lftmp2_imag = DVTXOP_L_deposit_h(ftmp2_imag);

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[i]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[i + 1]));
			*(farray_ptr + i) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_sub(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[i]), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[i + 1]));
			*(farray_ptr + i + 1) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_imag, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_sub(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[j]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[j + 1]));
			*(farray_ptr + j) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_negate(DVTXOP_L_add(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[j]), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[j + 1])));
			Ltmp2 = DVTXOP_L_add(DVTXOP_L_negate(Lftmp1_imag), Ltmp1);
			*(farray_ptr + j + 1) = DVTXOP_round(DVTXOP_L_shr(Ltmp2, 1));
		}
		/* Perform the complex IFFT */
		Tx_c_fft(farray_ptr, isign, num_point_fft);
	}

	return;
} /* end r_fft () */
#else
void Tx_r_fft(short * farray_ptr, short isign, int num_point_fft)
{
	short ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
	int Lftmp1_real, Lftmp1_imag, Lftmp2_real, Lftmp2_imag;
	short i, j;
	int Ltmp1, Ltmp2;
	int fft_size,fft_half_size, tw_stage;
	short *tx_phs_tbl_ptr;
	ae_int32x2 int32x2_zerozero =AE_MOVDA32X2 (0,0);
	ae_int32x2 int32x2_round = AE_MOVDA32X2 (0x10000,0x10000);

	if(num_point_fft == 64)
	{
		fft_size = DVTX_FFT_SIZE_64;
		fft_half_size = DVTX_FFT_SIZE_32;
		tw_stage = 2;
	}
	else
	{
		fft_size = DVTX_FFT_SIZE_256;
		fft_half_size = DVTX_FFT_SIZE_128;
		tw_stage = 0;
	}


	tx_phs_tbl_ptr = &tx_phs_tbl_256_wb[0];

	/* The FFT part */
	if (isign == 1)
	{
		/* Perform the complex FFT */

		Tx_c_fft(farray_ptr, isign, num_point_fft);

		/* First, handle the DC and foldover frequencies */
		ftmp1_real = *farray_ptr;
		ftmp2_real = *(farray_ptr + 1);
		*farray_ptr = DVTXOP_add(ftmp1_real, ftmp2_real);
		*(farray_ptr + 1) = DVTXOP_sub(ftmp1_real, ftmp2_real);

		/* Now, handle the remaining positive frequencies */
		for (i = 2, j = fft_size - i; i <= fft_half_size; i = i + 2, j = fft_size - i)
		{
			///**********************Coresponding C code****************************
			/***********************************************************************
			ftmp1_real = DVTXOP_add(*(farray_ptr + i), *(farray_ptr + j));
			ftmp1_imag = DVTXOP_sub(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_real = DVTXOP_add(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_imag = DVTXOP_sub(*(farray_ptr + j), *(farray_ptr + i));

			Lftmp1_real = DVTXOP_L_deposit_h(ftmp1_real);
			Lftmp1_imag = DVTXOP_L_deposit_h(ftmp1_imag);
			Lftmp2_real = DVTXOP_L_deposit_h(ftmp2_real);
			Lftmp2_imag = DVTXOP_L_deposit_h(ftmp2_imag);

			Ltmp1 = DVTXOP_L_sub(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[i<<tw_stage]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[(i<<tw_stage) + 1]));
			*(farray_ptr + i) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[i<<tw_stage]), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[(i<<tw_stage) + 1]));
			*(farray_ptr + i + 1) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_imag, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[j<<tw_stage]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[(j<<tw_stage) + 1]));
			*(farray_ptr + j) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_negate(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[j<<tw_stage])), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[(j<<tw_stage) + 1]));
			Ltmp2 = DVTXOP_L_add(DVTXOP_L_negate(Lftmp1_imag), Ltmp1);
			*(farray_ptr + j + 1) = DVTXOP_round(DVTXOP_L_shr(Ltmp2, 1));
			***********************************************************************/
			ae_int32x2 int32x2_farray1, int32x2_farray2, int32x2_farray3, int32x2_farray4,int32x2_farray5;
			ae_int16x4 int16x4_tbl_swbi, int16x4_tbl_swbj;

			ae_valign align;
			ae_int16x4 *tx_phs_tbl_swb_ptri, *tx_phs_tbl_swb_ptrj;

			tx_phs_tbl_swb_ptri = (ae_int16x4 *)(&tx_phs_tbl_ptr[(i<<tw_stage)-2]);
			tx_phs_tbl_swb_ptrj = (ae_int16x4 *)(&tx_phs_tbl_ptr[(j<<tw_stage)-2]);


			int32x2_farray1 = AE_MOVDA32X2(*(farray_ptr + i), *(farray_ptr + i+ 1));
			int32x2_farray2 = AE_MOVDA32X2(*(farray_ptr + j), *(farray_ptr + j+ 1));

			int32x2_farray1 = AE_SLAI32(int32x2_farray1, 16);
			int32x2_farray2 = AE_SLAI32(int32x2_farray2, 16);

			align = AE_LA64_PP(tx_phs_tbl_swb_ptri);
			AE_LA16X4_IP(int16x4_tbl_swbi, align, tx_phs_tbl_swb_ptri);
			align = AE_LA64_PP(tx_phs_tbl_swb_ptrj);
		    AE_LA16X4_IP(int16x4_tbl_swbj, align, tx_phs_tbl_swb_ptrj);

			int32x2_farray3 = int32x2_farray1 - int32x2_farray2;
			int32x2_farray1 = (int32x2_farray1 + int32x2_farray2);
			int32x2_farray2 = AE_SEL32_LH(int32x2_farray3,int32x2_farray3);

			int32x2_farray2 = AE_ADDSUB32(int32x2_zerozero,int32x2_farray2);
			int32x2_farray3 = AE_SEL32_LL(int32x2_farray1,int32x2_farray2)>>16;
			int32x2_farray4 = AE_SEL32_HH(int32x2_farray1,int32x2_farray2);

			int16x4_tbl_swbi = AE_SEL16_5410(int16x4_tbl_swbi, int16x4_tbl_swbi);
			int16x4_tbl_swbj = AE_SEL16_5410(int16x4_tbl_swbj, int16x4_tbl_swbj);

			int32x2_farray5 = AE_MULC32X16_L(int32x2_farray3,int16x4_tbl_swbi)<<1;

			int32x2_farray5 = (int32x2_farray5 + int32x2_farray4 +int32x2_round)>>17;

			*(farray_ptr + i) = AE_MOVAD32_H(int32x2_farray5);
			*(farray_ptr + i + 1) = AE_MOVAD32_L(int32x2_farray5);

			/*switch*/
			int16x4_tbl_swbj=AE_SEL16_4321(int16x4_tbl_swbj,int16x4_tbl_swbj);

			int32x2_farray5 = AE_MULC32X16_L(int32x2_farray3,int16x4_tbl_swbj)<<1;
			int32x2_farray5 = AE_SEL32_LH(int32x2_farray5,int32x2_farray5);

			int32x2_farray5 = (AE_ADDSUB32(int32x2_farray5 , int32x2_farray4) +int32x2_round)>>17;
			*(farray_ptr + j) = AE_MOVAD32_H(int32x2_farray5);
			*(farray_ptr + j + 1) = AE_MOVAD32_L(int32x2_farray5);

			///********************************************************************
		}
	}
	else
	{
		/* First, handle the DC and foldover frequencies */
		ftmp1_real = *farray_ptr;
		ftmp2_real = *(farray_ptr + 1);
		*farray_ptr = DVTXOP_shr(DVTXOP_add(ftmp1_real, ftmp2_real), 1);
		*(farray_ptr + 1) = DVTXOP_shr(DVTXOP_sub(ftmp1_real, ftmp2_real), 1);

		/* Now, handle the remaining positive frequencies */
		for (i = 2, j = fft_size - i; i <= fft_half_size; i = i + 2, j = fft_size - i)
		{
			///**********************Coresponding C code****************************
			/***********************************************************************
			ftmp1_real = DVTXOP_add(*(farray_ptr + i), *(farray_ptr + j));
			ftmp1_imag = DVTXOP_sub(*(farray_ptr + i + 1), *(farray_ptr + j + 1));
			ftmp2_real = DVTXOP_negate(DVTXOP_add(*(farray_ptr + j + 1), *(farray_ptr + i + 1)));
			ftmp2_imag = DVTXOP_negate(DVTXOP_sub(*(farray_ptr + j), *(farray_ptr + i)));

			Lftmp1_real = DVTXOP_L_deposit_h(ftmp1_real);
			Lftmp1_imag = DVTXOP_L_deposit_h(ftmp1_imag);
			Lftmp2_real = DVTXOP_L_deposit_h(ftmp2_real);
			Lftmp2_imag = DVTXOP_L_deposit_h(ftmp2_imag);

			Ltmp1 = DVTXOP_L_add(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[i<<tw_stage]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[(i<<tw_stage) + 1]));
			*(farray_ptr + i) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_sub(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[i<<tw_stage]), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[(i<<tw_stage) + 1]));
			*(farray_ptr + i + 1) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_imag, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_sub(DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[j<<tw_stage]), DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[(j<<tw_stage) + 1]));
			*(farray_ptr + j) = DVTXOP_round(DVTXOP_L_shr(DVTXOP_L_add(Lftmp1_real, Ltmp1), 1));

			Ltmp1 = DVTXOP_L_negate(DVTXOP_L_add(DVTXOP_L_mult(ftmp2_imag, tx_phs_tbl_ptr[j<<tw_stage]), DVTXOP_L_mult(ftmp2_real, tx_phs_tbl_ptr[(j<<tw_stage) + 1])));
			Ltmp2 = DVTXOP_L_add(DVTXOP_L_negate(Lftmp1_imag), Ltmp1);
			*(farray_ptr + j + 1) = DVTXOP_round(DVTXOP_L_shr(Ltmp2, 1));
			***********************************************************************/
			ae_int32x2 int32x2_farray1, int32x2_farray2, int32x2_farray3, int32x2_farray4,int32x2_farray5;
			ae_int16x4 int16x4_tbl_swbi, int16x4_tbl_swbj;
			ae_p16x2s *temp_farray_ptr1,*temp_farray_ptr2;
			ae_valign align;
			ae_int16x4 *tx_phs_tbl_swb_ptri, *tx_phs_tbl_swb_ptrj;


			tx_phs_tbl_swb_ptri = (ae_int16x4 *)(&tx_phs_tbl_ptr[(i<<tw_stage)-2]);
			tx_phs_tbl_swb_ptrj = (ae_int16x4 *)(&tx_phs_tbl_ptr[(j<<tw_stage)-2]);


			int32x2_farray1 = AE_MOVDA32X2(*(farray_ptr + i), *(farray_ptr + i+ 1));
			int32x2_farray2 = AE_MOVDA32X2(*(farray_ptr + j), *(farray_ptr + j+ 1));

			int32x2_farray1 = AE_SLAI32(int32x2_farray1, 16);
			int32x2_farray2 = AE_SLAI32(int32x2_farray2, 16);

			align = AE_LA64_PP(tx_phs_tbl_swb_ptri);
			AE_LA16X4_IP(int16x4_tbl_swbi, align, tx_phs_tbl_swb_ptri);

			align = AE_LA64_PP(tx_phs_tbl_swb_ptrj);
		    AE_LA16X4_IP(int16x4_tbl_swbj, align, tx_phs_tbl_swb_ptrj);

			int32x2_farray3 = AE_ADDSUB32(int32x2_farray1, int32x2_farray2);
			int32x2_farray1 = AE_SUB32(int32x2_zerozero,AE_SUBADD32(int32x2_farray2, int32x2_farray1));
			int32x2_farray2 = AE_SEL32_LH(int32x2_farray1,int32x2_farray1);

			int32x2_farray4 = AE_SEL32_LH(int32x2_farray3,int32x2_farray3);
			int32x2_farray3 = AE_SEL32_LL(int32x2_farray2,int32x2_farray1)>>16;

			int16x4_tbl_swbi = AE_SEL16_5410(int16x4_tbl_swbi, int16x4_tbl_swbi);
			int16x4_tbl_swbj = AE_SEL16_5410(int16x4_tbl_swbj, int16x4_tbl_swbj);

			int32x2_farray5 = AE_MULC32X16_H(int32x2_farray3,int16x4_tbl_swbi)<<1;

			int32x2_farray5 = (int32x2_farray5 + int32x2_farray4 +int32x2_round)>>17;
			int32x2_farray5=AE_SEL32_LH(int32x2_farray5,int32x2_farray5);


			*(farray_ptr + i) = AE_MOVAD32_H(int32x2_farray5);
			*(farray_ptr + i + 1) = AE_MOVAD32_L(int32x2_farray5);

			/*switch*/
			int32x2_farray4 = AE_SEL32_LH(int32x2_farray4,int32x2_farray4);
			int32x2_farray3 = AE_SEL32_LL(int32x2_farray1,int32x2_farray2)>>16;
			int16x4_tbl_swbj=AE_SEL16_5432(int16x4_tbl_swbj,int16x4_tbl_swbj);

			int32x2_farray5 = AE_MULC32X16_L(int32x2_farray3,int16x4_tbl_swbj)<<1;
			int32x2_farray5 = AE_ADDSUB32(int32x2_zerozero, int32x2_farray5);
			int32x2_farray5 = (AE_ADDSUB32(int32x2_farray5 , int32x2_farray4) +int32x2_round)>>17;


			*(farray_ptr + j) = AE_MOVAD32_H(int32x2_farray5);
			*(farray_ptr + j + 1) = AE_MOVAD32_L(int32x2_farray5);
			///********************************************************************
		}

		/* Perform the complex IFFT */
		Tx_c_fft(farray_ptr, isign, num_point_fft);
	}
}

#endif
	
void FFT_function_NB(short* pTx, FFTBUF_NB *fft_buf)
{
	short i, j;
	short FRM_LEN, DELAY, PRE_EMP_FAC, FFT_LEN;
	short *OLAWindow;

	short *PRE_EMP = &fft_buf->PRE_EMP;
	short *BLK_NORM = &fft_buf->BLK_NORM;
	short *NORM_PRV = &fft_buf->NORM_PRV;
	short *data_buffer = fft_buf->data_buffer;
	short *WINDOW_OVERLAP = fft_buf->WINDOW_OVERLAP;

	FRM_LEN = 160;
	OLAWindow = TRZ_WIN_256_NB;
	DELAY = DVTX_DELAY_NB;
	FFT_LEN = DVTX_FFT_LEN_NB;
	PRE_EMP_FAC = DVTX_PRE_EMP_FAC;

//#ifndef Right_Shift_vecter_OPT_DSP
	for (i = 0; i < DELAY; i++)
		data_buffer[i] = WINDOW_OVERLAP[i];
		//data_buffer[i] = DVTXOP_shr_r(WINDOW_OVERLAP[i], (short)(NORM_PRV[0] - BLK_NORM[0]));
//#else
//	Right_Shift_vecter(WINDOW_OVERLAP, data_buffer, (short)(NORM_PRV[0] - BLK_NORM[0]), DELAY);
//#endif

	//PRE_EMP[0] = DVTXOP_shr_r(PRE_EMP[0], (short)(NORM_PRV[0] - BLK_NORM[0]));
	//NORM_PRV[0] = BLK_NORM[0];
	data_buffer[DELAY] = DVTXOP_add(*pTx, DVTXOP_mult(PRE_EMP_FAC, PRE_EMP[0])); // PRE_EMP_FAC  -13107//-26214
#ifndef SRCB_1016_OPT
	for (i = DELAY + 1, j = 1; i < DELAY + FRM_LEN; i++, j++)
		data_buffer[i] = DVTXOP_add(*(pTx + j), DVTXOP_mult(PRE_EMP_FAC, *(pTx + j - 1)));
#else
	Pre_emp(&pTx[1], &data_buffer[DELAY + 1], FRM_LEN - 1, PRE_EMP_FAC);
	j = FRM_LEN;
#endif

	PRE_EMP[0] = *(pTx + FRM_LEN - 1);

#ifndef KHW_OPTI_20191216_HIFI
	for (i = DELAY + FRM_LEN; i < FFT_LEN; i++)
		data_buffer[i] = 0;

	for (i = 0, j = FRM_LEN; i < DELAY; i++, j++)
		WINDOW_OVERLAP[i] = data_buffer[j];
#else
	__vec_memcpy(WINDOW_OVERLAP,&data_buffer[FRM_LEN],(DELAY<<1));
#endif

	BLK_NORM[0] = DVTXOP_block_norm(data_buffer, 256, DVTX_FFT_HEADROOM_NB);

#ifndef Right_Shift_vecter_OPT_DSP
	for (i = 0; i < FRM_LEN + DELAY; i++)
		data_buffer[i] = DVTXOP_mult_r(data_buffer[i], *(OLAWindow + i));// DVTX_ECNS_vars->AECNS.WINDOW_NSWB[i]); //sine window
#else
	Multi_vecter(data_buffer, OLAWindow, FRM_LEN + DELAY);
#endif

	Tx_r_fft(data_buffer, +1, 256);

	return;
}

void FFT_function_WB(short* pTx, FFTBUF_WB *fft_buf)
{
	short i, j;
	short FRM_LEN, DELAY, PRE_EMP_FAC, FFT_LEN;
	short *OLAWindow;

	short *PRE_EMP = &fft_buf->PRE_EMP;
	short *BLK_NORM = &fft_buf->BLK_NORM;
	short *NORM_PRV = &fft_buf->NORM_PRV;
	short *data_buffer = fft_buf->data_buffer;
	short *WINDOW_OVERLAP = fft_buf->WINDOW_OVERLAP;

	FRM_LEN = 320;
	OLAWindow = TRZ_WIN_512_WB;
	DELAY = DVTX_DELAY_WB;
	FFT_LEN = DVTX_FFT_LEN_WB;
	PRE_EMP_FAC = DVTX_PRE_EMP_FAC;

//#ifndef Right_Shift_vecter_OPT_DSP
	for (i = 0; i < DELAY; i++)
		data_buffer[i] = WINDOW_OVERLAP[i];
		//data_buffer[i] = DVTXOP_shr_r(WINDOW_OVERLAP[i], (short)(NORM_PRV[0] - BLK_NORM[0]));
//#else
//	Right_Shift_vecter(WINDOW_OVERLAP, data_buffer, (short)(NORM_PRV[0] - BLK_NORM[0]), DELAY);
//#endif

	//PRE_EMP[0] = DVTXOP_shr_r(PRE_EMP[0], (short)(NORM_PRV[0] - BLK_NORM[0]));
	//NORM_PRV[0] = BLK_NORM[0];
	data_buffer[DELAY] = DVTXOP_add(*pTx, DVTXOP_mult(PRE_EMP_FAC, PRE_EMP[0])); // PRE_EMP_FAC  -13107//-26214
#ifndef SRCB_1016_OPT
	for (i = DELAY + 1, j = 1; i < DELAY + FRM_LEN; i++, j++)
		data_buffer[i] = DVTXOP_add(*(pTx + j), DVTXOP_mult(PRE_EMP_FAC, *(pTx + j - 1)));
#else
	Pre_emp(&pTx[1], &data_buffer[DELAY + 1], FRM_LEN - 1, PRE_EMP_FAC);
	j = FRM_LEN;
#endif

	PRE_EMP[0] = *(pTx + FRM_LEN - 1);

#ifndef KHW_OPTI_20191216_HIFI
	for (i = DELAY + FRM_LEN; i < FFT_LEN; i++)
		data_buffer[i] = 0;

	for (i = 0, j = FRM_LEN; i < DELAY; i++, j++)
		WINDOW_OVERLAP[i] = data_buffer[j];
#else
	__vec_memcpy(WINDOW_OVERLAP,&data_buffer[FRM_LEN],(DELAY<<1));
#endif

	BLK_NORM[0] = DVTXOP_block_norm(data_buffer, 512, DVTX_FFT_HEADROOM_WB);

#ifndef Multi_vecter_OPT_DSP
	for (i = 0; i < FRM_LEN + DELAY; i++)
		data_buffer[i] = DVTXOP_mult_r(data_buffer[i], *(OLAWindow + i));// DVTX_ECNS_vars->AECNS.WINDOW_NSWB[i]); //sine window
#else
	Multi_vecter(data_buffer, OLAWindow, FRM_LEN + DELAY);
#endif
//////////////////////////////////////////////////////
/*
	printf("before\n");
	for(i=0;i<512;i++)
	{
		
		printf("data_buffer[%d] = %d \n",i,data_buffer[i]);
	}
*/
	
	Tx_r_fft_512_32b_16b_input(data_buffer, +1);

/*
	printf("after\n");
	for(i=0;i<512;i++)
	{
		printf("data_buffer[%d] = %d \n",i,data_buffer[i]);
	}
*/


	return;
}

extern void FFT_functions(DVTX_ECNS_Cfg_t* DVTX_ECNS_vars)
{
	short  i;

	short *pTx;
	FRAMEStatus* Frame_buf = &DVTX_ECNS_vars->FRAME_buf;
	FFTBUF_NB *fft_buf_NB;
	FFTBUF_WB *fft_buf_WB;
	

	if (DVTX_ECNS_vars->FRAME_param.TX_Outer_bandwidth == DVTX_FRM_LEN_WB)
	{
		/*FFT for (downsampled) NB inner Tx ---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->Speech_Tx_Inner;
		fft_buf_NB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner;

		//fft_buf_NB->BLK_NORM = DVTXOP_block_norm(pTx, 160, DVTX_FFT_HEADROOM_NB);
		FFT_function_NB(pTx, fft_buf_NB);


		/*FFT for (downsampled) NB inner Clippin Rx1 ---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->AEC_Inner_buf.RxFFTfrm_clp_AEC;
		fft_buf_NB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_RX_Inner;

		//fft_buf_NB->BLK_NORM = DVTXOP_block_norm(pTx, 160, DVTX_FFT_HEADROOM_NB);
		FFT_function_NB(pTx, fft_buf_NB);


		/*FFT for (downsampled) NB inner Clippin Rx2 ---------------------------------------------------------------------------------------------------*/
		// FFT for Rx with clipping	
		pTx = DVTX_ECNS_vars->AEC_Inner_buf.RxFFTfrm_AEC;
		fft_buf_NB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_RX2_Inner;

		//fft_buf_NB->BLK_NORM = DVTXOP_block_norm(pTx, 160, DVTX_FFT_HEADROOM_NB);
		FFT_function_NB(pTx, fft_buf_NB);

		/*FFT for Acc sensor data ---------------------------------------------------------------------------------------------------*/		
		pTx = DVTX_ECNS_vars->Speech_Tx_Acc;
		fft_buf_WB = &DVTX_ECNS_vars->FRAME_buf.fftbuffer_Acc;

		//fft_buf_WB->BLK_NORM = DVTXOP_block_norm(pTx, 320, DVTX_FFT_HEADROOM_WB);
		FFT_function_WB(pTx, fft_buf_WB);


		/*FFT for WB inner Tx ---------------------------------------------------------------------------------------------------*/
		//pTx = DVTX_ECNS_vars->FRAME_buf.Speech_Tx_Inner_WB;
		//fft_buf_WB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner_WB;

		//fft_buf_WB->BLK_NORM = DVTXOP_block_norm(pTx, 320, DVTX_FFT_HEADROOM_WB);
		//FFT_function_WB(pTx, fft_buf_WB);


		/*FFT for WB outer Tx ---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->Speech_Tx_Outer_1;
		fft_buf_WB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1;

		//fft_buf_WB->BLK_NORM = DVTXOP_block_norm(pTx, 320, DVTX_FFT_HEADROOM_WB);
		FFT_function_WB(pTx, fft_buf_WB);

		for (i = 0; i < DVTX_FFT_LEN_WB; i++)
			Frame_buf->FFTbuf_Tx_Outer_WB[i] = fft_buf_WB->data_buffer[i];


		/*FFT for WB outer 2 Tx ---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->Speech_Tx_Outer_2;
		fft_buf_WB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_2;

		//fft_buf_WB->BLK_NORM = DVTXOP_block_norm(pTx, 320, DVTX_FFT_HEADROOM_WB);
		FFT_function_WB(pTx, fft_buf_WB);


		/*FFT for WB outer Rx clippe1---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->AEC_Outer_buf_1.RxFFTfrm_clp_AEC;
		fft_buf_WB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_RX_Outer;

		//fft_buf_WB->BLK_NORM = DVTXOP_block_norm(pTx, 320, DVTX_FFT_HEADROOM_WB);
		FFT_function_WB(pTx, fft_buf_WB);


		/*FFT for WB outer Rx clippe2---------------------------------------------------------------------------------------------------*/
		/*pTx = DVTX_ECNS_vars->AEC_Outer_buf_1.RxFFTfrm_AEC;
		fft_buf_WB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_RX2_Outer;

		fft_buf_WB->BLK_NORM = DVTXOP_block_norm(pTx, 320, DVTX_FFT_HEADROOM_WB);
		FFT_function_WB(pTx, fft_buf_WB);*/
	}
	else 
	{
		/*FFT for WB outer Tx ---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->Speech_Tx_Outer_1;
		fft_buf_NB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_NB;

		//fft_buf_NB->BLK_NORM = DVTXOP_block_norm(pTx, 160, DVTX_FFT_HEADROOM_NB);
		FFT_function_NB(pTx, fft_buf_NB);


		/*FFT for WB outer Rx clippe1---------------------------------------------------------------------------------------------------*/
		pTx = DVTX_ECNS_vars->AEC_Outer_buf_1.RxFFTfrm_clp_AEC;
		fft_buf_NB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_RX_Outer_NB;

		//fft_buf_NB->BLK_NORM = DVTXOP_block_norm(pTx, 160, DVTX_FFT_HEADROOM_NB);
		FFT_function_NB(pTx, fft_buf_NB);


		/*FFT for WB outer Rx clippe2---------------------------------------------------------------------------------------------------*/
		/*pTx = DVTX_ECNS_vars->AEC_Outer_buf_1.RxFFTfrm_AEC;
		fft_buf_NB = &DVTX_ECNS_vars->FRAME_buf.fftbuf_RX2_Outer_NB;

		fft_buf_NB->BLK_NORM = DVTXOP_block_norm(pTx, 160, DVTX_FFT_HEADROOM_NB);
		FFT_function_NB(pTx, fft_buf_NB);*/
	}

	return;
}

extern void IFFT_functions(DVTX_ECNS_Cfg_t* DVTX_ECNS_vars)
{
	short  BLK_NORM_inner, norm_shift_inner, FFT_LEN_inner, *data_buffer_inner;
	short  BLK_NORM_outer, norm_shift_outer, FFT_LEN_outer, *data_buffer_outer;
#if(FLAG_SELECT_C_DSP) // IFFT for inner doesn't need 
	short norm_shift_outer_FNLMSin, *data_buffer_outer_FNLMSin, norm_shift_inner_FNLMSin, *data_buffer_inner_FNLMSin;
	short norm_shift_outer_FNLMSout, *data_buffer_outer_FNLMSout, norm_shift_inner_FNLMSout, *data_buffer_inner_FNLMSout;
	short norm_shift_outer_RESout, *data_buffer_outer_RESout, norm_shift_inner_RESout, *data_buffer_inner_RESout;
	short norm_shift_acc_NSout, *data_buffer_acc_NSout, norm_shift_outer_NSout, *data_buffer_outer_NSout, norm_shift_inner_NSout, *data_buffer_inner_NSout;

	short norm_shift_outer_BFfbf, *data_buffer_outer_BFfbf, norm_shift_outer_BFbm, *data_buffer_outer_BFbm, norm_shift_outer_BFgsc, *data_buffer_outer_BFgsc;

#endif
	short  BLK_NORM_acc, norm_shift_acc, FFT_LEN_acc, *data_buffer_acc;
	
	if (DVTX_ECNS_vars->FRAME_param.TX_Outer_bandwidth == DVTX_FRM_LEN_WB)
	{
		MixingStatus* Mix_buf = &DVTX_ECNS_vars->Mix_buf;

#if(FLAG_SELECT_C_DSP) // IFFT for inner doesn't need 
		BLK_NORM_inner = DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner.BLK_NORM; // BLK_NORM_NB;
		FFT_LEN_inner = DVTX_FFT_LEN_NB;
		data_buffer_inner = DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner.data_buffer; // data_buffer_NB

		data_buffer_outer_FNLMSin = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_FNLMSin;
		data_buffer_inner_FNLMSin = DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner.dbg_data_buffer_FNLMSin;
		data_buffer_outer_FNLMSout = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_FNLMSout;
		data_buffer_inner_FNLMSout = DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner.dbg_data_buffer_FNLMSout;
		data_buffer_outer_RESout = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_RESout;
		data_buffer_inner_RESout = DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner.dbg_data_buffer_RESout;
		data_buffer_acc_NSout = DVTX_ECNS_vars->FRAME_buf.fftbuffer_Acc.dbg_data_buffer_NSout;
		data_buffer_outer_NSout = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_NSout;
		data_buffer_inner_NSout = DVTX_ECNS_vars->FRAME_buf.fftbuf_Inner.dbg_data_buffer_NSout;

		data_buffer_outer_BFfbf = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_BFfbf;
		data_buffer_outer_BFbm  = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_BFbm;
		data_buffer_outer_BFgsc = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.dbg_data_buffer_BFgsc;
		
#endif

		BLK_NORM_outer = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.BLK_NORM; // BLK_NORM_WB;
		FFT_LEN_outer = DVTX_FFT_LEN_WB;
		data_buffer_outer = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_1.data_buffer; // data_buffer_WB

		if (Mix_buf->State_Car == 1 || Mix_buf->State_Wind == 1 || (Mix_buf->State > 1))
		{
#if(FLAG_SELECT_C_DSP) // IFFT for inner doesn't need 
			norm_shift_inner = DVTXOP_block_norm(data_buffer_inner, FFT_LEN_inner, DVTX_IFFT_HEADROOM_NB);
			norm_shift_inner_FNLMSin = DVTXOP_block_norm(data_buffer_inner_FNLMSin, FFT_LEN_inner, DVTX_IFFT_HEADROOM_NB);
			norm_shift_outer_FNLMSin = DVTXOP_block_norm(data_buffer_outer_FNLMSin, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			norm_shift_inner_FNLMSout = DVTXOP_block_norm(data_buffer_inner_FNLMSout, FFT_LEN_inner, DVTX_IFFT_HEADROOM_NB);
			norm_shift_outer_FNLMSout = DVTXOP_block_norm(data_buffer_outer_FNLMSout, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			norm_shift_inner_RESout = DVTXOP_block_norm(data_buffer_inner_RESout, FFT_LEN_inner, DVTX_IFFT_HEADROOM_NB);
			norm_shift_outer_RESout = DVTXOP_block_norm(data_buffer_outer_RESout, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			norm_shift_inner_NSout = DVTXOP_block_norm(data_buffer_inner_NSout, FFT_LEN_inner, DVTX_IFFT_HEADROOM_NB);
			norm_shift_outer_NSout = DVTXOP_block_norm(data_buffer_outer_NSout, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			norm_shift_acc_NSout = DVTXOP_block_norm(data_buffer_acc_NSout, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);

			norm_shift_outer_BFfbf = DVTXOP_block_norm(data_buffer_outer_BFfbf, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			norm_shift_outer_BFbm  = DVTXOP_block_norm(data_buffer_outer_BFbm,  FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			norm_shift_outer_BFgsc = DVTXOP_block_norm(data_buffer_outer_BFgsc, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
			
#endif
			norm_shift_outer = DVTXOP_block_norm(data_buffer_outer, FFT_LEN_outer, DVTX_IFFT_HEADROOM_WB);
		}
		else
		{
			norm_shift_inner = 0;
			norm_shift_outer = 0;

#if(FLAG_SELECT_C_DSP)
			norm_shift_inner_FNLMSin = 0;
			norm_shift_outer_FNLMSin = 0;
			norm_shift_inner_FNLMSout = 0;
			norm_shift_outer_FNLMSout = 0;
			norm_shift_inner_RESout = 0;
			norm_shift_outer_RESout = 0;
			norm_shift_inner_NSout = 0;
			norm_shift_outer_NSout = 0;
			norm_shift_acc_NSout = 0;

			norm_shift_outer_BFfbf = 0;
			norm_shift_outer_BFbm = 0;
			norm_shift_outer_BFgsc = 0;

#endif
		}

#if(FLAG_SELECT_C_DSP) // IFFT for inner doesn't need 
		// IFFT for Inner (NB)  
		Tx_r_fft(data_buffer_inner, -1, 256);

		BLK_NORM_acc = DVTX_ECNS_vars->FRAME_buf.fftbuffer_Acc.BLK_NORM; // BLK_NORM_WB;
		FFT_LEN_acc = DVTX_FFT_LEN_WB;
		data_buffer_acc = DVTX_ECNS_vars->FRAME_buf.fftbuffer_Acc.data_buffer; // data_buffer_WB
		Tx_r_fft_512_32b_16b_input(data_buffer_acc, -1);
		DVTXOP_block_denorm(data_buffer_acc, FFT_LEN_acc, (short)(BLK_NORM_acc));
#endif

		// IFFT for Outer (WB)
		Tx_r_fft_512_32b_16b_input(data_buffer_outer, -1);
#ifdef __SV_ANC_DEBUG__
		Tx_r_fft_512_32b_16b_input(DVTX_ECNS_vars->FRAME_buf.fftbuf_ANC_Out.data_buffer, -1);
		Tx_r_fft_512_32b_16b_input(DVTX_ECNS_vars->FRAME_buf.fftbuf_BF_Out.data_buffer, -1);
#endif

#ifndef Right_Shift_vecter_OPT_DSP
#if(FLAG_SELECT_C_DSP) // IFFT for inner doesn't need 
		DVTXOP_block_denorm(data_buffer_inner, FFT_LEN_inner, (short)(BLK_NORM_inner + norm_shift_inner));

		Tx_r_fft(data_buffer_inner_FNLMSin, -1, 256);
		DVTXOP_block_denorm(data_buffer_inner_FNLMSin, FFT_LEN_inner, (short)(BLK_NORM_inner + norm_shift_inner_FNLMSin));
		Tx_r_fft_512_32b_16b_input(data_buffer_outer_FNLMSin, -1);
		DVTXOP_block_denorm(data_buffer_outer_FNLMSin, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_FNLMSin));
		Tx_r_fft(data_buffer_inner_FNLMSout, -1, 256);
		DVTXOP_block_denorm(data_buffer_inner_FNLMSout, FFT_LEN_inner, (short)(BLK_NORM_inner + norm_shift_inner_FNLMSout));
		Tx_r_fft_512_32b_16b_input(data_buffer_outer_FNLMSout, -1);
		DVTXOP_block_denorm(data_buffer_outer_FNLMSout, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_FNLMSout));
		Tx_r_fft(data_buffer_inner_RESout, -1, 256);
		DVTXOP_block_denorm(data_buffer_inner_RESout, FFT_LEN_inner, (short)(BLK_NORM_inner + norm_shift_inner_RESout));
		Tx_r_fft_512_32b_16b_input(data_buffer_outer_RESout, -1);
		DVTXOP_block_denorm(data_buffer_outer_RESout, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_RESout));
		Tx_r_fft(data_buffer_inner_NSout, -1, 256);
		DVTXOP_block_denorm(data_buffer_inner_NSout, FFT_LEN_inner, (short)(BLK_NORM_inner + norm_shift_inner_NSout));
		Tx_r_fft_512_32b_16b_input(data_buffer_outer_NSout, -1);
		DVTXOP_block_denorm(data_buffer_outer_NSout, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_NSout));
		Tx_r_fft_512_32b_16b_input(data_buffer_acc_NSout, -1);
		DVTXOP_block_denorm(data_buffer_acc_NSout, FFT_LEN_outer, (short)(BLK_NORM_acc + norm_shift_acc_NSout));

		Tx_r_fft_512_32b_16b_input(data_buffer_outer_BFfbf, -1);
		DVTXOP_block_denorm(data_buffer_outer_BFfbf, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_BFfbf));
		Tx_r_fft_512_32b_16b_input(data_buffer_outer_BFbm, -1);
		DVTXOP_block_denorm(data_buffer_outer_BFbm, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_BFbm));
		Tx_r_fft_512_32b_16b_input(data_buffer_outer_BFgsc, -1);
		DVTXOP_block_denorm(data_buffer_outer_BFgsc, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer_BFgsc));


#endif
		DVTXOP_block_denorm(data_buffer_outer, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer));
#else
		//Right_Shift_vecter(data_buffer_inner, data_buffer_inner, (short)(BLK_NORM_inner + norm_shift_inner), FFT_LEN_inner);
		Right_Shift_vecter(data_buffer_outer, data_buffer_outer, (short)(BLK_NORM_outer + norm_shift_outer), FFT_LEN_outer);
#endif

#ifdef __SV_ANC_DEBUG__
		DVTXOP_block_denorm(DVTX_ECNS_vars->FRAME_buf.fftbuf_ANC_Out.data_buffer, FFT_LEN_outer , (short)(DVTX_ECNS_vars->FRAME_buf.fftbuf_ANC_Out.BLK_NORM));
		DVTXOP_block_denorm(DVTX_ECNS_vars->FRAME_buf.fftbuf_BF_Out.data_buffer, FFT_LEN_outer, (short)(DVTX_ECNS_vars->FRAME_buf.fftbuf_BF_Out.BLK_NORM ));
#endif
	}
	else
	{
		BLK_NORM_outer = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_NB.BLK_NORM; // BLK_NORM_WB;
		FFT_LEN_outer = DVTX_FFT_LEN_NB;
		data_buffer_outer = DVTX_ECNS_vars->FRAME_buf.fftbuf_Outer_NB.data_buffer; // data_buffer_WB

		norm_shift_outer = 0;

		// IFFT for Outer (NB)
		Tx_r_fft(data_buffer_outer, -1, 256);

#ifndef Right_Shift_vecter_OPT_DSP
		DVTXOP_block_denorm(data_buffer_outer, FFT_LEN_outer, (short)(BLK_NORM_outer + norm_shift_outer));
#else
		Right_Shift_vecter(data_buffer_outer, data_buffer_outer, (short)(BLK_NORM_outer + norm_shift_outer), FFT_LEN_outer);
#endif
	}
	return;
}

void IOLA_function_NB(short* pIn, short* pOut, IFFTBUF_NB *Ifft_buf)
{
	short i, tmp;

	short *OLAWindow;
	short FRM_LEN, DELAY, DE_EMP_FAC, FFT_LEN;

	OLAWindow = TRZ_WIN_256_NB;
	FRM_LEN = DVTX_FRM_LEN_NB;
	DELAY = DVTX_DELAY_NB;
	FFT_LEN = DVTX_FFT_LEN_NB;
	DE_EMP_FAC = DVTX_DE_EMP_FAC;
#ifndef Multi_vecter_OPT_DSP
	for (i = 0; i < FRM_LEN + DELAY; i++) {// additional window for sine window 
		pIn[i] = DVTXOP_mult_r(pIn[i], *(OLAWindow + i));//DVTX_ECNS_vars->AECNS.WINDOW_NSWB[i]); //windowing 
	}
#else
	Multi_vecter(pIn, OLAWindow, FRM_LEN + DELAY);
#endif

	for (i = 0; i < FFT_LEN - FRM_LEN; i++) {
		pIn[i] = DVTXOP_add(pIn[i], Ifft_buf->OVERLAP[i]);
	}

	for (i = FRM_LEN; i < FFT_LEN; i++) {
		Ifft_buf->OVERLAP[i - FRM_LEN] = pIn[i];
	}
	tmp = DVTXOP_mult_r(DE_EMP_FAC, Ifft_buf->DE_EMP);
	*pOut = DVTXOP_add(pIn[0], tmp);

	for (i = 1; i < FRM_LEN; i++)
	{
		tmp = DVTXOP_mult_r(DE_EMP_FAC, *(pOut + i - 1));
		*(pOut + i) = DVTXOP_add(pIn[i], tmp);
	}
	Ifft_buf->DE_EMP = *(pOut + FRM_LEN - 1);

	return;
}
#ifdef IOLA_function_WB_Add_vector_OPT_DSP
void IOLA_function_WB_Add_vector(short* pIn_out, short* Ifft_buf, short num)
{
	int i;
	ae_f16x4 data1, data2, dataadd;
	ae_valign align_pin, align_pfft, align_pout;
	ae_f16x4 *ptmpin, *ptmpfft, *ptmpout;
	ptmpin = (ae_f16x4 *)&pIn_out[0];
	ptmpfft = (ae_f16x4 *)&Ifft_buf[0];
	ptmpout = (ae_f16x4 *)&pIn_out[0];
	align_pin = AE_LA64_PP(ptmpin);
	align_pfft = AE_LA64_PP(ptmpfft);
	align_pout = AE_ZALIGN64();

	for	(i = 0; i<num;i+=4)
	{
		AE_LA16X4_IP(data1, align_pin, ptmpin);
		AE_LA16X4_IP(data2, align_pfft, ptmpfft);
		dataadd = AE_ADD16S(data1, data2);
		AE_SA16X4_IP(dataadd, align_pout, ptmpout);
	}
    AE_SA64POS_FP(align_pout, ptmpout);
}
#endif
void IOLA_function_WB(short* pIn, short* pOut, IFFTBUF_WB *Ifft_buf)
{
	short i, tmp;

	short *OLAWindow;
	short FRM_LEN, DELAY, DE_EMP_FAC, FFT_LEN; 

	OLAWindow = TRZ_WIN_512_WB;
	FRM_LEN = DVTX_FRM_LEN_WB;
	DELAY = DVTX_DELAY_WB;
	FFT_LEN = DVTX_FFT_LEN_WB;
	DE_EMP_FAC = DVTX_DE_EMP_FAC;
#ifndef Multi_vecter_OPT_DSP
	for (i = 0; i < FRM_LEN + DELAY; i++) {// additional window for sine window 
		pIn[i] = DVTXOP_mult_r(pIn[i], *(OLAWindow + i));//DVTX_ECNS_vars->AECNS.WINDOW_NSWB[i]); //windowing 
	}
#else
	Multi_vecter(pIn, OLAWindow, FRM_LEN + DELAY);
#endif

#ifndef IOLA_function_WB_Add_vector_OPT_DSP
	for (i = 0; i < FFT_LEN - FRM_LEN; i++) {
		pIn[i] = DVTXOP_add(pIn[i], Ifft_buf->OVERLAP[i]);
	}
#else
	IOLA_function_WB_Add_vector(pIn,Ifft_buf->OVERLAP,FFT_LEN - FRM_LEN);
#endif

#ifndef KHW_OPTI_20191216_HIFI
	for (i = FRM_LEN; i < FFT_LEN; i++) {
		Ifft_buf->OVERLAP[i - FRM_LEN] = pIn[i];
	}
#else
	__vec_memcpy(Ifft_buf->OVERLAP,&pIn[FRM_LEN],((FFT_LEN-FRM_LEN)<<1));
#endif
	tmp = DVTXOP_mult_r(DE_EMP_FAC, Ifft_buf->DE_EMP);
	*pOut = DVTXOP_add(pIn[0], tmp);

	for (i = 1; i < FRM_LEN; i++)
	{
		tmp = DVTXOP_mult_r(DE_EMP_FAC, *(pOut + i - 1));
		*(pOut + i) = DVTXOP_add(pIn[i], tmp);
	}
	Ifft_buf->DE_EMP = *(pOut + FRM_LEN - 1);

	return;
}

extern void IOLA_functions(DVTX_ECNS_Cfg_t* DVTX_ECNS_vars)
{
	short *pIn, *pOut;

	FRAMEStatus* FRAME_buf = &DVTX_ECNS_vars->FRAME_buf;
	IFFTBUF_WB* Ifftbuf_Acc;
	IFFTBUF_WB* Ifftbuf_Output_WB;
	IFFTBUF_NB* Ifftbuf_Output_NB;
	IFFTBUF_WB* Ifftbuf_Outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_Outer_WB;

#if(FLAG_SELECT_C_DSP == 1)
	short dbg_outer_WB[320] = { 0, };
	short dbg_inner_NB[160] = { 0, };
	IFFTBUF_NB* Ifftbuf_inner_NB;
	IFFTBUF_WB* Ifftbuf_outer_WB;
#endif

	if (DVTX_ECNS_vars->FRAME_param.TX_Outer_bandwidth == DVTX_FRM_LEN_WB)
	{
		Ifftbuf_Output_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_Output;

		pIn = FRAME_buf->fftbuf_Outer_1.data_buffer; // data_buffer_WB;
		pOut = DVTX_ECNS_vars->OutSignal_Outer;
		IOLA_function_WB(pIn, pOut, Ifftbuf_Output_WB);
	}
	else
	{
		Ifftbuf_Output_NB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_Output_NB;

		pIn = FRAME_buf->fftbuf_Outer_NB.data_buffer; // data_buffer_NB;
		pOut = DVTX_ECNS_vars->OutSignal_Outer;
		IOLA_function_NB(pIn, pOut, Ifftbuf_Output_NB);
	}
	
#if(FLAG_SELECT_C_DSP == 1)
#if(Debug_File_Write_C == 1)
	// inner 8kHz ECNS out for debugging
	if (DVTX_ECNS_vars->FRAME_param.TX_Outer_bandwidth == DVTX_FRM_LEN_WB)
	{
		Ifftbuf_Acc = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_Acc;
		pIn = FRAME_buf->fftbuffer_Acc.data_buffer;
		pOut = DVTX_ECNS_vars->Speech_Tx_Acc;
		IOLA_function_WB(pIn, pOut, Ifftbuf_Acc);


		Ifftbuf_Output_NB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_Output_NB;
		pIn = FRAME_buf->fftbuf_Inner.data_buffer;
		pOut = DVTX_ECNS_vars->dbgOutput_buf_Outer;
		IOLA_function_NB(pIn, pOut, Ifftbuf_Outer_WB);
		fwrite(pOut, sizeof(short), 160, fp_inner8kout);

		Ifftbuf_inner_NB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_inner_FNLMSin;
		pIn = FRAME_buf->fftbuf_Inner.dbg_data_buffer_FNLMSin;
		IOLA_function_NB(pIn, dbg_inner_NB, Ifftbuf_inner_NB);
		fwrite(dbg_inner_NB, sizeof(short), 160, fp_debug_FNLMSin_inner);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_FNLMSin;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_FNLMSin;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_FNLMSin_outer);

		Ifftbuf_inner_NB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_inner_FNLMSout;
		pIn = FRAME_buf->fftbuf_Inner.dbg_data_buffer_FNLMSout;
		IOLA_function_NB(pIn, dbg_inner_NB, Ifftbuf_inner_NB);
		fwrite(dbg_inner_NB, sizeof(short), 160, fp_debug_FNLMSout_inner);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_FNLMSout;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_FNLMSout;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_FNLMSout_outr);

		Ifftbuf_inner_NB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_inner_RESout;
		pIn = FRAME_buf->fftbuf_Inner.dbg_data_buffer_RESout;
		IOLA_function_NB(pIn, dbg_inner_NB, Ifftbuf_inner_NB);
		fwrite(dbg_inner_NB, sizeof(short), 160, fp_debug_RESout_inner);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_RESout;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_RESout;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_RESout_outer);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_acc_NSout;
		pIn = FRAME_buf->fftbuffer_Acc.dbg_data_buffer_NSout;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_NSout_acc);

		Ifftbuf_inner_NB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_inner_NSout;
		pIn = FRAME_buf->fftbuf_Inner.dbg_data_buffer_NSout;
		IOLA_function_NB(pIn, dbg_inner_NB, Ifftbuf_inner_NB);
		fwrite(dbg_inner_NB, sizeof(short), 160, fp_debug_NSout_inner);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_NSout;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_NSout;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_NSout_outer);



		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_BFfbf;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_BFfbf;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_BFfbf_outer);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_BFbm;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_BFbm;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_BFbm_outer);

		Ifftbuf_outer_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_outer_BFgsc;
		pIn = FRAME_buf->fftbuf_Outer_1.dbg_data_buffer_BFgsc;
		IOLA_function_WB(pIn, dbg_outer_WB, Ifftbuf_outer_WB);
		fwrite(dbg_outer_WB, sizeof(short), 320, fp_debug_BFgsc_outer);
	}
#endif
#endif

#ifdef __SV_ANC_DEBUG__
	Ifftbuf_Output_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_ANC_Output;
	pIn = FRAME_buf->fftbuf_ANC_Out.data_buffer; // data_buffer_WB;
	pOut = DVTX_ECNS_vars->OutSignal_ANC;
	IOLA_function_WB(pIn, pOut, Ifftbuf_Output_WB);

	Ifftbuf_Output_WB = &DVTX_ECNS_vars->FRAME_buf.Ifftbuf_BF_Output;
	pIn = FRAME_buf->fftbuf_BF_Out.data_buffer; // data_buffer_WB;
	pOut = DVTX_ECNS_vars->OutSignal_BF;
	IOLA_function_WB(pIn, pOut, Ifftbuf_Output_WB);
#endif

	return;
}
