
#include "speech_fft.h"
#include "basic_op.h"








#ifndef OPT_KHW_BASIC_OP

static const short fft_bitrev60[15] = 
{
	0, 10, 20, 2, 12, 22, 4, 14, 24, 6, 16, 26, 8, 18, 28, 
};

static const short fft_twiddles16000_240[120] = 
{
	32767,	 0, 32588, -3425,
	32051, -6813, 31165, -10126,
	29936, -13328,28379, -16384,
	26510, -19260,24353, -21926,
	21927, -24352,19261, -26509,
	16385, -28378,13329, -29934,
	10127, -31164,6815, -32051,
	3426, -32588, 0,
	-32767,
	-3425, -32588, -6813, -32051,
	-10126, -31165,-13328, -29936,
	-16384, -28379,-19260, -26510,
	-21926, -24353,-24352, -21927,
	-26509, -19261,-28378, -16385,
	-29934, -13329,-31164, -10127,
	-32051, -6815,-32588, -3426,

	-32767, 0,-32588, 3425,
	-32051, 6813,-31165, 10126,
	-29936, 13328,-28379, 16384,
	-26510, 19260,-24353, 21926,
	-21927, 24352,-19261, 26509,
	-16385, 28378,-13329, 29934,
	-10127, 31164,-6815, 32051,
	-3426, 32588,0, 32767,
	3425, 32588,6813, 32051,
	10126, 31165,13328, 29936,
	16384, 28379,19260, 26510,
	21926, 24353,24352, 21927,
	26509, 19261,28378, 16385,
	29934, 13329,31164, 10127,
	32051, 6815,32588, 3426,
};

#ifndef kf_bfly3_opt_hifi3_ZH
static void kf_bfly3(int * Fout, short _fwd)
{
	short i, k; // addit = 0;
	const short m = 10;
	const short m2 = 20;
	const short *tw1, *tw2;

	int scratch_0, scratch_1, scratch_2, scratch_3, scratch_4, scratch_5;

	short fwd = (_fwd<<1)-1;
	short epi3 = -28379;
#ifdef	HW_HIFI3
	long long tmp_val,tmp_val2;
#endif

#ifdef HW_FFT_5
	if(_fwd)
	{

		for(i=0;i<120;i=i+4)
		{
/*
			tmp_val = AE_MUL32X16_H0(Fout[i], 10923);
			Fout[i] = (int)(tmp_val>>15);

			tmp_val = AE_MUL32X16_H0(Fout[i+1], 10923);
			Fout[i+1] = (int)(tmp_val>>15);

			tmp_val = AE_MUL32X16_H0(Fout[i+2], 10923);
			Fout[i+2] = (int)(tmp_val>>15);

			tmp_val = AE_MUL32X16_H0(Fout[i+3], 10923);
			Fout[i+3] = (int)(tmp_val>>15);
*/
			Fout[i] = S_MUL(Fout[i], 10923);
			Fout[i+1] = S_MUL(Fout[i+1], 10923);
			Fout[i+2] = S_MUL(Fout[i+2], 10923);
			Fout[i+3] = S_MUL(Fout[i+3], 10923);


		}

	}
#endif



	for (i = 0; i<4; i++)
	{
		tw1 = tw2 = fft_twiddles16000_240;
		k = 5;
		do {

#ifndef HW_FFT_5
			if (_fwd) {
#ifndef	HW_HIFI3
				Fout[0] = S_MUL(Fout[0], 10923);
				Fout[1] = S_MUL(Fout[1], 10923);
				Fout[m] = S_MUL(Fout[m], 10923);
				Fout[m + 1] = S_MUL(Fout[m + 1], 10923);
				Fout[m2] = S_MUL(Fout[m2], 10923);
				Fout[m2 + 1] = S_MUL(Fout[m2 + 1], 10923);
#else
				tmp_val = AE_MUL32X16_H0(Fout[0], 10923);
				Fout[0] = (int)(tmp_val>>15);

				tmp_val = AE_MUL32X16_H0(Fout[1], 10923);
				Fout[1] = (int)(tmp_val>>15);

				tmp_val = AE_MUL32X16_H0(Fout[m], 10923);
				Fout[m] = (int)(tmp_val>>15);

				tmp_val = AE_MUL32X16_H0(Fout[m+1], 10923);
				Fout[m+1] = (int)(tmp_val>>15);

				tmp_val = AE_MUL32X16_H0(Fout[m2], 10923);
				Fout[m2] = (int)(tmp_val>>15);

				tmp_val = AE_MUL32X16_H0(Fout[m2+1], 10923);
				Fout[m2+1] = (int)(tmp_val>>15);
#endif
			}
#endif

#ifndef	HW_HIFI3
			scratch_2 = SSC_ADD(S_MUL(Fout[m], tw1[0]), (-1)*fwd * S_MUL(Fout[m + 1], tw1[1]));
			scratch_3 = SSC_ADD(S_MUL(Fout[m + 1 - _fwd], tw1[_fwd]), fwd*S_MUL(Fout[m + _fwd], tw1[1 - _fwd]));
			scratch_4 = SSC_ADD(S_MUL(Fout[m2], tw2[0]), (-1)*fwd * S_MUL(Fout[m2 + 1], tw2[1]));
			scratch_5 = SSC_ADD(S_MUL(Fout[m2 + 1 - _fwd], tw2[_fwd]), fwd*S_MUL(Fout[m2 + _fwd], tw2[1 - _fwd]));
#else
			tmp_val = AE_MUL32X16_H0(Fout[m], tw1[0]);
			tmp_val2 = AE_MUL32X16_H0(Fout[m+1], tw1[1]);
		//	scratch_2 = SSC_SUB((int)(tmp_val>>15) , fwd*(int)(tmp_val2>>15) );
			scratch_2 = SSC_SUB((int)(tmp_val>>15) , FFT_MUL_16_32(fwd,(int)(tmp_val2>>15))    );


			tmp_val = AE_MUL32X16_H0(Fout[m + 1 - _fwd], tw1[_fwd]);
			tmp_val2 = AE_MUL32X16_H0(Fout[m + _fwd], tw1[1 - _fwd]);
		//	scratch_3 = SSC_ADD((int)(tmp_val>>15) , fwd*(int)(tmp_val2>>15) );
			scratch_3 = SSC_ADD((int)(tmp_val>>15) , FFT_MUL_16_32(fwd,(int)(tmp_val2>>15)) );

			tmp_val = AE_MUL32X16_H0(Fout[m2], tw2[0]);
			tmp_val2 = AE_MUL32X16_H0(Fout[m2+1], tw2[1]);
	//		scratch_4 = SSC_SUB((int)(tmp_val>>15) , fwd*(int)(tmp_val2>>15) );
			scratch_4 = SSC_SUB((int)(tmp_val>>15) , FFT_MUL_16_32(fwd,(int)(tmp_val2>>15)) );

			tmp_val = AE_MUL32X16_H0(Fout[m2 + 1 - _fwd], tw2[_fwd]);
			tmp_val2 = AE_MUL32X16_H0(Fout[m2 + _fwd], tw2[1 - _fwd]);
	//		scratch_5 = SSC_ADD((int)(tmp_val>>15) , fwd*(int)(tmp_val2>>15) );
			scratch_5 = SSC_ADD((int)(tmp_val>>15) , FFT_MUL_16_32(fwd,(int)(tmp_val2>>15)) );
#endif

			scratch_0 = SSC_SUB(scratch_2, scratch_4); 
			scratch_1 = SSC_ADD(scratch_2, scratch_4); // 6-->1
								   
								   
			scratch_2 = SSC_ADD(scratch_3, scratch_5);	// 7-->2
			scratch_3 = SSC_SUB(scratch_3, scratch_5);   // 1-->3



			tw1 += 8;
			tw2 += 16;
			Fout[m] = Fout[0] - HALF_OF(scratch_1); // 6-->1
			Fout[m + 1] = Fout[1] - HALF_OF(scratch_2); // 7-->2
#ifndef	HW_HIFI3
			scratch_0 = S_MUL(scratch_0, fwd * epi3);
			scratch_3 = S_MUL(scratch_3, fwd * epi3); // 1-->3
#else
			tmp_val = AE_MUL32X16_H0(scratch_0, (SSC_MULT16x16(fwd,epi3)));
			scratch_0 = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(scratch_3, (SSC_MULT16x16(fwd,epi3))); // 1-->3
			scratch_3 = (int)(tmp_val>>15);
#endif

			Fout[0] = SSC_ADD(Fout[0], scratch_1); // 6-->1
			Fout[1] = SSC_ADD(Fout[1], scratch_2); // 7-->2
			Fout[m2] = Fout[m] + scratch_3;  // 1-->3
			Fout[m2 + 1] = Fout[m + 1] - scratch_0;
			Fout[m] -= scratch_3; // 1-->3
			Fout[m + 1] += scratch_0;
			Fout = Fout + 2;
		} while (--k);
		Fout = Fout + 20;
	}
}
#else
static void kf_bfly3(int * Fout, short _fwd)
{
	short i, k; // addit = 0;
	const short m = 10;
	const short m2 = 20;
	const short *tw1, *tw2;
	ae_f32x2 tmpL1, tmpL2, tmpL3, tmpL4, flagv;
	ae_f32x2 *pt0, *pt1, *pt2, *pt3, *pt4, *pt5;
	ae_valign align0 ,align1, align2, align3, align4, align5;
	ae_f16x4 epi3v;

	int scratch_0, scratch_1, scratch_2, scratch_3, scratch_4, scratch_5;

	short fwd = (_fwd<<1)-1;
	short epi3 = -28379;
#ifdef	HW_HIFI3
	long long tmp_val,tmp_val2;
#endif

#ifdef HW_FFT_5
	if(_fwd)
	{

		for(i=0;i<120;i=i+4)
		{
			tmp_val = AE_MUL32X16_H0(Fout[i], 10923);  //  ³ªÁß¿¡  AE_MUL32X2_vector¸¦ ÀÌ¿ëÇØ¼­ ¹Ù²ÙÀÚ.  AE_SRAI64X2_vector
			Fout[i] = (int)(tmp_val>>15);

			tmp_val = AE_MUL32X16_H0(Fout[i+1], 10923);
			Fout[i+1] = (int)(tmp_val>>15);

			tmp_val = AE_MUL32X16_H0(Fout[i+2], 10923);
			Fout[i+2] = (int)(tmp_val>>15);

			tmp_val = AE_MUL32X16_H0(Fout[i+3], 10923);
			Fout[i+3] = (int)(tmp_val>>15);
		}

	}
#endif


	epi3v = AE_MOVDA16(SSC_MULT16x16(fwd,epi3));
	flagv = AE_MOVDA32X2(1,fwd);
	for (i = 0; i<4; i++)
	{
		ae_f32x2 S01,S23, S45, F01, F23, F45, S12, S03, tmpL, S30;
		ae_f16x4 Tw0,Tw1;

		tw1 = tw2 = fft_twiddles16000_240;
		k = 5;

		pt0 = pt3 =  (ae_f32x2*)&Fout[0];
		pt1 = pt4 = (ae_f32x2*)&Fout[m];
		pt2 = pt5 = (ae_f32x2*)&Fout[m2];
		align0 = AE_LA64_PP(pt0);
		align1 = AE_LA64_PP(pt1);
		align2 = AE_LA64_PP(pt2);
		align3 = AE_ZALIGN64();
		align4 = AE_ZALIGN64();
		align5 = AE_ZALIGN64();


		do {



			AE_LA32X2_IP(F01, align0, pt0);
			AE_LA32X2_IP(F23, align1, pt1);
			AE_LA32X2_IP(F45, align2, pt2);

			Tw0 = AE_MOVDA16X2(tw1[0], tw1[1]*fwd);   //  here
			Tw1 = AE_MOVDA16X2(tw2[0], tw2[1]*fwd);   //



			S23 = AE_MULFC32X16RAS_L(F23, Tw0);
			S45 = AE_MULFC32X16RAS_L(F45, Tw1);

            S03 = AE_SUB32(S23, S45);
            S12 = AE_ADD32(S23, S45);

            tmpL = AE_SRAI32(S12, 1);

			tw1 += 8;
			tw2 += 16;

			F23 = AE_SUB32(F01, tmpL);
			S03 = AE_MULFP32X16X2RAS_L(S03, epi3v);

			S30 = AE_SEL32_LH(S03, S03);

            F01 = AE_ADD32(F01, S12);
            F45 = AE_ADDSUB32(F23, S30);
            F23 = AE_SUBADD32(F23, S30);


            AE_SA32X2_IP(F01, align3, pt3);
            AE_SA32X2_IP(F23, align4, pt4);
            AE_SA32X2_IP(F45, align5, pt5);

//			Fout = Fout + 2;
		} while (--k);
		AE_SA64POS_FP(align3, pt3);
		AE_SA64POS_FP(align4, pt4);
		AE_SA64POS_FP(align5, pt5);
		Fout = Fout + 30;
	}
}

#endif





#ifndef kf_bfly4_opt_hifi3_ZH

static void kf_bfly4(int *Fout, short fwd)
{
	const short *tw1, *tw2, *tw3;
	short flag = 1, shiftbit = 2;

	int scratch_0,scratch_1,scratch_2,scratch_3,scratch_4,scratch_5,scratch_6,scratch_7;

	short m = 30;
	const short m2 = (m << 1);
	const short m3 = 90;
	short j;

#ifdef	HW_HIFI3
	long long tmp_val,tmp_val2;
#endif

	tw3 = tw2 = tw1 = fft_twiddles16000_240;

	j = m >> 1;

	if (0 == fwd)
	{
		flag = -1;
		shiftbit = 0;

	}


	do {

#ifndef HW_HIFI3
		scratch_0 = SHR(SSC_SUB(S_MUL(Fout[m], tw1[0]), flag*S_MUL(Fout[m + 1], tw1[1])), shiftbit);
		scratch_1 = SHR(SSC_ADD(S_MUL(Fout[m + 1], tw1[0]), flag*S_MUL(Fout[m], tw1[1])), shiftbit);
		scratch_2 = SHR(SSC_SUB(S_MUL(Fout[m2], tw2[0]), flag*S_MUL(Fout[m2 + 1], tw2[1])), shiftbit);
		scratch_3 = SHR(SSC_ADD(S_MUL(Fout[m2 + 1], tw2[0]), flag*S_MUL(Fout[m2], tw2[1])), shiftbit);
		scratch_4 = SHR(SSC_SUB(S_MUL(Fout[m3], tw3[0]), flag*S_MUL(Fout[m3 + 1], tw3[1])), shiftbit);

		scratch_5 = SHR(SSC_ADD(S_MUL(Fout[m3 + 1], tw3[0]), flag*S_MUL(Fout[m3], tw3[1])), shiftbit);
#else	
		tmp_val = AE_MUL32X16_H0(Fout[m], tw1[0]);
		tmp_val2 = AE_MUL32X16_H0(Fout[m+1], tw1[1]);
//		scratch_0 =  SHR(SSC_SUB((int)(tmp_val>>15),(flag)*((int)(tmp_val2>>15))),shiftbit);
		scratch_0 =  SHR(SSC_SUB((int)(tmp_val>>15),FFT_MUL_16_32(flag,((int)(tmp_val2>>15)))),shiftbit);


		tmp_val = AE_MUL32X16_H0(Fout[m+1], tw1[0]);
		tmp_val2 = AE_MUL32X16_H0(Fout[m], tw1[1]);
	//	scratch_1 =  SHR(SSC_ADD((int)(tmp_val>>15),(flag)*((int)(tmp_val2>>15))),shiftbit);
		scratch_1 =  SHR(SSC_ADD((int)(tmp_val>>15),FFT_MUL_16_32(flag,((int)(tmp_val2>>15)))),shiftbit);

		tmp_val = AE_MUL32X16_H0(Fout[m2], tw2[0]);
		tmp_val2 = AE_MUL32X16_H0(Fout[m2+1], tw2[1]);
//		scratch_2 =  SHR(SSC_SUB((int)(tmp_val>>15),(flag)*((int)(tmp_val2>>15))),shiftbit);
		scratch_2 =  SHR(SSC_SUB((int)(tmp_val>>15),FFT_MUL_16_32(flag,((int)(tmp_val2>>15)))),shiftbit);

		tmp_val = AE_MUL32X16_H0(Fout[m2+1], tw2[0]);
		tmp_val2 = AE_MUL32X16_H0(Fout[m2], tw2[1]);
//		scratch_3 =  SHR(SSC_ADD((int)(tmp_val>>15),(flag)*((int)(tmp_val2>>15))),shiftbit);
		scratch_3 =  SHR(SSC_ADD((int)(tmp_val>>15),FFT_MUL_16_32(flag,((int)(tmp_val2>>15)))),shiftbit);

		tmp_val = AE_MUL32X16_H0(Fout[m3], tw3[0]);
		tmp_val2 = AE_MUL32X16_H0(Fout[m3+1], tw3[1]);
//		scratch_4 =  SHR(SSC_SUB((int)(tmp_val>>15),(flag)*((int)(tmp_val2>>15))),shiftbit);
		scratch_4 =  SHR(SSC_SUB((int)(tmp_val>>15),FFT_MUL_16_32(flag,((int)(tmp_val2>>15)))),shiftbit);

		tmp_val = AE_MUL32X16_H0(Fout[m3+1], tw3[0]);
		tmp_val2 = AE_MUL32X16_H0(Fout[m3], tw3[1]);
//		scratch_5 =  SHR(SSC_ADD((int)(tmp_val>>15),(flag)*((int)(tmp_val2>>15))),shiftbit);
		scratch_5 =  SHR(SSC_ADD((int)(tmp_val>>15),FFT_MUL_16_32(flag,((int)(tmp_val2>>15)))),shiftbit);
#endif

		Fout[0] = SSC_PSHR(Fout[0], shiftbit);
		Fout[1] = SSC_PSHR(Fout[1], shiftbit);
		scratch_6 = SSC_SUB(Fout[0], scratch_2);  // 10-->6
		scratch_7 = SSC_SUB(Fout[1], scratch_3);  // 11-->7
		Fout[0] = SSC_ADD(Fout[0], scratch_2);
		Fout[1] = SSC_ADD(Fout[1], scratch_3);

		scratch_2 = SSC_ADD(scratch_0, scratch_4);   //6-->2
		scratch_3 = SSC_ADD(scratch_1, scratch_5);   //7-->3
		scratch_0 = SSC_SUB(scratch_0, scratch_4);   //8-->0
		scratch_1 = SSC_SUB(scratch_1, scratch_5);   //9-->1

		Fout[m2] = SSC_SUB(Fout[0], scratch_2); //6-->2
		Fout[m2 + 1] = SSC_SUB(Fout[1], scratch_3); //7-->3
		tw1 += 2;
		tw2 += 4;
		tw3 += 6;
		Fout[0] = SSC_ADD(Fout[0], scratch_2); //6-->2
		Fout[1] = SSC_ADD(Fout[1], scratch_3); //7-->3
#ifndef HW_HIFI3
		Fout[m] = scratch_6 + flag*scratch_1; // 10-->6   //9-->1
		Fout[m + 1] = scratch_7 - flag*scratch_0; // 11-->7  //8-->0
		Fout[m3] = scratch_6 - flag*scratch_1; // 10-->6   //9-->1
		Fout[m3 + 1] = scratch_7 + flag*scratch_0; // 11-->7  //8-->0
#else
		Fout[m] = scratch_6 + FFT_MUL_16_32(flag,scratch_1); // 10-->6   //9-->1
		Fout[m + 1] = scratch_7 - FFT_MUL_16_32(flag,scratch_0); // 11-->7  //8-->0
		Fout[m3] = scratch_6 - FFT_MUL_16_32(flag,scratch_1); // 10-->6   //9-->1
		Fout[m3 + 1] = scratch_7 + FFT_MUL_16_32(flag,scratch_0); // 11-->7  //8-->0
#endif
		Fout = Fout + 2;
	} while (--j);
}
#else
static void kf_bfly4(int *Fout, short fwd)
{
	const short *tw1, *tw2, *tw3;
	short flag = 1, shiftbit = 2;

	int scratch_0,scratch_1,scratch_2,scratch_3,scratch_4,scratch_5,scratch_6,scratch_7;

	short m = 30;
	const short m2 = (m << 1);
	const short m3 = 90;
	short j;


	long long tmp_val,tmp_val2;
	ae_f32x2 *pt0, *pt1, *pt2,*pt3,*pt4,*pt5,*pt6, *pt7;
	ae_f32x2  F01, F23, F45, F67;
	ae_f32x2  S01, S23, S45, S67;
	ae_valign align0 ,align1, align2, align3, align4, align5 , align6, align7;
	ae_f16x4 Tw0,Tw1,Tw2;

	tw3 = tw2 = tw1 = fft_twiddles16000_240;

	j = m >> 1;

	if (0 == fwd)
	{
		flag = -1;
		shiftbit = 0;

	}

	pt0 = pt4 = (ae_f32x2*)&Fout[0];
	pt1 = pt5 = (ae_f32x2*)&Fout[m];
	pt2 = pt6 = (ae_f32x2*)&Fout[m2];
	pt3 = pt7 = (ae_f32x2*)&Fout[m3];
	align0 = AE_LA64_PP(pt0);
	align1 = AE_LA64_PP(pt1);
	align2 = AE_LA64_PP(pt2);
	align3 = AE_LA64_PP(pt3);
	align4 = AE_ZALIGN64();
	align5 = AE_ZALIGN64();
	align6 = AE_ZALIGN64();
	align7 = AE_ZALIGN64();
	do {

		AE_LA32X2_IP(F01, align0, pt0);
		AE_LA32X2_IP(F23, align1, pt1);
		AE_LA32X2_IP(F45, align2, pt2);
		AE_LA32X2_IP(F67, align3, pt3);


		Tw0 = AE_MOVDA16X2(tw1[0], tw1[1]*flag);
		Tw1 = AE_MOVDA16X2(tw2[0], tw2[1]*flag);
		Tw2 = AE_MOVDA16X2(tw3[0], tw3[1]*flag);

		S01 = AE_MULFC32X16RAS_L(F23, Tw0);
		S23 = AE_MULFC32X16RAS_L(F45, Tw1);
		S45 = AE_MULFC32X16RAS_L(F67, Tw2);
		S01 = AE_SRAA32(S01, shiftbit);
		S23 = AE_SRAA32(S23, shiftbit);
		S45 = AE_SRAA32(S45, shiftbit);

		F01 = AE_SRAA32RS(F01, shiftbit);


		S67 = AE_SUB32(F01, S23);
		F01 = AE_ADD32(F01, S23);

		S23 = AE_ADD32(S01, S45);
		S01 = AE_SUB32(S01, S45);

        F45 = AE_SUB32(F01, S23);

		tw1 += 2;
		tw2 += 4;
		tw3 += 6;
		F01 = AE_ADD32(F01, S23);

		S01 = AE_SEL32_LH(S01, S01);
		S01 = AE_MULP32X16X2_L(S01, flag);

		F23 = AE_ADDSUB32(S67, S01);
		F67 = AE_SUBADD32(S67, S01);

        AE_SA32X2_IP(F01, align4, pt4);
        AE_SA32X2_IP(F23, align5, pt5);
        AE_SA32X2_IP(F45, align6, pt6);
        AE_SA32X2_IP(F67, align7, pt7);

	} while (--j);
	AE_SA64POS_FP(align4, pt4);
	AE_SA64POS_FP(align5, pt5);
	AE_SA64POS_FP(align6, pt6);
	AE_SA64POS_FP(align7, pt7);
}
#endif


#ifndef kf_bfly5_opt_hifi3_ZH
static void kf_bfly5(int * Fout, short fwd)
{
	int *Fout0; //,*Fout4;
	short i;
#ifndef HW_FFT_5
	short flag = 1;
#else
	short flag = -1;
#endif
	int scratch_0,scratch_1,scratch_2,scratch_3,scratch_4,scratch_5,scratch_6,scratch_7,scratch_8,scratch_9,scratch_10,scratch_11;

	short ya_0, ya_1, yb_0, yb_1;
#ifdef	HW_HIFI3
	long long tmp_val,tmp_val2;
#endif
	ya_1 = -31164;
	yb_1 = -19261;


	ya_0 = 10127;


	yb_0 = -26509;
#ifndef HW_FFT_5
	if (!fwd)
	{
		Fout0 = Fout;
		flag = -1;
	}
#else
	if(fwd)
	{
		flag = 1;
#if 0
		Fout0 = Fout;
		i=30;
		do
		{
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
		} while (--i);

#else
		//AE_MULA32X2_vector

		for(i=0;i<120;i=i+4)
		{
			Fout[i] = S_MUL(Fout[i], 6554);
			Fout[i+1] = S_MUL(Fout[i+1], 6554);
			Fout[i+2] = S_MUL(Fout[i+2], 6554);
			Fout[i+3] = S_MUL(Fout[i+3], 6554);
		}
#endif
	}

	Fout0 = Fout;

#endif
	i = 12;
	do {



#ifndef HW_FFT_5
		if (fwd)
		{	
			Fout0 = Fout;
#ifndef HW_HIFI3
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0++ = S_MUL(*Fout0, 6554);
			*Fout0 = S_MUL(*Fout0, 6554);
			Fout0 = Fout0 - 9;
#else
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
		//	*Fout0++ = FFT_SHRI(tmp_val,15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0 = (int)(tmp_val>>15);

			Fout0 = Fout0 - 9;
#endif
		}
#endif
		scratch_0 = Fout0[0];
		scratch_1 = Fout0[1];

		scratch_2 = SSC_ADD(Fout0[2], Fout0[8]);
		scratch_3 = SSC_ADD(Fout0[3], Fout0[9]);
		scratch_8 = SSC_SUB(Fout0[2], Fout0[8]);
		scratch_9 = SSC_SUB(Fout0[3], Fout0[9]);
		scratch_4 = SSC_ADD(Fout0[4], Fout0[6]);
		scratch_5 = SSC_ADD(Fout0[5], Fout0[7]);
		scratch_6 = SSC_SUB(Fout0[4], Fout0[6]);
		scratch_7 = SSC_SUB(Fout0[5], Fout0[7]);



		*Fout0++ = *Fout0 + scratch_2 + scratch_4;
		*Fout0++ = *Fout0 + scratch_3 + scratch_5;
		
#ifndef HW_HIFI3
		scratch_10 = scratch_0 + S_MUL(scratch_2, ya_0) + S_MUL(scratch_4, yb_0);
		scratch_11 = scratch_1 + S_MUL(scratch_3, ya_0) + S_MUL(scratch_5, yb_0);
		scratch_0 = scratch_0 + S_MUL(scratch_2, yb_0) + S_MUL(scratch_4, ya_0);   //12-->0  //13-->1
		scratch_1 = scratch_1 + S_MUL(scratch_3, yb_0) + S_MUL(scratch_5, ya_0);  
#else

		tmp_val = AE_MUL32X16_H0(scratch_2,ya_0);
		tmp_val2 = AE_MUL32X16_H0(scratch_4,yb_0);
		scratch_10 = scratch_0+(int)(tmp_val>>15) +(int)(tmp_val2>>15);

		tmp_val = AE_MUL32X16_H0(scratch_3,ya_0);
		tmp_val2 = AE_MUL32X16_H0(scratch_5,yb_0);
		scratch_11 = scratch_1+(int)(tmp_val>>15) +(int)(tmp_val2>>15);

		tmp_val = AE_MUL32X16_H0(scratch_2,yb_0);
		tmp_val2 = AE_MUL32X16_H0(scratch_4,ya_0);
		scratch_0 = scratch_0+(int)(tmp_val>>15) +(int)(tmp_val2>>15);

		tmp_val = AE_MUL32X16_H0(scratch_3,yb_0);
		tmp_val2 = AE_MUL32X16_H0(scratch_5,ya_0);
		scratch_1 = scratch_1+(int)(tmp_val>>15) +(int)(tmp_val2>>15);
#endif
		
 

#ifndef HW_HIFI3
		scratch_2 = flag*(S_MUL(scratch_9, ya_1) + S_MUL(scratch_7, yb_1));    //0-->2  // 1->3
		scratch_3 = (-flag)*(S_MUL(scratch_8, ya_1) + S_MUL(scratch_6, yb_1));  

		scratch_4 = flag*(S_MUL(scratch_7, ya_1)-S_MUL(scratch_9, yb_1));  //2-->4  // 3->5
		scratch_5 = flag*(S_MUL(scratch_8, yb_1)-S_MUL(scratch_6, ya_1)); 
#else
		tmp_val = AE_MUL32X16_H0(scratch_9,ya_1);
		tmp_val2 = AE_MUL32X16_H0(scratch_7,yb_1);
	//	scratch_2 = flag*((int)(tmp_val>>15)+(int)(tmp_val2>>15));
		scratch_2 = FFT_MUL_16_32(flag,((int)(tmp_val>>15)+(int)(tmp_val2>>15)));


		tmp_val = AE_MUL32X16_H0(scratch_8,ya_1);
		tmp_val2 = AE_MUL32X16_H0(scratch_6,yb_1);
	//	scratch_3 = (-flag)*((int)(tmp_val>>15)+(int)(tmp_val2>>15));
		scratch_3 = FFT_MUL_16_32((-flag),((int)(tmp_val>>15)+(int)(tmp_val2>>15)));

		tmp_val = AE_MUL32X16_H0(scratch_7,ya_1);
		tmp_val2 = AE_MUL32X16_H0(scratch_9,yb_1);
	//	scratch_4 = flag*((int)(tmp_val>>15)-(int)(tmp_val2>>15));
		scratch_4 = FFT_MUL_16_32(flag,((int)(tmp_val>>15)-(int)(tmp_val2>>15)));
		
		tmp_val = AE_MUL32X16_H0(scratch_8,yb_1);
		tmp_val2 = AE_MUL32X16_H0(scratch_6,ya_1);
	//	scratch_5 = flag*((int)(tmp_val>>15)-(int)(tmp_val2>>15));
		scratch_5 = FFT_MUL_16_32(flag,((int)(tmp_val>>15)-(int)(tmp_val2>>15)));
#endif



		*Fout0++ = SSC_SUB(scratch_10, scratch_2);    // //0-->2  // 1->3
		*Fout0++ = SSC_SUB(scratch_11, scratch_3);  

		*Fout0++ = SSC_ADD(scratch_0, scratch_4); //12-->0  //13-->1  //2-->4  // 3->5
		*Fout0++ = SSC_ADD(scratch_1, scratch_5); 
		*Fout0++ = SSC_SUB(scratch_0, scratch_4);
		*Fout0++ = SSC_SUB(scratch_1, scratch_5);



		*Fout0++ = SSC_ADD(scratch_10, scratch_2);  //0-->2  // 1->3
		*Fout0++ = SSC_ADD(scratch_11, scratch_3); 
#ifndef HW_FFT_5
#ifndef HW_HIFI3
		Fout = Fout + 10*fwd;
#else
		Fout = Fout + SSC_MULT16x16(10,fwd);
#endif
#endif
	}while (--i);
}
#else
static void kf_bfly5(int * Fout, short fwd)
{
	int *Fout0; //,*Fout4;
	short i;

	short flag = -1;

	int scratch_0,scratch_1,scratch_2,scratch_3,scratch_4,scratch_5,scratch_6,scratch_7,scratch_8,scratch_9,scratch_10,scratch_11;

	short ya_0, ya_1, yb_0, yb_1;

	long long tmp_val,tmp_val2;
	ae_valign align1, align2;

	ae_f32x2 tmpL1, tmpL2, tmpL3, tmpL4;
	ae_f32x2 *pt1, *pt2;
	ae_f16x4 tmpS, ya0_b0,yb0_a0,ya1_b1,yb1_a1, flagv;
	ae_f64   tmpLL1, tmpLL2;
	ya_1 = -31164;
	yb_1 = -19261;


	ya_0 = 10127;


	yb_0 = -26509;

	if(fwd)
	{
#if 1
		flag = 1;
		Fout0 = Fout;
		i=30;
		do
		{
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);    // //  ³ªÁß¿¡  AE_MUL32X2_vector¸¦ ÀÌ¿ëÇØ¼­ ¹Ù²ÙÀÚ.  AE_SRAI64X2_vector
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
			tmp_val = AE_MUL32X16_H0(*Fout0, 6554);
			*Fout0++ = (int)(tmp_val>>15);
		} while (--i);

#else
		flag = 1;
		i=30;
		pt1 = (ae_f32x2*)Fout;
		pt2 = (ae_f32x2*)Fout;
		align1 = AE_LA64_PP(pt1);
		align2 = AE_ZALIGN64();
		tmpS = AE_MOVDA16X2(6554, 6554);
		do
		{
			AE_LA32X2_IP(tmpL1, align1, pt1);
			tmpL1 = AE_MULFP32X16X2RAS_L(tmpL1, tmpS);
			AE_SA32X2_IP(tmpL1, align2, pt2);

			AE_LA32X2_IP(tmpL1, align1, pt1);
			tmpL1 = AE_MULFP32X16X2RAS_L(tmpL1, tmpS);
			AE_SA32X2_IP(tmpL1, align2, pt2);

		} while (--i);
		AE_SA64POS_FP(align2, pt2);
#endif


	}



	Fout0 = Fout;

	pt1 = (ae_f32x2*)Fout;
	pt2 = (ae_f32x2*)Fout;
	align1 = AE_LA64_PP(pt1);
	align2 = AE_ZALIGN64();
	i = 12;
	ya0_b0 = AE_MOVDA16X2(ya_0, yb_0);
	yb0_a0 = AE_MOVDA16X2(yb_0, ya_0);
	ya1_b1 = AE_MOVDA16X2(ya_1, yb_1);
	yb1_a1 = AE_MOVDA16X2(yb_1, ya_1);
	flagv =  AE_MOVDA16X2(flag, -flag);
	do {
		ae_f32x2 F01,F23, F45, F67, F89;
		ae_f32x2 S01,S23, S45, S67, S89,S1011, S97, S86;
		ae_f32x2 S24, S35;
		int data1,data2;

		AE_LA32X2_IP(F01, align1, pt1);
		AE_LA32X2_IP(F23, align1, pt1);
		AE_LA32X2_IP(F45, align1, pt1);
		AE_LA32X2_IP(F67, align1, pt1);
		AE_LA32X2_IP(F89, align1, pt1);

//		scratch_0 = Fout0[0];
//		scratch_1 = Fout0[1];

/*		scratch_2 = SSC_ADD(Fout0[2], Fout0[8]);
		scratch_3 = SSC_ADD(Fout0[3], Fout0[9]);
		scratch_8 = SSC_SUB(Fout0[2], Fout0[8]);
		scratch_9 = SSC_SUB(Fout0[3], Fout0[9]);
		scratch_4 = SSC_ADD(Fout0[4], Fout0[6]);
		scratch_5 = SSC_ADD(Fout0[5], Fout0[7]);
		scratch_6 = SSC_SUB(Fout0[4], Fout0[6]);
		scratch_7 = SSC_SUB(Fout0[5], Fout0[7]);
*/

		S01 = AE_MOV32(F01);
		S23 = AE_ADD32(F23, F89);
		S89 = AE_SUB32(F23, F89);
		S45 = AE_ADD32(F45, F67);
		S67 = AE_SUB32(F45, F67);


		F01 = AE_ADD32(F01, S23);
		F01 = AE_ADD32(F01, S45);
		AE_SA32X2_IP(F01, align2, pt2);



		S24 = AE_SEL32_HH(S23, S45);
		S35 = AE_SEL32_LL(S23, S45);


        tmpL1 = AE_MULFP32X16X2RAS_L(S24, ya0_b0);
        tmpL2 = AE_MULFP32X16X2RAS_L(S35, ya0_b0);
        tmpL3 = AE_SEL32_HH(tmpL1, tmpL2);
        tmpL4 = AE_SEL32_LL(tmpL1, tmpL2);
        tmpL3 = AE_ADD32(tmpL3, tmpL4);
        S1011 = AE_ADD32(S01, tmpL3);



        tmpL1 = AE_MULFP32X16X2RAS_L(S24, yb0_a0);
        tmpL2 = AE_MULFP32X16X2RAS_L(S35, yb0_a0);
        tmpL3 = AE_SEL32_HH(tmpL1, tmpL2);
        tmpL4 = AE_SEL32_LL(tmpL1, tmpL2);
        tmpL3 = AE_ADD32(tmpL3, tmpL4);
        S01 = AE_ADD32(S01, tmpL3);



		S97 = AE_SEL32_LL(S89, S67);
		S86 = AE_SEL32_HH(S89, S67);



        tmpL1 = AE_MULFP32X16X2RAS_L(S97, ya1_b1);
        tmpL1 = AE_MULP32X16X2_L(tmpL1, flag);
        tmpL2 = AE_MULFP32X16X2RAS_L(S86, ya1_b1);
        tmpL2 = AE_MULP32X16X2_L(tmpL2, -flag);
        tmpL3 = AE_SEL32_HH(tmpL1, tmpL2);
        tmpL4 = AE_SEL32_LL(tmpL1, tmpL2);
        S23 = AE_ADD32(tmpL3, tmpL4);

        tmpL1 = AE_MULFP32X16X2RAS_L(S97, yb1_a1);
        tmpL1 = AE_MULP32X16X2_L(tmpL1, -flag);
        tmpL2 = AE_MULFP32X16X2RAS_L(S86, yb1_a1);
        tmpL2 = AE_MULP32X16X2_L(tmpL2, flag);
        tmpL3 = AE_SEL32_HH(tmpL1, tmpL2);
        tmpL4 = AE_SEL32_LL(tmpL1, tmpL2);
        S45 = AE_SUB32(tmpL3, tmpL4);



		tmpL1 = AE_SUB32(S1011, S23);
		AE_SA32X2_IP(tmpL1, align2, pt2);

		tmpL1 = AE_ADD32(S01, S45);
		AE_SA32X2_IP(tmpL1, align2, pt2);

		tmpL1 = AE_SUB32(S01, S45);
		AE_SA32X2_IP(tmpL1, align2, pt2);

		tmpL1 = AE_ADD32(S1011, S23);
		AE_SA32X2_IP(tmpL1, align2, pt2);


	}while (--i);
}
#endif



//FILE *fp_check;
void sun_fft(const int *fin,int *fout,short dir)
{
	
	const short *bitrev = fft_bitrev60;
	const int *_fin = fin;
	int *_fout;
	int i;

#ifndef  fft_bitrev_opt_hifi3_ZH
#ifndef HW_FOR_GCC
	i = 15;
	do{
		_fout = fout+(*bitrev++);
		*_fout++ = *_fin++;
		*_fout = *_fin++;
		_fout = _fout+29;

		*_fout++ = *_fin++;
		*_fout = *_fin++;
		_fout = _fout+29;

		*_fout++ = *_fin++;
		*_fout = *_fin++;
		_fout = _fout+29;
		*_fout++ = *_fin++;
		*_fout = *_fin++;
	}while(--i);
#else
	short i;
	for (i=0;i<15;i++)
	{
		fout[bitrev[i]] = fin[(i<<3)];
		fout[bitrev[i]+1] = fin[(i<<3)+1];

		fout[bitrev[i]+30] = fin[(i<<3)+2];
		fout[bitrev[i]+31] = fin[(i<<3)+3];

		fout[bitrev[i]+60] = fin[(i<<3)+4];
		fout[bitrev[i]+61] = fin[(i<<3)+5];

		fout[bitrev[i]+90] = fin[(i<<3)+6];
		fout[bitrev[i]+91] = fin[(i<<3)+7];
	}
#endif
#else
	{
	   ae_f32x2 *pfin, *pfout;
	   pfin = (ae_f32x2*)fin;
	   short i = 15;
	   do{
		   pfout = (ae_f32x2*)(fout+(*bitrev++));

		   *pfout = *pfin++;
		   pfout += 15;

		   *pfout = *pfin++;
		   pfout += 15;

		   *pfout = *pfin++;
		   pfout += 15;

		   *pfout = *pfin++;
		   pfout += 15;

	   }while(--i);
	}
#endif


	kf_bfly5(fout,dir);	
	kf_bfly3(fout,dir);
	kf_bfly4(fout, dir);

}

#endif
