
#include "pitch.h"
#include "pitch_util.h"
#include "basic_op.h"
#include "math_op.h"


//extern short pitch_buf[156];

#ifndef HW_PITCH_OFF2

//#ifdef HW_DATASIZE
//extern int shared_buf[390];   // 312+78(156)
//#endif

#ifdef OPT_DATASIZE2
extern int pre_shared[360];
#endif

#ifdef  remove_doubling_loop_BX
int remove_doubling_loop(short *x,int*yy_lookup,short maxperiod,int yy,int fixed_flr,int N)
{
//	  int i;
//	  for (i=1;i<=maxperiod;i++)
//	   {
//	      yy = yy+SSC_MULT16x16(x[-i],x[-i])-SSC_MULT16x16(x[N-i],x[N-i]);
//		  yy_lookup[i] = yy>>fixed_flr;
//	   }
//	   return 0;
//##########################################################################################
		 int i;
	     ae_int16x4 *  t_vec_1_16x4 = (ae_int16x4 *)(&x[-1]);
	     ae_int16x4 * t_vec_2_16x4 = (ae_int16x4 *)(&x[N-1]);
	     ae_int32x2 * t_vec_3_32x2=  (ae_int32x2 *)(&yy_lookup[1]);
	     ae_int16x4 t_vec_1_16x4_v,t_vec_2_16x4_v;
	     ae_valign    align1,align2,align3;
	     align1 = AE_LA64_PP(t_vec_1_16x4);
	     align2 = AE_LA64_PP(t_vec_2_16x4);
	     align3 = AE_ZALIGN64();

	     ae_int32x2 tmp1,tmp2,tmp3,tmp4,tmp_err1,tmp_err2,tmp_err3,tmp_err4,tmp_yy,tmp_err1_h,tmp_err2_h;
	     tmp_yy=AE_MOVDA32X2( yy, 0);
	     for(i=1;i<=maxperiod;i=i+4)
	     {
	  		AE_LA16X4_RIP(t_vec_1_16x4_v,align1,t_vec_1_16x4);
	  		AE_LA16X4_RIP(t_vec_2_16x4_v,align2,t_vec_2_16x4);
	  	    AE_MUL16X4(tmp1,tmp2,t_vec_1_16x4_v, t_vec_1_16x4_v);
	  	    AE_MUL16X4(tmp3,tmp4,t_vec_2_16x4_v, t_vec_2_16x4_v);
	  		tmp_err1=AE_SUB32(tmp1,tmp3);
	  		tmp_err2=AE_SUB32(tmp2,tmp4);

	  		tmp_err1_h=AE_ADD32(tmp_yy,tmp_err1);//tmp_err1 h 是第一个
	  		tmp_err2_h=AE_ADD32_HL_LH(tmp_err1_h,tmp_err1_h);//tmp_err2 h 是第2个
	  		tmp1=AE_SEL32_HH(tmp_err1_h,tmp_err2_h);
	  		tmp1 = AE_SRAA32(tmp1, 2);
	  		AE_SA32X2_IP(tmp1, align3, t_vec_3_32x2);

	  		tmp_err3 = AE_SEL32_HL(tmp_err2_h, tmp_yy);//
	  		tmp_err3=AE_ADD32(tmp_err3,tmp_err2); //tmp_err3 h 是第3个
	  		tmp_err4=AE_ADD32_HL_LH(tmp_err3,tmp_err3);//tmp_err3 h 是第3个
	  		tmp2=AE_SEL32_HH(tmp_err3,tmp_err4);
	  		//更新tmp_yy
	  		tmp_yy=AE_SEL32_LH(tmp2, 0);

	  		tmp2 = AE_SRAA32(tmp2, 2);
	  		AE_SA32X2_IP(tmp2, align3, t_vec_3_32x2);
	     }
	     AE_SA64POS_FP(align3, t_vec_3_32x2);
	     return 0;
	}
#endif



#ifdef HW_CODESIZE
#ifndef HW_CODESIZE
void dual_inner_prod(const short *x, const short *y01, const short *y02,
	short N, int *xy1, int *xy2)

#else
void dual_inner_prod(const short *x, const short *y01, const short *y02,
		int *xy1, int *xy2)
#endif
{
	short i;
#ifndef HW_HIFI3
//	int N = 60;
	int xy01=0;
	int xy02=0;
//	for (i=0;i<N;i++)
	for (i=0;i<60;i++)
	{
		xy01 += SSC_MULT16x16(x[i], y01[i]);
		xy02 += SSC_MULT16x16(x[i], y02[i]);
	}
	*xy1 = xy01;
	*xy2 = xy02;
#else

	ae_int16x4 *x_4 = (ae_int16x4 *)(x);
	ae_int16x4 *y01_4 = (ae_int16x4 *)(y01);
	ae_int16x4 *y02_4 = (ae_int16x4 *)(y02);

	ae_int16x4 x_4_v ;
	ae_int16x4 y01_4_v;
	ae_int16x4 y02_4_v;

	ae_valign align1 = AE_LA64_PP(x_4);
	ae_valign align2 = AE_LA64_PP(y01_4);
	ae_valign align3 = AE_LA64_PP(y02_4);

	ae_int32x4 xy_01_4 = 0;
	ae_int32x4 xy_02_4 = 0;


//	for (i=0;i<(N>>2);i++)
	for (i=0;i<15;i++)
	{
		AE_LA16X4_IP(x_4_v,align1,x_4);
		AE_LA16X4_IP(y01_4_v,align2,y01_4);
		AE_MULA16X4_vector(xy_01_4, x_4_v, y01_4_v);
		AE_LA16X4_IP(y02_4_v,align3,y02_4);
		AE_MULA16X4_vector(xy_02_4, x_4_v, y02_4_v);
	}

	*xy1 = AE_INT32X4_RADD(xy_01_4);
	*xy2 = AE_INT32X4_RADD(xy_02_4);


#endif
}

#endif





#ifndef HW_CODESIZE
static void find_best_pitch(int *xcorr, short *y, short len,
	short max_pitch, short *best_pitch, int maxcorr)
#else
void find_best_pitch(int *xcorr, short *y, short len,
	short max_pitch, short *best_pitch, int maxcorr)
#endif
{
   short i,j;
   short xshift;
#ifndef HW_HIFI3_VECTOR
   int Syy=1;
#endif
   short best_num[2];
   int best_den[2];
   int threshold;

#ifdef HW_HIFI3_VECTOR
   int Syy;
   ae_int16x4 *y_16x4 = (ae_int16x4 *)(y);
   ae_int32x4 syy_32x4 = 0;
//   short remainder = (len & 3);
#endif



#ifndef HW_CODESIZE
   xshift = EC_ILOG(maxcorr)-15;
#else
#ifndef VC_PROJ
   xshift =  (16-AE_NSAZ32_L(maxcorr));
#else
    xshift = EC_ILOG(maxcorr)-15;
#endif
#endif


   threshold = maxcorr>>4;

   best_num[0] = -1;
   best_num[1] = -1;
   best_den[0] = 0;
   best_den[1] = 0;
   best_pitch[0] = 0;
   best_pitch[1] = 1;


#ifndef HW_HIFI3_VECTOR
   for (j=0;j<len;j++)
	   Syy += SSC_MULT16x16(y[j],y[j]);
#else

   for (j=0;j<len>>2;j++)
   {
	   AE_MULA16X4_vector(syy_32x4, *y_16x4, *y_16x4);
	   y_16x4++;
   }

   Syy = AE_INT32X4_RADD(syy_32x4);
   Syy++;

   j = j<<2;
   while(j<len)
   {
	   Syy += SSC_MULT16x16(y[j],y[j]);
	   j++;
   }
 //  for ( ;j<remainder;j++)
  // {
//	   Syy += SSC_MULT16x16(y[j],y[j]);
 //  }
#endif



   for (i=0;i<max_pitch;i++)
   {
	  if (xcorr[i]>threshold)
      {
		 short num, xcorr16;
         xcorr16 = EXTRACT16(SSC_VSHR(xcorr[i], xshift));
         num = SSC_MULT16x16_Q15(xcorr16,xcorr16);
         if (SSC_MULT16x32_Q15(num,best_den[1]) > SSC_MULT16x32_Q15(best_num[1],Syy))
         {
            if (SSC_MULT16x32_Q15(num,best_den[0]) > SSC_MULT16x32_Q15(best_num[0],Syy))
            {
				best_num[1] = best_num[0];
				best_den[1] = best_den[0];
				best_pitch[1] = best_pitch[0];
				best_num[0] = num;
				best_den[0] = Syy;
				best_pitch[0] = i;
            } else {
				best_num[1] = num;
				best_den[1] = Syy;
				best_pitch[1] = i;
            }
         }
      }
	  Syy += SSC_MULT16x16(y[i+len],y[i+len]) - SSC_MULT16x16(y[i],y[i]);
   }
}


#ifndef HW_SAVESTACK
void downsample_enc(int *x, short *x_lp, short len)
#else
void downsample_enc(int *x,int *x2, short *x_lp, short len)
#endif
{
   short i;
   int tmp1, tmp2;
   short *tmp_x_lp = x_lp;
   short shift;

   int *pt_x1, *pt_x2, *pt_x3;



 //  int pre_shared[312];
#ifndef HW_SAVESTACK
   int maxabs = speech_maxabs32(x, len);
#else
   int maxabs;
   tmp1 = speech_maxabs32(x, MAX_PITCH_PERIOD);
   tmp2 = speech_maxabs32(x2, 120);
   maxabs = SPEECH_MAX(tmp1,tmp2);
#endif


   if (maxabs<1) 
	   maxabs=1;

#ifndef HW_CODESIZE
    shift = EC_ILOG(maxabs)-9;
#else
#ifndef VC_PROJ
    shift = (22-AE_NSAZ32_L(maxabs));
#else
    shift = EC_ILOG(maxabs)-9;
#endif
#endif

   if (shift<0) shift=0;



//   SUN_COPY_32(pre_shared, x, MAX_PITCH_PERIOD);
//   SUN_COPY_32(pre_shared+MAX_PITCH_PERIOD, x2, 120);



#ifndef HW_SAVESTACK
#ifndef pitch_downsample_opt_hifi3_TLF
   *tmp_x_lp++ = SHR(HALF(HALF(x[1])+x[0]), shift);

   x++;

      i = (len>>1)-1;
      do{
   	   tmp1 = *x++;
   	   tmp2 = *x++;
   	   *tmp_x_lp++ = SHR(HALF(HALF(tmp1+(*x))+tmp2), shift);
      }while(--i);
#else
        *tmp_x_lp++ = SHR(HALF(HALF(x[1])+x[0]), shift);
	    i = (len>>1)-4;
	    shift+=2;
		ae_int32x2 *ptin1, *ptin2;
		ae_int32x2 d_tmp11, d_tmp12, d_tmp13, d_tmpout1, d_tmpout2, d_tmpout;
		ae_int16x4 *ptout;
		ae_int16x4 tmpout16;
		ae_valign align1, align2, alignout;
		ptout = (ae_int16x4 *)tmp_x_lp;
		alignout = AE_ZALIGN64();
		ptin1 = (ae_int32x2 *)&x[1];
		ptin2 = (ae_int32x2 *)&x[2];
		align1 = AE_LA64_PP(ptin1);
		align2 = AE_LA64_PP(ptin2);

	 do{
		 AE_LA32X2_IP(d_tmp11,align1,ptin1);
		 AE_LA32X2_IP(d_tmp12,align2,ptin2);
		 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
		 d_tmpout1 = AE_ADD32_HL_LH(d_tmp13, d_tmp13);
		 AE_LA32X2_IP(d_tmp11,align1,ptin1);
		 AE_LA32X2_IP(d_tmp12,align2,ptin2);
		 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
		 d_tmpout2 = AE_ADD32_HL_LH(d_tmp13, d_tmp13);
		 d_tmpout = AE_SRAA32(AE_SEL32_LL(d_tmpout1, d_tmpout2), shift);
		 AE_LA32X2_IP(d_tmp11,align1,ptin1);
		 AE_LA32X2_IP(d_tmp12,align2,ptin2);
		 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
		 d_tmpout1 = AE_ADD32_HL_LH(d_tmp13, d_tmp13);
		 AE_LA32X2_IP(d_tmp11,align1,ptin1);
		 AE_LA32X2_IP(d_tmp12,align2,ptin2);
		 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
		 d_tmpout2 = AE_ADD32_HL_LH(d_tmp13, d_tmp13);
		 d_tmpout1 = AE_SRAA32(AE_SEL32_LL(d_tmpout1, d_tmpout2), shift);
		 tmpout16 = AE_CVT16X4(d_tmpout, d_tmpout1);
		 AE_SA16X4_IP(tmpout16,alignout,ptout);
	 }while(i-=4);

	 AE_SA64POS_FP(alignout, ptout);
	 AE_LA32X2_IP(d_tmp11,align1,ptin1);
	 AE_LA32X2_IP(d_tmp12,align2,ptin2);
	 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
	 d_tmpout1 = AE_SRAA32(AE_ADD32_HL_LH(d_tmp13, d_tmp13), shift);
	 AE_S16_0_IP(AE_CVT16X4(0,d_tmpout1), ptout, 2);

	 AE_LA32X2_IP(d_tmp11,align1,ptin1);
	 AE_LA32X2_IP(d_tmp12,align2,ptin2);
	 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
	 d_tmpout1 = AE_SRAA32(AE_ADD32_HL_LH(d_tmp13, d_tmp13), shift);
	 AE_S16_0_IP(AE_CVT16X4(0,d_tmpout1), ptout, 2);

	 AE_LA32X2_IP(d_tmp11,align1,ptin1);
	 AE_LA32X2_IP(d_tmp12,align2,ptin2);
	 d_tmp13 = AE_ADD32_HL_LH(d_tmp11, d_tmp12);
	 d_tmpout1 = AE_SRAA32(AE_ADD32_HL_LH(d_tmp13, d_tmp13), shift);
	 AE_S16_0_IP(AE_CVT16X4(0,d_tmpout1), ptout, 2);

	 AE_SA64POS_FP(alignout, ptout);
#endif
/*
   for(i=1;i<(len>>1);i++)
   {
	   x_lp[i] = SHR(HALF(HALF(pre_shared[(2*i)-1]+pre_shared[(2*i)+1])+pre_shared[2*i]),shift);
   }
*/

#else
   *tmp_x_lp++ = SHR(HALF(HALF(x[1])+x[0]), shift);

   for(i=1;i<(192>>1);i++)
   {
	   x_lp[i] = SHR(HALF(HALF(x[(2*i)-1]+x[(2*i)+1])+x[2*i]),shift);
   }

   x_lp[i] = SHR(HALF(HALF(x[(2*i)-1]+x2[1])+x2[0]),shift);

   for(i=1;i<(120>>1);i++)
   {
	   x_lp[96+i] = SHR(HALF(HALF(x2[(2*i)-1]+x2[(2*i)+1])+x2[2*i]),shift);
   }


#endif




}


#ifndef HW_DATASIZE
short x_lp4[30];	//len>>2
short y_lp4[66];	//lag>>2
int xcorr[73];	//max_pitch>>1
#endif

void search_pitch_candidate(const short *x_lp, short *y, short len, short max_pitch, short *pitch)
{
	short i, j, lag;
	short best_pitch[2]={0,0};
#ifndef AFTER_190327_OPTI
	short offset;
#else
	short offset = 0;
#endif
	short c3 = 22938; //QCONST16(.7f,15);
	
	int maxcorr;
  
#ifdef HW_DATASIZE
/*
#ifndef OPT_DATASIZE2
	short x_lp4[30];	//len>>2
	short y_lp4[66];	//lag>>2
#else
	short *x_lp4 = (short *)(&pre_shared[312]);
	short *y_lp4 = (short *)(&pre_shared[327]);
#endif
*/


	short x_lp4[30];	//len>>2
#ifndef OPT_DATASIZE2
	short y_lp4[66];	//lag>>2
#else
	short *y_lp4 = (short *)(&pre_shared[312]);
#endif
	int xcorr[73];	//max_pitch>>1
#endif
	lag = len+max_pitch;

#ifndef pitch_search_loop_opt_hifi3_ZH
	for (j=0;j<len>>2;j++)
	{
		x_lp4[j] = x_lp[j<<1];
		//y_lp4[j] = y[j<<1];
	}
	for (j = 0;j<lag>>2;j++)
		y_lp4[j] = y[j<<1];
#else
	{
		ae_int16x4 *pt0, *pt1;
		ae_int16x4 tmp1, tmp2;
		ae_valign align0;
		pt0 = (ae_int16x4 *)(&x_lp[-1]);
		pt1 = (ae_int16x4 *)x_lp4;
		align0 = AE_LA64_PP(pt0);
		for (j=0;j<len>>2;j+= 4)
		{
			AE_LA16X4_IP(tmp1,align0, pt0);
			AE_LA16X4_IP(tmp2,align0, pt0);
			tmp1 = AE_SEL16_6420(tmp1, tmp2);
			AE_S16X4_IP(tmp1, pt1, 8);
		}
		for ( ;j<len>>2;j++)
			x_lp4[j] = x_lp[j<<1];


		pt0 = (ae_int16x4 *)(&y[-1]);
		pt1 = (ae_int16x4 *)y_lp4;
		align0 = AE_LA64_PP(pt0);
		for (j=0;j<lag>>2;j+= 4)
		{
			AE_LA16X4_IP(tmp1,align0, pt0);
			AE_LA16X4_IP(tmp2,align0, pt0);
			tmp1 = AE_SEL16_6420(tmp1, tmp2);
			AE_S16X4_IP(tmp1, pt1, 8);
		}
		for ( ;j<lag>>2;j++)
			y_lp4[j] = y[j<<1];

	}
#endif

	/* Coarse search with 4x decimation */

	maxcorr = speech_pitch_xcorr_c(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
	find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch, maxcorr);

	/* Finer search with 2x decimation */
	maxcorr=1;
	for (i=0;i<max_pitch>>1;i++)
	{
		int sum=0;
#ifdef HW_HIFI3_VECTOR
		ae_int32x4 sum_32x4;
		ae_int16x4 *x_lp_16x4;
		ae_int16x4 *y_16x4;
		ae_int16x4 y_16x4_v;
		ae_valign align1;
#endif

		xcorr[i] = 0;
#ifndef HW_HIFI3
		if (ABS(i-2*best_pitch[0])>2 && ABS(i-2*best_pitch[1])>2)
			continue;
#else
		if (ABS(i-(best_pitch[0]<<1))>2 && ABS(i-(best_pitch[1]<<1))>2)
			continue;
#endif


#ifndef HW_HIFI3_VECTOR
		for (j=0;j<len>>1;j++)
			sum += SSC_MULT16x16(x_lp[j],y[i+j]);
#else

		sum_32x4 = 0;
		x_lp_16x4 = (ae_int16x4 *)(x_lp);
		y_16x4 = (ae_int16x4 *)(&y[i]);
		align1 = AE_LA64_PP(y_16x4);


		for (j=0;j<len>>3;j++)
		{
			AE_LA16X4_IP(y_16x4_v,align1,y_16x4);
			AE_MULA16X4_vector(sum_32x4, *x_lp_16x4, y_16x4_v);
			x_lp_16x4++;
		}

		sum = AE_INT32X4_RADD(sum_32x4);
#endif



		sum = sum>>1;
		xcorr[i] = SPEECH_MAX(-1, sum);
		maxcorr = SPEECH_MAX(maxcorr, sum);
	}

	find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch, maxcorr);

	/* Refine by pseudo-interpolation */
	if (best_pitch[0]>0 && best_pitch[0]<(max_pitch>>1)-1)
	{
		int a, b, c;
	
		a = xcorr[best_pitch[0]-1];
		b = xcorr[best_pitch[0]];
		c = xcorr[best_pitch[0]+1];
		if ((c-a) > SSC_MULT16x32_Q15(c3,b-a)) offset = 1;
		else if ((a-c) > SSC_MULT16x32_Q15(c3,b-c)) offset = -1;
#ifndef AFTER_190327_OPTI
		else offset = 0;
#endif
	}
#ifndef AFTER_190327_OPTI
	else
	{
		offset = 0;
	}
#endif

//	*pitch = 2*best_pitch[0]-offset;
	*pitch = (best_pitch[0]<<1)-offset;
}

#ifndef HW_DATASIZE
static const short second_check[8] = {0, 0, 3, 2, 3, 2, 5, 2};
#else

#ifndef OPT_DATASIZE2
static const short second_check[8] = {0, 0, 3, 2, 3, 2, 5, 2};
#else
static const char second_check[8] = {0, 0, 3, 2, 3, 2, 5, 2};
#endif

#endif

#ifndef HW_DATASIZE
int yy_lookup[97];
#endif

#ifndef HW_CODESIZE
short no_doubling_get_gain(short *x, short maxperiod, short minperiod,
	short N, short *T0_, short prev_period, short prev_gain)
#else


short no_doubling_get_gain(short *x, short maxperiod, short minperiod,
	short *T0_, short prev_period, short prev_gain)


//short no_doubling_get_gain(short *x, short *T0_, short prev_period, short prev_gain)

#endif

{
   short k, i, T, T0;
   short T1, T1b;
   short g, g0;
   short pg;
   int xy,xx,yy,xy2;

   int best_xy, best_yy;
   short offset;
   short minperiod0;

   int xcorr[3];

#ifdef HW_DATASIZE
   int yy_lookup[97];
#endif

 //  int yy_lookup[97];
   int x2y2,t;
   short sh;
   short qconst07 = 22938;

   const short fixed_flr = 2;

#ifdef HW_CODESIZE
   short N=60;
//   short maxperiod = 192;
//   short minperiod = 26;
#endif

#ifdef HW_HIFI3_VECTOR
	ae_int16x4 *t_vec_1_16x4;
	ae_int16x4 *t_vec_2_16x4;
	ae_int16x4 t_vec_1_16x4_v;
	ae_int16x4 t_vec_2_16x4_v;
	ae_int32x4 result_1_32x4;
	ae_int32x4 result_2_32x4;

	ae_valign align1;
	ae_valign align2;

#endif

   minperiod0 = minperiod;
   maxperiod >>= 1;
   minperiod >>= 1;
   *T0_ >>= 1;
   prev_period >>= 1;
#ifndef HW_CODESIZE
   N >>= 1;
#endif


   x += maxperiod;
   if (*T0_>=maxperiod)
	   *T0_=maxperiod-1;

   T = T0 = *T0_;
   
#ifndef HW_CODESIZE
   dual_inner_prod(x, x, x-T0, N, &xx, &xy);  // use rMAC for xx and xy.
#else
   dual_inner_prod(x, x, x-T0, &xx, &xy);  // use rMAC for xx and xy.
#endif




   xy = xy>>fixed_flr; //  7 can be incorrect. change the value later.
   yy=xx;

   xx = xx>>fixed_flr;   


   yy_lookup[0] = xx;

#ifdef  remove_doubling_loop_BX
  remove_doubling_loop(x,yy_lookup, maxperiod, yy, fixed_flr, N);
#else
#ifndef HW_HIFI3_VECTORx
   for (i=1;i<=maxperiod;i++)
   {
      yy = yy+SSC_MULT16x16(x[-i],x[-i])-SSC_MULT16x16(x[N-i],x[N-i]);
	  yy_lookup[i] = yy>>fixed_flr;
   }
#else
/*
   t_vec_1_16x4 = (ae_int16x4 *)(&x[-1]);
   t_vec_2_16x4 = (ae_int16x4 *)(&x[N-1]);

   align1 = AE_LA64_PP(t_vec_1_16x4);
   align2 = AE_LA64_PP(t_vec_2_16x4);
   result_1_32x4 = 0;
   result_2_32x4 = 0;

   for(i=1;i<=maxperiod;i=i+4)
   {
		AE_LA16X4_IP(t_vec_1_16x4_v,align1,t_vec_1_16x4);
		AE_LA16X4_IP(t_vec_2_16x4_v,align2,t_vec_2_16x4);
	    result_1_32x4 = AE_MUL16X4_vector(t_vec_1_16x4_v, t_vec_1_16x4_v);
	    result_2_32x4 = AE_MUL16X4_vector(t_vec_2_16x4_v, t_vec_1_16x4_v);
   }
*/


#endif
#endif

   yy = yy_lookup[T0];
   best_xy = xy;
   best_yy = yy;

#ifndef HW_CODESIZEx
   x2y2 = 1+HALF(SSC_MULT32x32_Q31(xx<<fixed_flr,yy<<fixed_flr));

#else
   x2y2 = 1+((int)(((long long)(AE_MUL32_HH(xx<<fixed_flr,yy<<fixed_flr))>>32)));
#endif

#ifndef HW_CODESIZE
   sh = speech_ilog2(x2y2)>>1;
#else

#ifndef VC_PROJ
   sh = (30-AE_NSAZ32_L(x2y2))>>1;
#else
   sh = (EC_ILOG(x2y2)-1)>>1;
#endif

#endif
   t = SSC_VSHR(x2y2, (sh-7)<<1);

   t = SSC_VSHR(SSC_MULT16x32_Q15(speech_rsqrt_norm(t), xy<<fixed_flr),sh+1);

//////  ssat //////////////////////////////////


#ifndef HW_HIFI3
   g = g0 = SPEECH_MAX(SPEECH_MIN(t,32767),-32768);
#else
   g = g0 = AE_SAT16X4_scalar(t);
#endif


//////////////////////////////////////////////////


   T1 = ((T0<<1)+2)>>2;
   k=2;
   if (T1 < minperiod)
		k += 6;

   T1b = T0;
   if (T1+T0<=maxperiod)
	   T1b += T1;


   /* Look for any pitch at T/k */
//   for (;k<=7;k++)   // 15 12 9 6 4 2
   while(k<=7)
   {
		short g1;
		short cont=0;
		short thresh;
#if 0
#ifndef HW_HIFI3
		T1 = (2*T0+k)/(2*k);
#else
		T1 = ((T0<<1)+k)/(k<<1);
#endif
		if (T1 < minperiod)
			break;
		/* Look for another strong correlation at T1b */
		if (k==2)
		{
			if (T1+T0>maxperiod) T1b = T0;
			else T1b = T0+T1;
		}
		else
		{
#ifndef HW_HIFI3
			T1b = (2*second_check[k]*T0+k)/(2*k);
#else
			T1b = (SSC_MULT16x16((second_check[k]<<1),T0)+k)/(k<<1);
#endif
		}
#else

		if(k!=2)
		{
			T1 = ((T0<<1)+k)/(k<<1);
			if (T1 < minperiod)
				break;
#ifndef HW_HIFI3
			T1b = (2*second_check[k]*T0+k)/(2*k);
#else
#ifndef OPT_DATASIZE2
			T1b = (SSC_MULT16x16((second_check[k]<<1),T0)+k)/(k<<1);
#else
			T1b = (SSC_MULT16x16((((short)second_check[k])<<1),T0)+k)/(k<<1);
#endif

#endif
		}

#endif


#ifndef HW_CODESIZE
		dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
#else
		dual_inner_prod(x, &x[-T1], &x[-T1b], &xy, &xy2);
#endif

		xy = xy>>fixed_flr;   //  7 can be incorrect. change the value later.
		xy2 = xy2>>fixed_flr;

		xy += xy2;
		yy = yy_lookup[T1] + yy_lookup[T1b];


		x2y2 = 1+SSC_MULT32x32_Q31(xx<<fixed_flr,yy<<fixed_flr);

#ifndef HW_CODESIZE
		sh = speech_ilog2(x2y2)>>1;
#else
#ifndef VC_PROJ
		sh = (30-AE_NSAZ32_L(x2y2))>>1;
#else
		sh = (EC_ILOG(x)-1)>>1;
#endif
#endif
////////////////////////////////////////////////////////////////////////////

#ifndef HW_HIFI3
		t = SSC_VSHR(x2y2, 2*(sh-7));
#else
		t = SSC_VSHR(x2y2, ((sh-7)<<1));
#endif

//		g1 = SSC_VSHR(SSC_MULT16x32_Q15(speech_rsqrt_norm(t), xy<<fixed_flr),sh+1);
		t = SSC_VSHR(SSC_MULT16x32_Q15(speech_rsqrt_norm(t), xy<<fixed_flr),sh+1);
#ifndef HW_HIFI3
		g1 = SPEECH_MAX(SPEECH_MIN(t,32767),-32768);
#else
		g1 = AE_SAT16X4_scalar(t);
#endif


		if (ABS(T1-prev_period)<=1)
			cont = prev_gain;
#ifndef HW_HIFI3
		else if (ABS(T1-prev_period)<=2 && 5*k*k < T0)
#else
		else if (ABS(T1-prev_period)<=2 && SSC_MULT16x16(SSC_MULT16x16(5,k),k) < T0)
#endif
			cont = HALF(prev_gain);
		else
			cont = 0;

		thresh = SPEECH_MAX(9830, SSC_MULT16x16_Q15(qconst07,g0)-cont);
		/* Bias against very high pitch (very short period) to avoid false-positives
			due to short-term correlation */

#ifndef HW_HIFI3
		if (T1<3*minperiod)
			thresh = SPEECH_MAX(13107, SSC_MULT16x16_Q15(27853,g0)-cont);
		else if (T1<2*minperiod)
			thresh = SPEECH_MAX(16384, SSC_MULT16x16_Q15(29491,g0)-cont);
#else
		if (T1<SSC_MULT16x16(3,minperiod))
			thresh = SPEECH_MAX(13107, SSC_MULT16x16_Q15(27853,g0)-cont);
		else if (T1<(minperiod<<1))
			thresh = SPEECH_MAX(16384, SSC_MULT16x16_Q15(29491,g0)-cont);

#endif
		if (g1 > thresh)
		{
			best_xy = xy;
			best_yy = yy;
			T = T1;
			g = g1;
		}

		k++;
	}



	best_xy = SPEECH_MAX(0, best_xy);
	if (best_yy <= best_xy)   // besy_xy>=0 捞扁 锭巩俊 best_yy=0捞搁辑 else肺 逞绢啊瘤 臼绰促.
		pg = Q15_ONE;
	else
	{

#ifndef HW_FRAC_DIV32_AIROHA
		pg = SHR(frac_div32(best_xy,best_yy,fixed_flr),16);
#else
		pg = frac_div32(best_xy,best_yy);
#endif

	}

#ifndef HW_HIFI3_VECTOR
	for (k=0;k<3;k++)
	{
		short T1 = T+k-1;
		xy = 0;
		for (i=0;i<N;i++)
			xy += SSC_MULT16x16(x[i], x[i-T1]);

		xcorr[k] = (xy>>fixed_flr);

	}
#else

#ifndef remove_doubling_opt_hifi3_ZH
	for (k=0;k<3;k++)
	{
		short T1 = T+k-1;
//		xy = 0;
		result_2_32x4 = 0;
		t_vec_1_16x4 = (ae_int16x4 *)(x);
		t_vec_2_16x4 = (ae_int16x4 *)(&x[-T1]);

		align1 = AE_LA64_PP(t_vec_1_16x4);
		align2 = AE_LA64_PP(t_vec_2_16x4);



		for (i=0;i<N;i=i+4)
		{
			AE_LA16X4_IP(t_vec_1_16x4_v,align1,t_vec_1_16x4);
			AE_LA16X4_IP(t_vec_2_16x4_v,align2,t_vec_2_16x4);
			AE_MULA16X4_vector(result_2_32x4, t_vec_1_16x4_v, t_vec_2_16x4_v);
		}
		xy = AE_INT32X4_RADD(result_2_32x4);


		xcorr[k] = (xy>>fixed_flr);

	}
#else
	{
		ae_int16x4 *ptx, *pt1, *pt2, *pt3, xx, x1, x2, x3;
		ae_valign alignx,align1, align2, align3;
		ae_int32x4 xcorr1,xcorr2, xcorr3;
		int tmpL;
	//	short T1 = T -1;
        ptx = (ae_int16x4 *)(x);
        pt1 = (ae_int16x4 *)(&x[-T +1]);
        pt2 = (ae_int16x4 *)(&x[-T]);
        pt3 = (ae_int16x4 *)(&x[-T- 1]);

		alignx = AE_LA64_PP(ptx);
		align1 = AE_LA64_PP(pt1);
		align2 = AE_LA64_PP(pt2);
		align3 = AE_LA64_PP(pt3);
		xcorr1 = 0;
		xcorr2 = 0;
		xcorr3 = 0;
		for (i=0;i<N;i=i+4)
		{
			AE_LA16X4_IP(xx,alignx,ptx);
			AE_LA16X4_IP(x1,align1,pt1);
			AE_LA16X4_IP(x2,align2,pt2);
			AE_LA16X4_IP(x3,align3,pt3);
			AE_MULA16X4_vector(xcorr1, xx, x1);
			AE_MULA16X4_vector(xcorr2, xx, x2);
			AE_MULA16X4_vector(xcorr3, xx, x3);
		}
		tmpL = AE_INT32X4_RADD(xcorr1);
		xcorr[0] = (tmpL>>fixed_flr);
		tmpL = AE_INT32X4_RADD(xcorr2);
		xcorr[1] = (tmpL>>fixed_flr);
		tmpL = AE_INT32X4_RADD(xcorr3);
		xcorr[2] = (tmpL>>fixed_flr);

	}

#endif




#endif

	if ((xcorr[2]-xcorr[0]) > SSC_MULT16x32_Q15(qconst07,xcorr[1]-xcorr[0]))
		offset = 1;
	else if ((xcorr[0]-xcorr[2]) > SSC_MULT16x32_Q15(qconst07,xcorr[1]-xcorr[2]))
		offset = -1;
	else
		offset = 0; 

	if (pg > g) pg = g;
#ifndef HW_HIFI3
	*T0_ = 2*T+offset;
#else
	*T0_ = (T<<1)+offset;
#endif
	if (*T0_<minperiod0) *T0_=minperiod0;

	return pg;
}
#endif
