
#include "vector_quant.h"
#include "basic_op.h"
#include "math_op.h"
#include "speech_cwrs.h"
#include "config.h"



#ifndef HW_CODESIZE
static void exp_rotation1(short *X, short len, short stride, short c, short s)
#else
void exp_rotation1(short *X, short len, short stride, short c, short s)
#endif
{
	short i;
	short *Xptr;
	Xptr = X;

#ifndef HW_ALG_QUANT
	for (i=0;i<len-stride;i++)
	{
		short x1, x2;
		x1 = Xptr[0];
		x2 = Xptr[stride];
		Xptr[stride] = EXTRACT16(SHR(SSC_MULT16x16(c,x2) + SSC_MULT16x16(s,x1), 15));
		*Xptr++      = EXTRACT16(SHR(SSC_MULT16x16(c,x1) - SSC_MULT16x16(s,x2), 15));
	}
#else
	i = len-stride;
	do 
	{
#ifndef OPT_KHW_20191106x
		short x1, x2;
		x1 = Xptr[0];
		x2 = Xptr[stride];
		Xptr[stride] = EXTRACT16(SHR(SSC_MULT16x16(c,x2) + SSC_MULT16x16(s,x1), 15));
		*Xptr++      = EXTRACT16(SHR(SSC_MULT16x16(c,x1) - SSC_MULT16x16(s,x2), 15));
#else
		short x2 = Xptr[stride];
		Xptr[stride] = EXTRACT16(SHR(SSC_MULT16x16(c,x2) + SSC_MULT16x16(s,*Xptr), 15));
		*Xptr++      = EXTRACT16(SHR(SSC_MULT16x16(c,*Xptr) - SSC_MULT16x16(s,x2), 15));
#endif
	} while (--i);	
#endif

#ifndef HW_CODESIZE
	Xptr = &X[len-2*stride-1];
#else
	Xptr = &X[len-(stride<<1)-1];
#endif


#ifndef HW_ALG_QUANT
	for (i=len-2*stride-1;i>=0;i--)
	{
		short x1, x2;
		x1 = Xptr[0];
		x2 = Xptr[stride];
		Xptr[stride] = EXTRACT16(SHR(SSC_MULT16x16(c,x2) + SSC_MULT16x16(s,x1), 15));
		*Xptr--      = EXTRACT16(SHR(SSC_MULT16x16(c,x1) - SSC_MULT16x16(s,x2), 15));
	}
#else
	i = len-(stride<<1);
	do 
	{
#ifndef OPT_KHW_20191106x
		short x1, x2;
		x1 = Xptr[0];
		x2 = Xptr[stride];
		Xptr[stride] = EXTRACT16(SHR(SSC_MULT16x16(c,x2) + SSC_MULT16x16(s,x1), 15));
		*Xptr--      = EXTRACT16(SHR(SSC_MULT16x16(c,x1) - SSC_MULT16x16(s,x2), 15));
#else
		short x2 = Xptr[stride];
		Xptr[stride] = EXTRACT16(SHR(SSC_MULT16x16(c,x2) + SSC_MULT16x16(s,*Xptr), 15));
		*Xptr--      = EXTRACT16(SHR(SSC_MULT16x16(c,*Xptr) - SSC_MULT16x16(s,x2), 15));

#endif
	} while (--i);
	
#endif
}






#ifndef OPT_KHW_20191106
#ifndef HW_CODESIZE
static void exp_rotation(short *X, short len, short K, short dir)
#else
void exp_rotation(short *X, short len, short K, short dir)
#endif

{
   short c, s, gain, theta;
   short stride2=0;
   if ((K<<1)>=len) return;


#ifndef OPT_KHW
   gain = speech_div((int)len<<15 ,(int)(len+10*K));    /////  check
      theta = HALF(SSC_MULT16x16_Q15(gain,gain));
#else

#ifndef HW_HIFI3
   gain = ((int)len<<15)/(len+10*K);
#else
   gain = ((int)len<<15)/(len+SSC_MULT16x16(10,K));
#endif
   theta = SHR(SSC_MULT16x16(gain,gain),16);
#endif


   c = speech_cos_norm(theta);
   s = speech_cos_norm(SSC_SUB(Q15_ONE,theta)); /*  sin(theta) */



   if(dir) exp_rotation1(X, len, 1, c, -s);
   if (len>7)
   {
	   stride2 = 3;
	   if(len>23) stride2 = stride2+2;
	   else if(len>15) stride2++;
	   if(dir) exp_rotation1(X, len, stride2, s, -c);
	   else exp_rotation1(X, len, stride2, s, c);
   }
   if(!(dir)) exp_rotation1(X, len, 1, c, s);
}
#endif



#ifndef alg_quant_opt_hifi3_WQ


#ifndef HW_EXPROTATION_LAST
void alg_quant(short *X, short N, short K, ec_enc *enc)
#else
void alg_quant(short *X, short N, short K, ec_enc *enc, short exp_flag)
#endif
{
	short s;

#ifndef HW_AIROHA_ALG_QUANT
	int sum;
#else
	short sum;
#endif

	int xy;
//	short yy;
	int yy;
	short i, j;
	short pulsesLeft;
	short y[24];
	short iy[24];
	short signx[24];


#ifdef HW_ALG_QUANT
	short *pX = X;
	short *py = y;
	short *piy = iy;
	short *psignx = signx;
#endif

#ifdef HW_HIFI3_VECTOR
	ae_int16x4 *p_tmp1_16x4 = (ae_int16x4 *)(iy);
	ae_int16x4 *p_tmp2_16x4 = (ae_int16x4 *)(X);
	ae_int16x4 *p_tmp3_16x4 = (ae_int16x4 *)(y);
	ae_int16x4 tmp1_16x4 = 0;
	ae_int16x4 tmp2_16x4 = 0;

	ae_int32x4 tmp1_32x4 = 0;
	ae_int32x4 tmp2_32x4 = 0;
	ae_int32x4 tmp3_32x4;
#endif


#ifndef HW_EXPROTATION_LAST 
	exp_rotation(X, N, K, 1);
#else
	if(exp_flag)
	{

#ifndef OPT_KHW_20191106
		exp_rotation(X, N, K,1);
#else
	short c, side, gain, theta;

	if ((K<<1)<N)
	{
		gain = ((int)N<<15)/(N+10*K);
		theta = SHR(SSC_MULT16x16(gain,gain),16);

		c = speech_cos_norm(theta);
		side = speech_cos_norm(SSC_SUB(Q15_ONE,theta)); /*  sin(theta) */

		exp_rotation1(X, N, 1, c, -side);

		if (N>7)
		{
			exp_rotation1(X, N, (N>>3)+2, side, -c);
		}
	}
#endif


	}
#endif


	/* Get rid of the sign */
	sum = 0;

#ifndef HW_ALG_QUANT
	j=0;
	do{
		if (X[j]>0)
			signx[j]=1;
		else {
			signx[j]=-1;
			X[j]=-X[j];
		}
		iy[j] = 0;
		y[j] = 0;
	}while(++j<N);
#else
	j = N;


	do{


		if (*pX>0)
			*psignx++=1;
		else {
			*psignx++=-1;
			*pX=(-(*pX));
		}

//////////////////////////////////////////////////
//		*psignx++=(*pX>0);
//		*pX = ABS(*pX);
///////////////////////////////////////////////////
		*piy++ = 0;
		*py++ = 0;

		pX++;
	}while(--j);



#endif

#ifndef HW_HIFI3_VECTORx
	xy = yy = 0;
#endif

	pulsesLeft = K;
	/* Do a pre-search by projecting on the pyramid */
	if (K > (N>>1))
	{
		short rcp;

		j=0; 
#ifndef HW_HIFI3_VECTOR
		do{

#ifndef HW_AIROHA_ALG_QUANT
			sum += X[j];
#else
			sum += (X[j]>>N);
#endif
		}while(++j<N);
#else
		do{
			tmp1_16x4 = AE_INT16X4_SRA(*p_tmp2_16x4,N);
			tmp2_16x4 = AE_ADD16S_vector(tmp2_16x4,tmp1_16x4); // shift ÇÏ°í 1µé ´õÇÔ.

			p_tmp2_16x4++;

			j = j+4;
		}while(j<N);

		sum = AE_INT16X4_RADD(tmp2_16x4);

		p_tmp2_16x4 = (ae_int16x4 *)(X);
#endif



/////////////////////////////////////
//		test = sum>>N;
/////////////////////////////////////
		/* If X is too small, just replace it with a pulse at 0 */
		if (sum <= K)
		{
			X[0] = 16384;
			j=1;
			do
				X[j]=0;
			while(++j<N);
#ifndef HW_AIROHA_ALG_QUANT
			sum = 16384;
#else
			sum = (16384>>N);
#endif
		}






#ifndef HW_AIROHA_ALG_QUANT
		rcp = EXTRACT16(SSC_MULT16x32_Q16(K-1, speech_rcp(sum)));
#else

#ifndef HW_HIFI3
		rcp = (K-1)*(1<<(15-N))/sum;
#else
		rcp = SSC_MULT16x16((K-1),(1<<(15-N)))/sum;
#endif
#endif

#ifndef HW_HIFI3_VECTOR
		j=0; 
		do{
			/* It's really important to round *towards zero* here */
			iy[j] = SSC_MULT16x16_Q15(X[j],rcp);
			y[j] = (short)iy[j];
			yy += SSC_MULT16x16(y[j],y[j]);
			xy += SSC_MULT16x16(X[j],y[j]);
	//		y[j] *= 2;
			y[j] = y[j]<<1;
			pulsesLeft -= iy[j];
		}while(++j<N);
#else

		tmp1_16x4 = rcp;

		j=0;
		do
		{
			tmp3_32x4 = AE_MUL16X4_vector(*p_tmp2_16x4,tmp1_16x4);
			*p_tmp1_16x4 =	(ae_int16x4)(AE_INT32X4_SRAI32(tmp3_32x4, 15));
			*p_tmp3_16x4 = *p_tmp1_16x4;

			AE_MULA16X4_vector(tmp1_32x4, *p_tmp1_16x4, *p_tmp1_16x4);
			AE_MULA16X4_vector(tmp2_32x4, *p_tmp2_16x4, *p_tmp1_16x4);


			*p_tmp3_16x4 = (ae_int16x4)(AE_INT32X4_SLAI32((ae_int32x4)(*p_tmp3_16x4), 1));
		//	AE_INT16X4_SLAI(*p_tmp3_16x4,1);

			pulsesLeft = pulsesLeft-AE_INT16X4_RADD(*p_tmp1_16x4);

			j = j+4;
			p_tmp1_16x4++;
			p_tmp2_16x4++;
			p_tmp3_16x4++;

		}
		while(j<N);

		yy = AE_INT32X4_RADD(tmp1_32x4);
		xy = AE_INT32X4_RADD(tmp2_32x4);
#endif


	}

	if (pulsesLeft > N+3)
	{
		short tmp = (short)pulsesLeft;
		yy += SSC_MULT16x16(tmp, tmp);
		yy += SSC_MULT16x16(tmp, y[0]);
		iy[0] += pulsesLeft;
		pulsesLeft=0;
	}
	s = 1;
	for (i=0;i<pulsesLeft;i++)
	{
		short best_id;
		short best_num = -VERY_LARGE16;   
		short rshift;
		short best_den = 0;


#ifdef HW_ALG_QUANT
		pX = X;
		py = y;
#endif



		rshift = EC_ILOG(K-pulsesLeft+i+1);


		best_id = 0;
		yy++;

#ifndef HW_ALG_QUANT
		j=0;
#else
		j = N;
#endif

		do{
			short Rxy, Ryy;   // 1343, ctx.i==8. i == 2ÀÏ¶§.
#ifndef WWQ_OPT_ALG_QUANT_NOT_BIT_EXACT

#ifndef HW_ALG_QUANT
			Rxy = EXTRACT16(SHR(SSC_ADD(xy, EXTEND32(X[j])),rshift));
			Ryy = SSC_ADD(yy, y[j]);
#else
			Rxy = EXTRACT16(SHR(SSC_ADD(xy, EXTEND32(*pX++)),rshift));
			Ryy = SSC_ADD(yy, *py++);
#endif

			Rxy = SSC_MULT16x16_Q15(Rxy,Rxy);


#else
			int Ltmp;
			//Rxy = EXTRACT16(SHR(SSC_ADD(xy, EXTEND32(X[j])),rshift));
			Ltmp = SSC_VSHR(SSC_ADD(xy, EXTEND32(X[j])),rshift);
			Ryy = SSC_ADD(yy, y[j]);


#ifndef WWQ_CORRECTION
			Rxy = (short)(((((long long)2*Ltmp*Ltmp)>>23)+1)>>1);
#else
			Rxy = (short)(((long long)2*Ltmp*Ltmp + 0x7ffffe)>>24);
#endif


#endif
			if (SSC_MULT16x16(best_den, Rxy) > SSC_MULT16x16(Ryy, best_num))
			{
				best_den = Ryy;
				best_num = Rxy;
#ifndef HW_ALG_QUANT
				best_id = j;
#else
				best_id = N-j;
#endif
			}
		}

#ifndef HW_ALG_QUANT
		while (++j<N);
#else
		while (--j);
#endif
		xy = SSC_ADD(xy, EXTEND32(X[best_id]));
		yy = SSC_ADD(yy, y[best_id]);
		y[best_id] += (s<<1);
		iy[best_id]++;
	}

	/* Put the original sign back */

#ifndef HW_ALG_QUANT
	j=0;
	do{
		X[j] = SSC_MULT16x16(signx[j],X[j]);
		if (signx[j] < 0)
			iy[j] = -iy[j];
	}while(++j<N);
#else


	j=N;
	do{
		pX--;
		psignx--;
		piy--;


		*pX = SSC_MULT16x16(*psignx,*pX);
		*piy = SSC_MULT16x16(*psignx,*piy);


	}while(--j);

#endif

	encode_pulses(iy, N, K, enc);
//	ec_enc_uint_1(enc,icwrs(N,iy),code_pulses(N,K));


}


#else
void alg_quant(short *X, short N, short K, ec_enc *enc, short exp_flag)
{
	short sum;
	short yy;
	int xy;
	short i, j;
	short pulsesLeft;
	short y[24], signx[24];
	short *pX, *py, *psignx;

	ae_f16x4 *ae_pX, *ae_psign, *ae_py;
	ae_f16x4 ae_tmp, ae_tmp1, ae_tmp2, ae_tmp3, ae_tmp4;
	ae_f32x2 ae_Ltmp, ae_Ltmp1, ae_Ltmp2, ae_Ltmp3, ae_Ltmp4;
	xtbool2 ae_bool2;

	if(exp_flag)
	{
#ifndef OPT_KHW_20191106
		exp_rotation(X, N, K,1);
#else
		short c, side, gain, theta;

		if ((K<<1)<N)
		{
			gain = ((int)N<<15)/(N+10*K);
			theta = SHR(SSC_MULT16x16(gain,gain),16);

			c = speech_cos_norm(theta);
			side = speech_cos_norm(SSC_SUB(Q15_ONE,theta)); /*  sin(theta) */

			exp_rotation1(X, N, 1, c, -side);

			if (N>7)
			{
				exp_rotation1(X, N, (N>>3)+2, side, -c);
			}
		}
#endif
	}

	/* Get rid of the sign */
#if 0
	pX = X;
	py = y;
	psignx = signx;
	j = N;
	do{
		if (*pX>0)
			*psignx++=1;
		else {
			*psignx++=-1;
			*pX=(-(*pX));
		}
		*py++ = 0;
		pX++;
	}while(--j);
#else
	{
			ae_int16x4 zeroV, data, tmp, signV, neg1V;
			ae_int16x4 *pt0, *pt1, *pt2, *pt3, *pt4;
			ae_valign align0, align1;
			xtbool4 bool4;
			pt0 = (ae_int16x4*)X;
			pt1 = (ae_int16x4*)signx;
			pt3 = (ae_int16x4*)y;
			pt4 = (ae_int16x4*)X;
		    j=0;
		    zeroV = AE_MOV16(0);
		    neg1V = AE_MOV16(-1);
			align0 = AE_LA64_PP(pt0);
			align1 = AE_ZALIGN64();
		    do{
		        signV = AE_MOV16(1);
		    	AE_LA16X4_IP(data, align0, pt0);
		    	bool4 = AE_LE16(data, zeroV);
		    	AE_MOVT16X4(signV, neg1V, bool4);
		    	tmp = AE_ABS16S(data);
		    	AE_SA16X4_IP(tmp, align1, pt4);
		    	AE_S16X4_IP(signV,pt1, 8);
		    	AE_S16X4_IP(zeroV,pt3, 8);
			    j += 4;
		    }while(j<N);
		}
#endif




	xy = 0;
	yy = 0;
	pulsesLeft = K;

	/* Do a pre-search by projecting on the pyramid */
	if (K > (N>>1))
	{
		short rcp;

#if 0
		sum = 0;
		j=0;
		do{
			sum += (X[j]>>N);
		}while(++j<N);
#else
		ae_pX = (ae_f16x4 *)&X[0];
		ae_tmp3 = AE_ZERO16();
		for(j=0; j<N; j+=4)
		{
			AE_L16X4_IP(ae_tmp1, ae_pX, 8);
			ae_tmp1 = AE_SRAA16S(ae_tmp1, N);
			ae_tmp3 = AE_ADD16S(ae_tmp3, ae_tmp1);
		}
		sum = AE_INT16X4_RADD(ae_tmp3);
#endif

		/* If X is too small, just replace it with a pulse at 0 */
		if (sum <= K)
		{
			X[0] = 16384;
			j=1;
			do{ X[j]=0; } while(++j<N);
			sum = (16384>>N);
		}

		rcp = SSC_MULT16x16((K-1),(1<<(15-N)))/sum;


#if 0
		for(j=0; j<N; j++)
		{
			/* It's really important to round *towards zero* here */
			y[j] = (short)((MULT16_16(X[j],rcp) + 0)>>15);
			yy += MULT16_16(y[j],y[j]);
			xy += MULT16_16(X[j],y[j]);
			pulsesLeft -= y[j];
		}
#else
		ae_pX = (ae_f16x4 *)&X[0];
		ae_py = (ae_f16x4 *)&y[0];
		ae_Ltmp1 = AE_ZERO32();
		ae_Ltmp2 = AE_ZERO32();
		ae_Ltmp3 = AE_ZERO32();
		ae_Ltmp4 = AE_ZERO32();


	//	ae_Ltmp = AE_MOVDA32(16384);
		ae_Ltmp = AE_MOVDA32(0);

		ae_tmp3 = AE_ZERO16();
		ae_tmp4 = AE_MOVDA16(rcp);
		for(j=0; j<N; j+=4)
		{
			ae_f32x2 a, b;
			AE_L16X4_IP(ae_tmp1, ae_pX, 8);
			AE_MULF16X4SS(a, b, ae_tmp1, ae_tmp4);
			a = AE_ADD32S(a, ae_Ltmp);
			b = AE_ADD32S(b, ae_Ltmp);
			ae_tmp2 = AE_TRUNC16X4F32(a, b);
			AE_MULA16X4(ae_Ltmp1, ae_Ltmp2, ae_tmp1, ae_tmp2);
			AE_MULA16X4(ae_Ltmp3, ae_Ltmp4, ae_tmp2, ae_tmp2);
			pulsesLeft = pulsesLeft-AE_INT16X4_RADD(ae_tmp2);
			AE_S16X4_IP(ae_tmp2, ae_py, 8);
		}
		ae_Ltmp1 = AE_ADD32S(ae_Ltmp1, ae_Ltmp2);
		ae_Ltmp3 = AE_ADD32S(ae_Ltmp3, ae_Ltmp4);
		xy = AE_INT32X2_RADD(ae_Ltmp1);
		yy = AE_INT32X2_RADD(ae_Ltmp3);
#endif


	}

	if (pulsesLeft > N+3)
	{
		short tmp = (short)pulsesLeft;
		yy += SSC_MULT16x16(tmp, tmp);
		yy += SSC_MULT16x16(tmp, 2*y[0]);
		y[0] += pulsesLeft;
		pulsesLeft=0;
	}

	for (i=0;i<pulsesLeft;i++)
	{
		short best_id;
		short best_num = -VERY_LARGE16;
		short rshift;
		short best_den = 0;
		pX = X;
		py = y;
		rshift = EC_ILOG(K-pulsesLeft+i+1);
		best_id = 0;
		yy++;


#if 0
		j = N;
		do{
			short Rxy, Ryy;   // 1343, ctx.i==8. i == 2ÀÏ¶§.
			Rxy = EXTRACT16(SHR(ADD(xy, EXTEND32(*pX++)),rshift));
			Ryy = ADD(yy, 2*(*py++));
			Rxy = MULT16_16_Q15(Rxy,Rxy);

			if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))
			{
				best_den = Ryy;
				best_num = Rxy;
				best_id = N-j;
			}
		}while (--j);
#else
		ae_pX = (ae_f16x4 *)&X[0];
		ae_py = (ae_f16x4 *)&y[0];
		ae_Ltmp4 = AE_MOVDA32(xy);
		ae_tmp4 = AE_MOVDA16(yy);
		for(j=0; j<N; j+=4)
		{
			short Rxy0, Ryy0, Rxy1, Ryy1;
			AE_L16X4_IP(ae_tmp1, ae_pX, 8);
			AE_L16X4_IP(ae_tmp2, ae_py, 8);
			ae_Ltmp1 = AE_SEXT32X2D16_32(ae_tmp1);
			ae_Ltmp2 = AE_SEXT32X2D16_10(ae_tmp1);
			ae_Ltmp1 = AE_SRAA32(AE_ADD32S(ae_Ltmp1, ae_Ltmp4), rshift);
			ae_Ltmp2 = AE_SRAA32(AE_ADD32S(ae_Ltmp2, ae_Ltmp4), rshift);
			ae_tmp1 = AE_SAT16X4(ae_Ltmp1, ae_Ltmp2);
			ae_tmp2 = AE_ADD16S(AE_SLAI16S(ae_tmp2, 1), ae_tmp4);
			ae_tmp1 = AE_MULFP16X4S(ae_tmp1, ae_tmp1);
			Rxy0 = AE_MOVAD16_3(ae_tmp1);
			Ryy0 = AE_MOVAD16_3(ae_tmp2);
			Rxy1 = AE_MOVAD16_2(ae_tmp1);
			Ryy1 = AE_MOVAD16_2(ae_tmp2);
			if (SSC_MULT16x16(best_den, Rxy0) > SSC_MULT16x16(Ryy0, best_num))
			{
				best_den = Ryy0;
				best_num = Rxy0;
				best_id = j;
			}
			if (SSC_MULT16x16(best_den, Rxy1) > SSC_MULT16x16(Ryy1, best_num))
			{
				best_den = Ryy1;
				best_num = Rxy1;
				best_id = j+1;
			}
			Rxy0 = AE_MOVAD16_1(ae_tmp1);
			Ryy0 = AE_MOVAD16_1(ae_tmp2);
			Rxy1 = AE_MOVAD16_0(ae_tmp1);
			Ryy1 = AE_MOVAD16_0(ae_tmp2);
			if (SSC_MULT16x16(best_den, Rxy0) > SSC_MULT16x16(Ryy0, best_num))
			{
				best_den = Ryy0;
				best_num = Rxy0;
				best_id = j+2;
			}
			if (SSC_MULT16x16(best_den, Rxy1) > SSC_MULT16x16(Ryy1, best_num))
			{
				best_den = Ryy1;
				best_num = Rxy1;
				best_id = j+3;
			}
		}
#endif


		xy = SSC_ADD(xy, EXTEND32(X[best_id]));
		yy = SSC_ADD(yy, 2*y[best_id]);
		y[best_id] += 1;
	}




	/* Put the original sign back */
#if 0
	j=0;
	pX = X;
	py = y;
	psignx = signx;
	do{
		*pX = MULT16_16(*psignx,*pX);
		*py = MULT16_16(*psignx,*py);
		pX++;
		psignx++;
		py++;
	}while(++j<N);
#else
	ae_pX = (ae_f16x4 *)&X[0];
	ae_py = (ae_f16x4 *)&y[0];
	ae_psign = (ae_f16x4 *)&signx[0];
	for(i=0; i<N; i+=4)
	{
		ae_tmp1 = AE_L16X4_I(ae_pX, 0);
		ae_tmp2 = AE_L16X4_I(ae_py, 0);
		AE_L16X4_IP(ae_tmp3, ae_psign, 8);
		AE_MUL16X4(ae_Ltmp1, ae_Ltmp2, ae_tmp1, ae_tmp3);
		ae_tmp1 = AE_SAT16X4(ae_Ltmp1, ae_Ltmp2);
		AE_MUL16X4(ae_Ltmp1, ae_Ltmp2, ae_tmp2, ae_tmp3);
		ae_tmp2 = AE_SAT16X4(ae_Ltmp1, ae_Ltmp2);
		AE_S16X4_IP(ae_tmp1, ae_pX, 8);
		AE_S16X4_IP(ae_tmp2, ae_py, 8);
	}
#endif

	encode_pulses(y, N, K, enc);
}

#endif


static void normalise_residual(short *iy, short *X, short N, int Ryy)
{
	short i, k, g;

	int t;

#ifdef HW_HIFI3_VECTOR
	 ae_int16x4 *p_iy_16x4;
	 ae_int16x4 g_16x4;
	 ae_int16x4 *pX_16x4;
	 ae_int32x4 tmp1_32x4;
	 ae_int32x4 tmp2_32x4;
#endif

#ifndef HW_CODESIZE
	k = speech_ilog2(Ryy)>>1;
#else

#ifndef VC_PROJ
	k = (30-AE_NSAZ32_L(Ryy))>>1;
#else
	k = (EC_ILOG(Ryy)-1)>>1;
#endif
#endif


#ifndef HW_HIFI3
	t = SSC_VSHR(Ryy, 2*(k-7));
#else
	t = SSC_VSHR(Ryy, (k-7)<<1);
#endif



#ifndef HW_HIFI3_VECTOR
	g = speech_rsqrt_norm(t); 
	i=0;
	do{
		X[i] = EXTRACT16(SSC_PSHR(SSC_MULT16x16(g, iy[i]), k+1));
	}while (++i < N);
#else
	g_16x4 = speech_rsqrt_norm(t);
	p_iy_16x4 = (ae_int16x4 *)(iy);
	tmp2_32x4 = (1<<k);
	pX_16x4 = (ae_int16x4 *)(X);
	i = 0;
	do
	{
		tmp1_32x4 = AE_MUL16X4_vector(g_16x4, *p_iy_16x4);
		tmp1_32x4 = AE_INT32X4_ADD(tmp1_32x4, tmp2_32x4);
		*pX_16x4 = (ae_int16x4)(AE_INT32X4_SRAA32(tmp1_32x4, (k+1)));
		p_iy_16x4++;
		pX_16x4++;
		i=i+4;
	}
	while(i<N);
#endif



}




#ifndef HW_EXPROTATION_LAST
void alg_unquant(short *X, short N, short K, ec_dec *dec)
#else
void alg_unquant(short *X, short N, short K, ec_dec *dec,short exp_flag)
#endif
{

	int Ryy;

   short i;
   short iy[24];

#ifdef HW_HIFI3_VECTOR
   ae_int16x4 *iy_16x4;
   ae_int32x4 Ryy_32x4 = 0;
#endif

   decode_pulses(iy, N, K, dec);
  // cwrsi(N,K,ec_dec_uint(dec,code_pulses(N, K));

  /*
   void decode_pulses(short *_y,short _n,short _k,ec_dec *_dec)
   {
   	cwrsi(N,_k,ec_dec_uint(_dec,code_pulses(_n, _k)),_y);
   }
   */

#ifndef HW_HIFI3_VECTOR
   Ryy = 0;
   i=0;
   do{
      Ryy += SSC_MULT16x16(iy[i], iy[i]);
   }while (++i < N);
#else
   iy_16x4 = (ae_int16x4 *)iy;
   i=0;
   do{
	   AE_MULA16X4_vector(Ryy_32x4, *iy_16x4, *iy_16x4);
	   iy_16x4++;
	   i = i+4;
   }
   while(i<N);
   Ryy = AE_INT32X4_RADD(Ryy_32x4);
#endif
   
   normalise_residual(iy, X, N, Ryy);




#ifndef HW_EXPROTATION_LAST  
   exp_rotation(X, N, K, 0);
#else
   if(exp_flag)
   {
#ifndef OPT_KHW_20191111
	   exp_rotation(X, N, K,0);
#else
	   short c, s, gain, theta;
	  //	   short stride2=2+(N>>3);
	  if ((K<<1)<N)
	  {
		  gain = ((int)N<<15)/(N+10*K);
	  	  theta = SHR(SSC_MULT16x16(gain,gain),16);

	  	  c = speech_cos_norm(theta);
	  	  s = speech_cos_norm(SSC_SUB(Q15_ONE,theta)); /*  sin(theta) */

	  	  if (N>7)
	  	  {
	  		  exp_rotation1(X, N, (N>>3)+2, s, c);
	  	  }
	  	  exp_rotation1(X, N, 1, c, s);
	  }
#endif
   }
#endif

}


void renormalise_vector(short *X, short N)
{
	int k, t;

	short g;


#ifndef HW_HIFI3_VECTOR
	int E = EPSILON;
	short *xptr = X;
	do
	{
		E += SSC_MULT16x16(*xptr, *xptr);
		xptr++;
	}while(xptr!=X+N);
#else
	int E;
	ae_int16x4 *xptr_16x4 = (ae_int16x4 *)(X);
	ae_int16x4 g_16x4;
	ae_int32x4 E_32x4 = 0;
	ae_int32x4 tmp1_32x4;
	short i;

	short *xptr;

	for(i=0;i<(N>>2);i++)
	{
		 AE_MULA16X4_vector(E_32x4, *xptr_16x4, *xptr_16x4);
		 xptr_16x4++;
	}

	E = AE_INT32X4_RADD(E_32x4);
	E = E+EPSILON;

#endif



#ifndef HW_CODESIZE
	k = (speech_ilog2(E)>>1);
#else
#ifndef VC_PROJ
	k = (30-AE_NSAZ32_L(E)>>1);
#else
	k = ((EC_ILOG(E)-1)>>1);
#endif
#endif


#ifndef HW_HIFI3
	t = SSC_VSHR(E, 2*(k-7));
#else
	t = SSC_VSHR(E, (k-7)<<1);
#endif




#ifndef HW_HIFI3_VECTOR
	g = speech_rsqrt_norm(t);
	xptr = X;

	do{
		*xptr = EXTRACT16(SSC_PSHR(SSC_MULT16x16(g, *xptr), k+1));
		xptr++;
	}while(xptr!=X+N);	
#else
	g_16x4 = speech_rsqrt_norm(t);
	E_32x4 = (1<<k);
	xptr_16x4 = (ae_int16x4 *)(X);

	for(i=0;i<(N>>2);i++)
	{
		 tmp1_32x4 = AE_MUL16X4_vector(g_16x4, *xptr_16x4);
		 tmp1_32x4 = AE_INT32X4_ADD(tmp1_32x4, E_32x4);
		 *xptr_16x4 = (ae_int16x4)(AE_INT32X4_SRAA32(tmp1_32x4,(k+1)));
		 xptr_16x4++;
	}
#endif

}




