#include "pitch_util.h"

#ifndef HW_PITCH_OFF2




#ifndef HW_HIFI3_VECTOR
static void xcorr_kernel(const short *x, const short *y, int *sum, short len)
#else
static void xcorr_kernel(const short *x, const short *y, ae_int32x4 *sum_32x4, short len)
#endif

{
#ifndef OPT_CODESIZE_X
	short j;

#ifndef HW_HIFI3_VECTOR
	short tmp;
	short y_0, y_1, y_2, y_3;
	y_3=0;
	y_0=*y++;
	y_1=*y++;
	y_2=*y++;
#else

	ae_int16x4 tmp_16x4;

	ae_int16x4 *y0_16x4 = (ae_int16x4 *)y;
	ae_int16x4 *y1_16x4 = (ae_int16x4 *)(&y[1]);
	ae_int16x4 *y2_16x4 = (ae_int16x4 *)(&y[2]);
	ae_int16x4 *y3_16x4 = (ae_int16x4 *)(&y[3]);

	ae_int16x4 y0_16x4_v;
	ae_int16x4 y1_16x4_v;
	ae_int16x4 y2_16x4_v;
	ae_int16x4 y3_16x4_v;

//	ae_int32x4 *sum_32x4 = (ae_int32x4 *)sum;

	ae_valign align1 = AE_LA64_PP(y0_16x4);
	ae_valign align2 = AE_LA64_PP(y1_16x4);
	ae_valign align3 = AE_LA64_PP(y2_16x4);
	ae_valign align4 = AE_LA64_PP(y3_16x4);

#endif


	j = (len>>2);
	do{
#ifndef HW_HIFI3_VECTOR
		tmp = *x++;
		y_3=*y++;


		sum[0] += SSC_MULT16x16(tmp,y_0);
		sum[1] += SSC_MULT16x16(tmp,y_1);
		sum[2] += SSC_MULT16x16(tmp,y_2);
		sum[3] += SSC_MULT16x16(tmp,y_3);

		tmp=*x++;
		y_0=*y++;
		sum[0] += SSC_MULT16x16(tmp,y_1);
		sum[1] += SSC_MULT16x16(tmp,y_2);
		sum[2] += SSC_MULT16x16(tmp,y_3);
		sum[3] += SSC_MULT16x16(tmp,y_0);


		tmp=*x++;
		y_1=*y++;
		sum[0] += SSC_MULT16x16(tmp,y_2);
		sum[1] += SSC_MULT16x16(tmp,y_3);
		sum[2] += SSC_MULT16x16(tmp,y_0);
		sum[3] += SSC_MULT16x16(tmp,y_1);

		tmp=*x++;
		y_2=*y++;
		sum[0] += SSC_MULT16x16(tmp,y_3);
		sum[1] += SSC_MULT16x16(tmp,y_0);
		sum[2] += SSC_MULT16x16(tmp,y_1);
		sum[3] += SSC_MULT16x16(tmp,y_2);

#else

		AE_LA16X4_IP(y0_16x4_v,align1,y0_16x4);
		AE_LA16X4_IP(y1_16x4_v,align2,y1_16x4);
		AE_LA16X4_IP(y2_16x4_v,align3,y2_16x4);
		AE_LA16X4_IP(y3_16x4_v,align4,y3_16x4);

		tmp_16x4 = *x++;
		AE_MULA16X4_vector(*sum_32x4, tmp_16x4, y0_16x4_v);
		tmp_16x4 = *x++;
		AE_MULA16X4_vector(*sum_32x4, tmp_16x4, y1_16x4_v);
		tmp_16x4 = *x++;
		AE_MULA16X4_vector(*sum_32x4, tmp_16x4, y2_16x4_v);
		tmp_16x4 = *x++;
		AE_MULA16X4_vector(*sum_32x4, tmp_16x4, y3_16x4_v);

#endif

	}while(--j);


#ifndef HW_HIFI3_VECTOR
	tmp=*x++;
	y_3=*y++;
	sum[0] += SSC_MULT16x16(tmp,y_0);
	sum[1] += SSC_MULT16x16(tmp,y_1);
	sum[2] += SSC_MULT16x16(tmp,y_2);
	sum[3] += SSC_MULT16x16(tmp,y_3);

	tmp=*x++;
	y_0=*y++;
	sum[0] += SSC_MULT16x16(tmp,y_1);
	sum[1] += SSC_MULT16x16(tmp,y_2);
	sum[2] += SSC_MULT16x16(tmp,y_3);
	sum[3] += SSC_MULT16x16(tmp,y_0);
#else



	AE_LA16X4_IP(y0_16x4_v,align1,y0_16x4);
	AE_LA16X4_IP(y1_16x4_v,align2,y1_16x4);

	tmp_16x4 = *x++;
	AE_MULA16X4_vector(*sum_32x4, tmp_16x4, y0_16x4_v);
	tmp_16x4 = *x++;
	AE_MULA16X4_vector(*sum_32x4, tmp_16x4, y1_16x4_v);

#endif



#else

	int ltmp;
	int k, l;
	for (k = 0; k < 4; k++)
	{
		ltmp = 0;
		for (l = 0; l < len; l++)
		{
			ltmp += SSC_MULT16x16(x[l], y[l + k]);
		}
		sum[k] += ltmp;
#ifdef WWQ_CORRECTION
		sum[k] >>= 1;
#endif
	}


#endif
}






int speech_pitch_xcorr_c(const short *_x, const short *_y, int *xcorr, short len, short max_pitch)
{
	short i;
	int maxcorr=1;
	int tmp_maxcorr;
	const short *tmp_y = _y;

#ifdef HW_HIFI3_VECTOR
	ae_int32x4 *p_xcorr_32x4 = (ae_int32x4 *)xcorr;
#endif

	i = (max_pitch>>2);
	do{
		int sum[4]={0,0,0,0};
#ifndef HW_HIFI3_VECTOR
		xcorr_kernel(_x, tmp_y, sum, len);
#else
		ae_int32x4 *p_sum32x4 = (ae_int32x4 *)sum;
		xcorr_kernel(_x, tmp_y, p_sum32x4, len);
#endif
		tmp_y = tmp_y+4;



		tmp_maxcorr = SPEECH_MAX(sum[0], sum[1]);
		tmp_maxcorr = SPEECH_MAX(tmp_maxcorr, sum[2]);
		tmp_maxcorr = SPEECH_MAX(tmp_maxcorr, sum[3]);
		maxcorr = SPEECH_MAX(maxcorr, tmp_maxcorr);

#ifndef HW_HIFI3_VECTOR
		*xcorr++=sum[0];
		*xcorr++=sum[1];
		*xcorr++=sum[2];
		*xcorr++=sum[3];
#else
		*p_xcorr_32x4 = *p_sum32x4;
		p_xcorr_32x4++;
#endif

	}while(--i);

	/* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
	return maxcorr;
}

/*
static __inline long ByteSwap1(unsigned long val) 
{ 
	int temp;
	__asm{ 
		eor temp, val, val, ror #16
		bic temp, temp, #0x00FF0000 
		mov val, val, ror #8
		eor val, val, temp, lsr #8
	}
return val; 
}
*/
/*
static __inline __asm long ByteSwap1(unsigned long val)
{
    eor r1, r0, r0, ror #16
    bic r1, r1, #0x00FF0000 
    mov r0, r0, ror #8
    eor r0, r0, r1, lsr #8
    bx lr
}
*/

#endif
