#include "SamsungSolomonVoiceW_Int.h"

#if(FLAG_SELECT_C_DSP == 1)
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#endif

//#define MULT16_16(a,b) ((a)*(b))

//#ifndef  FLAG_DSP_ASM_ON
#define WEBRTC_SPL_SCALEDIFF32(A, B, C) \
	(C + (B >> 16) * A + (((WebRtc_UWord32)(0x0000FFFF & B) * A) >> 16))
//#endif

const uint16_t kResampleAllpass1[3] = {3284, 24441, 49528};
const uint16_t kResampleAllpass2[3] = {12199, 37471, 60255};

#define MUL_ACCUM_1(a, b, c) WEBRTC_SPL_SCALEDIFF32(a, b, c)
#define MUL_ACCUM_2(a, b, c) WEBRTC_SPL_SCALEDIFF32(a, b, c)

#ifndef  OPT_WebRtcSpl_DownsampleBy2
void WebRtcSpl_DownsampleBy2(const int16_t* in, uint32_t len, int16_t* out, int32_t* filtState) {

	int32_t tmp1, tmp2, diff, in32, out32;
	uint32_t i;

	register int32_t state0 = filtState[0];
	register int32_t state1 = filtState[1];
	register int32_t state2 = filtState[2];
	register int32_t state3 = filtState[3];
	register int32_t state4 = filtState[4];
	register int32_t state5 = filtState[5];
	register int32_t state6 = filtState[6];
	register int32_t state7 = filtState[7];


	for (i = (len >> 1); i > 0; i--) {
		// lower allpass filter
		in32 = (int32_t)(*in++) << 10;
		diff = in32 - state1;
		tmp1 = MUL_ACCUM_1(kResampleAllpass2[0], diff, state0);
		state0 = in32;
		diff = tmp1 - state2;
		tmp2 = MUL_ACCUM_2(kResampleAllpass2[1], diff, state1);
		state1 = tmp1;
		diff = tmp2 - state3;
		state3 = MUL_ACCUM_2(kResampleAllpass2[2], diff, state2);
		state2 = tmp2;

		// upper allpass filter
		in32 = (int32_t)(*in++) << 10;
		diff = in32 - state5;
		tmp1 = MUL_ACCUM_1(kResampleAllpass1[0], diff, state4);
		state4 = in32;
		diff = tmp1 - state6;
		tmp2 = MUL_ACCUM_1(kResampleAllpass1[1], diff, state5);
		state5 = tmp1;
		diff = tmp2 - state7;
		state7 = MUL_ACCUM_2(kResampleAllpass1[2], diff, state6);
		state6 = tmp2;

		// add two allpass outputs, divide by two and round
		out32 = (state3 + state7 + 1024) >> 11;

    // limit amplitude to prevent wrap-around, and write to output array
//    *out++ = saturate(out32);
//		*out++ = WebRtcSpl_SatW32ToW16(out32);
		*out++ = DVTXOP_saturate(out32);
  }

	filtState[0] = state0;
	filtState[1] = state1;
	filtState[2] = state2;
	filtState[3] = state3;
	filtState[4] = state4;
	filtState[5] = state5;
	filtState[6] = state6;
	filtState[7] = state7;
}
#else
//void WebRtcSpl_DownsampleBy2(const int16_t* in, uint32_t len, int16_t* out, int32_t* filtState);
void WebRtcSpl_DownsampleBy2(const int16_t* in, uint32_t len, int16_t* out, int32_t* filtState)
{
	ae_p16x2s * pdata;
	ae_int32x2 data, S04, S15, S26, S37, k00, k11, k22, diff, d, tmp1, tmp2, t1, t2;
	ae_f64 tmpll1, tmpll2;
	int32_t out32;
	int16_t i;

	register int32_t state0 = filtState[0];
	register int32_t state1 = filtState[1];
	register int32_t state2 = filtState[2];
	register int32_t state3 = filtState[3];
	register int32_t state4 = filtState[4];
	register int32_t state5 = filtState[5];
	register int32_t state6 = filtState[6];
	register int32_t state7 = filtState[7];


	S04 = AE_MOVDA32X2(state0, state4);
	S15 = AE_MOVDA32X2(state1, state5);
	S26 = AE_MOVDA32X2(state2, state6);
	S37 = AE_MOVDA32X2(state3, state7);

	k00 = AE_MOVDA32X2(kResampleAllpass2[0], kResampleAllpass1[0]);
	k11 = AE_MOVDA32X2(kResampleAllpass2[1], kResampleAllpass1[1]);
	k22 = AE_MOVDA32X2(kResampleAllpass2[2], kResampleAllpass1[2]);

	pdata = (ae_p16x2s*)in;
	for (i = (len >> 1); i > 0; i--) {
		d = AE_L16X2M_I(pdata, 0);
		pdata += 1;
		d = AE_SLAI32S(d, 2);

		diff = AE_SUB32(d, S15);
		tmpll1 = AE_MUL32_HH(k00, diff);
		tmpll1 = AE_SRAI64(tmpll1, 16);
		tmpll2 = AE_MUL32_LL(k00, diff);
		tmpll2 = AE_SRAI64(tmpll2, 16);
		t1 = AE_MOVINT32X2_FROMF64(tmpll1);
		t2 = AE_MOVINT32X2_FROMF64(tmpll2);
		tmp1 = AE_SEL32_LL(t1, t2);
		tmp1 = AE_ADD32(tmp1, S04);
		S04 = AE_MOV32(d);

		diff = AE_SUB32(tmp1, S26);
		tmpll1 = AE_MUL32_HH(k11, diff);
		tmpll1 = AE_SRAI64(tmpll1, 16);
		tmpll2 = AE_MUL32_LL(k11, diff);
		tmpll2 = AE_SRAI64(tmpll2, 16);
		t1 = AE_MOVINT32X2_FROMF64(tmpll1);
		t2 = AE_MOVINT32X2_FROMF64(tmpll2);
		tmp2 = AE_SEL32_LL(t1, t2);
		tmp2 = AE_ADD32(tmp2, S15);
		S15 = AE_MOV32(tmp1);

		diff = AE_SUB32(tmp2, S37);
		tmpll1 = AE_MUL32_HH(k22, diff);
		tmpll1 = AE_SRAI64(tmpll1, 16);
		tmpll2 = AE_MUL32_LL(k22, diff);
		tmpll2 = AE_SRAI64(tmpll2, 16);
		t1 = AE_MOVINT32X2_FROMF64(tmpll1);
		t2 = AE_MOVINT32X2_FROMF64(tmpll2);
		S37 = AE_SEL32_LL(t1, t2);
		S37 = AE_ADD32(S37, S26);
		S26 = AE_MOV32(tmp2);

		state3 = AE_MOVAD32_H(S37);
		state7 = AE_MOVAD32_L(S37);

		out32 = (state3 + state7 + 1024) >> 11;

		*out++ = AE_SAT16X4_scalar(out32);

	}
	state0 = AE_MOVAD32_H(S04);
	state4 = AE_MOVAD32_L(S04);
	state1 = AE_MOVAD32_H(S15);
	state5 = AE_MOVAD32_L(S15);
	state2 = AE_MOVAD32_H(S26);
	state6 = AE_MOVAD32_L(S26);
	filtState[0] = state0;
	filtState[1] = state1;
	filtState[2] = state2;
	filtState[3] = state3;
	filtState[4] = state4;
	filtState[5] = state5;
	filtState[6] = state6;
	filtState[7] = state7;
}

#endif


#ifndef  OPT_WebRtcSpl_UpsampleBy2
void WebRtcSpl_UpsampleBy2(const int16_t* in, uint32_t len, int16_t* out, int32_t* filtState) {
  int32_t tmp1, tmp2, diff, in32, out32;
  uint32_t i;

  register int32_t state0 = filtState[0];
  register int32_t state1 = filtState[1];
  register int32_t state2 = filtState[2];
  register int32_t state3 = filtState[3];
  register int32_t state4 = filtState[4];
  register int32_t state5 = filtState[5];
  register int32_t state6 = filtState[6];
  register int32_t state7 = filtState[7];


  for (i = len; i > 0; i--) {
    // lower allpass filter
    in32 = (int32_t)(*in++) << 10;
    diff = in32 - state1;
    tmp1 = MUL_ACCUM_1(kResampleAllpass1[0], diff, state0);
    state0 = in32;
    diff = tmp1 - state2;
    tmp2 = MUL_ACCUM_1(kResampleAllpass1[1], diff, state1);
    state1 = tmp1;
    diff = tmp2 - state3;
    state3 = MUL_ACCUM_2(kResampleAllpass1[2], diff, state2);
    state2 = tmp2;

    // round; limit amplitude to prevent wrap-around; write to output array
    out32 = (state3 + 512) >> 10;
//	*out++ = saturate(out32);
//	*out++ = WebRtcSpl_SatW32ToW16(out32);
	*out++ = DVTXOP_saturate(out32);

    // upper allpass filter
    diff = in32 - state5;
    tmp1 = MUL_ACCUM_1(kResampleAllpass2[0], diff, state4);
    state4 = in32;
    diff = tmp1 - state6;
    tmp2 = MUL_ACCUM_2(kResampleAllpass2[1], diff, state5);
    state5 = tmp1;
    diff = tmp2 - state7;
    state7 = MUL_ACCUM_2(kResampleAllpass2[2], diff, state6);
    state6 = tmp2;

    // round; limit amplitude to prevent wrap-around; write to output array
    out32 = (state7 + 512) >> 10;
//	*out++ = saturate(out32);
//	*out++ = WebRtcSpl_SatW32ToW16(out32);
	*out++ = DVTXOP_saturate(out32);
  }

  filtState[0] = state0;
  filtState[1] = state1;
  filtState[2] = state2;
  filtState[3] = state3;
  filtState[4] = state4;
  filtState[5] = state5;
  filtState[6] = state6;
  filtState[7] = state7;
}
#else
void WebRtcSpl_UpsampleBy2(const int16_t* in, uint32_t len, int16_t* out, int32_t* filtState)
{
	ae_p16x2s * pdata;
	ae_int32x2 data, S04, S15, S26, S37, k00, k11, k22, diff, d, tmp1, tmp2, t1, t2;
	ae_f64 tmpll1, tmpll2;
	int32_t out32, in32;
	int16_t i;

	register int32_t state0 = filtState[0];
	register int32_t state1 = filtState[1];
	register int32_t state2 = filtState[2];
	register int32_t state3 = filtState[3];
	register int32_t state4 = filtState[4];
	register int32_t state5 = filtState[5];
	register int32_t state6 = filtState[6];
	register int32_t state7 = filtState[7];


	S04 = AE_MOVDA32X2(state0, state4);
	S15 = AE_MOVDA32X2(state1, state5);
	S26 = AE_MOVDA32X2(state2, state6);
	S37 = AE_MOVDA32X2(state3, state7);

	k00 = AE_MOVDA32X2(kResampleAllpass1[0], kResampleAllpass2[0]);
	k11 = AE_MOVDA32X2(kResampleAllpass1[1], kResampleAllpass2[1]);
	k22 = AE_MOVDA32X2(kResampleAllpass1[2], kResampleAllpass2[2]);

	pdata = (ae_p16x2s*)in;
	for (i = len; i > 0; i--) {
		in32 = (int32_t)(*in++) << 10;
		d = AE_MOVDA32X2(in32, in32);

		diff = AE_SUB32(d, S15);
		tmpll1 = AE_MUL32_HH(k00, diff);
		tmpll1 = AE_SRAI64(tmpll1, 16);
		tmpll2 = AE_MUL32_LL(k00, diff);
		tmpll2 = AE_SRAI64(tmpll2, 16);
		t1 = AE_MOVINT32X2_FROMF64(tmpll1);
		t2 = AE_MOVINT32X2_FROMF64(tmpll2);
		tmp1 = AE_SEL32_LL(t1, t2);
		tmp1 = AE_ADD32(tmp1, S04);
		S04 = AE_MOV32(d);

		diff = AE_SUB32(tmp1, S26);
		tmpll1 = AE_MUL32_HH(k11, diff);
		tmpll1 = AE_SRAI64(tmpll1, 16);
		tmpll2 = AE_MUL32_LL(k11, diff);
		tmpll2 = AE_SRAI64(tmpll2, 16);
		t1 = AE_MOVINT32X2_FROMF64(tmpll1);
		t2 = AE_MOVINT32X2_FROMF64(tmpll2);
		tmp2 = AE_SEL32_LL(t1, t2);
		tmp2 = AE_ADD32(tmp2, S15);
		S15 = AE_MOV32(tmp1);

		diff = AE_SUB32(tmp2, S37);
		tmpll1 = AE_MUL32_HH(k22, diff);
		tmpll1 = AE_SRAI64(tmpll1, 16);
		tmpll2 = AE_MUL32_LL(k22, diff);
		tmpll2 = AE_SRAI64(tmpll2, 16);
		t1 = AE_MOVINT32X2_FROMF64(tmpll1);
		t2 = AE_MOVINT32X2_FROMF64(tmpll2);
		S37 = AE_SEL32_LL(t1, t2);
		S37 = AE_ADD32(S37, S26);
		S26 = AE_MOV32(tmp2);

		state3 = AE_MOVAD32_H(S37);
		state7 = AE_MOVAD32_L(S37);

		out32 = (state3 + 512) >> 10;

		*out++ = AE_SAT16X4_scalar(out32);

		out32 = (state7 + 512) >> 10;

		*out++ = AE_SAT16X4_scalar(out32);


	}
	state0 = AE_MOVAD32_H(S04);
	state4 = AE_MOVAD32_L(S04);
	state1 = AE_MOVAD32_H(S15);
	state5 = AE_MOVAD32_L(S15);
	state2 = AE_MOVAD32_H(S26);
	state6 = AE_MOVAD32_L(S26);
	filtState[0] = state0;
	filtState[1] = state1;
	filtState[2] = state2;
	filtState[3] = state3;
	filtState[4] = state4;
	filtState[5] = state5;
	filtState[6] = state6;
	filtState[7] = state7;
}
#endif
