
#include "ssc_pulsealloc.h"
#include "basic_op.h"
#include "ssc_modes.h"
#include "copy.h"
#include "rc_encode.h"
#include "rc_decode.h"
#include "config.h"

//#include <stdio.h>

#define ALLOC_STEPS 6


//                          0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15                 
const short en_start[16]    = {7800,7800,7800,7800,7600,7600,7600,7600,7500,7500,7300,7300,7250,7250,7250,7250};
const short offset_unit[16] = {3,      3,   3,   3,   4,   4,   4,   5,   9,   9,  12,  14,  16,  28,  29,  30};




const char eMeans[16] = {103,100, 92, 85, 81, 77, 72, 70, 78, 75, 73, 71, 78, 74, 69, 72};
const short _cap[16] = {249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 514, 514, 514, 1028, 1028, 1482};
const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0};


#ifdef OPT_KHW_20191106
static const unsigned short band_allocation_opti[176] =
{
	0,		0,		0,		0,		0,		0,		0,		0,		0,		0,		0,		0,		0,		0,		0,		0,
	90,		80,		75,		69,		63,		56,		49,		40,		34,		29,		40,		36,		20,		0,		0,		0,
	110,	100,	90,		84,		78,		71,		65,		58,		51,		45,		78,		64,		52,		80,		48,		0,
	118,	110,	103,	93,		86,		80,		75,		70,		65,		59,		106,	94,		80,		124,	92,		90,
	126,	119,	112,	104,	95,		89,		83,		78,		72,		66,		120,	108,	94,		156,	128,	150,
	134,	127,	120,	114,	103,	97,		91,		85,		78,		72,		132,	120,	108,	188,	164,	210,
	144,	137,	130,	124,	113,	107,	101,	95,		88,		82,		152,	140,	128,	228,	204,	270,
	152,	145,	138,	132,	123,	117,	111,	105,	98,		92,		172,	160,	148,	268,	244,	330,
	162,	155,	148,	142,	133,	127,	121,	115,	108,	102,	192,	180,	168,	308,	284,	390,
	172,	165,	158,	152,	143,	137,	131,	125,	118,	112,	212,	200,	188,	348,	324,	450,
	200,	200,	200,	200,	200,	200,	200,	200,	198,	193,	376,	366,	356,	692,	672,	978,
};
#endif


#ifndef HW_24BIT
short num_pulses(short i)
{
   return (i<8)?(i):((8+(i&7))<<((i>>3)-1));
}
#else
short num_pulses(short i,short N)
{
	short YLPVQ24BitKTable[4] = {176,16,7,5};

	short k;
	k = i<8 ? i : (8 + (i&7)) << ((i>>3)-1);
#ifndef OPT_KHW_20191106
	if (k > YLPVQ24BitKTable[N>>3])
		k = YLPVQ24BitKTable[N>>3];

	return k;
#else
	return SPEECH_MIN(k,YLPVQ24BitKTable[N>>3]);
#endif
}
#endif








#ifndef compute_allocation_opt_hifi3_ZH
static short cal_bitsTopulses(const SpeechMode *m, short end, short skip_start,
	const short *bits1, const short *bits2, const short *thresh, const short *cap, short total, short *_balance,
	short *bits, short *ebits, short *fine_priority, ec_ctx *ec, short prev)
{
	int lo, hi;
	int psum;
	short i, j;
	short codedBands=-1;
	short alloc_floor;
	short left, percoeff;
	short done;
	short balance;
	short start = 0;

	alloc_floor = 8; //1<<BITRES;
	lo = 0;
	hi = 1<<ALLOC_STEPS;

	
	for (i=0;i<ALLOC_STEPS;i++)
	{
		short mid = (lo+hi)>>1;
		short tmp;
		psum = 0;
		done = 0;
		for (j=end;j--;)
		{
#ifndef HW_HIFI3
			tmp = bits1[j] + (mid*(int)bits2[j]>>ALLOC_STEPS);
#else
			tmp = bits1[j] + (SSC_MULT16x16(mid,bits2[j])>>ALLOC_STEPS);
#endif
			if (tmp >= thresh[j] || done)
			{
				done = 1;
				/* Don't allocate more than we can actually use */
				psum += tmp;
			} else {
				if(tmp >= alloc_floor) psum += alloc_floor;
			}
		}
		if (psum > total) hi = mid;
		else lo = mid;
	}




	psum = 0;
	done = 0;

	for (j=end;j--;)
	{
#ifndef HW_HIFI3
		short tmp = bits1[j] + (lo*bits2[j]>>ALLOC_STEPS);
#else
		short tmp = bits1[j] + (SSC_MULT16x16(lo,bits2[j])>>ALLOC_STEPS);
#endif
		if (tmp < thresh[j] && !done)
		{
			if (tmp >= alloc_floor) tmp = alloc_floor;
			else tmp = 0;
		} else
			done = 1;

		bits[j] = tmp;
		psum += tmp;
	}

//////////////////////////////////////////////////////////////////////////////////////////////////////////

	for (codedBands=end;;codedBands--)
	{
		short band_width, band_bits, rem;
		j = codedBands-1;

		if(j<=skip_start)
		{
			/* Give the bit we reserved to end skipping back. */
			total += 8;
			break;
		}
		left = total-psum;
		percoeff = left/(m->eBands[codedBands]); // #if 0   -m->eBands[start]); #endif
#ifndef HW_HIFI3
		left -= m->eBands[codedBands]*percoeff;
#else
		left -= SSC_MULT16x16(m->eBands[codedBands],percoeff);
#endif
		rem = SPEECH_MAX(left-(m->eBands[j]),0);
		band_width = m->eBands[codedBands]-m->eBands[j];
#ifndef HW_HIFI3
		band_bits = (int)(bits[j] + percoeff*band_width + rem);
#else
		band_bits = (int)(bits[j] + SSC_MULT16x16(percoeff,band_width) + rem);
#endif

		if (band_bits >= SPEECH_MAX(thresh[j], alloc_floor+8))
		{

			if(prev == -10000)    // at the kalimba, prev can be shared with encoder and decoder, so extra variable should be used instead of prev.
			{
				if(rc_dec_bit_logp(ec, 1)) break;
			}
			else
			{
#ifndef HW_HIFI3
				if (codedBands<=2 || (band_bits > ((j<prev?7:9)*band_width<<1) && j<=(end-1)))    //    j<=(end-1))´Â Ç×»ó trueÀÎ°ÍÀ¸·Î º¸ÀÓ.
#else
				if (codedBands<=2 || (band_bits > (SSC_MULT16x16((j<prev?7:9),band_width<<1)) && j<=(end-1)))    //    j<=(end-1))´Â Ç×»ó trueÀÎ°ÍÀ¸·Î º¸ÀÓ.
#endif
				{
					rc_enc_bit_logp_1(ec, 1, 1);
					break;
				}
				rc_enc_bit_logp_1(ec, 0, 1);
			}



			/*We used a bit to skip this band.*/
			psum += 8;
			band_bits -= 8;
		}
		/*Reclaim the bits originally allocated to this band.*/
		psum -= bits[j];
		if (band_bits >= alloc_floor)
		{
			/*If we have enough for a fine energy bit per channel, use it.*/
			psum += alloc_floor;
			bits[j] = alloc_floor;
		} else {
			/*Otherwise this band gets nothing at all.*/
			bits[j] = 0;
		}
	}

	/* Allocate the remaining bits */
	left = total-psum;

	percoeff = left/(m->eBands[codedBands]);
#ifndef HW_HIFI3
	left -= (m->eBands[codedBands])*percoeff;
#else
	left -= SSC_MULT16x16((m->eBands[codedBands]),percoeff);
#endif
	balance = 0;


	for (j=start;j<codedBands;j++)
	{
		short N0 = m->eBands[j+1]-m->eBands[j];
		short N, den;
		short offset;
		short NClogN;
		short excess, bit;
		short tmp = (int)SPEECH_MIN(left, N0);
#ifndef HW_HIFI3
		bits[j] += ((int)percoeff*N0);
#else
		bits[j] += (SSC_MULT16x16(percoeff,N0));
#endif
		bits[j] += tmp;
		left -= tmp;

		N=N0<<2;
		bit = (int)bits[j]+balance;

        excess = SPEECH_MAX(bit-cap[j],0);
        bits[j] = bit-excess;
        /* Compensate for the extra DoF in stereo */
		den=N;
#ifndef HW_HIFI3
        NClogN = den*(m->logN[N0>>1] + 16); //NClogN = den*(m->logN[j] + logM);
        /* Offset for the number of fine bits by log2(N)/2 + FINE_OFFSET
        compared to their "fair share" of total/N */

        offset = (NClogN>>1)-den*FINE_OFFSET;

		if (bits[j] + offset < den<<4 )
			offset += NClogN>>2;
		else if (bits[j] + offset < den*24)
			offset += NClogN>>3;
#else
        NClogN = SSC_MULT16x16(den,(m->logN[N0>>1] + 16)); //NClogN = den*(m->logN[j] + logM);
        offset = (NClogN>>1)-SSC_MULT16x16(den,FINE_OFFSET);

		if (bits[j] + offset < den<<4 ) 
			offset += NClogN>>2;
		else if (bits[j] + offset < SSC_MULT16x16(den,24))
			offset += NClogN>>3;
#endif

        /* Divide with rounding */



		ebits[j] = SPEECH_MAX(0, (bits[j] + offset + (den<<2)) / (den<<BITRES));
        /* Make sure not to bust */
        if (ebits[j] > (bits[j]>>BITRES))
        ebits[j] = bits[j] >> BITRES;
        /* More than that is useless because that's about as far as PVQ can go */
        ebits[j] = SPEECH_MIN(ebits[j], MAX_FINE_BITS);
#ifndef HW_HIFI3
        fine_priority[j] = ebits[j]*(den<<BITRES) >= bits[j]+offset;
#else
        fine_priority[j] = SSC_MULT16x16(ebits[j],(den<<BITRES)) >= bits[j]+offset;
#endif
        bits[j] -= ebits[j]<<BITRES;
		if(excess > 0)
		{
			short extra_fine, extra_bits;
			extra_fine = SPEECH_MIN(excess>>(BITRES),MAX_FINE_BITS-ebits[j]);
			ebits[j] += extra_fine;
			extra_bits = extra_fine<<BITRES;
			fine_priority[j] = extra_bits >= excess-balance;
			excess -= extra_bits;
		}
		balance = excess;
	}

   *_balance = balance;
   for (;j<end;j++)
   {
      ebits[j] = bits[j] >> BITRES;
      bits[j] = 0;
      fine_priority[j] = ebits[j]<1;
   }
   return codedBands;

}





short calculate_bitalloc(const SpeechMode *m, short end, short *offsets, const short *cap, short alloc_trim, short total, short *balance, short *pulses, short *ebits, short *fine_priority, ec_ctx *ec, short prev)
{


#ifndef OPT_KHW_20191106
	short lo, hi, len, j;
#else
	short lo, hi, j;
#endif

	short codedBands;
	short bits1[16];
	short bits2[16];
	short thresh[16];
	short trim_offset[16];
	short start = 0;

	short skip_start = 0;



#ifdef OPT_KHW_20191106
	int psum = 1943;
#endif


	total = SPEECH_MAX(total, 0);
#ifndef OPT_KHW_20191106
	len = m->nbEBands;
#endif
	total -= 8;
	for (j=start;j<10;j++)
	{
		/* Below this threshold, we're sure not to allocate any PVQ bits */
		thresh[j] = 8;
		/* Tilt of the allocation curve */
#ifndef HW_HIFI3
		trim_offset[j] = (alloc_trim-7)*(end-j-1)>>1;
#else
		trim_offset[j] = SSC_MULT16x16((alloc_trim-7),(end-j-1))>>1;
#endif

#ifndef OPT_KHW_20191106
		psum = psum + trim_offset[j]+offsets[j];
#endif
	}
#ifndef OPT_KHW_20191106
	for (j=10;j<end;j++)	
#else
	for (;j<end;j++)	
#endif
	{
#ifndef HW_HIFI3
		thresh[j] = (6*(m->eBands[j+1]-m->eBands[j]));
		trim_offset[j] = (m->eBands[j+1]-m->eBands[j])*(alloc_trim-7)*(end-j-1)>>1;
#else
		thresh[j] = SSC_MULT16x16(6,(m->eBands[j+1]-m->eBands[j]));
		trim_offset[j] = SSC_MULT16x16(SSC_MULT16x16((m->eBands[j+1]-m->eBands[j]),(alloc_trim-7)),(end-j-1))>>1;
#endif

#ifndef OPT_KHW_20191106
		psum = psum + trim_offset[j]+offsets[j];
#endif

	}

	lo = 1;
	hi = m->nbAllocVectors - 1;

#ifndef OPT_KHW_20191106
	{
		int psum = 0;
		short mid = 5;
		for (j=end;j--;)	  		   
		{
			short bitsj;
			short N = m->eBands[j+1]-m->eBands[j];
#ifndef HW_HIFI3
			bitsj = N*m->allocVectors[mid*len+j];
#else
			bitsj = SSC_MULT16x16(N,(short)(m->allocVectors[SSC_MULT16x16(mid,len)+j]));
#endif
			bitsj = bitsj + trim_offset[j];
			bitsj += offsets[j];

			psum += bitsj; 
		}

		if (psum > total) hi = mid - 1;
		else lo = mid + 1;
#else
	if (psum > total) 
		hi = 4;
	else 
		lo = 6;
#endif



		while (lo <= hi)
		{
			short done = 0;
			short bitsj;
			short N;
#ifdef OPT_KHW_20191106
			short mid;
#endif
			psum = 0;
			mid = (lo+hi) >> 1;
			for (j=end;j--;)
			{
#ifndef OPT_KHW_20191106
				N = m->eBands[j+1]-m->eBands[j];
#ifndef HW_HIFI3
				bitsj = N*m->allocVectors[mid*len+j];
#else
				bitsj = SSC_MULT16x16(N,(short)(m->allocVectors[SSC_MULT16x16(mid,len)+j]));
#endif
#else
				bitsj =  band_allocation_opti[(mid<<4)+j];	
#endif


				bitsj = SPEECH_MAX(0, bitsj + trim_offset[j]);

				bitsj += offsets[j];

				if (bitsj >= thresh[j] || done)
				{
					done = 1;
					psum += bitsj;
				} else {
					if (bitsj >= 8)
						psum += 8;
				}
			}
			if (psum > total) hi = mid - 1;
			else lo = mid + 1;
		}
#ifndef OPT_KHW_20191106
	}
#endif
	hi = lo--;
	for (j=start;j<end;j++)
	{
		short bits1j, bits2j;


#ifndef OPT_KHW_20191106
		short N = m->eBands[j+1]-m->eBands[j];
#ifndef HW_HIFI3
		bits1j = N*m->allocVectors[lo*len+j];
		bits2j = N*m->allocVectors[hi*len+j];
#else
		bits1j = SSC_MULT16x16(N,(short)m->allocVectors[SSC_MULT16x16(lo,len)+j]);
		bits2j = SSC_MULT16x16(N,(short)m->allocVectors[SSC_MULT16x16(hi,len)+j]);
#endif
#else
		bits1j = band_allocation_opti[(lo<<4)+j];
		bits2j = band_allocation_opti[(hi<<4)+j];
#endif
		if (bits1j>0) bits1j = SPEECH_MAX(0, bits1j + trim_offset[j]);
		if (bits2j>0) bits2j = SPEECH_MAX(0, bits2j + trim_offset[j]);

		if (lo > 0)
			bits1j += offsets[j];
		bits2j += offsets[j];
		if (offsets[j]>0)
			skip_start = j;


		bits2j = bits2j-bits1j;
		bits1[j] = bits1j;
		bits2[j] = bits2j;
	}
	codedBands = cal_bitsTopulses(m, end, skip_start, bits1, bits2, thresh, cap,total, balance, pulses, ebits, fine_priority, ec, prev );

	return codedBands;
}

#else
//FILE *fp_check;
const short eBandsdiff[] = {1,1,1,1,1,1,1,1,1,1,2,2,2,4,4,6};




static short cal_bitsTopulses(const SpeechMode *m, short end, short skip_start,
	const short *bits1, const short *bits2, const short *thresh, const short *cap, short total, short *_balance,
	short *bits, short *ebits, short *fine_priority, ec_ctx *ec, short prev)
{
	int lo, hi;
	int psum;
	short i, j;
	short codedBands=-1;
	short alloc_floor;
	short left, percoeff;
	short done;
	short balance;
	short start = 0;
	ae_int16x4 *pt1, *pt2, *pt3, *pt4, bits1v, bits2v, toffv, offv, tmps, midv;
	ae_int32x2  tmpL1, tmpL2;
	ae_valign align1, align2, align3, align4 ;

	alloc_floor = 8; //1<<BITRES;
	lo = 0;
	hi = 1<<ALLOC_STEPS;

	for (i=0;i<ALLOC_STEPS;i++)
	{
		short mid = (lo+hi)>>1;
		short tmp;
		psum = 0;
		done = 0;


		pt1 = (ae_int16x4*)(&bits1[end -1]);
		pt2 = (ae_int16x4*)(&bits2[end -1]);
		midv = AE_MOV16(mid);
		for (j=end -1;j>0;)
		{

			AE_LA16X4_RIP(bits1v, align1, pt1);
		    AE_LA16X4_RIP(bits2v, align2, pt2);
		    AE_MUL16X4(tmpL1, tmpL2, midv, bits2v);
			tmpL1= AE_SLAI32S(tmpL1, 10);
			tmpL2= AE_SLAI32S(tmpL2, 10);
			tmps = AE_TRUNC16X4F32(tmpL1, tmpL2);

			tmps = AE_ADD16(tmps, bits1v);
			tmp = AE_MOVAD16_3(tmps);
			if (tmp >= thresh[j] || done)
			{
				done = 1;
				psum += tmp;
			} else {
				if(tmp >= alloc_floor) psum += alloc_floor;
			}
			j-=1;

			tmp = AE_MOVAD16_2(tmps);
			if (tmp >= thresh[j] || done)
			{
				done = 1;
				psum += tmp;
			} else {
				if(tmp >= alloc_floor) psum += alloc_floor;
			}
			j-=1;

			tmp = AE_MOVAD16_1(tmps);
			if (tmp >= thresh[j] || done)
			{
				done = 1;
				psum += tmp;
			} else {
				if(tmp >= alloc_floor) psum += alloc_floor;
			}
			j-=1;

			tmp = AE_MOVAD16_0(tmps);
			if (tmp >= thresh[j] || done)
			{
				done = 1;
				psum += tmp;
			} else {
				if(tmp >= alloc_floor) psum += alloc_floor;
			}
			j-=1;
		}
		if (psum > total) hi = mid;
		else lo = mid;
	}
/*
    fp_check = fopen("E:\\2019_BT_codec\\190305_speech\\test_vector\\x_out.txt","a+");
    fprintf(fp_check, "%d\n", lo);
    fclose(fp_check);
*/


	psum = 0;
	done = 0;

	for (j=end;j--;)
	{
#ifndef HW_HIFI3
		short tmp = bits1[j] + (lo*bits2[j]>>ALLOC_STEPS);
#else
		short tmp = bits1[j] + (SSC_MULT16x16(lo,bits2[j])>>ALLOC_STEPS);
#endif
		if (tmp < thresh[j] && !done)
		{
			if (tmp >= alloc_floor) tmp = alloc_floor;
			else tmp = 0;
		} else
			done = 1;

		bits[j] = tmp;
		psum += tmp;
	}

//////////////////////////////////////////////////////////////////////////////////////////////////////////

	for (codedBands=end;;codedBands--)
	{
		short band_width, band_bits, rem;
		j = codedBands-1;

		if(j<=skip_start)
		{
			/* Give the bit we reserved to end skipping back. */
			total += 8;
			break;
		}
		left = total-psum;
		percoeff = left/(m->eBands[codedBands]); // #if 0   -m->eBands[start]); #endif
#ifndef HW_HIFI3
		left -= m->eBands[codedBands]*percoeff;
#else
		left -= SSC_MULT16x16(m->eBands[codedBands],percoeff);
#endif
		rem = SPEECH_MAX(left-(m->eBands[j]),0);
		band_width = m->eBands[codedBands]-m->eBands[j];
#ifndef HW_HIFI3
		band_bits = (int)(bits[j] + percoeff*band_width + rem);
#else
		band_bits = (int)(bits[j] + SSC_MULT16x16(percoeff,band_width) + rem);
#endif

		if (band_bits >= SPEECH_MAX(thresh[j], alloc_floor+8))
		{

			if(prev == -10000)    // at the kalimba, prev can be shared with encoder and decoder, so extra variable should be used instead of prev.
			{
				if(rc_dec_bit_logp(ec, 1)) break;
			}
			else
			{
#ifndef HW_HIFI3
				if (codedBands<=2 || (band_bits > ((j<prev?7:9)*band_width<<1) && j<=(end-1)))    //    j<=(end-1))´Â Ç×»ó trueÀÎ°ÍÀ¸·Î º¸ÀÓ.
#else
				if (codedBands<=2 || (band_bits > (SSC_MULT16x16((j<prev?7:9),band_width<<1)) && j<=(end-1)))    //    j<=(end-1))´Â Ç×»ó trueÀÎ°ÍÀ¸·Î º¸ÀÓ.
#endif
				{
					rc_enc_bit_logp_1(ec, 1, 1);
					break;
				}
				rc_enc_bit_logp_1(ec, 0, 1);
			}



			/*We used a bit to skip this band.*/
			psum += 8;
			band_bits -= 8;
		}
		/*Reclaim the bits originally allocated to this band.*/
		psum -= bits[j];
		if (band_bits >= alloc_floor)
		{
			/*If we have enough for a fine energy bit per channel, use it.*/
			psum += alloc_floor;
			bits[j] = alloc_floor;
		} else {
			/*Otherwise this band gets nothing at all.*/
			bits[j] = 0;
		}
	}

	/* Allocate the remaining bits */
	left = total-psum;

	percoeff = left/(m->eBands[codedBands]);
#ifndef HW_HIFI3
	left -= (m->eBands[codedBands])*percoeff;
#else
	left -= SSC_MULT16x16((m->eBands[codedBands]),percoeff);
#endif
	balance = 0;


	for (j=start;j<codedBands;j++)
	{
		short N0 = eBandsdiff[j];
		short N, den;
		short offset;
		short NClogN;
		short excess, bit;
		short tmp = (int)SPEECH_MIN(left, N0);
#ifndef HW_HIFI3
		bits[j] += ((int)percoeff*N0);
#else
		bits[j] += (SSC_MULT16x16(percoeff,N0));
#endif
		bits[j] += tmp;
		left -= tmp;

		N=N0<<2;
		bit = (int)bits[j]+balance;

        excess = SPEECH_MAX(bit-cap[j],0);
        bits[j] = bit-excess;
        /* Compensate for the extra DoF in stereo */
		den=N;
#ifndef HW_HIFI3
        NClogN = den*(m->logN[N0>>1] + 16); //NClogN = den*(m->logN[j] + logM);
        /* Offset for the number of fine bits by log2(N)/2 + FINE_OFFSET
        compared to their "fair share" of total/N */

        offset = (NClogN>>1)-den*FINE_OFFSET;

		if (bits[j] + offset < den<<4 )
			offset += NClogN>>2;
		else if (bits[j] + offset < den*24)
			offset += NClogN>>3;
#else
        NClogN = SSC_MULT16x16(den,(m->logN[N0>>1] + 16)); //NClogN = den*(m->logN[j] + logM);
        offset = (NClogN>>1)-SSC_MULT16x16(den,FINE_OFFSET);

		if (bits[j] + offset < den<<4 )
			offset += NClogN>>2;
		else if (bits[j] + offset < SSC_MULT16x16(den,24))
			offset += NClogN>>3;
#endif

        /* Divide with rounding */



		ebits[j] = SPEECH_MAX(0, (bits[j] + offset + (den<<2)) / (den<<BITRES));
        /* Make sure not to bust */
        if (ebits[j] > (bits[j]>>BITRES))
        ebits[j] = bits[j] >> BITRES;
        /* More than that is useless because that's about as far as PVQ can go */
        ebits[j] = SPEECH_MIN(ebits[j], MAX_FINE_BITS);
#ifndef HW_HIFI3
        fine_priority[j] = ebits[j]*(den<<BITRES) >= bits[j]+offset;
#else
        fine_priority[j] = SSC_MULT16x16(ebits[j],(den<<BITRES)) >= bits[j]+offset;
#endif
        bits[j] -= ebits[j]<<BITRES;
		if(excess > 0)
		{
			short extra_fine, extra_bits;
			extra_fine = SPEECH_MIN(excess>>(BITRES),MAX_FINE_BITS-ebits[j]);
			ebits[j] += extra_fine;
			extra_bits = extra_fine<<BITRES;
			fine_priority[j] = extra_bits >= excess-balance;
			excess -= extra_bits;
		}
		balance = excess;
	}

   *_balance = balance;
   for (;j<end;j++)
   {
      ebits[j] = bits[j] >> BITRES;
      bits[j] = 0;
      fine_priority[j] = ebits[j]<1;
   }
   return codedBands;

}





short calculate_bitalloc(const SpeechMode *m, short end, short *offsets, const short *cap, short alloc_trim, short total, short *balance, short *pulses, short *ebits, short *fine_priority, ec_ctx *ec, short prev)
{
#ifndef OPT_KHW_20191106
	short lo, hi, len, j;
#else
	short lo, hi, j;
#endif
	short codedBands;
	short bits1[16];
	short bits2[16];
	short thresh[16];
	short trim_offset[16];
	short tmp_table[16];
	short start = 0;

	short skip_start = 0;

#ifndef OPT_KHW_20191106
	int index, index1;
#endif

#ifdef OPT_KHW_20191106
	int psum = 1943;
#endif

	ae_int16x4 *pt1, *pt3, *pt4, Nv, data, toffv, offv, tmps, zerov;

	ae_int32x2  tmpL1, tmpL2;
	ae_valign align1, align3, align4 ;
#ifndef OPT_KHW_20191106
	ae_int16x4 *pt2;
	ae_valign align2;
#endif
	xtbool4 bool4;

	total = SPEECH_MAX(total, 0);
#ifndef OPT_KHW_20191106
	len = m->nbEBands;
#endif
	total -= 8;
	for (j=start;j<10;j++)
	{
		/* Below this threshold, we're sure not to allocate any PVQ bits */
		thresh[j] = 8;
		/* Tilt of the allocation curve */

		trim_offset[j] = SSC_MULT16x16((alloc_trim-7),(end-j-1))>>1;

#ifdef OPT_KHW_20191106
		psum = psum + trim_offset[j]+offsets[j];
#endif

	}
#ifndef OPT_KHW_20191106
	for (j=10;j<end;j++)
#else
	for (;j<end;j++)
#endif
	{

		thresh[j] = SSC_MULT16x16(6,eBandsdiff[j]);
		trim_offset[j] = SSC_MULT16x16(SSC_MULT16x16(eBandsdiff[j],(alloc_trim-7)),(end-j-1))>>1;
#ifdef OPT_KHW_20191106
		psum = psum + trim_offset[j]+offsets[j];
#endif
	}

	lo = 1;

#ifndef OPT_KHW_20191106
	hi = m->nbAllocVectors - 1;
#else
	hi = 10;
#endif


#ifndef OPT_KHW_20191106
	{
		int psum = 0;
		short mid = 5;
		pt1 = (ae_int16x4*)(&m->allocVectors[SSC_MULT16x16(mid,len) +end -1]);
	    pt2 = (ae_int16x4*)(&eBandsdiff[end -1]);
		pt3 = (ae_int16x4*)(&trim_offset[end -1]);
		pt4 = (ae_int16x4*)(&offsets[end -1]);
		align1 = AE_LA64_PP(pt1);
		align2 = AE_LA64_PP(pt2);
		align3 = AE_LA64_PP(pt3);
		align4 = AE_LA64_PP(pt4);

		for (j=end -1;j>0;j-=4)
		{
			AE_LA16X4_RIP(data, align1, pt1);
			AE_LA16X4_RIP(Nv, align2, pt2);
			AE_LA16X4_RIP(toffv,align3, pt3);
			AE_LA16X4_RIP(offv,align4, pt4);
			AE_MUL16X4(tmpL1, tmpL2, Nv, data);

			tmpL1= AE_SLAI32S(tmpL1, 16);
			tmpL2= AE_SLAI32S(tmpL2, 16);

			tmps = AE_TRUNC16X4F32(tmpL1, tmpL2);
			tmps = AE_ADD16(tmps, toffv);
			tmps = AE_ADD16(tmps, offv);
			psum += AE_INT16X4_RADD(tmps);
		}

		if (psum > total) hi = mid - 1;
		else lo = mid + 1;
#else
		if (psum > total)
			hi = 4;
		else
			lo = 6;
#endif



		while (lo <= hi)
		{
			short done = 0;
			short bitsj;
			short N;
#ifdef OPT_KHW_20191106
			short mid;
#endif

			psum = 0;
			mid = (lo+hi) >> 1;
#ifndef OPT_KHW_20191106
			index = SSC_MULT16x16(mid,len);


			pt1 = (ae_int16x4*)(&m->allocVectors[SSC_MULT16x16(mid,len) +end -1]);
			pt2 = (ae_int16x4*)(&eBandsdiff[end -1]);
#else
			pt1 = (ae_int16x4*)(&band_allocation_opti[(mid<<4)+end-1]);
#endif
			pt3 = (ae_int16x4*)(&trim_offset[end -1]);
			pt4 = (ae_int16x4*)(&offsets[end -1]);
			align1 = AE_LA64_PP(pt1);
#ifndef OPT_KHW_20191106
			align2 = AE_LA64_PP(pt2);
#endif
			align3 = AE_LA64_PP(pt3);
			align4 = AE_LA64_PP(pt4);
			zerov = AE_MOV16(0);
			for (j=end -1;j>0;)
			{
				AE_LA16X4_RIP(data, align1, pt1);
#ifndef OPT_KHW_20191106
				AE_LA16X4_RIP(Nv, align2, pt2);
#endif
				AE_LA16X4_RIP(toffv,align3, pt3);
				AE_LA16X4_RIP(offv,align4, pt4);

/*
proto AE_MUL16X4 {out ae_int32x2 d0, out ae_int32x2 d1, in ae_int16x4 d2, in ae_int16x4 d3} {}
{
    AE_MUL16X4 d0, d1, d2, d3;
}

proto AE_SLAI32S {out ae_f32x2 d, in ae_f32x2 d0, in immediate sa} {} {
    AE_SLAI32S d, d0, sa;
}
//2-way saturating shift left (arithmetic) by immediate


proto AE_TRUNC16X4F32 {out ae_f16x4 d, in ae_f32x2 dl, in ae_f32x2 dh} {} {
    AE_SEL16I d, dl, dh, 7;
}


*/
#ifndef OPT_KHW_20191106
				AE_MUL16X4(tmpL1, tmpL2, Nv, data);
				tmpL1= AE_SLAI32S(tmpL1, 16);
				tmpL2= AE_SLAI32S(tmpL2, 16);

				tmps = AE_TRUNC16X4F32(tmpL1, tmpL2);



				tmps = AE_ADD16(tmps, toffv);
				bool4 = AE_LT16(tmps, zerov);
				AE_MOVT16X4(tmps, zerov, bool4);
				tmps = AE_ADD16(tmps, offv);

				bitsj = AE_MOVAD16_3(tmps);
#else
				tmps = AE_ADD16(data, toffv);
				bool4 = AE_LT16(tmps, zerov);
				AE_MOVT16X4(tmps, zerov, bool4);
				tmps = AE_ADD16(tmps, offv);

				bitsj = AE_MOVAD16_3(tmps);
#endif




				if (bitsj >= thresh[j] || done)
						{
							done = 1;
							psum += bitsj;
						} else {
							if (bitsj >= 8)
								psum += 8;
						}
                j-=1;
				bitsj = AE_MOVAD16_2(tmps);
				if (bitsj >= thresh[j] || done)
						{
							done = 1;
							psum += bitsj;
						} else {
							if (bitsj >= 8)
								psum += 8;
						}
				j-=1;
				bitsj = AE_MOVAD16_1(tmps);
				if (bitsj >= thresh[j] || done)
						{
							done = 1;
							psum += bitsj;
						} else {
							if (bitsj >= 8)
								psum += 8;
						}

				j-=1;
				bitsj = AE_MOVAD16_0(tmps);
				if (bitsj >= thresh[j] || done)
						{
							done = 1;
							psum += bitsj;
						} else {
							if (bitsj >= 8)
								psum += 8;
						}
				j-=1;

			}
			if (psum > total) hi = mid - 1;
			else lo = mid + 1;
		}

#ifndef OPT_KHW_20191106
	}
#endif
//	hi = lo--;
	hi = lo--;

//	index = lo<<4;
//	index1 =  hi<<4;

	for (j=start;j<end;j++)
	{
		short bits1j, bits2j;

#ifndef OPT_KHW_20191106
		short N = eBandsdiff[j];

		bits1j = SSC_MULT16x16(N,(short)m->allocVectors[index+j]);
		bits2j = SSC_MULT16x16(N,(short)m->allocVectors[index1+j]);
#else
	//	bits1j = band_allocation_opti[index+j];
	//	bits2j = band_allocation_opti[index1+j];

		bits1j = band_allocation_opti[(lo<<4)+j];
		bits2j = band_allocation_opti[(hi<<4)+j];

#endif
		if (bits1j>0) bits1j = SPEECH_MAX(0, bits1j + trim_offset[j]);
		if (bits2j>0) bits2j = SPEECH_MAX(0, bits2j + trim_offset[j]);

		if (lo > 0)
			bits1j += offsets[j];
		bits2j += offsets[j];
		if (offsets[j]>0)
			skip_start = j;


		bits2j = bits2j-bits1j;
		bits1[j] = bits1j;
		bits2[j] = bits2j;
	}
	codedBands = cal_bitsTopulses(m, end, skip_start, bits1, bits2, thresh, cap,total, balance, pulses, ebits, fine_priority, ec, prev );

	return codedBands;
}
#endif

void compute_offset(short *oldBandE, short *offsets)
{
	short i;
	short offset_en;

	for(i=0; i<16; i++)
	{
		offsets[i] = 0;
		offset_en = oldBandE[i]-en_start[i];

		while(offset_en>0)
		{
			offsets[i] = offsets[i] + offset_unit[i];
			offset_en = offset_en-500;
		}
	}

}
