/*****************************************************
                CRN_CircullantD.c
******************************************************/

#include <stdbool.h>
#include <stdlib.h>

#include "CRN_CircullantD_impl.h"

// PCMdump debug
//#define DBGDUMP
#ifdef DBGDUMP
#include <stdio.h>
static char filedbgdump_enc1[] = "dbg_enc1.dat";
static char filedbgdump_enc2[] = "dbg_enc2.dat";
static char filedbgdump_enc3[] = "dbg_enc3.dat";
static char filedbgdump_enc4[] = "dbg_enc4.dat";
static char filedbgdump_enc5[] = "dbg_enc5.dat";
FILE *fdbgdump_enc1;
FILE *fdbgdump_enc2;
FILE *fdbgdump_enc3;
FILE *fdbgdump_enc4;
FILE *fdbgdump_enc5;

static char filedbgdump_gru1[] = "dbg_gru1.dat";
static char filedbgdump_gru2[] = "dbg_gru2.dat";
FILE *fdbgdump_gru1;
FILE *fdbgdump_gru2;

static char filedbgdump_dec1[] = "dbg_dec1.dat";
static char filedbgdump_dec2[] = "dbg_dec2.dat";
static char filedbgdump_dec3[] = "dbg_dec3.dat";
static char filedbgdump_dec4[] = "dbg_dec4.dat";
static char filedbgdump_dec5[] = "dbg_dec5.dat";
FILE *fdbgdump_dec1;
FILE *fdbgdump_dec2;
FILE *fdbgdump_dec3;
FILE *fdbgdump_dec4;
FILE *fdbgdump_dec5;
#endif

//#define QTWD 31
//static const ComplexInt twd[256]={
//#include"twd256.dat"
//};

typedef long long INT64;

#define MAX(a,b) (((a)>(b)) ? (a):(b))
#define MIN(a,b) (((a)<(b)) ? (a):(b))

#define Q_INPUT 8
#define Q_DATA 10
#define Q_OUTPUT 15

#define Q_WEIGHT 13
#define Q_GRU_WEIGHT 12 // 13 //12

/******** BN *********/
static const short BN1_scale_fx[CRN_INPUT_LEN*2] = {              // format Q_WEIGHT
#include "CRN_CircullantD_BNcoeffs_scale_fx.dat"
};
static const short BN1_offset_fx[CRN_INPUT_LEN*2] = {             // format Q_DATA
#include "CRN_CircullantD_BNcoeffs_offset_fx.dat"
};

/******** CONV *********/
static const short enc1_W[CRN_CONV1_OUT_CH][CRN_INPUT_CH][CRN_CONV1_KERNEL_LEN]={     // format Q_WEIGHT
#include "CRN_CircullantD_enc1_W.dat"
};
static const short enc1_b[CRN_CONV1_OUT_CH]={                                         // format Q_DATA
#include "CRN_CircullantD_enc1_B.dat"
};

static const short enc2_W[CRN_CONV2_OUT_CH][CRN_CONV1_OUT_CH][CRN_CONV2_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_enc2_W.dat"
};
static const short enc2_b[CRN_CONV2_OUT_CH]={                                         // format Q_DATA
#include "CRN_CircullantD_enc2_B.dat"
};

static const short enc3_W[CRN_CONV3_OUT_CH][CRN_CONV2_OUT_CH][CRN_CONV3_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_enc3_W.dat"
};
static const short enc3_b[CRN_CONV3_OUT_CH]={                                         // format Q_DATA
#include "CRN_CircullantD_enc3_B.dat"
};

static const short enc4_W[CRN_CONV4_OUT_CH][CRN_CONV3_OUT_CH][CRN_CONV4_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_enc4_W.dat"
};
static const short enc4_b[CRN_CONV4_OUT_CH]={                                         // format Q_DATA
#include "CRN_CircullantD_enc4_B.dat"
};

static const short enc5_W[CRN_CONV5_OUT_CH][CRN_CONV4_OUT_CH][CRN_CONV5_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_enc5_W.dat"
};
static const short enc5_b[CRN_CONV5_OUT_CH]={                                         // format Q_DATA
#include "CRN_CircullantD_enc5_B.dat"
};

/******** GRU *********/
//static signed char gru1_Dinp[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
//#include "CRN_CircullantD_gru1_Dinp.dat"
//};
//static signed char gru1_Drec[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
//#include "CRN_CircullantD_gru1_Drec.dat"
//};
//static ComplexShort gru1_Winp[CRN_GRU_STATE_LEN+1]={                            // format Q_WEIGHT
//#include "CRN_CircullantD_gru1_Winp.dat"
//};
//static ComplexShort gru1_Wrec[CRN_GRU_STATE_LEN+1]={                            // format Q_WEIGHT
//#include "CRN_CircullantD_gru1_Wrec.dat"
//};
static short gru1_P_krn_idx[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
#include "CRN_CircullantD_gru1_P_krn_idx.dat"
};
static const short gru1_W_krn_B[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru1_W_krn_B.dat"
};
static const short gru1_W_krn_G[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru1_W_krn_G.dat"
};
static const short gru1_W_krn_S[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru1_W_krn_S.dat"
};
static short gru1_P_rec_idx[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
#include "CRN_CircullantD_gru1_P_rec_idx.dat"
};
static const short gru1_W_rec_B[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru1_W_rec_B.dat"
};
static const short gru1_W_rec_G[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru1_W_rec_G.dat"
};
static const short gru1_W_rec_S[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru1_W_rec_S.dat"
};
static const short gru1_Bkrn[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={                   // format Q_DATA
#include "CRN_CircullantD_gru1_b_krn.dat"
};
static const short gru1_Brec[CRN_GRU_STATE_LEN]={                                     // format Q_DATA
#include "CRN_CircullantD_gru1_b_rec.dat"
};

//static signed char gru2_Dinp[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
//#include "CRN_CircullantD_gru2_Dinp.dat"
//};
//static signed char gru2_Drec[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
//#include "CRN_CircullantD_gru2_Drec.dat"
//};
//static ComplexShort gru2_Winp[CRN_GRU_STATE_LEN+1]={                            // format Q_WEIGHT
//#include "CRN_CircullantD_gru2_Winp.dat"
//};
//static ComplexShort gru2_Wrec[CRN_GRU_STATE_LEN+1]={                            // format Q_WEIGHT
//#include "CRN_CircullantD_gru2_Wrec.dat"
//};
static short gru2_P_krn_idx[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
#include "CRN_CircullantD_gru2_P_krn_idx.dat"
};
static const short gru2_W_krn_B[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru2_W_krn_B.dat"
};
static const short gru2_W_krn_G[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru2_W_krn_G.dat"
};
static const short gru2_W_krn_S[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru2_W_krn_S.dat"
};
static short gru2_P_rec_idx[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={
#include "CRN_CircullantD_gru2_P_rec_idx.dat"
};
static const short gru2_W_rec_B[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru2_W_rec_B.dat"
};
static const short gru2_W_rec_G[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru2_W_rec_G.dat"
};
static const short gru2_W_rec_S[CRN_GRU_STATE_LEN+CRN_GRU_STATE_LEN]={                            // format Q_WEIGHT
#include "CRN_CircullantD_gru2_W_rec_S.dat"
};
static const short gru2_Bkrn[CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN]={                   // format Q_DATA
#include "CRN_CircullantD_gru2_b_krn.dat"
};
static const short gru2_Brec[CRN_GRU_STATE_LEN]={                                     // format Q_DATA
#include "CRN_CircullantD_gru2_b_rec.dat"
};

/******** DECONV *********/
static short dec5_W[CRN_CONV4_OUT_CH/2][CRN_DECONV5_OUT_CH][CRN_DECONV5_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_dec1_W.dat"
};
static short dec5_b[CRN_CONV4_OUT_CH]={                                             // format Q_DATA
#include "CRN_CircullantD_dec1_B.dat"
};

static short dec4_W[CRN_CONV3_OUT_CH/2][CRN_DECONV4_OUT_CH][CRN_DECONV4_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_dec2_W.dat"
};
static short dec4_b[CRN_CONV3_OUT_CH]={                                             // format Q_DATA
#include "CRN_CircullantD_dec2_B.dat"
};

static short dec3_W[CRN_CONV2_OUT_CH/2][CRN_DECONV3_OUT_CH][CRN_DECONV3_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_dec3_W.dat"
};
static short dec3_b[CRN_CONV2_OUT_CH]={                                             // format Q_DATA
#include "CRN_CircullantD_dec3_B.dat"
};

//static const short dec5_W_depth[CRN_DECONV5_OUT_CH][CRN_DECONV5_KERNEL_LEN]={             // format Q_WEIGHT
//#include "CRN_CircullantD_dec1_W_depth.dat"
//};
//static const short dec5_W_point[CRN_CONV4_OUT_CH][CRN_DECONV5_OUT_CH]={                   // format Q_WEIGHT
//#include "CRN_CircullantD_dec1_W_point.dat"
//};
//static const short dec5_b[CRN_CONV4_OUT_CH]={                                             // format Q_DATA
//#include "CRN_CircullantD_dec1_B.dat"
//};
//
//static const short dec4_W_depth[CRN_DECONV4_OUT_CH][CRN_DECONV4_KERNEL_LEN]={             // format Q_WEIGHT
//#include "CRN_CircullantD_dec2_W_depth.dat"
//};
//static const short dec4_W_point[CRN_CONV3_OUT_CH][CRN_DECONV4_OUT_CH]={                   // format Q_WEIGHT
//#include "CRN_CircullantD_dec2_W_point.dat"
//};
//static const short dec4_b[CRN_CONV3_OUT_CH]={                                             // format Q_DATA
//#include "CRN_CircullantD_dec2_B.dat"
//};
//
//static const short dec3_W_depth[CRN_DECONV3_OUT_CH][CRN_DECONV3_KERNEL_LEN]={             // format Q_WEIGHT
//#include "CRN_CircullantD_dec3_W_depth.dat"
//};
//static const short dec3_W_point[CRN_CONV2_OUT_CH][CRN_DECONV3_OUT_CH]={                   // format Q_WEIGHT
//#include "CRN_CircullantD_dec3_W_point.dat"
//};
//static const short dec3_b[CRN_CONV2_OUT_CH]={                                             // format Q_DATA
//#include "CRN_CircullantD_dec3_B.dat"
//};

static const short dec2_W[CRN_CONV1_OUT_CH][CRN_DECONV2_OUT_CH][CRN_DECONV2_KERNEL_LEN]={ // format Q_WEIGHT
#include "CRN_CircullantD_dec4_W.dat"
};
static const short dec2_b[CRN_CONV1_OUT_CH]={                                             // format Q_DATA
#include "CRN_CircullantD_dec4_B.dat"
};

static const short dec1_W[1][CRN_DECONV1_OUT_CH][CRN_DECONV1_KERNEL_LEN]={                // format Q_WEIGHT
#include "CRN_CircullantD_dec5_W.dat"
};
static const short dec1_b[1]={                                                            // format Q_DATA
#include "CRN_CircullantD_dec5_B.dat"
};

static CRN_network_T CRN_network_mem;

static void CRN_CircullantD_BN(int* out, short* in, const short* BN_scale, const short* BN_offset, int n)
{
    do{
        *out++=(((int)(*in++)*(*BN_scale++))>>(Q_INPUT+Q_WEIGHT-Q_DATA))+(*BN_offset++); // format is Q_CRN_BN1_OUT
    }while(--n>0);
}

//static void CRN_memset64(INT64* out, int d, int n)
//{  do{*out++=d;}while(--n>0);}
static void CRN_memset32(int* out, int d, int n)
{  do{*out++=d;}while(--n>0);}

//#ifdef USE_XTENSA_OPT
//extern void CRN_conv_relu(int* out, INT64* in, int n);
//#else
//static void CRN_conv_relu(int* out, INT64* in, int n)
//{
//    do{
//        int ds=(int)((*in++)>>Q_WEIGHT);
//        if (ds<0) ds=0;
//        *out++=ds;
//    }while(--n>0);
//}
//#endif

static void CRN_conv_relu32(int* out, int* in, int n)
{
    do{
        int ds=(int)((*in++)>>Q_WEIGHT);
        if (ds<0) ds=0;
        *out++=ds;
    }while(--n>0);
}
#define CONV1_SLICE_LEN 2


#ifdef USEOPT
extern void fx_CRN_conv1_relu_strided_conv(int* inout, int(*in)[CRN_INPUT_LEN], short(*W)[CRN_CONV1_KERNEL_LEN],
    int slice_num, int nout);
#else
static void fx_CRN_conv1_relu_strided_conv(int* inout, int (*in)[CRN_INPUT_LEN], short (*W)[CRN_CONV1_KERNEL_LEN],
                                                        int slice_num, int nout)
{
//    int inputlen=nout*2;

    // pointers
    int *pin0=in[0], *pin1=in[1];                         // pointers in registers

    // registers                                                                // data in registers
    int s0_d0,s0_d1,s0_d2,s0_d3,s0_d4;
    int s1_d0,s1_d1,s1_d2,s1_d3,s1_d4;
    short s0_h0=W[0][0],s0_h1=W[0][1],s0_h2=W[0][2],s0_h3=W[0][3],s0_h4=W[0][4];
    short s1_h0=W[1][0],s1_h1=W[1][1],s1_h2=W[1][2],s1_h3=W[1][3],s1_h4=W[1][4];

    // preroll
    s0_d2=*pin0++;s0_d3=*pin0++;s0_d4=*pin0++;
    s1_d2=*pin1++;s1_d3=*pin1++;s1_d4=*pin1++;

    for (int m=0;m<nout;m++)
    {
        // data shifting and load new data
        s0_d0=s0_d2; s0_d1=s0_d3; s0_d2=s0_d4; s0_d3=*pin0++; s0_d4=*pin0++;
        s1_d0=s1_d2; s1_d1=s1_d3; s1_d2=s1_d4; s1_d3=*pin1++; s1_d4=*pin1++;

        int q0=*inout;
        q0 += (int)s0_d0*s0_h0+(int)s0_d1*s0_h1+(int)s0_d2*s0_h2+(int)s0_d3*s0_h3+(int)s0_d4*s0_h4;
        q0 += (int)s1_d0*s1_h0+(int)s1_d1*s1_h1+(int)s1_d2*s1_h2+(int)s1_d3*s1_h3+(int)s1_d4*s1_h4;

        *inout++ = q0;
    }
}
#endif

static void fx_CRN_conv1_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short (*W)[CRN_INPUT_CH][CRN_CONV1_KERNEL_LEN],
                                        short* bias, int Wlen, int* scratch64)
{
    int (*out)[CRN_CONV1_OUT_BUFLEN]=outbuf->buf;
//    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int (*in)[CRN_INPUT_LEN]=inbuf->buf;
//    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

    for (int out_ch_idx=0;out_ch_idx<out_ch;out_ch_idx++)
    {
        // fill temp buffer by bias
        CRN_memset32(scratch64,bias[out_ch_idx]<<(Q_WEIGHT),CRN_CONV1_OUT_LEN);  // !!! Note : combine with first slice

        for (int slice_cnt=0;slice_cnt<in_ch/CONV1_SLICE_LEN;slice_cnt+=CONV1_SLICE_LEN)
        {
            // stride 2 convolution over slice CONV1_SLICE_LEN
            fx_CRN_conv1_relu_strided_conv(scratch64,in+slice_cnt,W[out_ch_idx]+slice_cnt,CONV1_SLICE_LEN,CRN_CONV1_OUT_LEN);
        }

        CRN_conv_relu32(out[out_ch_idx],scratch64,CRN_CONV1_OUT_LEN);  // !!! Note : combine with the last slice
    }
}

#define CONV2_SLICE_LEN 2

#ifdef USEOPT
extern void fx_CRN_conv2_relu_strided_conv(int* inout, int(*in)[CRN_CONV1_OUT_BUFLEN], short(*W)[CRN_CONV2_KERNEL_LEN],
    int slice_num, int nout);
#else
static void fx_CRN_conv2_relu_strided_conv(int* inout, int (*in)[CRN_CONV1_OUT_BUFLEN], short (*W)[CRN_CONV2_KERNEL_LEN],
                                                        int slice_num, int nout)
{
//    int inputlen=nout*2;

    // pointers
    int *pin0=in[0], *pin1=in[1];                         // pointers in registers

    // registers                                                                // data in registers
    int s0_d0,s0_d1,s0_d2,s0_d3,s0_d4;
    int s1_d0,s1_d1,s1_d2,s1_d3,s1_d4;
    short s0_h0=W[0][0],s0_h1=W[0][1],s0_h2=W[0][2];
    short s1_h0=W[1][0],s1_h1=W[1][1],s1_h2=W[1][2];

    // preroll
    /*s0_d2=0;*/s0_d3=*pin0++;s0_d4=*pin0++;
    /*s1_d2=0;*/s1_d3=*pin1++;s1_d4=*pin1++;

    int q0=0, q1;
    q0 += (int)s0_d3*s0_h1+(int)s0_d4*s0_h2;
    q0 += (int)s1_d3*s1_h1+(int)s1_d4*s1_h2;
    *inout++ += q0;                                                            // store 1st out smpl

    for (int m=1;m<nout-1;m+=2)
    {
        // data shifting and load new data
        s0_d0=s0_d4; s0_d1=*pin0++; s0_d2=*pin0++; s0_d3=*pin0++; s0_d4=*pin0++;
        s1_d0=s1_d4; s1_d1=*pin1++; s1_d2=*pin1++; s1_d3=*pin1++; s1_d4=*pin1++;

        q0=*inout;
        q0 += (int)s0_d0*s0_h0+(int)s0_d1*s0_h1+(int)s0_d2*s0_h2;
        q0 += (int)s1_d0*s1_h0+(int)s1_d1*s1_h1+(int)s1_d2*s1_h2;
        q1=*(inout+1);
        q1 += (int)s0_d2*s0_h0+(int)s0_d3*s0_h1+(int)s0_d4*s0_h2;
        q1 += (int)s1_d2*s1_h0+(int)s1_d3*s1_h1+(int)s1_d4*s1_h2;

        *inout++ = q0;
        *inout++ = q1;
    }

    // postroll
    // data shifting and load new data
    s0_d0=s0_d4; s0_d1=*pin0++;// s0_d2=0;
    s1_d0=s1_d4; s1_d1=*pin1++;// s1_d2=0;
    q0=*inout;
    q0 += (int)s0_d0*s0_h0+(int)s0_d1*s0_h1; //+(INT64)s0_d2*s0_h2;
    q0 += (int)s1_d0*s1_h0+(int)s1_d1*s1_h1; //+(INT64)s1_d2*s1_h2;
    *inout++ = q0;                                                                                                      // store

}
#endif

static void fx_CRN_conv2_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short (*W)[CRN_CONV1_OUT_CH][CRN_CONV2_KERNEL_LEN],
                                        short* bias, int Wlen, int* scratch64)
{
    int (*out)[CRN_CONV2_OUT_LEN]=outbuf->buf;
//    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int (*in)[CRN_CONV1_OUT_BUFLEN]=inbuf->buf;
//    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

    for (int out_ch_idx=0;out_ch_idx<out_ch;out_ch_idx++)
    {
        // fill temp buffer by bias
        CRN_memset32(scratch64,bias[out_ch_idx]<<(Q_WEIGHT),CRN_CONV2_OUT_LEN);  // !!! Note : combine with first slice

        for (int slice_cnt=0;slice_cnt<in_ch;slice_cnt+=CONV2_SLICE_LEN)
        {
            // stride 2 convolution over slice CONV1_SLICE_LEN
            fx_CRN_conv2_relu_strided_conv(scratch64,in+slice_cnt,W[out_ch_idx]+slice_cnt,CONV2_SLICE_LEN,CRN_CONV2_OUT_LEN);
        }

        CRN_conv_relu32(out[out_ch_idx],scratch64,CRN_CONV2_OUT_LEN);  // !!! Note : combine with the last slice
    }
}

#define CONV3_SLICE_LEN 2

#ifdef USEOPT
extern void fx_CRN_conv35_relu_strided_conv(int* inout, int(*in), short(*W)[CRN_CONV3_KERNEL_LEN],
    int slice_num, int nout);
#else
static void fx_CRN_conv35_relu_strided_conv(int* inout, int (*in), short (*W)[CRN_CONV3_KERNEL_LEN],
                                                        int slice_num, int nout)
{
    int inputlen=nout*2;

    // pointers
    int *pin0=in, *pin1=in+inputlen;                         // pointers in registers

    // registers                                                                // data in registers
    int s0_d0,s0_d1,s0_d2,s0_d3,s0_d4;
    int s1_d0,s1_d1,s1_d2,s1_d3,s1_d4;
    short s0_h0=W[0][0],s0_h1=W[0][1],s0_h2=W[0][2];
    short s1_h0=W[1][0],s1_h1=W[1][1],s1_h2=W[1][2];

    // preroll
    s0_d2=*pin0++;s0_d3=*pin0++;s0_d4=*pin0++;
    s1_d2=*pin1++;s1_d3=*pin1++;s1_d4=*pin1++;

    int q0=*inout, q1;
    q0 += (int)s0_d2*s0_h0+(int)s0_d3*s0_h1+(int)s0_d4*s0_h2;
    q0 += (int)s1_d2*s1_h0+(int)s1_d3*s1_h1+(int)s1_d4*s1_h2;
    *inout++ = q0;                                                            // store 1st out smpl

    for (int m=1;m<nout-1;m+=2)
    {
        // data shifting and load new data
        s0_d0=s0_d4; s0_d1=*pin0++; s0_d2=*pin0++; s0_d3=*pin0++; s0_d4=*pin0++;
        s1_d0=s1_d4; s1_d1=*pin1++; s1_d2=*pin1++; s1_d3=*pin1++; s1_d4=*pin1++;

        q0=*inout;
        q0 += (int)s0_d0*s0_h0+(int)s0_d1*s0_h1+(int)s0_d2*s0_h2;
        q0 += (int)s1_d0*s1_h0+(int)s1_d1*s1_h1+(int)s1_d2*s1_h2;
        q1=*(inout+1);
        q1 += (int)s0_d2*s0_h0+(int)s0_d3*s0_h1+(int)s0_d4*s0_h2;
        q1 += (int)s1_d2*s1_h0+(int)s1_d3*s1_h1+(int)s1_d4*s1_h2;

        *inout++ = q0;
        *inout++ = q1;
    }

    // postroll
    // data shifting and load new data
    s0_d0=s0_d4; s0_d1=*pin0++;
    s1_d0=s1_d4; s1_d1=*pin1++;
    q0=*inout;
    q0 += (int)s0_d0*s0_h0+(int)s0_d1*s0_h1;
    q0 += (int)s1_d0*s1_h0+(int)s1_d1*s1_h1;
    *inout++ = q0;                                                                                                      // store
}
#endif

static void fx_CRN_conv35_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short *W,
                                        short* bias, int Wlen, int* scratch64)
{
    int *out=outbuf->buf;
    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int *in=inbuf->buf;
    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

    for (int out_ch_idx=0;out_ch_idx<out_ch;out_ch_idx++)
    {
        // fill temp buffer by bias
        CRN_memset32(scratch64,bias[out_ch_idx]<<(Q_WEIGHT),out_binlen);  // !!! Note : combine with first slice

        for (int slice_cnt=0;slice_cnt<in_ch;slice_cnt+=CONV3_SLICE_LEN)
        {
            // stride 2 convolution over slice CONV1_SLICE_LEN
            fx_CRN_conv35_relu_strided_conv(scratch64,in+slice_cnt*in_binlen,(short (*)[3])(W+(Wlen*in_ch*out_ch_idx)+Wlen*slice_cnt),CONV3_SLICE_LEN,out_binlen);
        }

        CRN_conv_relu32(out+out_ch_idx*out_binlen,scratch64,out_binlen);  // !!! Note : combine with the last slice
    }
}

#ifdef USEOPT
extern void fx_CRN_deconv53_relu_strided_conv(int* inout, int(*in), short(*W)[CRN_CONV3_KERNEL_LEN],
   int slice_num, int inputlen);
#else
static void fx_CRN_deconv53_relu_strided_conv(int* inout, int (*in), short (*W)[CRN_CONV3_KERNEL_LEN],
                                                        int slice_num, int inputlen)
{
    // pointers
    int *pin0=in, *pin1=in+inputlen;                         // pointers in registers

    // registers                                                                // data in registers
     int s0_d0,s0_d1,s0_d2;
     int s1_d0,s1_d1,s1_d2;
     short s0_h0=W[0][0],s0_h1=W[0][1],s0_h2=W[0][2];
     short s1_h0=W[1][0],s1_h1=W[1][1],s1_h2=W[1][2];

    // preroll
    s0_d2=*pin0++;    s1_d2=*pin1++;

    // 0th sample
    int acc=*inout;
    acc+=(int)s0_d2*s0_h2+(int)s1_d2*s1_h2;
    *inout++ = acc;

    // 1st sample
    acc=*inout;
    acc+=(int)s0_d2*s0_h1+(int)s1_d2*s1_h1;
    *inout++ = acc;

    int acc0,acc1,acc2,acc3;

    for (int m=2;m<inputlen;m+=2)
    {
        // data shifting and load new data
        s0_d0=s0_d2; s0_d1=*pin0++; s0_d2=*pin0++;
        s1_d0=s1_d2; s1_d1=*pin1++; s1_d2=*pin1++;

        // load accumulated data
           acc0=*inout;
           acc1=*(inout+1);
           acc2=*(inout+2);
           acc3=*(inout+3);

        // even smpls
        acc0+=(int)s0_d0*s0_h0+(int)s0_d1*s0_h2+(int)s1_d0*s1_h0+(int)s1_d1*s1_h2;
        acc2+=(int)s0_d1*s0_h0+(int)s0_d2*s0_h2+(int)s1_d1*s1_h0+(int)s1_d2*s1_h2;

        // odd smpls
        acc1+=(int)s0_d1*s0_h1+(int)s1_d1*s1_h1;
        acc3+=(int)s0_d2*s0_h1+(int)s1_d2*s1_h1;

        *inout++=acc0;
        *inout++=acc1;
        *inout++=acc2;
        *inout++=acc3;
    }

    acc0 = *inout;
    acc1 = *(inout + 1);


    s0_d0 = s0_d2; s0_d1 = *pin0++;
    s1_d0 = s1_d2; s1_d1 = *pin1++;

	acc0 += (int)s0_d0*s0_h0 + (int)s0_d1*s0_h2 + (int)s1_d0*s1_h0 + (int)s1_d1*s1_h2;
	acc1 += (int)s0_d1*s0_h1 + (int)s1_d1*s1_h1;

    *inout++ = acc0;
    *inout++ = acc1;
}
#endif

//#ifdef USE_XTENSA_OPT
//extern void CRN_PosNeg_relu(int* out0, int* out1, INT64* in, int n);
//#else
//static void CRN_PosNeg_relu(int* out0, int* out1, INT64* in, int n)
//{
//    do{
//        int ds0=(int)((*in++)>>Q_WEIGHT);
//        int ds1=-ds0;
//        if (ds0<0) ds0=0;
//        *out0++=ds0;
//        if (ds1<0) ds1=0;
//        *out1++=ds1;
//    }while(--n>0);
//}
//#endif

#if 0 //def USEOPT
extern void CRN_PosNeg_relu32(int* out0, int* out1, int* in, int n);
#else
static void CRN_PosNeg_relu32(int* out0, int* out1, int* in, int n)
{
	do {
		int ds0 = (int)((*in++) >> Q_WEIGHT);
		int ds1 = -ds0;
		if (ds0 < 0) ds0 = 0;
		*out0++ = ds0;
		if (ds1 < 0) ds1 = 0;
		*out1++ = ds1;
	} while (--n > 0);
}
#endif

static void fx_CRN_DeconvPosNeg543_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short *W,
                                        short* bias, int Wlen, int* scratch64)
{
    int *out=outbuf->buf;
    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int *in=inbuf->buf;
    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

    for (int out_ch_idx=0;out_ch_idx<out_ch/2;out_ch_idx++)
    {
        // fill temp buffer by bias
        CRN_memset32(scratch64,bias[out_ch_idx]<<(Q_WEIGHT),out_binlen);  // !!! Note : combine with first slice

        for (int slice_cnt=0;slice_cnt<in_ch;slice_cnt+=CONV3_SLICE_LEN)
        {
            // stride 2 convolution over slice CONV1_SLICE_LEN
            fx_CRN_deconv53_relu_strided_conv(scratch64,in+slice_cnt*in_binlen, (short(*)[3])(W+(Wlen*in_ch*out_ch_idx)+Wlen*slice_cnt),CONV3_SLICE_LEN,in_binlen);
        }

        CRN_PosNeg_relu32(out+out_ch_idx*out_binlen,out+(out_ch/2+out_ch_idx)*out_binlen,scratch64,out_binlen);  // !!! Note : combine with the last slice
    }
}




#ifdef USEOPT
extern void fx_CRN_deconv2_relu_strided_conv(int* inout, int (*in), short (*W)[CRN_CONV3_KERNEL_LEN],
   int slice_num, int inputlen);
#else
static void fx_CRN_deconv2_relu_strided_conv(int* inout, int (*in), short (*W)[CRN_CONV2_KERNEL_LEN],
                                                        int slice_num, int inputlen)
{
    // pointers
    int *pin0=in, *pin1=in+inputlen;                       // pointers in registers

    // registers                                                                        // data in registers
    int s0_d0,s0_d1,s0_d2;
    int s1_d0,s1_d1,s1_d2;
    short s0_h0=W[0][0],s0_h1=W[0][1],s0_h2=W[0][2];
    short s1_h0=W[1][0],s1_h1=W[1][1],s1_h2=W[1][2];

    // preroll
    s0_d2=*pin0++;    s1_d2=*pin1++;

    // 0th sample
    int acc=*inout;
	acc+=(int)s0_d2*s0_h2+(int)s1_d2*s1_h2;
    *inout++ = acc;

    // 1st sample
    acc=*inout;
	acc+=(int)s0_d2*s0_h1+(int)s1_d2*s1_h1;
    *inout++ = acc;

    for (int m=1;m<inputlen-2;m+=2)
    {
        // data shifting and load new data
        s0_d0=s0_d2; s0_d1=*pin0++; s0_d2=*pin0++;
        s1_d0=s1_d2; s1_d1=*pin1++; s1_d2=*pin1++;

        // load accumulated data
        int acc0=*inout;
        int acc1=*(inout+1);
        int acc2=*(inout+2);
        int acc3=*(inout+3);

        // even smpls
		acc0+=(int)s0_d0*s0_h0+(int)s0_d1*s0_h2+(int)s1_d0*s1_h0+(int)s1_d1*s1_h2;
		acc2+=(int)s0_d1*s0_h0+(int)s0_d2*s0_h2+(int)s1_d1*s1_h0+(int)s1_d2*s1_h2;

        // odd smpls
		acc1+=(int)s0_d1*s0_h1+(int)s1_d1*s1_h1;
		acc3+=(int)s0_d2*s0_h1+(int)s1_d2*s1_h1;

        *inout++=acc0;
        *inout++=acc1;
        *inout++=acc2;
        *inout++=acc3;
    }

    // data shifting and load new data
    s0_d0=s0_d2; s0_d1=*pin0++;
    s1_d0=s1_d2; s1_d1=*pin1++;

    // load accumulated data
    int acc0=*inout;

    // even smpls
	acc0+=(int)s0_d0*s0_h0+(int)s0_d1*s0_h2+(int)s1_d0*s1_h0+(int)s1_d1*s1_h2;

    *inout++=acc0;
}
#endif

static void fx_CRN_deconv2_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short (*W)[CRN_DECONV2_OUT_CH][CRN_DECONV2_KERNEL_LEN],
                                        short* bias, int Wlen, int* scratch64)
{
    int (*out)[CRN_DECONV1_OUT_BUFLEN]=outbuf->buf;
    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int (*in)[CRN_DECONV2_OUT_LEN]=inbuf->buf;
    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

    for (int out_ch_idx=0;out_ch_idx<out_ch;out_ch_idx++)
    {
        // fill temp buffer by bias
        CRN_memset32(scratch64,bias[out_ch_idx]<<(Q_WEIGHT),out_binlen);  // !!! Note : combine with first slice

        for (int slice_cnt=0;slice_cnt<in_ch;slice_cnt+=CONV2_SLICE_LEN)
        {
            // stride 2 convolution over slice CONV1_SLICE_LEN
            fx_CRN_deconv2_relu_strided_conv(scratch64,in[slice_cnt],W[out_ch_idx]+slice_cnt,CONV2_SLICE_LEN,in_binlen);
        }

        CRN_conv_relu32(out[out_ch_idx],scratch64,out_binlen);  // !!! Note : combine with the last slice
    }
}

#ifdef USEOPT
extern void fx_CRN_deconv1_relu_strided_conv(int* inout, int(*in[CRN_DECONV1_OUT_BUFLEN]), short(*W)[CRN_DECONV1_KERNEL_LEN],
   int slice_num, int inputlen);
#else
static void fx_CRN_deconv1_relu_strided_conv(int* inout, int (*in)[CRN_DECONV1_OUT_BUFLEN], short (*W)[CRN_DECONV1_KERNEL_LEN],
                                                        int slice_num, int inputlen)
{
    // pointers
    int *pin0=in[0], *pin1=in[1];                         // pointers in registers

    // registers                                                                // data in registers
    int s0_d0,s0_d1,s0_d2;
    int s1_d0,s1_d1,s1_d2;
    short s0_h0=W[0][0],s0_h1=W[0][1],s0_h2=W[0][2],s0_h3=W[0][3],s0_h4=W[0][4];
    short s1_h0=W[1][0],s1_h1=W[1][1],s1_h2=W[1][2],s1_h3=W[1][3],s1_h4=W[1][4];

    // load data
    s0_d2=*pin0++;    s1_d2=*pin1++;

    // 0th sample
    int acc=*inout;
    acc+=(int)s0_d2*s0_h4+(int)s1_d2*s1_h4;
    *inout++ = acc;

    // 1st sample
    acc=*inout;
    acc+=(int)s0_d2*s0_h3+(int)s1_d2*s1_h3;
    *inout++ = acc;

    // load data
    s0_d1=s0_d2; s0_d2=*pin0++;    s1_d1=s1_d2; s1_d2=*pin1++;

    // 2nd sample
    acc=*inout;
    acc+=(int)s0_d1*s0_h2+(int)s0_d2*s0_h4+(int)s1_d1*s1_h2+(int)s1_d2*s1_h4;
    *inout++ = acc;

    // 3rd sample
    acc=*inout;
    acc+=(int)s0_d1*s0_h1+(int)s0_d2*s0_h3+(int)s1_d1*s1_h1+(int)s1_d2*s1_h3;
    *inout++ = acc;

    for (int m=2;m<inputlen;m++)
    {
        // data shifting and load new data
        s0_d0=s0_d1; s0_d1=s0_d2; s0_d2=*pin0++;
        s1_d0=s1_d1; s1_d1=s1_d2; s1_d2=*pin1++;

        // load accumulated data
        int acc0=*inout;
        int acc1=*(inout+1);

        // even smpls
        acc0+=(int)s0_d0*s0_h0+(int)s0_d1*s0_h2+(int)s0_d2*s0_h4+(int)s1_d0*s1_h0+(int)s1_d1*s1_h2+(int)s1_d2*s1_h4;
        // odd smpls
        acc1+=(int)s0_d1*s0_h1+(int)s0_d2*s0_h3+(int)s1_d1*s1_h1+(int)s1_d2*s1_h3;

        *inout++=acc0;
        *inout++=acc1;
    }

    // even smpls
    acc=*inout;
    acc+=(int)s0_d1*s0_h0+(int)s0_d2*s0_h2+(int)s1_d1*s1_h0+(int)s1_d2*s1_h2;
    *inout++ = acc;
    // odd smpls
    acc=*inout;
    acc+=(int)s0_d2*s0_h1+(int)s1_d2*s1_h1;
    *inout++ = acc;
    // even smpls
    acc=*inout;
    acc+=(int)s0_d2*s0_h0+(int)s1_d2*s1_h0;
    *inout++=acc;
}
#endif

static void fx_CRN_deconv1_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short (*W)[CRN_DECONV1_OUT_CH][CRN_DECONV1_KERNEL_LEN],
                                        short* bias, int Wlen, int* scratch64)
{
    int (*out)[CRN_INPUT_LEN]=outbuf->buf;
    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int (*in)[CRN_DECONV1_OUT_BUFLEN]=inbuf->buf;
    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

    for (int out_ch_idx=0;out_ch_idx<out_ch;out_ch_idx++)
    {
        // fill temp buffer by bias
        CRN_memset32(scratch64,bias[out_ch_idx]<<(Q_WEIGHT),out_binlen);  // !!! Note : combine with first slice

        for (int slice_cnt=0;slice_cnt<in_ch;slice_cnt+=CONV1_SLICE_LEN)
        {
            // stride 2 convolution over slice CONV1_SLICE_LEN
            fx_CRN_deconv1_relu_strided_conv(scratch64,in+slice_cnt,W[out_ch_idx]+slice_cnt,CONV1_SLICE_LEN,in_binlen);
        }

        CRN_conv_relu32(out[out_ch_idx],scratch64,out_binlen);  // !!! Note : combine with the last slice
    }
}


/*
static void fx_CRN_deconv_relu(CRN_BUF_T* outbuf, CRN_BUF_T* inbuf, short* W, short* bias, int Wlen, bool valid)
{
    int* out=outbuf->buf;
    int out_binlen=outbuf->len;
    int out_ch=outbuf->ch;

    int* in=inbuf->buf;
    int in_binlen=inbuf->len;
    int in_ch=inbuf->ch;

//    int start_idx=((!valid) && ((in_binlen & 1)==1)) ? 1:0;
   int start_idx = ((!valid)) ? -2 : -4;
   int start_intrp_idx = ((!valid)) ? 0 : 1;
   //    int start_idx=-2;

    for (int out_ch_idx=0;out_ch_idx<out_ch;out_ch_idx++)
    {
        for (int out_bin_idx=0;out_bin_idx<out_binlen;out_bin_idx++)
        {
            int Wstart = MAX(out_bin_idx & 1, -out_bin_idx-start_idx);         // array read underrun protection (only first pass)
         //int Wend = MIN(Wlen, in_binlen - out_bin_idx / 2);                  // array read overflow protection (only last pass)
         int Wend = MIN(Wlen, in_binlen*2- start_intrp_idx - (out_bin_idx+start_idx));                  // array read overflow protection (only last pass)
         INT64 d1=bias[out_ch_idx]<<(Q_WEIGHT);
            for (int in_ch_idx=0;in_ch_idx<in_ch;in_ch_idx++)
            {
                for (int k=Wstart;k<Wend;k+=2)
                {
                    int idx = (out_bin_idx + k+start_idx) / 2;
                    d1+=(INT64)in[in_binlen*in_ch_idx + idx]*W[Wlen*in_ch*out_ch_idx+Wlen*in_ch_idx+k];
                }
            }
            int ds1=(int)(d1>>Q_WEIGHT);            if (ds1<0) ds1=0;
            out[out_binlen*out_ch_idx+out_bin_idx]=ds1;
        }
    }
}
*/

static void CRN_CircullantD_CRN_GRU_mul(int* out, int* in, short* Dg, int n)
{
    do{
        *out++ = ((INT64)(*in++) * (*Dg++))>>Q_GRU_WEIGHT;
    }while(--n>0);
}

static void CRN_CircullantD_CRN_GRU_mul_permute(int* out, int* in, short* W, short* pidx, int n)
{
    do{
        //int data = ((INT64)(*in++) * (*W++))>>Q_GRU_WEIGHT;
        //out[*pidx++]=data;
		int data = ((INT64)(in[*pidx++]) * (*W++)) >> Q_GRU_WEIGHT;
		*out++ = data;
    }while(--n>0);
}

static void CRN_CircullantD_CRN_GRU_mul_r_mul(int* out, int* in, short* W, int* r,int n)
{
   do {
      int d = (int)(((INT64)(*in++) * (*r++)) >> Q_DATA);
      *out++ = (int)((INT64)d * (*W++))>>Q_GRU_WEIGHT;
   } while (--n > 0);
}

static void CRN_CircullantD_CRN_GRU_output(int* out, int* in1, int* in2, int* alpha, int n)
{
    do{
        int d1 = *in1++;
        int d2 = *in2++;
        *out++ = (((INT64)(d1-d2)*(*alpha++))>>Q_DATA)+d2;
    }while(--n>0);
}

#define Q_HARDSIGMOID Q_DATA
#define HARDSIGMOID_SCALE_0_2 ((int)(0.2f*(1<<Q_DATA)+0.5f))
#define HARDSIGMOID_OFFSET_0_5 ((int)(0.5f*(1<<Q_DATA)+0.5f))
#define HARDSIGMOID_1_0 (1<<Q_DATA)
static void CRN_CircullantD_CRN_GRU_bias_and_hardsigmoid(int* out, int* in, short* pbias, int n)
{
        do{
        int d = (*in++) + (*pbias++);
        d=((d*HARDSIGMOID_SCALE_0_2)>>Q_HARDSIGMOID)+HARDSIGMOID_OFFSET_0_5;
        if (d<0) d=0;
        if (d>HARDSIGMOID_1_0) d=HARDSIGMOID_1_0;
        *out++ = d;
    }while(--n>0);
}

static void CRN_CircullantD_CRN_GRU_bias_and_relu(int* out, int* in, short* pbias, int n)
{
    do{
        int d = (*in++) + (*pbias++);
        if (d<0) d=0;
        *out++ = d;
    }while(--n>0);
}

static void CRN_CircullantD_CRN_GRU_scaledown(int* inout, int sh, int n)
{
    do{
        int d = (*inout)>>sh;
        *inout++ = d;
    }while(--n>0);
}

static void fx_CRN_GRU( CRN_network_T* Net, int* state, int* inbuf,
                              short* W_krn_B, short* W_krn_G, short* W_krn_S, short* Didx_krn_P,
                              short* W_reckrn_B, short* W_reckrn_G, short* W_reckrn_S, short* Didx_reckrn_P,
                              short* Binp, short* Brec)
{
    /////////////////////// input branch (lower branch on flowchart) /////////////////////
    int* Ainp1=(int*)(Net->GRUscratch_inp1);
    int* Ainp2=(int*)(Net->GRUscratch_inp2);
	CRN_CircullantD_CRN_GRU_mul(Ainp1, inbuf, W_krn_B, CRN_GRU_INPUT_LEN);
	CRN_CircullantD_CRN_GRU_mul(Ainp1 + CRN_GRU_INPUT_LEN, state, W_krn_B + CRN_GRU_INPUT_LEN, CRN_GRU_STATE_LEN);
	SEC_AudioRD_FHT(Ainp1,Ainp1,9);	// 9=log2(CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN)
    CRN_CircullantD_CRN_GRU_mul_permute(Ainp2,Ainp1,W_krn_G,Didx_krn_P, CRN_GRU_INPUT_LEN + CRN_GRU_STATE_LEN);
    SEC_AudioRD_FHT(Ainp2,Ainp2,9);	// 9=log2(CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN)
        CRN_CircullantD_CRN_GRU_scaledown(Ainp2,9,CRN_GRU_INPUT_LEN+ CRN_GRU_STATE_LEN);
    CRN_CircullantD_CRN_GRU_mul(Ainp2, Ainp2, W_krn_S, CRN_GRU_INPUT_LEN + CRN_GRU_STATE_LEN);

    // output of input branch
    int* bi=Ainp2;
    CRN_CircullantD_CRN_GRU_bias_and_hardsigmoid(bi,Ainp2,Binp,CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN);
    int* z=bi;
    int* r=bi+CRN_GRU_INPUT_LEN;

    /////////////////////// recursive branch (upper branch on flowchart) /////////////////////
    int* Brec1=(int*)(Net->GRUscratch_rec1);
    int* Brec2=(int*)(Net->GRUscratch_rec2);
    CRN_CircullantD_CRN_GRU_mul(Brec1,inbuf,W_reckrn_B,CRN_GRU_INPUT_LEN);
    CRN_CircullantD_CRN_GRU_mul_r_mul(Brec1+CRN_GRU_INPUT_LEN,state,W_reckrn_B+CRN_GRU_INPUT_LEN,r,CRN_GRU_STATE_LEN);
    SEC_AudioRD_FHT(Brec1,Brec1,9);	// 9=log2(CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN)
    CRN_CircullantD_CRN_GRU_mul_permute(Brec2,Brec1,W_reckrn_G,Didx_reckrn_P, CRN_GRU_INPUT_LEN + CRN_GRU_STATE_LEN);
    SEC_AudioRD_FHT(Brec2,Brec2,9);	// 9=log2(CRN_GRU_INPUT_LEN+CRN_GRU_STATE_LEN)
        CRN_CircullantD_CRN_GRU_scaledown(Brec2,9,/*CRN_GRU_INPUT_LEN+*/CRN_GRU_STATE_LEN);
    CRN_CircullantD_CRN_GRU_mul(Brec2, Brec2, W_reckrn_S,CRN_GRU_STATE_LEN);

    // output of recursive branch
    int* br=Brec2;
    CRN_CircullantD_CRN_GRU_bias_and_relu(br,Brec2,Brec,CRN_GRU_STATE_LEN);

    /////////////////////// merging /////////////////////
    CRN_CircullantD_CRN_GRU_output(state,state,br,z,CRN_GRU_STATE_LEN);
}

static void CRN_CircullantD_saturation(short* out, int* in, int n)
{
    do{
      int d = (*in++) << (Q_OUTPUT - Q_DATA);
      if (d < 0) d = 0;
      if (d >= (1 << Q_OUTPUT)) d = (1 << Q_OUTPUT) - 1;
        *out++=(short)d;
    }while(--n>0);
}

static void CRN_CircullantD_reshape_before_gru(int* out, int(*in)[CRN_CONV5_OUT_LEN], int ch, int binlen)
{
   for (int k = 0; k < binlen; k++)
   {
      for (int i = 0; i < ch; i++)
      {
         *out++ = in[i][k];
      }
   }
}

static void CRN_CircullantD_reshape_after_gru(int(*out)[CRN_CONV5_OUT_LEN], int* in, int ch, int binlen)
{
   for (int k = 0; k < binlen; k++)
   {
      for (int i = 0; i < ch; i++)
      {
         out[i][k]=*in++;
      }
   }
}

void CRN_CircullantD_Exe(    short *in_speech,    short *in_noise,       short *out_IRM   )
{
    CRN_network_T* CRN_net=&CRN_network_mem;//CRN_net_ptr;

    int (*BN_buf)[CRN_INPUT_LEN]=(int(*)[CRN_INPUT_LEN])(CRN_net->crn_conv2_outbuf);   // use currently not used yet buffers as scratch memory
	CRN_CircullantD_BN(BN_buf[0], in_noise, BN1_scale_fx, BN1_offset_fx, CRN_INPUT_LEN);
	CRN_CircullantD_BN(BN_buf[1], in_speech, BN1_scale_fx + CRN_INPUT_LEN, BN1_offset_fx + CRN_INPUT_LEN, CRN_INPUT_LEN);

//#ifdef DBGDUMP
//    for (int i = 0; i < CRN_INPUT_LEN; i++)    fprintf(fdbgdump, "%d,", BN_buf[1][i]);
//    fprintf(fdbgdump, "\n");
//#endif
    int i=0;
    fx_CRN_conv1_relu(&(CRN_net->fx_conv_Layer[i].outbuf), &(CRN_net->fx_conv_Layer[i].inbuf),
        CRN_net->fx_conv_Layer[i].W, CRN_net->fx_conv_Layer[i].B, CRN_net->fx_conv_Layer[i].Wlen,
        (int*)(CRN_net->crn_conv5_outbuf)     // Scratch memory for conv1
        );

    i=1;
    fx_CRN_conv2_relu(&(CRN_net->fx_conv_Layer[i].outbuf), &(CRN_net->fx_conv_Layer[i].inbuf),
        CRN_net->fx_conv_Layer[i].W, CRN_net->fx_conv_Layer[i].B, CRN_net->fx_conv_Layer[i].Wlen,
        (int*)(CRN_net->crn_conv5_outbuf)     // Scratch memory for conv2
        );


    for (int i=2;i<5;i++)
        fx_CRN_conv35_relu(&(CRN_net->fx_conv_Layer[i].outbuf), &(CRN_net->fx_conv_Layer[i].inbuf),
            CRN_net->fx_conv_Layer[i].W, CRN_net->fx_conv_Layer[i].B, CRN_net->fx_conv_Layer[i].Wlen,
            (CRN_net->crn_conv1_outbuf[CRN_CONV1_OUT_CH])     // Scratch memory for conv3-5
            );

#ifdef DBGDUMP
    for (int k = 0; k < CRN_CONV1_OUT_CH; k++)
    {
        for (int i = 0; i < CRN_CONV1_OUT_LEN; i++)    fprintf(fdbgdump_enc1, "%d,", CRN_net->crn_conv1_outbuf[k][i]);
        fprintf(fdbgdump_enc1, "\n");
    }
    for (int k = 0; k < CRN_CONV2_OUT_CH; k++)
    {
        for (int i = 0; i < CRN_CONV2_OUT_LEN; i++)    fprintf(fdbgdump_enc2, "%d,", CRN_net->crn_conv2_outbuf[k][i]);
        fprintf(fdbgdump_enc2, "\n");
    }
    for (int k = 0; k < CRN_CONV3_OUT_CH; k++)
    {
        for (int i = 0; i < CRN_CONV3_OUT_LEN; i++)    fprintf(fdbgdump_enc3, "%d,", CRN_net->crn_conv3_outbuf[k][i]);
        fprintf(fdbgdump_enc3, "\n");
    }
    for (int k = 0; k < CRN_CONV4_OUT_CH; k++)
    {
        for (int i = 0; i < CRN_CONV4_OUT_LEN; i++)    fprintf(fdbgdump_enc4, "%d,", CRN_net->crn_conv4_outbuf[k][i]);
        fprintf(fdbgdump_enc4, "\n");
    }
    for (int k = 0; k < CRN_CONV5_OUT_CH; k++)
    {
        for (int i = 0; i < CRN_CONV5_OUT_LEN; i++)    fprintf(fdbgdump_enc5, "%d,", CRN_net->crn_conv5_outbuf[k][i]);
        fprintf(fdbgdump_enc5, "\n");
    }
#endif

    CRN_CircullantD_reshape_before_gru(CRN_net->fx_GRU_Layer[0].inbuf,
                                    CRN_net->crn_conv5_outbuf,
                                    CRN_CONV5_OUT_CH,CRN_CONV5_OUT_LEN);

//#ifdef DBGDUMP
//   for (int i = 0; i < CRN_CONV5_OUT_CH*CRN_CONV5_OUT_LEN; i++)    fprintf(fdbgdump, "%d,", CRN_net->fx_GRU_Layer[0].inbuf[i]);
//   fprintf(fdbgdump, "\n");
//#endif

    for (int i=0;i<2;i++)
        fx_CRN_GRU( CRN_net,
                    CRN_net->fx_GRU_Layer[i].state, CRN_net->fx_GRU_Layer[i].inbuf,
                    CRN_net->fx_GRU_Layer[i].W_krn_B,  CRN_net->fx_GRU_Layer[i].W_krn_G, CRN_net->fx_GRU_Layer[i].W_krn_S,  CRN_net->fx_GRU_Layer[i].P_krn_idx,
                    CRN_net->fx_GRU_Layer[i].W_rec_B,  CRN_net->fx_GRU_Layer[i].W_rec_G, CRN_net->fx_GRU_Layer[i].W_rec_S,  CRN_net->fx_GRU_Layer[i].P_rec_idx,
                    CRN_net->fx_GRU_Layer[i].Bkrn,  CRN_net->fx_GRU_Layer[i].Brec);

//#ifdef DBGDUMP
//   for (int i = 0; i < CRN_GRU_STATE_LEN; i++)    fprintf(fdbgdump, "%d,", CRN_net->fx_GRU_Layer[1].state[i]);
//   fprintf(fdbgdump, "\n");
//#endif

   int(*gru_output)[CRN_DECONV5_OUT_LEN] = (int(*)[CRN_DECONV5_OUT_LEN])CRN_net->crn_conv5_outbuf[CRN_CONV5_OUT_CH];
      //(int(*)[CRN_DECONV5_OUT_LEN])(CRN_net->fx_deconv_Layer[0].inbuf.buf) + CRN_CONV5_OUT_LEN * CRN_DECONV5_OUT_CH / 2;
   CRN_CircullantD_reshape_after_gru(gru_output,
                                 CRN_net->fx_GRU_Layer[1].state,
                                 CRN_CONV5_OUT_CH, CRN_CONV5_OUT_LEN);

//#ifdef DBGDUMP
//   for (int k = 0; k < CRN_CONV5_OUT_CH; k++)
//   {
//       for (int i = 0; i < CRN_CONV5_OUT_LEN; i++)    fprintf(fdbgdump, "%d,", gru_output[k][i]);
//       fprintf(fdbgdump, "\n");
//   }
//#endif

//    for (int i = 0; i < 3; i++)
//        fx_CRN_deconv53_relu( &(CRN_net->fx_deconv_Layer[i].outbuf),&(CRN_net->fx_deconv_Layer[i].inbuf),
//                            CRN_net->fx_deconv_Layer[i].W,CRN_net->fx_deconv_Layer[i].B,CRN_net->fx_deconv_Layer[i].Wlen,
//                            CRN_net->GRUscratch1);      // scratch memory for deconv 1-3
    for (int i = 0; i < 3; i++)
        fx_CRN_DeconvPosNeg543_relu( &(CRN_net->fx_deconv_Layer[i].outbuf),&(CRN_net->fx_deconv_Layer[i].inbuf),
                            CRN_net->fx_deconv_Layer[i].W,CRN_net->fx_deconv_Layer[i].B,CRN_net->fx_deconv_Layer[i].Wlen,
                            CRN_net->GRUscratch_inp1);                                                              // scratch memory for deconv 1-3

    i=3;
    fx_CRN_deconv2_relu( &(CRN_net->fx_deconv_Layer[i].outbuf),&(CRN_net->fx_deconv_Layer[i].inbuf),
                        CRN_net->fx_deconv_Layer[i].W,CRN_net->fx_deconv_Layer[i].B,CRN_net->fx_deconv_Layer[i].Wlen,
                        CRN_net->GRUscratch_inp1);                                                                 // scratch memory for deconv 4

#ifdef DBGDUMP
        for (int i = 0; i < CRN_GRU_STATE_LEN; i++)    fprintf(fdbgdump_gru1, "%d,", CRN_net->fx_GRU_Layer[0].state[i]);
        fprintf(fdbgdump_gru1, "\n");
        for (int i = 0; i < CRN_GRU_STATE_LEN; i++)    fprintf(fdbgdump_gru2, "%d,", CRN_net->fx_GRU_Layer[1].state[i]);
        fprintf(fdbgdump_gru2, "\n");

        for (int k = 0; k < CRN_CONV4_OUT_CH; k++)
        {
            for (int i = 0; i < CRN_CONV4_OUT_LEN; i++)    fprintf(fdbgdump_dec5, "%d,", CRN_net->crn_conv4_outbuf[k+CRN_CONV4_OUT_CH][i]);
            fprintf(fdbgdump_dec5, "\n");
        }
        for (int k = 0; k < CRN_CONV3_OUT_CH; k++)
        {
            for (int i = 0; i < CRN_CONV3_OUT_LEN; i++)    fprintf(fdbgdump_dec4, "%d,", CRN_net->crn_conv3_outbuf[k+CRN_CONV3_OUT_CH][i]);
            fprintf(fdbgdump_dec4, "\n");
        }
        for (int k = 0; k < CRN_CONV2_OUT_CH; k++)
        {
            for (int i = 0; i < CRN_CONV2_OUT_LEN; i++)    fprintf(fdbgdump_dec3, "%d,", CRN_net->crn_conv2_outbuf[k+CRN_CONV2_OUT_CH][i]);
            fprintf(fdbgdump_dec3, "\n");
        }
        for (int k = 0; k < CRN_CONV1_OUT_CH; k++)
        {
            for (int i = 0; i < CRN_CONV1_OUT_LEN; i++)    fprintf(fdbgdump_dec2, "%d,", CRN_net->crn_conv1_outbuf[k+CRN_CONV1_OUT_CH][i]);
            fprintf(fdbgdump_dec2, "\n");
        }
#endif

    i=4;
    fx_CRN_deconv1_relu( &(CRN_net->fx_deconv_Layer[i].outbuf),&(CRN_net->fx_deconv_Layer[i].inbuf),
                        CRN_net->fx_deconv_Layer[i].W,CRN_net->fx_deconv_Layer[i].B,CRN_net->fx_deconv_Layer[i].Wlen,
                        (int*)CRN_net->crn_conv3_outbuf);                                                             // scratch memory for deconv 5

#ifdef DBGDUMP
   for (int i = 0; i < CRN_INPUT_LEN; i++)    fprintf(fdbgdump_dec1, "%d,", ((int*)(CRN_net->fx_deconv_Layer[4].outbuf.buf))[i]);
   fprintf(fdbgdump_dec1, "\n");
#endif

    CRN_CircullantD_saturation(out_IRM,CRN_net->fx_deconv_Layer[4].outbuf.buf,CRN_INPUT_LEN);
}

void CRN_CircullantD_Init(void)
{
    CRN_network_mem.fx_conv_Layer[0].inbuf.buf=CRN_network_mem.crn_conv2_outbuf;
    CRN_network_mem.fx_conv_Layer[0].inbuf.len=CRN_INPUT_LEN;
    CRN_network_mem.fx_conv_Layer[0].inbuf.ch=CRN_INPUT_CH;
    CRN_network_mem.fx_conv_Layer[0].outbuf.buf=CRN_network_mem.crn_conv1_outbuf;
    CRN_network_mem.fx_conv_Layer[0].outbuf.len=CRN_CONV1_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[0].outbuf.ch=CRN_CONV1_OUT_CH;
    CRN_network_mem.fx_conv_Layer[0].W=enc1_W;
    CRN_network_mem.fx_conv_Layer[0].B=enc1_b;
    CRN_network_mem.fx_conv_Layer[0].Wlen=CRN_CONV1_KERNEL_LEN;
    CRN_network_mem.fx_conv_Layer[0].valid=true;

    CRN_network_mem.fx_conv_Layer[1].inbuf.buf=CRN_network_mem.crn_conv1_outbuf;
    CRN_network_mem.fx_conv_Layer[1].inbuf.len=CRN_CONV1_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[1].inbuf.ch=CRN_CONV1_OUT_CH;
    CRN_network_mem.fx_conv_Layer[1].outbuf.buf=CRN_network_mem.crn_conv2_outbuf;
    CRN_network_mem.fx_conv_Layer[1].outbuf.len=CRN_CONV2_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[1].outbuf.ch=CRN_CONV2_OUT_CH;
    CRN_network_mem.fx_conv_Layer[1].W=enc2_W;
    CRN_network_mem.fx_conv_Layer[1].B=enc2_b;
    CRN_network_mem.fx_conv_Layer[1].Wlen=CRN_CONV2_KERNEL_LEN;
    CRN_network_mem.fx_conv_Layer[1].valid=false;

    CRN_network_mem.fx_conv_Layer[2].inbuf.buf=CRN_network_mem.crn_conv2_outbuf;
    CRN_network_mem.fx_conv_Layer[2].inbuf.len=CRN_CONV2_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[2].inbuf.ch=CRN_CONV2_OUT_CH;
    CRN_network_mem.fx_conv_Layer[2].outbuf.buf=CRN_network_mem.crn_conv3_outbuf;
    CRN_network_mem.fx_conv_Layer[2].outbuf.len=CRN_CONV3_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[2].outbuf.ch=CRN_CONV3_OUT_CH;
    CRN_network_mem.fx_conv_Layer[2].W=enc3_W;
    CRN_network_mem.fx_conv_Layer[2].B=enc3_b;
    CRN_network_mem.fx_conv_Layer[2].Wlen=CRN_CONV3_KERNEL_LEN;
    CRN_network_mem.fx_conv_Layer[2].valid=false;

    CRN_network_mem.fx_conv_Layer[3].inbuf.buf=CRN_network_mem.crn_conv3_outbuf;
    CRN_network_mem.fx_conv_Layer[3].inbuf.len=CRN_CONV3_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[3].inbuf.ch=CRN_CONV3_OUT_CH;
    CRN_network_mem.fx_conv_Layer[3].outbuf.buf=CRN_network_mem.crn_conv4_outbuf;
    CRN_network_mem.fx_conv_Layer[3].outbuf.len=CRN_CONV4_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[3].outbuf.ch=CRN_CONV4_OUT_CH;
    CRN_network_mem.fx_conv_Layer[3].W=enc4_W;
    CRN_network_mem.fx_conv_Layer[3].B=enc4_b;
    CRN_network_mem.fx_conv_Layer[3].Wlen=CRN_CONV4_KERNEL_LEN;
    CRN_network_mem.fx_conv_Layer[3].valid=false;

    CRN_network_mem.fx_conv_Layer[4].inbuf.buf=CRN_network_mem.crn_conv4_outbuf;
    CRN_network_mem.fx_conv_Layer[4].inbuf.len=CRN_CONV4_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[4].inbuf.ch=CRN_CONV4_OUT_CH;
    CRN_network_mem.fx_conv_Layer[4].outbuf.buf=CRN_network_mem.crn_conv5_outbuf;
    CRN_network_mem.fx_conv_Layer[4].outbuf.len=CRN_CONV5_OUT_LEN;
    CRN_network_mem.fx_conv_Layer[4].outbuf.ch=CRN_CONV5_OUT_CH;
    CRN_network_mem.fx_conv_Layer[4].W=enc5_W;
    CRN_network_mem.fx_conv_Layer[4].B=enc5_b;
    CRN_network_mem.fx_conv_Layer[4].Wlen=CRN_CONV5_KERNEL_LEN;
    CRN_network_mem.fx_conv_Layer[4].valid=false;

    CRN_network_mem.fx_GRU_Layer[0].state=CRN_network_mem.crn_gru1_state;
    CRN_network_mem.fx_GRU_Layer[0].inbuf=CRN_network_mem.crn_conv5_outbuf[CRN_CONV5_OUT_CH];
    CRN_network_mem.fx_GRU_Layer[0].P_krn_idx=gru1_P_krn_idx;
    CRN_network_mem.fx_GRU_Layer[0].W_krn_B=gru1_W_krn_B;
    CRN_network_mem.fx_GRU_Layer[0].W_krn_G=gru1_W_krn_G;
    CRN_network_mem.fx_GRU_Layer[0].W_krn_S=gru1_W_krn_S;
    CRN_network_mem.fx_GRU_Layer[0].P_rec_idx=gru1_P_rec_idx;
    CRN_network_mem.fx_GRU_Layer[0].W_rec_B=gru1_W_rec_B;
    CRN_network_mem.fx_GRU_Layer[0].W_rec_G=gru1_W_rec_G;
    CRN_network_mem.fx_GRU_Layer[0].W_rec_S=gru1_W_rec_S;
    CRN_network_mem.fx_GRU_Layer[0].Bkrn=gru1_Bkrn;
    CRN_network_mem.fx_GRU_Layer[0].Brec=gru1_Brec;

    CRN_network_mem.fx_GRU_Layer[1].state=CRN_network_mem.crn_gru2_state;
    CRN_network_mem.fx_GRU_Layer[1].inbuf=CRN_network_mem.crn_gru1_state;
    CRN_network_mem.fx_GRU_Layer[1].P_krn_idx=gru2_P_krn_idx;
    CRN_network_mem.fx_GRU_Layer[1].W_krn_B=gru2_W_krn_B;
    CRN_network_mem.fx_GRU_Layer[1].W_krn_G=gru2_W_krn_G;
    CRN_network_mem.fx_GRU_Layer[1].W_krn_S=gru2_W_krn_S;
    CRN_network_mem.fx_GRU_Layer[1].P_rec_idx=gru2_P_rec_idx;
    CRN_network_mem.fx_GRU_Layer[1].W_rec_B=gru2_W_rec_B;
    CRN_network_mem.fx_GRU_Layer[1].W_rec_G=gru2_W_rec_G;
    CRN_network_mem.fx_GRU_Layer[1].W_rec_S=gru2_W_rec_S;
    CRN_network_mem.fx_GRU_Layer[1].Bkrn=gru2_Bkrn;
    CRN_network_mem.fx_GRU_Layer[1].Brec=gru2_Brec;

    CRN_network_mem.fx_deconv_Layer[0].inbuf.buf=CRN_network_mem.crn_conv5_outbuf;
    CRN_network_mem.fx_deconv_Layer[0].inbuf.len=CRN_DECONV5_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[0].inbuf.ch=CRN_DECONV5_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[0].outbuf.buf=CRN_network_mem.crn_conv4_outbuf+CRN_CONV4_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[0].outbuf.len=CRN_DECONV4_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[0].outbuf.ch=CRN_CONV4_OUT_CH;          // Yes, CRN_CONV4_OUT_CH - putput is only half of it (see outbuf.buf above)
    CRN_network_mem.fx_deconv_Layer[0].W=dec5_W;
    CRN_network_mem.fx_deconv_Layer[0].B=dec5_b;
    CRN_network_mem.fx_deconv_Layer[0].Wlen=CRN_CONV5_KERNEL_LEN;
    CRN_network_mem.fx_deconv_Layer[0].valid=false;

    CRN_network_mem.fx_deconv_Layer[1].inbuf.buf=CRN_network_mem.crn_conv4_outbuf;
    CRN_network_mem.fx_deconv_Layer[1].inbuf.len=CRN_DECONV4_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[1].inbuf.ch=CRN_DECONV4_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[1].outbuf.buf=CRN_network_mem.crn_conv3_outbuf+CRN_CONV3_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[1].outbuf.len=CRN_DECONV3_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[1].outbuf.ch=CRN_CONV3_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[1].W=dec4_W;
    CRN_network_mem.fx_deconv_Layer[1].B=dec4_b;
    CRN_network_mem.fx_deconv_Layer[1].Wlen=CRN_CONV4_KERNEL_LEN;
    CRN_network_mem.fx_deconv_Layer[1].valid=false;

    CRN_network_mem.fx_deconv_Layer[2].inbuf.buf=CRN_network_mem.crn_conv3_outbuf;
    CRN_network_mem.fx_deconv_Layer[2].inbuf.len=CRN_DECONV3_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[2].inbuf.ch=CRN_DECONV3_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[2].outbuf.buf=CRN_network_mem.crn_conv2_outbuf+CRN_CONV2_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[2].outbuf.len=CRN_DECONV2_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[2].outbuf.ch=CRN_CONV2_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[2].W=dec3_W;
    CRN_network_mem.fx_deconv_Layer[2].B=dec3_b;
    CRN_network_mem.fx_deconv_Layer[2].Wlen=CRN_CONV3_KERNEL_LEN;
    CRN_network_mem.fx_deconv_Layer[2].valid=false;

    CRN_network_mem.fx_deconv_Layer[3].inbuf.buf=CRN_network_mem.crn_conv2_outbuf;
    CRN_network_mem.fx_deconv_Layer[3].inbuf.len=CRN_DECONV2_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[3].inbuf.ch=CRN_DECONV2_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[3].outbuf.buf=CRN_network_mem.crn_conv1_outbuf+CRN_CONV1_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[3].outbuf.len=CRN_DECONV1_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[3].outbuf.ch=CRN_CONV1_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[3].W=dec2_W;
    CRN_network_mem.fx_deconv_Layer[3].B=dec2_b;
    CRN_network_mem.fx_deconv_Layer[3].Wlen=CRN_CONV2_KERNEL_LEN;
    CRN_network_mem.fx_deconv_Layer[3].valid=false;

    CRN_network_mem.fx_deconv_Layer[4].inbuf.buf=CRN_network_mem.crn_conv1_outbuf;
    CRN_network_mem.fx_deconv_Layer[4].inbuf.len=CRN_DECONV1_OUT_LEN;
    CRN_network_mem.fx_deconv_Layer[4].inbuf.ch=CRN_DECONV1_OUT_CH;
    CRN_network_mem.fx_deconv_Layer[4].outbuf.buf=CRN_network_mem.GRUscratch_inp1;
    CRN_network_mem.fx_deconv_Layer[4].outbuf.len=CRN_INPUT_LEN;
    CRN_network_mem.fx_deconv_Layer[4].outbuf.ch=1;
    CRN_network_mem.fx_deconv_Layer[4].W=dec1_W;
    CRN_network_mem.fx_deconv_Layer[4].B=dec1_b;
    CRN_network_mem.fx_deconv_Layer[4].Wlen=CRN_CONV1_KERNEL_LEN;
    CRN_network_mem.fx_deconv_Layer[4].valid=true;

    //SEC_AudioRD_FFT_NEON_Init(8);

    // debug
#ifdef DBGDUMP
    fdbgdump_enc1= fopen(filedbgdump_enc1, "wb");
    fdbgdump_enc2= fopen(filedbgdump_enc2, "wb");
    fdbgdump_enc3= fopen(filedbgdump_enc3, "wb");
    fdbgdump_enc4= fopen(filedbgdump_enc4, "wb");
    fdbgdump_enc5= fopen(filedbgdump_enc5, "wb");

    fdbgdump_gru1 = fopen(filedbgdump_gru1, "wb");
    fdbgdump_gru2 = fopen(filedbgdump_gru2, "wb");

    fdbgdump_dec1 = fopen(filedbgdump_dec1, "wb");
	fdbgdump_dec2 = fopen(filedbgdump_dec2, "wb");
	fdbgdump_dec3 = fopen(filedbgdump_dec3, "wb");
	fdbgdump_dec4 = fopen(filedbgdump_dec4, "wb");
	fdbgdump_dec5= fopen(filedbgdump_dec5, "wb");
#endif

//    return (void*)(&CRN_network_mem);
}

