#ifndef _WINO_CELL_HPP_
#define _WINO_CELL_HPP_

#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>

#include "wino_buffer.cpp"



#define IN_WIDTH 8
#define W_WIDTH 8
#define OUT_WIDTH 18

#define B_WIDTH_IN 8
#define B_WIDTH_OUT 12
#define BT_WIDTH_IN 12
#define BT_WIDTH_OUT 16

#define GTGG_WIDTH_IN 16
#define GTGG_WIDTH_W 16
#define GTGG_WIDTH_OUT 24

#define A_WIDTH_IN 24
#define A_WIDTH_OUT 28

#define AT_WIDTH_IN 28
#define AT_WIDTH_OUT 32








void wino6x6_stream_cell(
		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
		hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
		hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
		ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
		ap_int<16> weight_indepth_load_number,
		ap_int<16> weight_outdepth_load_number,
		ap_int<16> weight_indepth_feed_size,
		ap_int<16> weight_outdepth_feed_size,
		ap_int<16> row_tile_number
		#if DEBUG_FILE_PRINT
		,int ROW_IDX, int COL_IDX
		#endif

		)
{

#pragma HLS array_partition variable = out_buffer dim=1 complete
	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg0[2];
	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg1[2];
#pragma HLS array_partition variable = stream_temp_reg0 complete
#pragma HLS array_partition variable = stream_temp_reg1 complete
	#if DEBUG_FILE_PRINT
	memset(stream_temp_reg0,0xAA,2*2*36*2);
	memset(stream_temp_reg1,0xAA,2*2*36*2);
	#endif

	ap_int<W_WIDTH> G1[36][2];
	#pragma HLS array_partition variable=G1
	
	ap_uint<1> stream_pingpong_flag=0;

	int write_idx=0;
	int weight_stream_idx=0;
	int input_stream_idx =0;

	#if DEBUG_FILE_PRINT
	char filename[100];
	sprintf(filename,"wino_cell_output_%d_%d.txt",ROW_IDX, COL_IDX);
	char input_stream_filename[100];
	sprintf(input_stream_filename,"wino_cell_input_%d_%d.txt",ROW_IDX, COL_IDX);
	char weight_stream_filename[100];
	sprintf(weight_stream_filename,"wino_cell_weight_%d_%d.txt",ROW_IDX, COL_IDX);
	#endif

	top_stream_in>>stream_temp_reg0[0];
	bottom_stream_out<<stream_temp_reg0[0];
	#if DEBUG_FILE_PRINT
	wino_cell_stream_input<0>(stream_temp_reg0[0],input_stream_idx,input_stream_filename);
	#endif
	top_stream_in>>stream_temp_reg0[1];
	bottom_stream_out<<stream_temp_reg0[1];
	#if DEBUG_FILE_PRINT
	wino_cell_stream_input<0>(stream_temp_reg0[1],input_stream_idx,input_stream_filename);
	#endif




	ITER0:for(int iter0=0;iter0<weight_indepth_load_number;iter0++)
	ITER1:for(int iter1=0;iter1<weight_outdepth_load_number;iter1++)
	ITER2:for(int iter2=0;iter2<row_tile_number;iter2++)
	ITER3:for(int iter3=0;iter3<weight_indepth_feed_size/2;iter3++)
	{
	ITER4:for(int iter4=0;iter4<weight_outdepth_feed_size;iter4++)
	{

		#pragma HLS pipeline
#pragma HLS dependence variable=out_buffer inter false
#pragma HLS dependence variable=out_buffer intra false
			ap_int<GTGG_WIDTH_IN> stream_temp_array[2][36][2];
			#pragma HLS array_partition variable = stream_temp_array complete
			for(int i=0;i<36;i++)
			{
			#pragma HLS unroll
				if(stream_pingpong_flag)
				{
					stream_temp_array[0][i][0]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[0][i][1]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
					stream_temp_array[1][i][0]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[1][i][1]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
				}
				else
				{
					stream_temp_array[0][i][0]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[0][i][1]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
					stream_temp_array[1][i][0]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[1][i][1]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
				}
			}

			if(iter4<2 )
			{
				if(stream_pingpong_flag)
				{
					top_stream_in>>stream_temp_reg0[iter4];
					bottom_stream_out<<stream_temp_reg0[iter4];
					#if DEBUG_FILE_PRINT
					wino_cell_stream_input<0>(stream_temp_reg0[iter4],input_stream_idx,input_stream_filename);
					#endif
					
				}
				else
				{
					top_stream_in>>stream_temp_reg1[iter4];
					bottom_stream_out<<stream_temp_reg1[iter4];
					#if DEBUG_FILE_PRINT
					wino_cell_stream_input<0>(stream_temp_reg1[iter4],input_stream_idx,input_stream_filename);
					#endif
				}			
			}


			ap_uint<W_WIDTH*2*36> weight_value_temp;
			left_stream_in>>weight_value_temp;
			right_stream_out<<weight_value_temp;


			for(int i=0;i<36;i++)
			{
			#pragma HLS unroll
				G1[i][0]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1,i*W_WIDTH);
				G1[i][1]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1+288,i*W_WIDTH +288 );
			}

			ap_int<GTGG_WIDTH_OUT> UV[36][2];
			#pragma HLS array_partition variable=UV complete
			
			for(int i=0;i<36;i++)
			{
			#pragma HLS unroll
				UV[i][0] = G1[i][0]*stream_temp_array[0][i][0] + G1[i][1]*stream_temp_array[1][i][0];
				UV[i][1] = G1[i][0]*stream_temp_array[0][i][1] + G1[i][1]*stream_temp_array[1][i][1];
			}

			ap_int<A_WIDTH_OUT>  UVA[6][4][2];
			#pragma HLS array_partition variable=UVA complete

			for(int b_idx=0;b_idx<2;b_idx++)
			{
			#pragma HLS unroll
				for(int i=0;i<6;i++)
				{
			#pragma HLS unroll
					UVA[i][0][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+0][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx];
					UVA[i][1][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<1);
					UVA[i][2][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<2);
					UVA[i][3][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<3)+(ap_int<A_WIDTH_OUT>) UV[i*6+5][b_idx];
				}
			}


			int address = iter1*row_tile_number* weight_outdepth_feed_size + iter4*row_tile_number + iter2;
			

			ap_int<36> outbuffer_value[16];
			#pragma HLS array_partition variable= outbuffer_value complete
			for(int i=0;i<16;i++)
			{
				#pragma HLS unroll
				outbuffer_value[i]=out_buffer[i][address];
			}



			ap_int<AT_WIDTH_OUT> ATUVA[16][2];
			#pragma HLS array_partition variable=ATUVA complete


			for(int b_idx=0;b_idx<2;b_idx++)
			{
			#pragma HLS unroll
				for(int i=0;i<4;i++)
				{
				#pragma HLS unroll
					ATUVA[0+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[0][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx];
					ATUVA[4+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<1);
					ATUVA[8+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<2);
					ATUVA[12+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<3) + (ap_int<AT_WIDTH_OUT>) UVA[5][i][b_idx];
				}
			}
			#if DEBUG_FILE_PRINT
			attach_output_vector<0>(ATUVA,write_idx,filename);
			#endif

			ap_int<18> out_residual[16][2];

			if(iter0 ==0 && iter3 ==0)
			{
				for(int i=0;i<16;i++)
				{
				#pragma HLS unroll
					out_residual[i][0]=0;
					out_residual[i][1]=0;
				}
			}
			else
			{
				for(int i=0;i<16;i++)
				{
				#pragma HLS unroll
					out_residual[i][0]=outbuffer_value[i].range(17,0);
					out_residual[i][1]=outbuffer_value[i].range(35,18);
				}
			}


			ap_int<18> outbuffer_writeback_value[16][2];
			#pragma HLS array_partition variable = outbuffer_writeback_value complete dim=1
			#pragma HLS array_partition variable = outbuffer_writeback_value complete dim=2
			for(int i=0;i<16;i++)
			{
			#pragma HLS unroll
				outbuffer_writeback_value[i][0]=ATUVA[i][0]+out_residual[i][0];
				outbuffer_writeback_value[i][1]=ATUVA[i][1]+out_residual[i][1];
			}


			ap_int<18*2> outbuffer_writeback_value_batch[16];
			#pragma HLS array_partition variable = outbuffer_writeback_value_batch complete
			for(int i=0;i<16;i++)
			{
				#pragma HLS unroll
				outbuffer_writeback_value_batch[i]=(outbuffer_writeback_value[i][1],outbuffer_writeback_value[i][0]  );
			}

			for(int i=0;i<16;i++)
			{
#pragma HLS unroll
				out_buffer[i][address]=outbuffer_writeback_value_batch[i];
			}
		}
		stream_pingpong_flag=~stream_pingpong_flag;
	}

}


//template<int dummy>
void wino6x6_stream_bottomend(
	hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
	ap_int<16> weight_indepth_load_number,
	ap_int<16> weight_outdepth_load_number,
	ap_int<16> weight_indepth_feed_size,
	ap_int<16> row_tile_number
)
{
	ap_uint<GTGG_WIDTH_IN*2*36> dummy_temp;
	for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size*row_tile_number+2;i++)
	{
		top_stream_in.read(dummy_temp);
	}
}

//template<int dummy>
void wino6x6_stream_rightend(
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in,
	ap_int<16> weight_indepth_load_number,
	ap_int<16> weight_outdepth_load_number,
	ap_int<16> weight_indepth_feed_size,
	ap_int<16> weight_outdepth_feed_size,
	ap_int<16> row_tile_number
)
{
	ap_uint<W_WIDTH*2*36>dummy_temp;
	for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size/2*weight_outdepth_feed_size*row_tile_number;i++)
	{
		left_stream_in.read(dummy_temp);
	}
}





//template<int dummy>
void wino_stream_ceil_4x4(
		hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in,
		hls::stream< ap_uint<IN_WIDTH*16> > & bottom_stream_out,
		hls::stream< ap_uint<W_WIDTH*16> > &left_stream_in,
		hls::stream< ap_uint<W_WIDTH*16> > &right_stream_out,
		ap_uint<OUT_WIDTH> out_buffer[16][1024],
		ap_uint<1> wino_flag,
		int number)
{

	#pragma HLS array_partition variable=out_buffer dim=1 complete

		ap_int<W_WIDTH> G1[16];
	#pragma HLS array_partition variable=G1
		ap_int<IN_WIDTH> in[16];
	#pragma HLS array_partition variable=in
	

	for(int counter=0;counter<number;counter++)
	{
		#pragma HLS pipeline
		ap_uint<IN_WIDTH*16> stream_in_temp;
		ap_uint<W_WIDTH*16> stream_weight_temp;
		top_stream_in>>stream_in_temp;
		bottom_stream_out<<stream_in_temp;
		left_stream_in>>stream_weight_temp;
		right_stream_out<<stream_weight_temp;

		for(int k=0;k<16;k++)
		{
		#pragma HLS unroll factor=36
			G1[k].range(W_WIDTH-1,0)=stream_weight_temp.range(W_WIDTH-1+k*W_WIDTH,k*W_WIDTH);
			in[k].range(IN_WIDTH-1,0)=stream_in_temp.range(IN_WIDTH-1+k*IN_WIDTH,k*IN_WIDTH);
		}

		ap_int<B_WIDTH_OUT>  dB[4][4];
		#pragma HLS array_partition variable=dB complete
		for(int i=0;i<4;i++)
		{
	#pragma HLS unroll
			dB[i][0]=in[i*4]-in[i*4+2];
			dB[i][1]=in[i*4+1]+in[i*4+2];
			dB[i][2]=-in[i*4+1]+in[i*4+2];
			dB[i][3]=in[i*4+1]-in[i*4+3];
		}

		ap_int<BT_WIDTH_OUT>  BTdB[4][4];
		#pragma HLS array_partition variable=BTdB complete

		if(wino_flag)
		{
			for(int i=0;i<4;i++)
			{
		#pragma HLS unroll
				BTdB[i][0]=dB[0][i]-dB[2][i];
				BTdB[i][1]=dB[1][i]+dB[2][i];
				BTdB[i][2]=-dB[1][i]+dB[2][i];
				BTdB[i][3]=dB[1][i]-dB[3][i];
			}
		}
		else
		{
			for(int i=0;i<4;i++)
			{
		#pragma HLS unroll
				BTdB[i][0]=in[i*4+0];
				BTdB[i][1]=in[i*4+1];
				BTdB[i][2]=in[i*4+2];
				BTdB[i][3]=in[i*4+3];
			}
		}

		ap_int<GTGG_WIDTH_OUT> UV[4][4];
		#pragma HLS array_partition variable=UV complete
		
		for(int i=0;i<4;i++){
		#pragma HLS unroll
			for(int j=0;j<4;j++){
			#pragma HLS unroll
				if(wino_flag)
						UV[i][j]=BTdB[i][j]*G1[i*4+j];
				else
						UV[i][j]=in[i][j]*G1[i*4+j];
			}
		}
		ap_int<A_WIDTH_OUT>  UVA[4][2];
		#pragma HLS array_partition variable=UVA complete

		for(int i=0;i<4;i++)
		{
	#pragma HLS unroll
			UVA[i][0]=UV[i][0]+UV[i][1]+UV[i][2];
			UVA[i][1]=UV[i][1]-UV[i][2]-UV[i][3];
		}

		ap_int<AT_WIDTH_OUT> ATUVA[16];
		#pragma HLS array_partition variable=ATUVA complete

		if(wino_flag)
		{
			for(int i=0;i<2;i++)
			{
		#pragma HLS unroll
				ATUVA[0+i]=UVA[0][i]+UVA[1][i]+UVA[2][i];
				ATUVA[2+i]=UVA[1][i]-UVA[2][i]-UVA[3][i];
			}
		}
		else
		{
			for(int i=0;i<16;i++)
			{
				ATUVA[i]=UV[i/4][i%4];
			}
		}

		if(wino_flag)
		{
			for(int i=0;i<4;i++)
			{
		#pragma HLS unroll
				out_buffer[i][counter]=out_buffer[i][counter]+ATUVA[i];
			}
		}
		else
		{
			for(int i=0;i<4;i++)
			for(int j=0;j<4;j++)
			{
		#pragma HLS unroll
				out_buffer[i*4+j][counter]=out_buffer[i*4+j][counter]+UV[i][j];
			}
		}
	}
}





void wino_stream_cell_combined(
	hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in0,
	hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in1,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_0,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_1,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_2,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_3,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_4,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_5,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_6,
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_7,
	ap_uint<OUT_WIDTH*2> out_buffer[16][16][1024],
	ap_int<16> weight_indepth_load_number,
	ap_int<16> weight_outdepth_load_number,
	ap_int<16> weight_indepth_feed_size,
	ap_int<16> weight_outdepth_feed_size,
	ap_int<16> row_tile_number
)
{

}


//template<int dummy>
/*
void wino_systolic(
	ap_uint<16> input_buffer[8][16][INPUT_BUFFER_DEPTH],
	ap_uint<OUT_WIDTH*2> output_buffer0[4*WEIGHT_FEED_NUMBER_PER_PORT*INPUT_FEED_NUMBER][16][OUTPUT_BUFFER_DEPTH],
	ap_uint<128>* weight_DDR0,
	ap_uint<128>* weight_DDR1,
	ap_uint<128>* weight_DDR2,
	ap_uint<128>* weight_DDR3,
	ap_uint<16> input_height,
	ap_uint<16> input_width,
	ap_uint<16> input_depth,
	ap_uint<16> input_width_ceildiv_16,
	ap_uint<16> input_depth_align8,
	ap_uint<16> output_height,
	ap_uint<16> output_width,
	ap_uint<16> output_depth,
	ap_uint<8> kernel_window_size,
	ap_uint<8> pad_size,
	ap_uint<16> weight_indepth_load_number,
	ap_uint<16> weight_outdepth_load_number,
	ap_uint<16> weight_outdepth_feed_size,
	ap_uint<16> start_row_idx,
	//weight parameters
	ap_uint<16> weight_total_load_number,
	ap_uint<16> weight_total_feed_size,
	ap_uint<16> ddr_load_length,
	ap_uint<16> ddr_load_length_per_feed,
	ap_uint<16> row_repeat_times,
	ap_uint<16> first_flag,
	ap_uint<16> last_flag

)
{
#pragma HLS interface m_axi port = weight_DDR3
#pragma HLS interface m_axi port = weight_DDR2
#pragma HLS interface m_axi port = weight_DDR1
#pragma HLS interface m_axi port = weight_DDR0

#pragma HLS array_partition variable=output_buffer0 dim=1 complete
#pragma HLS array_partition variable=output_buffer0 dim=2 complete

#pragma HLS array_partition variable=input_buffer dim=1 complete
#pragma HLS array_partition variable=input_buffer dim=2 complete

#pragma HLS dataflow



	static hls::stream<ap_uint<32*36> > indata_out_0_0("indata_out_0_0");
	#pragma HLS stream variable=indata_out_0_0 depth=16
	static hls::stream<ap_uint<32*36> > indata_out_1_0("indata_out_1_0");
	#pragma HLS stream variable=indata_out_1_0 depth=16
	static hls::stream<ap_uint<32*36> > indata_out_2_0("indata_out_2_0");
	#pragma HLS stream variable=indata_out_2_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_3_0("indata_out_3_0");
	#pragma HLS stream variable=indata_out_3_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_4_0("indata_out_4_0");
	#pragma HLS stream variable=indata_out_4_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_5_0("indata_out_5_0");
	#pragma HLS stream variable=indata_out_5_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_6_0("indata_out_6_0");
	#pragma HLS stream variable=indata_out_6_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_7_0("indata_out_7_0");
	#pragma HLS stream variable=indata_out_7_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_8_0("indata_out_8_0");
	#pragma HLS stream variable=indata_out_8_0 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_0_1("indata_out_0_1");
	#pragma HLS stream variable=indata_out_0_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_1_1("indata_out_1_1");
	#pragma HLS stream variable=indata_out_1_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_2_1("indata_out_2_1");
	#pragma HLS stream variable=indata_out_2_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_3_1("indata_out_3_1");
	#pragma HLS stream variable=indata_out_3_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_4_1("indata_out_4_1");
	#pragma HLS stream variable=indata_out_4_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_5_1("indata_out_5_1");
	#pragma HLS stream variable=indata_out_5_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_6_1("indata_out_6_1");
	#pragma HLS stream variable=indata_out_6_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_7_1("indata_out_7_1");
	#pragma HLS stream variable=indata_out_7_1 depth=1
	static hls::stream<ap_uint<32*36> > indata_out_8_1("indata_out_8_1");
	#pragma HLS stream variable=indata_out_8_1 depth=1


	static hls::stream<ap_uint<16*36> > weightdata_out_0_0("weightdata_out_0_0");
	#pragma HLS stream variable=weightdata_out_0_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_1_0("weightdata_out_1_0");
	#pragma HLS stream variable=weightdata_out_1_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_2_0("weightdata_out_2_0");
	#pragma HLS stream variable=weightdata_out_2_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_3_0("weightdata_out_3_0");
	#pragma HLS stream variable=weightdata_out_3_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_4_0("weightdata_out_4_0");
	#pragma HLS stream variable=weightdata_out_4_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_5_0("weightdata_out_5_0");
	#pragma HLS stream variable=weightdata_out_5_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_6_0("weightdata_out_6_0");
	#pragma HLS stream variable=weightdata_out_6_0 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_7_0("weightdata_out_7_0");
	#pragma HLS stream variable=weightdata_out_7_0 depth=2 off


	static hls::stream<ap_uint<16*36> > weightdata_out_0_1("weightdata_out_0_1");
	#pragma HLS stream variable=weightdata_out_0_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_1_1("weightdata_out_1_1");
	#pragma HLS stream variable=weightdata_out_1_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_2_1("weightdata_out_2_1");
	#pragma HLS stream variable=weightdata_out_2_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_3_1("weightdata_out_3_1");
	#pragma HLS stream variable=weightdata_out_3_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_4_1("weightdata_out_4_1");
	#pragma HLS stream variable=weightdata_out_4_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_5_1("weightdata_out_5_1");
	#pragma HLS stream variable=weightdata_out_5_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_6_1("weightdata_out_6_1");
	#pragma HLS stream variable=weightdata_out_6_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_7_1("weightdata_out_7_1");
	#pragma HLS stream variable=weightdata_out_7_1 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_0_2("weightdata_out_0_2");
	#pragma HLS stream variable=weightdata_out_0_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_1_2("weightdata_out_1_2");
	#pragma HLS stream variable=weightdata_out_1_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_2_2("weightdata_out_2_2");
	#pragma HLS stream variable=weightdata_out_2_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_3_2("weightdata_out_3_2");
	#pragma HLS stream variable=weightdata_out_3_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_4_2("weightdata_out_4_2");
	#pragma HLS stream variable=weightdata_out_4_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_5_2("weightdata_out_5_2");
	#pragma HLS stream variable=weightdata_out_5_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_6_2("weightdata_out_6_2");
	#pragma HLS stream variable=weightdata_out_6_2 depth=2 off
	static hls::stream<ap_uint<16*36> > weightdata_out_7_2("weightdata_out_7_2");
	#pragma HLS stream variable=weightdata_out_7_2 depth=2 off



		input_feed(
		input_buffer,
		indata_out_0_0,
		indata_out_0_1,
		start_row_idx,
		output_width,
		input_height,
		input_width,
		input_width_ceildiv_16,
		input_depth_align8,
		pad_size,
		weight_outdepth_load_number,
		kernel_window_size,
		weight_outdepth_feed_size
	);
	
	weight_feed(
	weight_DDR0,
	weight_DDR1,
	weight_DDR2,
	weight_DDR3,
	#if WEIGHT_FEED_NUMBER_PER_PORT == 2
	weightdata_out_0_0,
	weightdata_out_1_0,
	weightdata_out_2_0,
	weightdata_out_3_0,
	weightdata_out_4_0,
	weightdata_out_5_0,
	weightdata_out_6_0,
	weightdata_out_7_0,
	#endif
	weight_total_load_number,
	weight_total_feed_size,
	ddr_load_length,
	ddr_load_length_per_feed,
	row_repeat_times,
	first_flag,
	last_flag);



	// wino6x6_stream_cell(
	// 	indata_out_0_0
	// 	hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
	// 	hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
	// 	hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
	// 	ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
	// 	ap_int<16> weight_indepth_load_number,
	// 	ap_int<16> weight_outdepth_load_number,
	// 	ap_int<16> weight_indepth_feed_size,
	// 	ap_int<16> weight_outdepth_feed_size,
	// 	ap_int<16> row_tile_number);


//
	wino6x6_stream_cell(indata_out_0_0,indata_out_1_0,weightdata_out_0_0,weightdata_out_0_1,output_buffer0[0],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,0,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_1_0,indata_out_2_0,weightdata_out_1_0,weightdata_out_1_1,output_buffer0[1],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,1,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_2_0,indata_out_3_0,weightdata_out_2_0,weightdata_out_2_1,output_buffer0[2],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,2,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_3_0,indata_out_4_0,weightdata_out_3_0,weightdata_out_3_1,output_buffer0[3],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,3,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_4_0,indata_out_5_0,weightdata_out_4_0,weightdata_out_4_1,output_buffer0[4],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,4,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_5_0,indata_out_6_0,weightdata_out_5_0,weightdata_out_5_1,output_buffer0[5],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,5,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_6_0,indata_out_7_0,weightdata_out_6_0,weightdata_out_6_1,output_buffer0[6],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,6,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_7_0,indata_out_8_0,weightdata_out_7_0,weightdata_out_7_1,output_buffer0[7],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,7,0
	#endif
	 );
	wino6x6_stream_cell(indata_out_0_1,indata_out_1_1,weightdata_out_0_1,weightdata_out_0_2,output_buffer0[8],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,0,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_1_1,indata_out_2_1,weightdata_out_1_1,weightdata_out_1_2,output_buffer0[9],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,1,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_2_1,indata_out_3_1,weightdata_out_2_1,weightdata_out_2_2,output_buffer0[10],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,2,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_3_1,indata_out_4_1,weightdata_out_3_1,weightdata_out_3_2,output_buffer0[11],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,3,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_4_1,indata_out_5_1,weightdata_out_4_1,weightdata_out_4_2,output_buffer0[12],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,4,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_5_1,indata_out_6_1,weightdata_out_5_1,weightdata_out_5_2,output_buffer0[13],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,5,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_6_1,indata_out_7_1,weightdata_out_6_1,weightdata_out_6_2,output_buffer0[14],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,6,1
	#endif
	 );
	wino6x6_stream_cell(indata_out_7_1,indata_out_8_1,weightdata_out_7_1,weightdata_out_7_2,output_buffer0[15],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times
	#if DEBUG_FILE_PRINT
	,7,1
	#endif
	 );

	wino6x6_stream_bottomend(indata_out_8_0,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times);
	wino6x6_stream_bottomend(indata_out_8_1,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times);


	wino6x6_stream_rightend(weightdata_out_0_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_1_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_2_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_3_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_4_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_5_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_6_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend(weightdata_out_7_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);

}
*/
#endif