#include <ap_int.h>
#include <hls_stream.h>
#include "wino_macro.h"
#include "wino_buffer.hpp"



#define IN_WIDTH 8
#define W_WIDTH 8
#define OUT_WIDTH 18

#define B_WIDTH_IN 8
#define B_WIDTH_OUT 12
#define BT_WIDTH_IN 12
#define BT_WIDTH_OUT 16

#define GTGG_WIDTH_IN 16
#define GTGG_WIDTH_W 16
#define GTGG_WIDTH_OUT 24

#define A_WIDTH_IN 24
#define A_WIDTH_OUT 28

#define AT_WIDTH_IN 28
#define AT_WIDTH_OUT 32




// template<int dummy>
// void wino_stream_ceil(
// 	hls::stream< ap_uint<IN_WIDTH*2*36> > & top_stream_in,
// 	hls::stream< ap_uint<IN_WIDTH*2*36> > & bottom_stream_out,
// 	hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
// 	hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
// 	ap_uint<16> weight_indepth_load_number,
// 	ap_uint<16> weight_outdepth_load_number,
// 	ap_uint<16> weight_outdepth_feed_size,
// 	ap_uint<16> wino_tile_number_per_row
// )
// {


// 	ap_int<W_WIDTH> G1[36];
// 	#pragma HLS array_partition variable=G1


// 	for(int input_depth_tile_idx=0;input_depth_tile_idx<weight_indepth_load_number;input_depth_tile_idx++)
// 	{
// 		for(int output_depth_tile_idx=0;output_depth_tile_idx<weight_load_outdepth_number;output_depth_tile_idx++)
// 		{
// 			for(int start_output_col =0; start_output_col < output_width; start_output_col+=wino_output_tile_size*2)
// 			{
// 				for(int input_depth_idx_in_tile=0; input_depth_idx_in_tile<8;input_depth_idx_in_tile++ )
// 				{
// 					for(int output_depth_idx_in_tile=0; output_depth_idx_in_tile<weight_outdepth_feed_size;output_depth_idx_in_tile++ )
// 					{
// 						#pragma HLS pipeline
// 						ap_uint<IN_WIDTH*2*36> temp;
// 						ap_uint<W_WIDTH*36> stream_weight_temp;

// 						if(output_depth_idx_in_tile ==0)
// 						{
// 							top_stream_in >> temp;
// 							bottom_stream_out << temp;
// 						}

// 						left_stream_in>>stream_weight_temp;
// 						right_stream_out<<stream_weight_temp;
						
// 						ap_int<IN_WIDTH> input_tile[2][36];
						
// 						for(int i=0;i<36;i++)
// 						{
// 							#pragma HLS unroll
// 							input_tile[0][i] = temp.range(i*16+7,i*16);
// 							input_tile[1][i] = temp.range(i*16+15,i*16+8);
// 						}


// 					}

// 				}
// 			}
// 		}
// 	}

// }


template<int ROW_IDX, int COL_IDX>
void wino6x6_stream_cell(
		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
		hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
		hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
		ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
		ap_int<16> weight_indepth_load_number,
		ap_int<16> weight_outdepth_load_number,
		ap_int<16> weight_indepth_feed_size,
		ap_int<16> weight_outdepth_feed_size,
		ap_int<16> row_tile_number)
{

	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg0[2];
	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg1[2];

	memset(stream_temp_reg0,0xAA,2*2*36*2);
	memset(stream_temp_reg1,0xAA,2*2*36*2);

	ap_int<W_WIDTH> G1[36][2];
	#pragma HLS array_partition variable=G1
	
	ap_uint<1> stream_pingpong_flag=0;

	int write_idx=0;
	int weight_stream_idx=0;
	int input_stream_idx =0;


	char filename[100];
	sprintf(filename,"wino_cell_output_%d_%d.txt",ROW_IDX, COL_IDX);
	char input_stream_filename[100];
	sprintf(input_stream_filename,"wino_cell_input_%d_%d.txt",ROW_IDX, COL_IDX);
	char weight_stream_filename[100];
	sprintf(weight_stream_filename,"wino_cell_weight_%d_%d.txt",ROW_IDX, COL_IDX);

	top_stream_in>>stream_temp_reg0[0];
	bottom_stream_out<<stream_temp_reg0[0];
	wino_cell_stream_input<0>(stream_temp_reg0[0],input_stream_idx,input_stream_filename);
	top_stream_in>>stream_temp_reg0[1];
	bottom_stream_out<<stream_temp_reg0[1];
	wino_cell_stream_input<0>(stream_temp_reg0[1],input_stream_idx,input_stream_filename);




	for(int iter0=0;iter0<weight_indepth_load_number;iter0++)
	for(int iter1=0;iter1<weight_outdepth_load_number;iter1++)
	for(int iter2=0;iter2<row_tile_number;iter2++)
	for(int iter3=0;iter3<weight_indepth_feed_size/2;iter3++)
	{
		for(int iter4=0;iter4<weight_outdepth_feed_size;iter4++)
		{	

		

			ap_int<GTGG_WIDTH_IN> stream_temp_array[2][36][2];
			for(int i=0;i<36;i++)
			{
			#pragma HLS unroll
				if(stream_pingpong_flag)
				{
					stream_temp_array[0][i][0]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[0][i][1]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
					stream_temp_array[1][i][0]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[1][i][1]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
				}
				else
				{
					stream_temp_array[0][i][0]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[0][i][1]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
					stream_temp_array[1][i][0]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
					stream_temp_array[1][i][1]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
				}
			}

			if(iter4<2 )
			{
				if(stream_pingpong_flag)
				{
					top_stream_in>>stream_temp_reg0[iter4];
					bottom_stream_out<<stream_temp_reg0[iter4];
					wino_cell_stream_input<0>(stream_temp_reg0[iter4],input_stream_idx,input_stream_filename);
					
				}
				else
				{
					top_stream_in>>stream_temp_reg1[iter4];
					bottom_stream_out<<stream_temp_reg1[iter4];
					wino_cell_stream_input<0>(stream_temp_reg1[iter4],input_stream_idx,input_stream_filename);
				}			
			}


			ap_uint<W_WIDTH*2*36> weight_value_temp;
			left_stream_in>>weight_value_temp;
			wino_cell_stream_weight<0>(weight_value_temp,weight_stream_idx,weight_stream_filename);
			right_stream_out<<weight_value_temp;


			for(int i=0;i<36;i++)
			{
			#pragma HLS unroll
				G1[i][0]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1,i*W_WIDTH);
				G1[i][1]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1+288,i*W_WIDTH +288 );
			}

			ap_int<GTGG_WIDTH_OUT> UV[36][2];
			#pragma HLS array_partition variable=UV complete
			
			for(int i=0;i<36;i++)
			{
			#pragma HLS unroll
				UV[i][0] = G1[i][0]*stream_temp_array[0][i][0] + G1[i][1]*stream_temp_array[1][i][0];
				UV[i][1] = G1[i][0]*stream_temp_array[0][i][1] + G1[i][1]*stream_temp_array[1][i][1];
			}

			ap_int<A_WIDTH_OUT>  UVA[6][4][2];
			#pragma HLS array_partition variable=UVA complete

			for(int b_idx=0;b_idx<2;b_idx++)
			{
			#pragma HLS unroll
				for(int i=0;i<6;i++)
				{
			#pragma HLS unroll
					UVA[i][0][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+0][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx];
					UVA[i][1][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<1);
					UVA[i][2][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<2);
					UVA[i][3][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<3)+(ap_int<A_WIDTH_OUT>) UV[i*6+5][b_idx];
				}
			}


			int address = iter1*row_tile_number* weight_outdepth_feed_size + iter4*row_tile_number + iter2;
			

			ap_int<36> outbuffer_value[16];
			for(int i=0;i<16;i++)
			{
				outbuffer_value[i]=out_buffer[i][address];
			}



			ap_int<AT_WIDTH_OUT> ATUVA[16][2];
			#pragma HLS array_partition variable=ATUVA complete


			for(int b_idx=0;b_idx<2;b_idx++)
			{
			#pragma HLS unroll
				for(int i=0;i<4;i++)
				{
				#pragma HLS unroll
					ATUVA[0+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[0][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx];
					ATUVA[4+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<1);
					ATUVA[8+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<2);
					ATUVA[12+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<3) + (ap_int<AT_WIDTH_OUT>) UVA[5][i][b_idx];
				}
			}
			attach_output_vector<0>(ATUVA,write_idx,filename);

			ap_int<18> out_residual[16][2];

			if(iter0 ==0 && iter3 ==0)
			{
				for(int i=0;i<16;i++)
				{
					out_residual[i][0]=0;
					out_residual[i][1]=0;
				}
			}
			else
			{
				for(int i=0;i<16;i++)
				{
					out_residual[i][0]=outbuffer_value[i].range(17,0);
					out_residual[i][1]=outbuffer_value[i].range(35,18);
				}
			}


			ap_int<18> outbuffer_writeback_value[16][2];
			for(int i=0;i<16;i++)
			{
				// outbuffer_writeback_value[i][0]=ATUVA[i][0].range(31,14)+out_residual[i][0];
				// outbuffer_writeback_value[i][1]=ATUVA[i][1].range(31,14)+out_residual[i][1];

				outbuffer_writeback_value[i][0]=ATUVA[i][0]+out_residual[i][0];
				outbuffer_writeback_value[i][1]=ATUVA[i][1]+out_residual[i][1];
			}


			ap_int<18*2> outbuffer_writeback_value_batch[16];

			for(int i=0;i<16;i++)
			{
				outbuffer_writeback_value_batch[i]=(outbuffer_writeback_value[i][1],outbuffer_writeback_value[i][0]  );
			}

			for(int i=0;i<16;i++)
			{
				out_buffer[i][address]=outbuffer_writeback_value_batch[i];
			}
		}
		stream_pingpong_flag=~stream_pingpong_flag;
	}

}


template<int dummy>
void wino6x6_stream_bottomend(
	hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
	ap_int<16> weight_indepth_load_number,
	ap_int<16> weight_outdepth_load_number,
	ap_int<16> weight_indepth_feed_size,
	ap_int<16> row_tile_number
)
{
	ap_uint<GTGG_WIDTH_IN*2*36> dummy_temp;
	for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size*row_tile_number+2;i++)
	{
		top_stream_in.read(dummy_temp);
	}
}

template<int dummy>
void wino6x6_stream_rightend(
	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in,
	ap_int<16> weight_indepth_load_number,
	ap_int<16> weight_outdepth_load_number,
	ap_int<16> weight_indepth_feed_size,
	ap_int<16> weight_outdepth_feed_size,
	ap_int<16> row_tile_number
)
{
	ap_uint<W_WIDTH*2*36>dummy_temp;
	for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size/2*weight_outdepth_feed_size*row_tile_number;i++)
	{
		left_stream_in.read(dummy_temp);
	}
}





template<int dummy>
void wino_stream_ceil_4x4(
		hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in,
		hls::stream< ap_uint<IN_WIDTH*16> > & bottom_stream_out,
		hls::stream< ap_uint<W_WIDTH*16> > &left_stream_in,
		hls::stream< ap_uint<W_WIDTH*16> > &right_stream_out,
		ap_uint<OUT_WIDTH> out_buffer[16][1024],
		ap_uint<1> wino_flag,
		int number)
{

	#pragma HLS array_partition variable=out_buffer dim=1 complete

		ap_int<W_WIDTH> G1[16];
	#pragma HLS array_partition variable=G1
		ap_int<IN_WIDTH> in[16];
	#pragma HLS array_partition variable=in
	

	for(int counter=0;counter<number;counter++)
	{
		#pragma HLS pipeline
		ap_uint<IN_WIDTH*16> stream_in_temp;
		ap_uint<W_WIDTH*16> stream_weight_temp;
		top_stream_in>>stream_in_temp;
		bottom_stream_out<<stream_in_temp;
		left_stream_in>>stream_weight_temp;
		right_stream_out<<stream_weight_temp;

		for(int k=0;k<16;k++)
		{
		#pragma HLS unroll factor=36
			G1[k].range(W_WIDTH-1,0)=stream_weight_temp.range(W_WIDTH-1+k*W_WIDTH,k*W_WIDTH);
			in[k].range(IN_WIDTH-1,0)=stream_in_temp.range(IN_WIDTH-1+k*IN_WIDTH,k*IN_WIDTH);
		}

		ap_int<B_WIDTH_OUT>  dB[4][4];
		#pragma HLS array_partition variable=dB complete
		for(int i=0;i<4;i++)
		{
	#pragma HLS unroll
			dB[i][0]=in[i*4]-in[i*4+2];
			dB[i][1]=in[i*4+1]+in[i*4+2];
			dB[i][2]=-in[i*4+1]+in[i*4+2];
			dB[i][3]=in[i*4+1]-in[i*4+3];
		}

		ap_int<BT_WIDTH_OUT>  BTdB[4][4];
		#pragma HLS array_partition variable=BTdB complete

		if(wino_flag)
		{hls::stream<ap_uint<32*36> > indata_tile_stream1("indata2");
			for(int i=0;i<4;i++)
			{
		#pragma HLS unroll
				BTdB[i][0]=dB[0][i]-dB[2][i];
				BTdB[i][1]=dB[1][i]+dB[2][i];
				BTdB[i][2]=-dB[1][i]+dB[2][i];
				BTdB[i][3]=dB[1][i]-dB[3][i];
			}
		}
		else
		{
			for(int i=0;i<4;i++)
			{
		#pragma HLS unroll
				BTdB[i][0]=in[i*4+0];
				BTdB[i][1]=in[i*4+1];
				BTdB[i][2]=in[i*4+2];
				BTdB[i][3]=in[i*4+3];
			}
		}

		ap_int<GTGG_WIDTH_OUT> UV[4][4];
		#pragma HLS array_partition variable=UV complete
		
		for(int i=0;i<4;i++){
		#pragma HLS unroll
			for(int j=0;j<4;j++){
			#pragma HLS unroll
				if(wino_flag)
						UV[i][j]=BTdB[i][j]*G1[i*4+j];
				else
						UV[i][j]=in[i][j]*G1[i*4+j];
			}
		}
		ap_int<A_WIDTH_OUT>  UVA[4][2];
		#pragma HLS array_partition variable=UVA complete

		for(int i=0;i<4;i++)
		{
	#pragma HLS unroll
			UVA[i][0]=UV[i][0]+UV[i][1]+UV[i][2];
			UVA[i][1]=UV[i][1]-UV[i][2]-UV[i][3];
		}

		ap_int<AT_WIDTH_OUT> ATUVA[16];
		#pragma HLS array_partition variable=ATUVA complete

		if(wino_flag)
		{
			for(int i=0;i<2;i++)
			{
		#pragma HLS unroll
				ATUVA[0+i]=UVA[0][i]+UVA[1][i]+UVA[2][i];
				ATUVA[2+i]=UVA[1][i]-UVA[2][i]-UVA[3][i];
			}
		}
		else
		{
			for(int i=0;i<16;i++)
			{
				ATUVA[i]=UV[i/4][i%4];
			}
		}

		if(wino_flag)
		{
			for(int i=0;i<4;i++)
			{
		#pragma HLS unroll
				out_buffer[i][counter]=out_buffer[i][counter]+ATUVA[i];
			}
		}
		else
		{
			for(int i=0;i<4;i++)
			for(int j=0;j<4;j++)
			{
		#pragma HLS unroll
				out_buffer[i*4+j][counter]=out_buffer[i*4+j][counter]+UV[i][j];
			}
		}
	}
}





template<int dummy>
void wino_systolic(
	ap_uint<16> input_buffer[8][16][INPUT_BUFFER_DEPTH],
	ap_uint<OUT_WIDTH*2> output_buffer0[4*WEIGHT_FEED_NUMBER_PER_PORT*INPUT_FEED_NUMBER][16][OUTPUT_BUFFER_DEPTH],
	ap_uint<128>* weight_DDR0,
	ap_uint<128>* weight_DDR1,
	ap_uint<128>* weight_DDR2,
	ap_uint<128>* weight_DDR3,
	ap_uint<16> input_height,
	ap_uint<16> input_width,
	ap_uint<16> input_depth,
	ap_uint<16> input_width_ceildiv_16,
	ap_uint<16> input_depth_align8,
	ap_uint<16> output_height,
	ap_uint<16> output_width,
	ap_uint<16> output_depth,
	ap_uint<8> kernel_window_size,
	ap_uint<8> pad_size,
	ap_uint<16> weight_indepth_load_number,
	ap_uint<16> weight_outdepth_load_number,
	ap_uint<16> weight_outdepth_feed_size,
	ap_uint<16> start_row_idx,

	//weight parameters
	ap_uint<16> weight_total_load_number,
	ap_uint<16> weight_total_feed_size,
	ap_uint<16> ddr_load_length,
	ap_uint<16> ddr_load_length_per_feed,
	ap_uint<16> row_repeat_times,
	ap_uint<16> first_flag,
	ap_uint<16> last_flag

)
{
	hls::stream<ap_uint<32*36> > indata_tile_stream0("indata1");
	hls::stream<ap_uint<32*36> > indata_tile_stream1("indata2");


	hls::stream<ap_uint<32*36> > indata_out_0_0("indata_out_0_0");
	hls::stream<ap_uint<32*36> > indata_out_1_0("indata_out_1_0");
	hls::stream<ap_uint<32*36> > indata_out_2_0("indata_out_2_0");
	hls::stream<ap_uint<32*36> > indata_out_3_0("indata_out_3_0");
	hls::stream<ap_uint<32*36> > indata_out_4_0("indata_out_4_0");
	hls::stream<ap_uint<32*36> > indata_out_5_0("indata_out_5_0");
	hls::stream<ap_uint<32*36> > indata_out_6_0("indata_out_6_0");
	hls::stream<ap_uint<32*36> > indata_out_7_0("indata_out_7_0");
	hls::stream<ap_uint<32*36> > indata_out_8_0("indata_out_8_0");

	hls::stream<ap_uint<32*36> > indata_out_0_1("indata_out_0_1");
	hls::stream<ap_uint<32*36> > indata_out_1_1("indata_out_1_1");
	hls::stream<ap_uint<32*36> > indata_out_2_1("indata_out_2_1");
	hls::stream<ap_uint<32*36> > indata_out_3_1("indata_out_3_1");
	hls::stream<ap_uint<32*36> > indata_out_4_1("indata_out_4_1");
	hls::stream<ap_uint<32*36> > indata_out_5_1("indata_out_5_1");
	hls::stream<ap_uint<32*36> > indata_out_6_1("indata_out_6_1");
	hls::stream<ap_uint<32*36> > indata_out_7_1("indata_out_7_1");
	hls::stream<ap_uint<32*36> > indata_out_8_1("indata_out_8_1");







	hls::stream<ap_uint<16*36> > weightdata_out_0_0("weightdata_out_0_0");
	hls::stream<ap_uint<16*36> > weightdata_out_1_0("weightdata_out_1_0");
	hls::stream<ap_uint<16*36> > weightdata_out_2_0("weightdata_out_2_0");
	hls::stream<ap_uint<16*36> > weightdata_out_3_0("weightdata_out_3_0");
	hls::stream<ap_uint<16*36> > weightdata_out_4_0("weightdata_out_4_0");
	hls::stream<ap_uint<16*36> > weightdata_out_5_0("weightdata_out_5_0");
	hls::stream<ap_uint<16*36> > weightdata_out_6_0("weightdata_out_6_0");
	hls::stream<ap_uint<16*36> > weightdata_out_7_0("weightdata_out_7_0");
	hls::stream<ap_uint<16*36> > weightdata_out_0_1("weightdata_out_0_1");
	hls::stream<ap_uint<16*36> > weightdata_out_1_1("weightdata_out_1_1");
	hls::stream<ap_uint<16*36> > weightdata_out_2_1("weightdata_out_2_1");
	hls::stream<ap_uint<16*36> > weightdata_out_3_1("weightdata_out_3_1");
	hls::stream<ap_uint<16*36> > weightdata_out_4_1("weightdata_out_4_1");
	hls::stream<ap_uint<16*36> > weightdata_out_5_1("weightdata_out_5_1");
	hls::stream<ap_uint<16*36> > weightdata_out_6_1("weightdata_out_6_1");
	hls::stream<ap_uint<16*36> > weightdata_out_7_1("weightdata_out_7_1");
	hls::stream<ap_uint<16*36> > weightdata_out_0_2("weightdata_out_0_2");
	hls::stream<ap_uint<16*36> > weightdata_out_1_2("weightdata_out_1_2");
	hls::stream<ap_uint<16*36> > weightdata_out_2_2("weightdata_out_2_2");
	hls::stream<ap_uint<16*36> > weightdata_out_3_2("weightdata_out_3_2");
	hls::stream<ap_uint<16*36> > weightdata_out_4_2("weightdata_out_4_2");
	hls::stream<ap_uint<16*36> > weightdata_out_5_2("weightdata_out_5_2");
	hls::stream<ap_uint<16*36> > weightdata_out_6_2("weightdata_out_6_2");
	hls::stream<ap_uint<16*36> > weightdata_out_7_2("weightdata_out_7_2");



		input_feed<0>(
		input_buffer,
		indata_out_0_0, 
		indata_out_0_1,
		start_row_idx,
		output_width,
		input_height,
		input_width,
		input_width_ceildiv_16,
		input_depth_align8,
		pad_size,
		weight_outdepth_load_number,
		kernel_window_size,
		weight_outdepth_feed_size
	);
	
	weight_feed<0>(
	weight_DDR0,
	weight_DDR1,
	weight_DDR2,
	weight_DDR3,
	#if WEIGHT_FEED_NUMBER_PER_PORT == 2
	weightdata_out_0_0,
	weightdata_out_1_0,
	weightdata_out_2_0,
	weightdata_out_3_0,
	weightdata_out_4_0,
	weightdata_out_5_0,
	weightdata_out_6_0,
	weightdata_out_7_0,
	#endif
	weight_total_load_number,
	weight_total_feed_size,
	ddr_load_length,
	ddr_load_length_per_feed,
	row_repeat_times,
	first_flag,
	last_flag);



	// wino6x6_stream_cell<0>(
	// 	indata_out_0_0
	// 	hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
	// 	hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
	// 	hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
	// 	ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
	// 	ap_int<16> weight_indepth_load_number,
	// 	ap_int<16> weight_outdepth_load_number,
	// 	ap_int<16> weight_indepth_feed_size,
	// 	ap_int<16> weight_outdepth_feed_size,
	// 	ap_int<16> row_tile_number);



	wino6x6_stream_cell<0,0>(indata_out_0_0,indata_out_1_0,weightdata_out_0_0,weightdata_out_0_1,output_buffer0[0],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<1,0>(indata_out_1_0,indata_out_2_0,weightdata_out_1_0,weightdata_out_1_1,output_buffer0[1],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<2,0>(indata_out_2_0,indata_out_3_0,weightdata_out_2_0,weightdata_out_2_1,output_buffer0[2],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<3,0>(indata_out_3_0,indata_out_4_0,weightdata_out_3_0,weightdata_out_3_1,output_buffer0[3],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<4,0>(indata_out_4_0,indata_out_5_0,weightdata_out_4_0,weightdata_out_4_1,output_buffer0[4],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<5,0>(indata_out_5_0,indata_out_6_0,weightdata_out_5_0,weightdata_out_5_1,output_buffer0[5],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<6,0>(indata_out_6_0,indata_out_7_0,weightdata_out_6_0,weightdata_out_6_1,output_buffer0[6],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<7,0>(indata_out_7_0,indata_out_8_0,weightdata_out_7_0,weightdata_out_7_1,output_buffer0[7],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<0,1>(indata_out_0_1,indata_out_1_1,weightdata_out_0_1,weightdata_out_0_2,output_buffer0[8],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<1,1>(indata_out_1_1,indata_out_2_1,weightdata_out_1_1,weightdata_out_1_2,output_buffer0[9],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<2,1>(indata_out_2_1,indata_out_3_1,weightdata_out_2_1,weightdata_out_2_2,output_buffer0[10],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<3,1>(indata_out_3_1,indata_out_4_1,weightdata_out_3_1,weightdata_out_3_2,output_buffer0[11],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<4,1>(indata_out_4_1,indata_out_5_1,weightdata_out_4_1,weightdata_out_4_2,output_buffer0[12],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<5,1>(indata_out_5_1,indata_out_6_1,weightdata_out_5_1,weightdata_out_5_2,output_buffer0[13],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<6,1>(indata_out_6_1,indata_out_7_1,weightdata_out_6_1,weightdata_out_6_2,output_buffer0[14],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_cell<7,1>(indata_out_7_1,indata_out_8_1,weightdata_out_7_1,weightdata_out_7_2,output_buffer0[15],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);

	wino6x6_stream_bottomend<0>(indata_out_8_0,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times);
	wino6x6_stream_bottomend<0>(indata_out_8_1,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times);


	wino6x6_stream_rightend<0>(weightdata_out_0_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_1_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_2_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_3_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_4_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_5_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_6_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
	wino6x6_stream_rightend<0>(weightdata_out_7_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);

}