Skip to content
Snippets Groups Projects
wino_cell.hpp 23.5 KiB
Newer Older
  • Learn to ignore specific revisions
  • xliu79's avatar
    xliu79 committed
    #include <ap_int.h>
    #include <hls_stream.h>
    
    #include "wino_macro.h"
    #include "wino_buffer.hpp"
    
    xliu79's avatar
    xliu79 committed
    
    
    
    #define IN_WIDTH 8
    #define W_WIDTH 8
    
    #define OUT_WIDTH 18
    
    xliu79's avatar
    xliu79 committed
    
    #define B_WIDTH_IN 8
    #define B_WIDTH_OUT 12
    #define BT_WIDTH_IN 12
    #define BT_WIDTH_OUT 16
    
    #define GTGG_WIDTH_IN 16
    #define GTGG_WIDTH_W 16
    #define GTGG_WIDTH_OUT 24
    
    #define A_WIDTH_IN 24
    #define A_WIDTH_OUT 28
    
    #define AT_WIDTH_IN 28
    #define AT_WIDTH_OUT 32
    
    
    
    
    
    // template<int dummy>
    // void wino_stream_ceil(
    // 	hls::stream< ap_uint<IN_WIDTH*2*36> > & top_stream_in,
    // 	hls::stream< ap_uint<IN_WIDTH*2*36> > & bottom_stream_out,
    // 	hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
    // 	hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
    // 	ap_uint<16> weight_indepth_load_number,
    // 	ap_uint<16> weight_outdepth_load_number,
    // 	ap_uint<16> weight_outdepth_feed_size,
    // 	ap_uint<16> wino_tile_number_per_row
    // )
    // {
    
    
    // 	ap_int<W_WIDTH> G1[36];
    // 	#pragma HLS array_partition variable=G1
    
    
    // 	for(int input_depth_tile_idx=0;input_depth_tile_idx<weight_indepth_load_number;input_depth_tile_idx++)
    // 	{
    // 		for(int output_depth_tile_idx=0;output_depth_tile_idx<weight_load_outdepth_number;output_depth_tile_idx++)
    // 		{
    // 			for(int start_output_col =0; start_output_col < output_width; start_output_col+=wino_output_tile_size*2)
    // 			{
    // 				for(int input_depth_idx_in_tile=0; input_depth_idx_in_tile<8;input_depth_idx_in_tile++ )
    // 				{
    // 					for(int output_depth_idx_in_tile=0; output_depth_idx_in_tile<weight_outdepth_feed_size;output_depth_idx_in_tile++ )
    // 					{
    // 						#pragma HLS pipeline
    // 						ap_uint<IN_WIDTH*2*36> temp;
    // 						ap_uint<W_WIDTH*36> stream_weight_temp;
    
    // 						if(output_depth_idx_in_tile ==0)
    // 						{
    // 							top_stream_in >> temp;
    // 							bottom_stream_out << temp;
    // 						}
    
    // 						left_stream_in>>stream_weight_temp;
    // 						right_stream_out<<stream_weight_temp;
    						
    // 						ap_int<IN_WIDTH> input_tile[2][36];
    						
    // 						for(int i=0;i<36;i++)
    // 						{
    // 							#pragma HLS unroll
    // 							input_tile[0][i] = temp.range(i*16+7,i*16);
    // 							input_tile[1][i] = temp.range(i*16+15,i*16+8);
    // 						}
    
    
    // 					}
    
    // 				}
    // 			}
    // 		}
    // 	}
    
    // }
    
    
    template<int ROW_IDX, int COL_IDX>
    void wino6x6_stream_cell(
    		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
    		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
    		hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
    		hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
    		ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
    		ap_int<16> weight_indepth_load_number,
    		ap_int<16> weight_outdepth_load_number,
    		ap_int<16> weight_indepth_feed_size,
    		ap_int<16> weight_outdepth_feed_size,
    		ap_int<16> row_tile_number)
    
    	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg0[2];
    	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg1[2];
    
    
    xliu79's avatar
    xliu79 committed
    	memset(stream_temp_reg0,0xAA,2*2*36*2);
    	memset(stream_temp_reg1,0xAA,2*2*36*2);
    
    xliu79's avatar
    xliu79 committed
    
    
    	ap_int<W_WIDTH> G1[36][2];
    
    xliu79's avatar
    xliu79 committed
    	#pragma HLS array_partition variable=G1
    	
    
    	ap_uint<1> stream_pingpong_flag=0;
    
    xliu79's avatar
    xliu79 committed
    
    
    	int write_idx=0;
    	int weight_stream_idx=0;
    	int input_stream_idx =0;
    
    
    	char filename[100];
    	sprintf(filename,"wino_cell_output_%d_%d.txt",ROW_IDX, COL_IDX);
    	char input_stream_filename[100];
    	sprintf(input_stream_filename,"wino_cell_input_%d_%d.txt",ROW_IDX, COL_IDX);
    	char weight_stream_filename[100];
    	sprintf(weight_stream_filename,"wino_cell_weight_%d_%d.txt",ROW_IDX, COL_IDX);
    
    xliu79's avatar
    xliu79 committed
    
    
    	top_stream_in>>stream_temp_reg0[0];
    	bottom_stream_out<<stream_temp_reg0[0];
    
    	wino_cell_stream_input<0>(stream_temp_reg0[0],input_stream_idx,input_stream_filename);
    
    	top_stream_in>>stream_temp_reg0[1];
    	bottom_stream_out<<stream_temp_reg0[1];
    
    	wino_cell_stream_input<0>(stream_temp_reg0[1],input_stream_idx,input_stream_filename);
    
    
    
    
    
    	for(int iter0=0;iter0<weight_indepth_load_number;iter0++)
    	for(int iter1=0;iter1<weight_outdepth_load_number;iter1++)
    	for(int iter2=0;iter2<row_tile_number;iter2++)
    	for(int iter3=0;iter3<weight_indepth_feed_size/2;iter3++)
    
    xliu79's avatar
    xliu79 committed
    	{
    		for(int iter4=0;iter4<weight_outdepth_feed_size;iter4++)
    		{	
    
    xliu79's avatar
    xliu79 committed
    		
    
    xliu79's avatar
    xliu79 committed
    			ap_int<GTGG_WIDTH_IN> stream_temp_array[2][36][2];
    			for(int i=0;i<36;i++)
    
    xliu79's avatar
    xliu79 committed
    			#pragma HLS unroll
    				if(stream_pingpong_flag)
    				{
    					stream_temp_array[0][i][0]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					stream_temp_array[0][i][1]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    					stream_temp_array[1][i][0]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					stream_temp_array[1][i][1]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    				}
    				else
    				{
    					stream_temp_array[0][i][0]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					stream_temp_array[0][i][1]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    					stream_temp_array[1][i][0]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					stream_temp_array[1][i][1]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    				}
    
    xliu79's avatar
    xliu79 committed
    
    			if(iter4<2 )
    
    xliu79's avatar
    xliu79 committed
    				if(stream_pingpong_flag)
    				{
    					top_stream_in>>stream_temp_reg0[iter4];
    					bottom_stream_out<<stream_temp_reg0[iter4];
    
    					wino_cell_stream_input<0>(stream_temp_reg0[iter4],input_stream_idx,input_stream_filename);
    					
    
    xliu79's avatar
    xliu79 committed
    				}
    				else
    				{
    					top_stream_in>>stream_temp_reg1[iter4];
    					bottom_stream_out<<stream_temp_reg1[iter4];
    
    					wino_cell_stream_input<0>(stream_temp_reg1[iter4],input_stream_idx,input_stream_filename);
    
    xliu79's avatar
    xliu79 committed
    				}			
    
    xliu79's avatar
    xliu79 committed
    			ap_uint<W_WIDTH*2*36> weight_value_temp;
    			left_stream_in>>weight_value_temp;
    
    			wino_cell_stream_weight<0>(weight_value_temp,weight_stream_idx,weight_stream_filename);
    
    xliu79's avatar
    xliu79 committed
    			right_stream_out<<weight_value_temp;
    
    xliu79's avatar
    xliu79 committed
    
    
    xliu79's avatar
    xliu79 committed
    			for(int i=0;i<36;i++)
    			{
    			#pragma HLS unroll
    				G1[i][0]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1,i*W_WIDTH);
    				G1[i][1]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1+288,i*W_WIDTH +288 );
    			}
    
    xliu79's avatar
    xliu79 committed
    			ap_int<GTGG_WIDTH_OUT> UV[36][2];
    			#pragma HLS array_partition variable=UV complete
    			
    			for(int i=0;i<36;i++)
    			{
    			#pragma HLS unroll
    				UV[i][0] = G1[i][0]*stream_temp_array[0][i][0] + G1[i][1]*stream_temp_array[1][i][0];
    				UV[i][1] = G1[i][0]*stream_temp_array[0][i][1] + G1[i][1]*stream_temp_array[1][i][1];
    			}
    
    xliu79's avatar
    xliu79 committed
    			ap_int<A_WIDTH_OUT>  UVA[6][4][2];
    			#pragma HLS array_partition variable=UVA complete
    
    xliu79's avatar
    xliu79 committed
    			for(int b_idx=0;b_idx<2;b_idx++)
    
    xliu79's avatar
    xliu79 committed
    			#pragma HLS unroll
    				for(int i=0;i<6;i++)
    				{
    			#pragma HLS unroll
    					UVA[i][0][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+0][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx];
    					UVA[i][1][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<1);
    					UVA[i][2][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<2);
    					UVA[i][3][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<3)+(ap_int<A_WIDTH_OUT>) UV[i*6+5][b_idx];
    				}
    
    xliu79's avatar
    xliu79 committed
    
    
    			int address = iter1*row_tile_number* weight_outdepth_feed_size + iter4*row_tile_number + iter2;
    			
    
    
    			ap_int<36> outbuffer_value[16];
    
    			for(int i=0;i<16;i++)
    			{
    
    xliu79's avatar
    xliu79 committed
    				outbuffer_value[i]=out_buffer[i][address];
    
    
    
    			ap_int<AT_WIDTH_OUT> ATUVA[16][2];
    			#pragma HLS array_partition variable=ATUVA complete
    
    
    			for(int b_idx=0;b_idx<2;b_idx++)
    			{
    			#pragma HLS unroll
    				for(int i=0;i<4;i++)
    				{
    				#pragma HLS unroll
    					ATUVA[0+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[0][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx];
    					ATUVA[4+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<1);
    					ATUVA[8+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<2);
    					ATUVA[12+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<3) + (ap_int<AT_WIDTH_OUT>) UVA[5][i][b_idx];
    				}
    			}
    			attach_output_vector<0>(ATUVA,write_idx,filename);
    
    			ap_int<18> out_residual[16][2];
    
    xliu79's avatar
    xliu79 committed
    			if(iter0 ==0 && iter3 ==0)
    			{
    				for(int i=0;i<16;i++)
    				{
    					out_residual[i][0]=0;
    					out_residual[i][1]=0;
    				}
    			}
    			else
    			{
    				for(int i=0;i<16;i++)
    				{
    
    					out_residual[i][0]=outbuffer_value[i].range(17,0);
    					out_residual[i][1]=outbuffer_value[i].range(35,18);
    
    xliu79's avatar
    xliu79 committed
    				}
    			}
    
    xliu79's avatar
    xliu79 committed
    
    
    			ap_int<18> outbuffer_writeback_value[16][2];
    			for(int i=0;i<16;i++)
    
    				// outbuffer_writeback_value[i][0]=ATUVA[i][0].range(31,14)+out_residual[i][0];
    				// outbuffer_writeback_value[i][1]=ATUVA[i][1].range(31,14)+out_residual[i][1];
    
    				outbuffer_writeback_value[i][0]=ATUVA[i][0]+out_residual[i][0];
    				outbuffer_writeback_value[i][1]=ATUVA[i][1]+out_residual[i][1];
    
    xliu79's avatar
    xliu79 committed
    
    
    
    			ap_int<18*2> outbuffer_writeback_value_batch[16];
    
    
    xliu79's avatar
    xliu79 committed
    			for(int i=0;i<16;i++)
    			{
    
    				outbuffer_writeback_value_batch[i]=(outbuffer_writeback_value[i][1],outbuffer_writeback_value[i][0]  );
    
    xliu79's avatar
    xliu79 committed
    			}
    
    xliu79's avatar
    xliu79 committed
    			for(int i=0;i<16;i++)
    			{
    
    				out_buffer[i][address]=outbuffer_writeback_value_batch[i];
    
    xliu79's avatar
    xliu79 committed
    			}
    
    		}
    		stream_pingpong_flag=~stream_pingpong_flag;
    
    xliu79's avatar
    xliu79 committed
    	}
    
    template<int dummy>
    void wino6x6_stream_bottomend(
    	hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
    	ap_int<16> weight_indepth_load_number,
    	ap_int<16> weight_outdepth_load_number,
    	ap_int<16> weight_indepth_feed_size,
    	ap_int<16> row_tile_number
    )
    {
    	ap_uint<GTGG_WIDTH_IN*2*36> dummy_temp;
    	for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size*row_tile_number+2;i++)
    	{
    		top_stream_in.read(dummy_temp);
    	}
    }
    
    template<int dummy>
    void wino6x6_stream_rightend(
    	hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in,
    	ap_int<16> weight_indepth_load_number,
    	ap_int<16> weight_outdepth_load_number,
    	ap_int<16> weight_indepth_feed_size,
    	ap_int<16> weight_outdepth_feed_size,
    	ap_int<16> row_tile_number
    )
    {
    	ap_uint<W_WIDTH*2*36>dummy_temp;
    	for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size/2*weight_outdepth_feed_size*row_tile_number;i++)
    	{
    		left_stream_in.read(dummy_temp);
    	}
    }
    
    
    
    
    
    
    xliu79's avatar
    xliu79 committed
    template<int dummy>
    void wino_stream_ceil_4x4(
    		hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in,
    		hls::stream< ap_uint<IN_WIDTH*16> > & bottom_stream_out,
    		hls::stream< ap_uint<W_WIDTH*16> > &left_stream_in,
    		hls::stream< ap_uint<W_WIDTH*16> > &right_stream_out,
    		ap_uint<OUT_WIDTH> out_buffer[16][1024],
    		ap_uint<1> wino_flag,
    		int number)
    {
    
    	#pragma HLS array_partition variable=out_buffer dim=1 complete
    
    		ap_int<W_WIDTH> G1[16];
    	#pragma HLS array_partition variable=G1
    		ap_int<IN_WIDTH> in[16];
    	#pragma HLS array_partition variable=in
    	
    
    	for(int counter=0;counter<number;counter++)
    	{
    		#pragma HLS pipeline
    		ap_uint<IN_WIDTH*16> stream_in_temp;
    		ap_uint<W_WIDTH*16> stream_weight_temp;
    		top_stream_in>>stream_in_temp;
    		bottom_stream_out<<stream_in_temp;
    		left_stream_in>>stream_weight_temp;
    		right_stream_out<<stream_weight_temp;
    
    		for(int k=0;k<16;k++)
    		{
    		#pragma HLS unroll factor=36
    			G1[k].range(W_WIDTH-1,0)=stream_weight_temp.range(W_WIDTH-1+k*W_WIDTH,k*W_WIDTH);
    			in[k].range(IN_WIDTH-1,0)=stream_in_temp.range(IN_WIDTH-1+k*IN_WIDTH,k*IN_WIDTH);
    		}
    
    		ap_int<B_WIDTH_OUT>  dB[4][4];
    		#pragma HLS array_partition variable=dB complete
    		for(int i=0;i<4;i++)
    		{
    	#pragma HLS unroll
    			dB[i][0]=in[i*4]-in[i*4+2];
    			dB[i][1]=in[i*4+1]+in[i*4+2];
    			dB[i][2]=-in[i*4+1]+in[i*4+2];
    			dB[i][3]=in[i*4+1]-in[i*4+3];
    		}
    
    		ap_int<BT_WIDTH_OUT>  BTdB[4][4];
    		#pragma HLS array_partition variable=BTdB complete
    
    		if(wino_flag)
    
    		{hls::stream<ap_uint<32*36> > indata_tile_stream1("indata2");
    
    xliu79's avatar
    xliu79 committed
    			for(int i=0;i<4;i++)
    			{
    		#pragma HLS unroll
    				BTdB[i][0]=dB[0][i]-dB[2][i];
    				BTdB[i][1]=dB[1][i]+dB[2][i];
    				BTdB[i][2]=-dB[1][i]+dB[2][i];
    				BTdB[i][3]=dB[1][i]-dB[3][i];
    			}
    		}
    		else
    		{
    			for(int i=0;i<4;i++)
    			{
    		#pragma HLS unroll
    				BTdB[i][0]=in[i*4+0];
    				BTdB[i][1]=in[i*4+1];
    				BTdB[i][2]=in[i*4+2];
    				BTdB[i][3]=in[i*4+3];
    			}
    		}
    
    		ap_int<GTGG_WIDTH_OUT> UV[4][4];
    		#pragma HLS array_partition variable=UV complete
    		
    		for(int i=0;i<4;i++){
    		#pragma HLS unroll
    			for(int j=0;j<4;j++){
    			#pragma HLS unroll
    				if(wino_flag)
    						UV[i][j]=BTdB[i][j]*G1[i*4+j];
    				else
    						UV[i][j]=in[i][j]*G1[i*4+j];
    			}
    		}
    		ap_int<A_WIDTH_OUT>  UVA[4][2];
    		#pragma HLS array_partition variable=UVA complete
    
    		for(int i=0;i<4;i++)
    		{
    	#pragma HLS unroll
    			UVA[i][0]=UV[i][0]+UV[i][1]+UV[i][2];
    			UVA[i][1]=UV[i][1]-UV[i][2]-UV[i][3];
    		}
    
    		ap_int<AT_WIDTH_OUT> ATUVA[16];
    		#pragma HLS array_partition variable=ATUVA complete
    
    		if(wino_flag)
    		{
    			for(int i=0;i<2;i++)
    			{
    		#pragma HLS unroll
    				ATUVA[0+i]=UVA[0][i]+UVA[1][i]+UVA[2][i];
    				ATUVA[2+i]=UVA[1][i]-UVA[2][i]-UVA[3][i];
    			}
    		}
    		else
    		{
    			for(int i=0;i<16;i++)
    			{
    				ATUVA[i]=UV[i/4][i%4];
    			}
    		}
    
    		if(wino_flag)
    		{
    			for(int i=0;i<4;i++)
    			{
    		#pragma HLS unroll
    				out_buffer[i][counter]=out_buffer[i][counter]+ATUVA[i];
    			}
    		}
    		else
    		{
    			for(int i=0;i<4;i++)
    			for(int j=0;j<4;j++)
    			{
    		#pragma HLS unroll
    				out_buffer[i*4+j][counter]=out_buffer[i*4+j][counter]+UV[i][j];
    			}
    		}
    	}
    }
    
    
    
    
    
    
    template<int dummy>
    void wino_systolic(
    	ap_uint<16> input_buffer[8][16][INPUT_BUFFER_DEPTH],
    
    	ap_uint<OUT_WIDTH*2> output_buffer0[4*WEIGHT_FEED_NUMBER_PER_PORT*INPUT_FEED_NUMBER][16][OUTPUT_BUFFER_DEPTH],
    
    	ap_uint<128>* weight_DDR0,
    	ap_uint<128>* weight_DDR1,
    	ap_uint<128>* weight_DDR2,
    	ap_uint<128>* weight_DDR3,
    	ap_uint<16> input_height,
    	ap_uint<16> input_width,
    	ap_uint<16> input_depth,
    	ap_uint<16> input_width_ceildiv_16,
    	ap_uint<16> input_depth_align8,
    	ap_uint<16> output_height,
    	ap_uint<16> output_width,
    	ap_uint<16> output_depth,
    	ap_uint<8> kernel_window_size,
    	ap_uint<8> pad_size,
    	ap_uint<16> weight_indepth_load_number,
    	ap_uint<16> weight_outdepth_load_number,
    	ap_uint<16> weight_outdepth_feed_size,
    	ap_uint<16> start_row_idx,
    
    	//weight parameters
    	ap_uint<16> weight_total_load_number,
    	ap_uint<16> weight_total_feed_size,
    	ap_uint<16> ddr_load_length,
    	ap_uint<16> ddr_load_length_per_feed,
    	ap_uint<16> row_repeat_times,
    	ap_uint<16> first_flag,
    	ap_uint<16> last_flag
    
    )
    {
    	hls::stream<ap_uint<32*36> > indata_tile_stream0("indata1");
    	hls::stream<ap_uint<32*36> > indata_tile_stream1("indata2");
    
    
    	hls::stream<ap_uint<32*36> > indata_out_0_0("indata_out_0_0");
    	hls::stream<ap_uint<32*36> > indata_out_1_0("indata_out_1_0");
    	hls::stream<ap_uint<32*36> > indata_out_2_0("indata_out_2_0");
    	hls::stream<ap_uint<32*36> > indata_out_3_0("indata_out_3_0");
    	hls::stream<ap_uint<32*36> > indata_out_4_0("indata_out_4_0");
    	hls::stream<ap_uint<32*36> > indata_out_5_0("indata_out_5_0");
    	hls::stream<ap_uint<32*36> > indata_out_6_0("indata_out_6_0");
    	hls::stream<ap_uint<32*36> > indata_out_7_0("indata_out_7_0");
    	hls::stream<ap_uint<32*36> > indata_out_8_0("indata_out_8_0");
    
    	hls::stream<ap_uint<32*36> > indata_out_0_1("indata_out_0_1");
    	hls::stream<ap_uint<32*36> > indata_out_1_1("indata_out_1_1");
    	hls::stream<ap_uint<32*36> > indata_out_2_1("indata_out_2_1");
    	hls::stream<ap_uint<32*36> > indata_out_3_1("indata_out_3_1");
    	hls::stream<ap_uint<32*36> > indata_out_4_1("indata_out_4_1");
    	hls::stream<ap_uint<32*36> > indata_out_5_1("indata_out_5_1");
    	hls::stream<ap_uint<32*36> > indata_out_6_1("indata_out_6_1");
    	hls::stream<ap_uint<32*36> > indata_out_7_1("indata_out_7_1");
    	hls::stream<ap_uint<32*36> > indata_out_8_1("indata_out_8_1");
    
    
    
    
    
    
    
    	hls::stream<ap_uint<16*36> > weightdata_out_0_0("weightdata_out_0_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_1_0("weightdata_out_1_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_2_0("weightdata_out_2_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_3_0("weightdata_out_3_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_4_0("weightdata_out_4_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_5_0("weightdata_out_5_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_6_0("weightdata_out_6_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_7_0("weightdata_out_7_0");
    	hls::stream<ap_uint<16*36> > weightdata_out_0_1("weightdata_out_0_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_1_1("weightdata_out_1_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_2_1("weightdata_out_2_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_3_1("weightdata_out_3_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_4_1("weightdata_out_4_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_5_1("weightdata_out_5_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_6_1("weightdata_out_6_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_7_1("weightdata_out_7_1");
    	hls::stream<ap_uint<16*36> > weightdata_out_0_2("weightdata_out_0_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_1_2("weightdata_out_1_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_2_2("weightdata_out_2_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_3_2("weightdata_out_3_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_4_2("weightdata_out_4_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_5_2("weightdata_out_5_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_6_2("weightdata_out_6_2");
    	hls::stream<ap_uint<16*36> > weightdata_out_7_2("weightdata_out_7_2");
    
    
    
    		input_feed<0>(
    		input_buffer,
    		indata_out_0_0, 
    		indata_out_0_1,
    		start_row_idx,
    		output_width,
    		input_height,
    		input_width,
    		input_width_ceildiv_16,
    		input_depth_align8,
    		pad_size,
    		weight_outdepth_load_number,
    		kernel_window_size,
    		weight_outdepth_feed_size
    	);
    	
    	weight_feed<0>(
    	weight_DDR0,
    	weight_DDR1,
    	weight_DDR2,
    	weight_DDR3,
    	#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    	weightdata_out_0_0,
    	weightdata_out_1_0,
    	weightdata_out_2_0,
    	weightdata_out_3_0,
    	weightdata_out_4_0,
    	weightdata_out_5_0,
    	weightdata_out_6_0,
    	weightdata_out_7_0,
    	#endif
    	weight_total_load_number,
    	weight_total_feed_size,
    	ddr_load_length,
    	ddr_load_length_per_feed,
    	row_repeat_times,
    	first_flag,
    	last_flag);
    
    
    
    	// wino6x6_stream_cell<0>(
    	// 	indata_out_0_0
    	// 	hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
    	// 	hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
    	// 	hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
    	// 	ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
    	// 	ap_int<16> weight_indepth_load_number,
    	// 	ap_int<16> weight_outdepth_load_number,
    	// 	ap_int<16> weight_indepth_feed_size,
    	// 	ap_int<16> weight_outdepth_feed_size,
    	// 	ap_int<16> row_tile_number);
    
    
    
    
    	wino6x6_stream_cell<0,0>(indata_out_0_0,indata_out_1_0,weightdata_out_0_0,weightdata_out_0_1,output_buffer0[0],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<1,0>(indata_out_1_0,indata_out_2_0,weightdata_out_1_0,weightdata_out_1_1,output_buffer0[1],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<2,0>(indata_out_2_0,indata_out_3_0,weightdata_out_2_0,weightdata_out_2_1,output_buffer0[2],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<3,0>(indata_out_3_0,indata_out_4_0,weightdata_out_3_0,weightdata_out_3_1,output_buffer0[3],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<4,0>(indata_out_4_0,indata_out_5_0,weightdata_out_4_0,weightdata_out_4_1,output_buffer0[4],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<5,0>(indata_out_5_0,indata_out_6_0,weightdata_out_5_0,weightdata_out_5_1,output_buffer0[5],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<6,0>(indata_out_6_0,indata_out_7_0,weightdata_out_6_0,weightdata_out_6_1,output_buffer0[6],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<7,0>(indata_out_7_0,indata_out_8_0,weightdata_out_7_0,weightdata_out_7_1,output_buffer0[7],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<0,1>(indata_out_0_1,indata_out_1_1,weightdata_out_0_1,weightdata_out_0_2,output_buffer0[8],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<1,1>(indata_out_1_1,indata_out_2_1,weightdata_out_1_1,weightdata_out_1_2,output_buffer0[9],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<2,1>(indata_out_2_1,indata_out_3_1,weightdata_out_2_1,weightdata_out_2_2,output_buffer0[10],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<3,1>(indata_out_3_1,indata_out_4_1,weightdata_out_3_1,weightdata_out_3_2,output_buffer0[11],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<4,1>(indata_out_4_1,indata_out_5_1,weightdata_out_4_1,weightdata_out_4_2,output_buffer0[12],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<5,1>(indata_out_5_1,indata_out_6_1,weightdata_out_5_1,weightdata_out_5_2,output_buffer0[13],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<6,1>(indata_out_6_1,indata_out_7_1,weightdata_out_6_1,weightdata_out_6_2,output_buffer0[14],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_cell<7,1>(indata_out_7_1,indata_out_8_1,weightdata_out_7_1,weightdata_out_7_2,output_buffer0[15],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    
    
    	wino6x6_stream_bottomend<0>(indata_out_8_0,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times);
    	wino6x6_stream_bottomend<0>(indata_out_8_1,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times);
    
    
    	wino6x6_stream_rightend<0>(weightdata_out_0_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_1_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_2_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_3_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_4_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_5_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_6_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    	wino6x6_stream_rightend<0>(weightdata_out_7_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times);
    
    }