Skip to content
Snippets Groups Projects
wino_cell.cpp 36.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • #ifndef _WINO_CELL_HPP_
    #define _WINO_CELL_HPP_
    
    #include "wino_macro.h"
    
    xliu79's avatar
    xliu79 committed
    #include <ap_int.h>
    #include <hls_stream.h>
    
    xliu79's avatar
    xliu79 committed
    #include "wino_buffer.cpp"
    
    #include "wino_transform.cpp"
    
    xliu79's avatar
    xliu79 committed
    
    
    #include <dsp_builtins.h>
    
    void input_transform(
    	hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_stream,
    	hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_transformed_stream,
    	int input_transform_feeding_loop_bound
    	#if DEBUG_FILE_PRINT
    	,int wino_array_col
    	#endif
    )
    {
    	#if DEBUG_FILE_PRINT
    	int write_idx=0;
    	#endif
    	
    	for(int cycle=0;cycle<input_transform_feeding_loop_bound;cycle++)
    	{
    		#pragma hls pipeline
    		ap_uint<8*BATCH_SIZE*36> input_tile_stream_data;
    		input_tile_stream>>input_tile_stream_data;
    
    		ap_int<8> in[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    		#pragma HLS array_partition variable=in complete
    
    
    		#if DEBUG_FILE_PRINT
    			char infilename[100];
    			sprintf(infilename,"intile_transform_%d.txt",wino_array_col);
    			// attach_output_vector<8,WINO_DOMAIN_SIZE,BATCH_SIZE>(in,write_idx,infilename);
    		#endif
    
    
    		for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    		{
    			#pragma HLS unroll
    			for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    			{
    				for(int k=0;k<BATCH_SIZE;k++)
    				{
    					in[i][j][k]=input_tile_stream_data.range(  ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8+7, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8);
    				}
    			}
    		}
    
    		ap_int<DB_WIDTH> DB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    
    
    		for(int k=0;k<BATCH_SIZE;k++)
    		{
    		#pragma HLS unroll
    			for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    			{
    			#pragma HLS unroll
    				#if WINO_DOMAIN_SIZE ==6
    
    					DB6x6_1(in,DB,i,k)
    
    				#else
    					#error "WINO_DOMAIN_SIZE!=6 not implemented "
    				#endif	
    			}
    		}
    
    
    		ap_int<BTB_WIDTH> BtDB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    		
    		for(int k=0;k<BATCH_SIZE;k++)
    		{
    		#pragma HLS unroll
    			for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    			{
    			#pragma HLS unroll
    				#if WINO_DOMAIN_SIZE ==6
    					
    				BTB6x6_1(DB,BtDB,i,k)
    				
    				#else
    					#error "WINO_DOMAIN_SIZE!=6 not implemented "
    				#endif	
    			}
    		}
    		ap_uint<BTB_WIDTH*BATCH_SIZE*36> input_tile_transformed_data;
    
    		for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    		{
    			#pragma HLS unroll
    			for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    			{
    				for(int k=0;k<BATCH_SIZE;k++)
    				{
    					input_tile_transformed_data.range(  ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+BTB_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH)=BtDB[i][j][k];
    				}
    			}
    		}
    
    
    		#if DEBUG_FILE_PRINT
    			attach_output_vector<BTB_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(BtDB,write_idx,infilename);
    			write_idx++;
    		#endif
    
    
    		input_tile_transformed_stream<<input_tile_transformed_data;
    	}
    }
    
    
    
    
    void load_input_tile(
    	ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE],
    	ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg[WINO_WIDTH][INDEPTH_MINITILE_SIZE]
    )
    {
    	#pragma HLS pipeline
    	#pragma HLS array_partition variable = input_tile complete dim=5
    	#pragma HLS array_partition variable = input_tile complete dim=4
    	#pragma HLS array_partition variable = input_tile complete dim=3
    	#pragma HLS array_partition variable = input_tile complete dim=2
    	#pragma HLS array_partition variable = input_tile complete dim=1
    
    	#pragma HLS array_partition variable = stream_temp_reg0 complete dim=1
    	#pragma HLS array_partition variable = stream_temp_reg0 complete dim=2 
    
    	for(int w=0;w<WINO_WIDTH;w++){
    	#pragma HLS unroll
    		for(int iid=0;iid<INDEPTH_MINITILE_SIZE;iid++){
    			#pragma HLS unroll
    			for(int i=0;i<WINO_DOMAIN_SIZE;i++){
    				#pragma HLS unroll
    				for(int j=0;j<WINO_DOMAIN_SIZE;j++){
    					#pragma HLS unroll
    					for(int k=0;k<BATCH_SIZE;k++){
    					#pragma HLS unroll
    						//if(stream_pingpong_flag)
    							input_tile[w][iid][i][j][k]=
    
    							stream_temp_reg[w][iid].range(  ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+BTB_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH);
    
    					}
    				}
    			}
    		}
    	}
    }
    
    void load_weight_tile(
    		ap_int<W_WIDTH> weight_tile[WINO_HEIGHT][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE],
    		ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> weight_value_temp[WINO_HEIGHT]
    )
    {
    		#pragma HLS array_partition variable = weight_tile complete dim=4
    		#pragma HLS array_partition variable = weight_tile complete dim=3
    		#pragma HLS array_partition variable = weight_tile complete dim=2
    		#pragma HLS array_partition variable = weight_tile complete dim=1
    
    		#pragma HLS array_partition variable=weight_value_temp complete 
    
    		for(int wh=0;wh<WINO_HEIGHT;wh++)
    		{
    			#pragma HLS unroll
    			for(int id=0;id<INDEPTH_MINITILE_SIZE;id++)
    			{
    				#pragma HLS unroll
    				for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    				{
    					#pragma HLS unroll
    					for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    					{
    						#pragma HLS unroll
    						weight_tile[wh][id][i][j]=weight_value_temp[wh].range(
    
    							(id*WINO_DOMAIN_SIZE_SQUARE+i*WINO_DOMAIN_SIZE+j)*W_WIDTH+W_WIDTH-1,
    							(id*WINO_DOMAIN_SIZE_SQUARE+i*WINO_DOMAIN_SIZE+j)*W_WIDTH);
    
    					}
    				}
    			}
    		}
    }
    
    template<int dummy>
    void element_wise_mult_6x6(
    		ap_int<UV_MUL_WIDTH> UV_MUL_TILE[INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE],
    		ap_int<BTB_WIDTH> input_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE],
    		ap_int<W_WIDTH> weight_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE],
    		ap_int<1> ap_clk_div2
    )
    {
    	#pragma HLS pipeline
    
    	#pragma HLS array_partition variable=UV_MUL_TILE complete dim=1
    	#pragma HLS array_partition variable=UV_MUL_TILE complete dim=2
    	#pragma HLS array_partition variable=UV_MUL_TILE complete dim=3
    	#pragma HLS array_partition variable=UV_MUL_TILE complete dim=4
    
    
    
    	#pragma HLS array_partition variable = input_tile complete dim=4
    	#pragma HLS array_partition variable = input_tile complete dim=3
    	#pragma HLS array_partition variable = input_tile complete dim=2
    	#pragma HLS array_partition variable = input_tile complete dim=1
    
    	#pragma HLS array_partition variable = weight_tile complete dim=3
    	#pragma HLS array_partition variable = weight_tile complete dim=2
    	#pragma HLS array_partition variable = weight_tile complete dim=1
    
    	for(int wr=0;wr<WINO_DOMAIN_SIZE;wr++)
    	{
    	#pragma HLS unroll
    		for(int wc=0; wc<WINO_DOMAIN_SIZE;wc++)
    		{
    		#pragma HLS unroll
    			for(int id2=0,id=0;id2<INDEPTH_MINITILE_SIZE/2;id2++,id+=2)
    			{
    			#pragma HLS unroll
    				for(int b=0;b<BATCH_SIZE;b++)
    				{
    				#pragma HLS unroll
    					ap_int<BTB_WIDTH> input_val0=input_tile[id][wr][wc][b];
    					ap_int<BTB_WIDTH> input_val1=input_tile[id+1][wr][wc][b];
    					UV_MUL_TILE[id2][wr][wc][b]=__builtin_mac16x2(
    							input_tile[id][wr][wc][b],
    							input_tile[id+1][wr][wc][b],
    							weight_tile[id][wr][wc],
    							weight_tile[id+1][wr][wc],
    							0,1,ap_clk_div2);
    				}
    			}
    		}
    	}
    }
    
    void element_wise_mult(
    		ap_int<UV_MUL_WIDTH> UV_MUL[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE],
    		ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE],
    		ap_int<W_WIDTH> weight_tile[OUTDEPTH_MINITILE_SIZE][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE],
    		ap_int<1> ap_clk_div2
    )
    {
    	#pragma HLS pipeline
    
    	#pragma HLS array_partition variable=UV_MUL complete dim=1
    	#pragma HLS array_partition variable=UV_MUL complete dim=2
    	#pragma HLS array_partition variable=UV_MUL complete dim=3
    	#pragma HLS array_partition variable=UV_MUL complete dim=4
    	#pragma HLS array_partition variable=UV_MUL complete dim=5
    	#pragma HLS array_partition variable=UV_MUL complete dim=6
    
    	#pragma HLS array_partition variable = input_tile complete dim=5
    	#pragma HLS array_partition variable = input_tile complete dim=4
    	#pragma HLS array_partition variable = input_tile complete dim=3
    	#pragma HLS array_partition variable = input_tile complete dim=2
    	#pragma HLS array_partition variable = input_tile complete dim=1
    
    	#pragma HLS array_partition variable = weight_tile complete dim=4
    	#pragma HLS array_partition variable = weight_tile complete dim=3
    	#pragma HLS array_partition variable = weight_tile complete dim=2
    	#pragma HLS array_partition variable = weight_tile complete dim=1
    
    	for(int wr=0;wr<WINO_DOMAIN_SIZE;wr++)
    	{
    	#pragma HLS unroll
    		for(int wc=0; wc<WINO_DOMAIN_SIZE;wc++)
    		{
    			for(int od=0;od<OUTDEPTH_MINITILE_SIZE;od++)
    			{
    				for(int id2=0,id=0;id2<INDEPTH_MINITILE_SIZE/2;id2++,id+=2)
    				{
    					ap_int<W_WIDTH> wino_val_d0 = weight_tile[od][id][wr][wc];
    					ap_int<W_WIDTH> wino_val_d1 = weight_tile[od][id+1][wr][wc];
    					for(int ww=0;ww<WINO_WIDTH;ww++)
    					{
    						for(int b=0;b<BATCH_SIZE;b++)
    						{
    
    							ap_int<BTB_WIDTH> input_val0=input_tile[ww][id][wr][wc][b];
    							ap_int<BTB_WIDTH> input_val1=input_tile[ww][id+1][wr][wc][b];
    
    							UV_MUL[od][ww][id2][wr][wc][b]=__builtin_mac16x2(input_val0,input_val1,wino_val_d0,wino_val_d1,0,1,ap_clk_div2);
    						}
    					}
    				}
    			}
    		}
    	}
    }
    
    
    template<class T>
    void load_reg(T &reg, T val)
    {
    	reg=val;
    }
    
    
    
    template<class T, int dim1, int dim2, int dim3, int dim4> 
    void load_reg_tile4(	T reg[dim1][dim2][dim3][dim4],
    				T val[dim1][dim2][dim3][dim4])
    {
    	#pragma HLS pipeline
    	#pragma HLS array_partition variable = reg complete dim=4
    	#pragma HLS array_partition variable = reg complete dim=3
    	#pragma HLS array_partition variable = reg complete dim=2
    	#pragma HLS array_partition variable = reg complete dim=1
    
    	#pragma HLS array_partition variable = val complete dim=4
    	#pragma HLS array_partition variable = val complete dim=3
    	#pragma HLS array_partition variable = val complete dim=2
    	#pragma HLS array_partition variable = val complete dim=1
    
    	for(int d4=0;d4<dim4;d4++){
    	#pragma HLS unroll	
    	for(int d3=0;d3<dim3;d3++){
    	#pragma HLS unroll	
    	for(int d2=0;d2<dim2;d2++){
    	#pragma HLS unroll	
    	for(int d1=0;d1<dim1;d1++){
    	#pragma HLS unroll	
    		reg[d1][d2][d3][d4]=val[d1][d2][d3][d4];
    	}}}}
    
    }
    
    
    template<class T, int dim1, int dim2, int dim3> 
    void load_reg_tile3(	T reg[dim1][dim2][dim3],
    				T val[dim1][dim2][dim3])
    {
    	#pragma HLS pipeline
    	#pragma HLS array_partition variable = reg complete dim=3
    	#pragma HLS array_partition variable = reg complete dim=2
    	#pragma HLS array_partition variable = reg complete dim=1
    
    	#pragma HLS array_partition variable = val complete dim=3
    	#pragma HLS array_partition variable = val complete dim=2
    	#pragma HLS array_partition variable = val complete dim=1
    
    	
    	for(int d3=0;d3<dim3;d3++){
    	#pragma HLS unroll	
    	for(int d2=0;d2<dim2;d2++){
    	#pragma HLS unroll	
    	for(int d1=0;d1<dim1;d1++){
    	#pragma HLS unroll	
    		reg[d1][d2][d3]=val[d1][d2][d3];
    	}}}
    
    }
    
    template<int dummy> 
    void DSP_LOADER(ap_int<32> &rst, ap_int<16> a0, ap_int<16> a1, ap_int<16> b0, ap_int<16> b1, ap_int<32> accum, ap_int<1> clear, ap_int<1> ap_clk_div2)
    {
    	rst=__builtin_mac16x2(a0,a1,b0,b1,accum,clear,ap_clk_div2);
    }
    
    
    
    void wino_stream_block(
    		hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > top_stream_in[WINO_WIDTH],
    		hls::stream< ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > left_stream_in[4][WEIGHT_FEED_NUMBER_PER_PORT],
    
    		ap_uint<OUT_WIDTH*BATCH_SIZE> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH],
    		ap_uint<16> weightbuffer_outdepth_minitile_number,
    
    		ap_uint<24> total_input_stream_tile,
    		ap_uint<16> loop_omini_base_reset_cycle,
    
    		ap_uint<10> loop_wino_tile_rowcol_self_reset_cycle_min1,
    		ap_uint<32> loop_iload_reset_cycle,
    
    		ap_uint<32> loop_wino_cell_bound,
    
    		ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_oload_increment_step,
    		ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_omini_increment_step,
    
    		ap_uint<1> wino5x5_flag
    		#if DEBUG_FILE_PRINT
    		,ConvDesc_t conv_desc
    		#endif
    		,ap_uint<1> ap_clk_div2
    		)
    {
    	#pragma HLS ap_stable port=ap_clk_div2
    
    	#pragma HLS ap_stable port=weightbuffer_outdepth_minitile_number
    	#pragma HLS ap_stable port=total_input_stream_tile
    	#pragma HLS ap_stable port=loop_omini_base_reset_cycle
    	#pragma HLS ap_stable port=loop_wino_tile_rowcol_self_reset_cycle_min1
    	#pragma HLS ap_stable port=loop_iload_reset_cycle
    	#pragma HLS ap_stable port=loop_wino_cell_bound
    	#pragma HLS ap_stable port=outbuffer_oload_increment_step
    	#pragma HLS ap_stable port=outbuffer_omini_increment_step
    	#pragma HLS ap_stable port=wino5x5_flag
    
    	#pragma HLS array_partition variable = out_buffer dim=1 complete
    
    	ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg0[WINO_WIDTH][INDEPTH_MINITILE_SIZE];
    	#pragma HLS array_partition variable = stream_temp_reg0 complete dim=1
    	#pragma HLS array_partition variable = stream_temp_reg0 complete dim=2 
    
    	ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg1[WINO_WIDTH][INDEPTH_MINITILE_SIZE];
    	#pragma HLS array_partition variable = stream_temp_reg1 complete dim=1
    	#pragma HLS array_partition variable = stream_temp_reg1 complete dim=2 
    
    
    
    	#if DEBUG_FILE_PRINT
    	for(int i=0;i<WINO_WIDTH;i++)
    	{
    		memset(stream_temp_reg0[i],0xAB,INDEPTH_MINITILE_SIZE*sizeof(ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE>));
    		memset(stream_temp_reg1[i],0xAB,INDEPTH_MINITILE_SIZE*sizeof(ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE>));
    	}
    	#endif
    
    
    	ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    	#pragma HLS array_partition variable = input_tile complete dim=5
    	#pragma HLS array_partition variable = input_tile complete dim=4
    	#pragma HLS array_partition variable = input_tile complete dim=3
    	#pragma HLS array_partition variable = input_tile complete dim=2
    	#pragma HLS array_partition variable = input_tile complete dim=1
    
    
    	ap_int<BTB_WIDTH> input_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    	#pragma HLS array_partition variable = input_tile complete dim=4
    	#pragma HLS array_partition variable = input_tile complete dim=3
    	#pragma HLS array_partition variable = input_tile complete dim=2
    	#pragma HLS array_partition variable = input_tile complete dim=1
    
    
    
    	#if 0
    	memset(stream_temp_reg0,0xAA,2*2*36*2);
    	memset(stream_temp_reg1,0xAA,2*2*36*2);
    	#endif
    
    
    
    	for(int i=0;i<INDEPTH_MINITILE_SIZE;i++)
    	{
    		#pragma hls pipeline
    		for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    		{
    			#pragma HLS unroll
    
    			for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++)
    			{
    				#pragma HLS unroll
    				stream_temp_reg0[wino_array_col][imini_idx]=stream_temp_reg0[wino_array_col][imini_idx+1];
    			}
    
    			top_stream_in[wino_array_col]>>stream_temp_reg0[wino_array_col][INDEPTH_MINITILE_SIZE-1];
    
    		}
    	}
    
    
    
    
    
    
    	ap_uint<1> load_input_flag=1;
    	ap_uint<1> stream_pingpong_flag=1;
    	ap_uint<24> loaded_input_stream_tile_number=1;
    
    	ap_uint<16> loop_omini_base_cnt=1;
    	ap_uint<10> loop_wino_tile_rowcol_cnt=0;
    	ap_uint<32>	loop_iload_cnt=1;
    
    	ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_oload_offset=0;
    	ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_omini_offset=0;
    
    	ap_uint<10> idepth_minitile_idx=0;
    
    	for(int cycle=0;cycle < loop_wino_cell_bound; cycle++)
    
    		// for(int oload_idx=0;oload_idx<conv_desc.weightbuffer_load_outdepth_number;oload_idx++)
    		// for(int iload_idx=0;iload_idx<conv_desc.weightbuffer_load_indepth_number;iload_idx++)
    		// for(int imini_base_idx=0;imini_base_idx<conv_desc.weightbuffer_indepth_minitile_number;imini_base_idx++)
    		// for(int wino_tile_row_idx=0;wino_tile_row_idx<conv_desc.wino_tile_number_in_out_rowstep;wino_tile_row_idx++)
    		// for(int wino_tile_col_idx=0;wino_tile_col_idx<conv_desc.wino_tile_number_in_outwidth;wino_tile_col_idx++)
    		// for(int omini_base_idx=0;omini_base_idx<loop_omini_base_reset_cycle ;omini_base_idx++)
    
    
    		#pragma HLS pipeline
    		#pragma HLS dependence variable=out_buffer inter false
    		#pragma HLS dependence variable=out_buffer intra false
    
    		ap_uint<1> load_input_flag_reg = (load_input_flag  && loaded_input_stream_tile_number !=  total_input_stream_tile);
    
    		if(stream_pingpong_flag)
    			load_input_tile(input_tile,stream_temp_reg0);
    		else
    			load_input_tile(input_tile,stream_temp_reg1);
    
    
    
    		#if 0
    			printf("pingpong %d load %d, write idx %d, [%d/%d]\n", (int) stream_pingpong_flag, (int) load_input_flag_reg,write_idx, (int) loaded_input_stream_tile_number, (int) total_input_stream_tile);
    
    			puts("input tile reg 0");
    			for(int id=0;id<4;id++)
    			{
    				for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    				{
    					for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    					{
    
    						ap_uint<BTB_WIDTH> data0,data1;
    						(data1,data0)=stream_temp_reg0[0][id].range( (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH+BTB_WIDTH*BATCH_SIZE-1, (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH);
    						printf("[%04x %04x]", (int) data0, (int) data1);
    					}
    					printf("            ");
    					for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    					{
    
    						ap_uint<BTB_WIDTH> data0,data1;
    						(data1,data0)=stream_temp_reg1[0][id].range( (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH+BTB_WIDTH*BATCH_SIZE-1, (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH);
    						printf("[%04x %04x]", (int) data0, (int) data1);
    					}	
    					printf("\n");
    				}
    				printf("\n");
    			}
    		#endif
    
    		ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_addr = outbuffer_oload_offset + loop_wino_tile_rowcol_cnt +  outbuffer_omini_offset;
    
    		#if DEBUG_FILE_PRINT
    		int rowtile_idx=loop_wino_tile_rowcol_cnt/conv_desc.wino_tile_number_in_outwidth;
    		int coltile_idx=loop_wino_tile_rowcol_cnt%conv_desc.wino_tile_number_in_outwidth;
    		int outdepth_minitile_idx= (outbuffer_oload_offset+outbuffer_omini_offset)/(conv_desc.wino_tile_number_in_out_rowstep*conv_desc.wino_tile_number_in_outwidth);
    		if((outbuffer_oload_offset+outbuffer_omini_offset)%(conv_desc.wino_tile_number_in_out_rowstep*conv_desc.wino_tile_number_in_outwidth))
    		{
    			printf("outdepth_minitile_idx not valid\n");
    			exit(-3);
    		}
    		#endif
    
    		for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    		{
    			#pragma HLS unroll
    			if(stream_pingpong_flag && load_input_flag_reg)
    			{
    
    				for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++)
    				{
    					#pragma HLS unroll
    					stream_temp_reg1[wino_array_col][imini_idx]=stream_temp_reg1[wino_array_col][imini_idx+1];
    				}
    
    				top_stream_in[wino_array_col]>>stream_temp_reg1[wino_array_col][INDEPTH_MINITILE_SIZE-1];
    
    			}
    			else if(load_input_flag_reg)
    			{
    
    				for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++)
    				{
    					#pragma HLS unroll
    					stream_temp_reg0[wino_array_col][imini_idx]=stream_temp_reg0[wino_array_col][imini_idx+1];
    				}
    
    				top_stream_in[wino_array_col]>>stream_temp_reg0[wino_array_col][INDEPTH_MINITILE_SIZE-1];
    
    			}
    		}
    
    
    		ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> weight_value_temp[WINO_HEIGHT];
    		#pragma HLS array_partition variable=weight_value_temp complete 
    
    
    		if(loop_omini_base_cnt <= weightbuffer_outdepth_minitile_number)
    
    		{
    			for(int i=0;i<4;i++)
    			{
    				#pragma HLS unroll
    				for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++)
    				{
    					#pragma HLS unroll
    					left_stream_in[i][j]>>weight_value_temp[i*WEIGHT_FEED_NUMBER_PER_PORT+j];
    
    					#if 0
    						printf("wino_row_idx: %2d --", i*WEIGHT_FEED_NUMBER_PER_PORT+j);
    						for(int k=0;k<WINO_DOMAIN_SIZE_SQUARE;k++)
    						{
    							printf("[%08x]", (unsigned int) weight_value_temp[i*WEIGHT_FEED_NUMBER_PER_PORT+j].range(k*32+31,k*32) );
    						}
    						printf("\n");
    					#endif
    
    				}
    			}
    		}
    
    		ap_int<W_WIDTH> weight_tile[WINO_HEIGHT][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE];
    		#pragma HLS array_partition variable = weight_tile complete dim=4
    		#pragma HLS array_partition variable = weight_tile complete dim=3
    		#pragma HLS array_partition variable = weight_tile complete dim=2
    		#pragma HLS array_partition variable = weight_tile complete dim=1
    
    		load_weight_tile(weight_tile,weight_value_temp);
    
    
    		for(int wino_array_row=0;wino_array_row<WINO_HEIGHT;wino_array_row++)
    
    		{
    			#pragma HLS unroll
    			for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    			{
    				#pragma HLS unroll
    				ap_int<BTB_WIDTH> input_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable = input_tile_reg complete dim=4
    				#pragma HLS array_partition variable = input_tile_reg complete dim=3
    				#pragma HLS array_partition variable = input_tile_reg complete dim=2
    				#pragma HLS array_partition variable = input_tile_reg complete dim=1
    
    				ap_int<W_WIDTH> weight_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE];
    				#pragma HLS array_partition variable = weight_tile_reg complete dim=3
    				#pragma HLS array_partition variable = weight_tile_reg complete dim=2
    				#pragma HLS array_partition variable = weight_tile_reg complete dim=1
    
    				ap_int<UV_MUL_WIDTH> UV_MUL_TILE[INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable=UV_MUL_TILE complete dim=1
    				#pragma HLS array_partition variable=UV_MUL_TILE complete dim=2
    				#pragma HLS array_partition variable=UV_MUL_TILE complete dim=3
    				#pragma HLS array_partition variable=UV_MUL_TILE complete dim=4
    
    				load_reg_tile4<ap_int<BTB_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg, input_tile[wino_array_col] );
    
    				load_reg_tile3<ap_int<W_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg, weight_tile[wino_array_row]);
    
    					if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number)
    					{
    						char infilename[100];
    						sprintf(infilename,"invector_%d_%d.txt",wino_array_row,wino_array_col);
    						attach_input_vector<BTB_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg,write_idx,infilename);
    						char wfilename[100];
    						sprintf(wfilename,"wvector_%d_%d.txt",wino_array_row,wino_array_col);
    						attach_weight_vector<W_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg,write_idx,wfilename);
    					}
    
    				element_wise_mult_6x6<0>(UV_MUL_TILE,input_tile_reg,weight_tile_reg, ap_clk_div2 );
    
    				ap_int<UV_WIDTH> UV[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable=UV complete dim=1
    				#pragma HLS array_partition variable=UV complete dim=2
    				#pragma HLS array_partition variable=UV complete dim=3
    
    				for(int wino_row=0;wino_row<WINO_DOMAIN_SIZE;wino_row++)
    				{
    					#pragma HLS unroll
    					for(int wino_col=0;wino_col<WINO_DOMAIN_SIZE;wino_col++)
    					{
    						#pragma HLS unroll
    						for(int b=0;b<BATCH_SIZE;b++)
    						{
    							ap_int<UV_MUL_WIDTH> temp=0;
    							for(int id2=0;id2<INDEPTH_MINITILE_SIZE/2;id2++)
    							{
    								#pragma HLS unroll
    
    								temp+=UV_MUL_TILE[id2][wino_row][wino_col][b];
    
    							}
    							UV[wino_row][wino_col][b]=temp>>UV_QUANT_BIT;
    						}
    					}
    				}
    
    
    				#if DEBUG_FILE_PRINT
    					char uvfilename[100];
    					sprintf(uvfilename,"uvvector_%d_%d.txt",wino_array_row,wino_array_col);
    
    					if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number)
    
    					attach_output_vector<UV_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(UV,write_idx,uvfilename);
    
    
    				#endif
    
    
    
    				ap_int<UVA_WIDTH> UVA[WINO_DOMAIN_SIZE][WINO_OUT_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable=UVA complete dim=1
    				#pragma HLS array_partition variable=UVA complete dim=2
    				#pragma HLS array_partition variable=UVA complete dim=3
    
    		
    				for(int ridx=0;ridx<WINO_DOMAIN_SIZE;ridx++)
    				{
    					#pragma HLS unroll
    					for(int bidx=0;bidx<BATCH_SIZE;bidx++)
    					{
    						#pragma HLS unroll
    						UVA_row(UVA,UV,ridx,bidx,wino5x5_flag);
    					}
    				}
    
    				ap_int<ATA_WIDTH> ATA[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable=ATA complete dim=1
    				#pragma HLS array_partition variable=ATA complete dim=2
    				#pragma HLS array_partition variable=ATA complete dim=3
    				for(int cidx=0;cidx<WINO_OUT_SIZE;cidx++)
    				{
    					#pragma HLS unroll
    					for(int bidx=0;bidx<BATCH_SIZE;bidx++)
    					{
    						#pragma HLS unroll
    						ATA_col(ATA,UVA,cidx,bidx,wino5x5_flag);
    					}
    				}
    
    				#if DEBUG_FILE_PRINT
    					char filename[100];
    					sprintf(filename,"outvector_%d_%d.txt",wino_array_row,wino_array_col);
    
    					if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number)
    
    					attach_output_vector<ATA_WIDTH,WINO_OUT_SIZE,BATCH_SIZE>(ATA,write_idx,filename);
    				#endif
    
    
    				ap_int<OUT_WIDTH> out_value[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable=out_value complete dim=1
    				#pragma HLS array_partition variable=out_value complete dim=2
    				#pragma HLS array_partition variable=out_value complete dim=3
    
    
    				ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_addr_reg;
    				load_reg< ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> >(outbuffer_addr_reg,outbuffer_addr);
    
    
    				for(int r=0;r<WINO_OUT_SIZE;r++)
    				{
    					#pragma HLS unroll
    					for(int c=0;c<WINO_OUT_SIZE;c++)
    					{
    						#pragma HLS unroll
    						ap_uint<OUT_WIDTH*BATCH_SIZE> data;
    
    							data=out_buffer[r][c][wino_array_row][wino_array_col][outbuffer_addr_reg];
    
    						(out_value[r][c][1],out_value[r][c][0])=data;
    					}
    				}
    
    				ap_int<OUT_WIDTH> out_value_back[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE];
    				#pragma HLS array_partition variable=out_value_back complete dim=1
    				#pragma HLS array_partition variable=out_value_back complete dim=2
    				#pragma HLS array_partition variable=out_value_back complete dim=3
    
    				for(int r=0;r<WINO_OUT_SIZE;r++)
    				{
    					#pragma HLS unroll
    					for(int c=0;c<WINO_OUT_SIZE;c++)
    					{
    						#pragma HLS unroll
    						for(int b=0;b<BATCH_SIZE;b++)
    						{
    							ap_int<ATA_WIDTH+1> sum_sat;
    							
    							sum_sat=ATA[r][c][b]+out_value[r][c][b];
    
    							ap_int<ATA_WIDTH+2-OUT_WIDTH> judgebit=sum_sat.range(ATA_WIDTH,OUT_WIDTH-1);
    
    							if(judgebit ==0 ||  judgebit == -1)
    
    								out_value_back[r][c][b]=sum_sat;
    							else if (sum_sat[ATA_WIDTH]==1 )
    								out_value_back[r][c][b]=OUT_SAT_MIN;
    							else
    								out_value_back[r][c][b]=OUT_SAT_MAX;
    
    
    							// #if DEBUG_FILE_PRINT
    							// int outdepth_idx = outdepth_minitile_idx*OUTDEPTH_MINITILE_SIZE+wino_array_row;
    							// int col_idx = (coltile_idx*WINO_WIDTH+wino_array_col)*conv_desc.wino_output_tile_size+c;
    							// int row_idx = rowtile_idx*conv_desc.wino_output_tile_size+r;
    							// out_value_back[r][c][0]=row_idx*conv_desc.outwidth+col_idx;
    							// out_value_back[r][c][1]=outdepth_idx;
    							// #endif
    
    
    				if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number)
    
    					for(int r=0;r<WINO_OUT_SIZE;r++)
    
    						#pragma HLS unroll
    						for(int c=0;c<WINO_OUT_SIZE;c++)
    						{
    							out_buffer[r][c][wino_array_row][wino_array_col][outbuffer_addr_reg]=(out_value_back[r][c][1],out_value_back[r][c][0]);
    						}
    
    			if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number)
    				write_idx++;
    
    
    		
    		// element_wise_mult(UV_MUL,input_tile,weight_tile,ap_clk_div2);
    
    		if(loop_omini_base_cnt==loop_omini_base_reset_cycle && loop_wino_tile_rowcol_cnt==loop_wino_tile_rowcol_self_reset_cycle_min1)
    		{
    			idepth_minitile_idx++;
    		}
    		else if(loop_iload_cnt==loop_iload_reset_cycle)
    		{
    			idepth_minitile_idx=0;
    		}
    
    		if(loop_omini_base_cnt==loop_omini_base_reset_cycle && loop_wino_tile_rowcol_cnt==loop_wino_tile_rowcol_self_reset_cycle_min1)
    		{
    			loop_wino_tile_rowcol_cnt=0;
    			
    		}
    		else if(loop_omini_base_cnt==loop_omini_base_reset_cycle)
    		{
    			loop_wino_tile_rowcol_cnt++;
    		}
    
    
    
    		if(loop_iload_cnt==loop_iload_reset_cycle)
    		{
    			loop_iload_cnt=1;
    			outbuffer_oload_offset+=outbuffer_oload_increment_step;
    		}
    		else
    		{
    			loop_iload_cnt++;
    		}
    
    		
    
    		if(loop_omini_base_cnt==loop_omini_base_reset_cycle ) 
    		{
    			load_input_flag = 1;
    		}
    		else if(loop_omini_base_cnt==INDEPTH_MINITILE_SIZE)
    		{
    			load_input_flag = 0;
    		}
    
    		if(loop_omini_base_cnt==loop_omini_base_reset_cycle)
    		{
    			loop_omini_base_cnt=1;
    			loaded_input_stream_tile_number++;
    
    			stream_pingpong_flag=~stream_pingpong_flag;
    
    			outbuffer_omini_offset+=outbuffer_omini_increment_step;
    
    
    void wino6x6_stream_cell(
    		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in,
    		hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out,
    		hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in,
    		hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out,
    		ap_uint<OUT_WIDTH*2> out_buffer[16][1024],
    		ap_int<16> weight_indepth_load_number,
    		ap_int<16> weight_outdepth_load_number,
    		ap_int<16> weight_indepth_feed_size,
    		ap_int<16> weight_outdepth_feed_size,
    
    		ap_int<16> row_tile_number
    		#if DEBUG_FILE_PRINT
    		,int ROW_IDX, int COL_IDX
    		#endif
    
    		)
    
    #pragma HLS array_partition variable = out_buffer dim=1 complete
    
    	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg0[2];
    	ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg1[2];
    
    #pragma HLS array_partition variable = stream_temp_reg0 complete
    #pragma HLS array_partition variable = stream_temp_reg1 complete
    	#if DEBUG_FILE_PRINT
    
    xliu79's avatar
    xliu79 committed
    	memset(stream_temp_reg0,0xAA,2*2*36*2);
    	memset(stream_temp_reg1,0xAA,2*2*36*2);
    
    xliu79's avatar
    xliu79 committed
    
    
    	ap_int<W_WIDTH> G1[36][2];
    
    xliu79's avatar
    xliu79 committed
    	#pragma HLS array_partition variable=G1
    	
    
    	ap_uint<1> stream_pingpong_flag=0;
    
    xliu79's avatar
    xliu79 committed
    
    
    	int write_idx=0;
    	int weight_stream_idx=0;
    	int input_stream_idx =0;
    
    
    	char filename[100];
    	sprintf(filename,"wino_cell_output_%d_%d.txt",ROW_IDX, COL_IDX);
    	char input_stream_filename[100];
    	sprintf(input_stream_filename,"wino_cell_input_%d_%d.txt",ROW_IDX, COL_IDX);
    	char weight_stream_filename[100];
    	sprintf(weight_stream_filename,"wino_cell_weight_%d_%d.txt",ROW_IDX, COL_IDX);
    
    xliu79's avatar
    xliu79 committed
    
    
    	top_stream_in>>stream_temp_reg0[0];
    	bottom_stream_out<<stream_temp_reg0[0];
    
    xliu79's avatar
    xliu79 committed
    	#if DEBUG_FILE_PRINT
    
    	wino_cell_stream_input<0>(stream_temp_reg0[0],input_stream_idx,input_stream_filename);
    
    xliu79's avatar
    xliu79 committed
    	#endif
    
    	top_stream_in>>stream_temp_reg0[1];
    	bottom_stream_out<<stream_temp_reg0[1];
    
    xliu79's avatar
    xliu79 committed
    	#if DEBUG_FILE_PRINT
    
    	wino_cell_stream_input<0>(stream_temp_reg0[1],input_stream_idx,input_stream_filename);
    
    xliu79's avatar
    xliu79 committed
    	#endif
    
    	ITER0:for(int iter0=0;iter0<weight_indepth_load_number;iter0++)
    	ITER1:for(int iter1=0;iter1<weight_outdepth_load_number;iter1++)
    	ITER2:for(int iter2=0;iter2<row_tile_number;iter2++)
    	ITER3:for(int iter3=0;iter3<weight_indepth_feed_size/2;iter3++)
    	{
    	ITER4:for(int iter4=0;iter4<weight_outdepth_feed_size;iter4++)
    
    xliu79's avatar
    xliu79 committed
    	{
    
    		#pragma HLS pipeline
    #pragma HLS dependence variable=out_buffer inter false
    #pragma HLS dependence variable=out_buffer intra false
    
    			ap_int<GTGG_WIDTH_IN> input_tile[2][36][2];
    			#pragma HLS array_partition variable = input_tile complete
    
    xliu79's avatar
    xliu79 committed
    			for(int i=0;i<36;i++)
    
    xliu79's avatar
    xliu79 committed
    			#pragma HLS unroll
    				if(stream_pingpong_flag)
    				{
    
    					input_tile[0][i][0]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					input_tile[0][i][1]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    					input_tile[1][i][0]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					input_tile[1][i][1]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    
    xliu79's avatar
    xliu79 committed
    				}
    				else
    				{
    
    					input_tile[0][i][0]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					input_tile[0][i][1]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    					input_tile[1][i][0]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2);
    					input_tile[1][i][1]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN);
    
    xliu79's avatar
    xliu79 committed
    				}
    
    xliu79's avatar
    xliu79 committed
    
    			if(iter4<2 )
    
    xliu79's avatar
    xliu79 committed
    				if(stream_pingpong_flag)
    				{
    					top_stream_in>>stream_temp_reg0[iter4];
    					bottom_stream_out<<stream_temp_reg0[iter4];
    
    xliu79's avatar
    xliu79 committed
    					#if DEBUG_FILE_PRINT
    
    					wino_cell_stream_input<0>(stream_temp_reg0[iter4],input_stream_idx,input_stream_filename);
    
    xliu79's avatar
    xliu79 committed
    					#endif
    
    xliu79's avatar
    xliu79 committed
    				}
    				else
    				{
    					top_stream_in>>stream_temp_reg1[iter4];
    					bottom_stream_out<<stream_temp_reg1[iter4];
    
    xliu79's avatar
    xliu79 committed
    					#if DEBUG_FILE_PRINT
    
    					wino_cell_stream_input<0>(stream_temp_reg1[iter4],input_stream_idx,input_stream_filename);
    
    xliu79's avatar
    xliu79 committed
    					#endif
    
    xliu79's avatar
    xliu79 committed
    				}			
    
    xliu79's avatar
    xliu79 committed
    			ap_uint<W_WIDTH*2*36> weight_value_temp;
    			left_stream_in>>weight_value_temp;
    			right_stream_out<<weight_value_temp;
    
    xliu79's avatar
    xliu79 committed
    
    
    xliu79's avatar
    xliu79 committed
    			for(int i=0;i<36;i++)
    			{
    			#pragma HLS unroll
    				G1[i][0]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1,i*W_WIDTH);
    				G1[i][1]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1+288,i*W_WIDTH +288 );
    			}
    
    xliu79's avatar
    xliu79 committed
    			ap_int<GTGG_WIDTH_OUT> UV[36][2];
    			#pragma HLS array_partition variable=UV complete
    			
    			for(int i=0;i<36;i++)
    			{
    			#pragma HLS unroll
    
    				UV[i][0] = G1[i][0]*input_tile[0][i][0] + G1[i][1]*input_tile[1][i][0];
    				UV[i][1] = G1[i][0]*input_tile[0][i][1] + G1[i][1]*input_tile[1][i][1];
    
    xliu79's avatar
    xliu79 committed
    			}
    
    xliu79's avatar
    xliu79 committed
    			ap_int<A_WIDTH_OUT>  UVA[6][4][2];
    			#pragma HLS array_partition variable=UVA complete
    
    xliu79's avatar
    xliu79 committed
    			for(int b_idx=0;b_idx<2;b_idx++)
    
    xliu79's avatar
    xliu79 committed
    				for(int i=0;i<6;i++)