Skip to content
Snippets Groups Projects
wino_buffer.cpp 18.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • 
    
    #ifndef _WINO_BUFFER_HPP_
    #define _WINO_BUFFER_HPP_
    
    xliu79's avatar
    xliu79 committed
    #include "wino_macro.h"
    
    #include "../software/param.h"
    
    #include <ap_int.h>
    #include <hls_stream.h>
    
    
    xliu79's avatar
    xliu79 committed
    
    
    void input_feed_underconstruction(
    	ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
    
    	hls::stream< ap_uint<8*BATCH_SIZE*36> > input_tile_stream[WINO_WIDTH], 
    
    	ap_uint<16> inheight,
    	ap_uint<16> inwidth,
    
    	ap_uint<16> weightbuffer_load_indepth_number,
    	ap_uint<16> weightbuffer_load_outdepth_number,
    	ap_uint<16> wino_output_tile_size,
    	ap_uint<32> input_buffer_feeding_loop_bound,
    
    	ap_uint<16> loop_wino_tile_row_reset_cycle,
    
    	ap_uint<16> loop_wino_tile_col_reset_cycle,
    
    	ap_uint<16> loop_outdepth_minitile_baseidx_reset_cycle,
    
    	ap_uint<10> buffer_address_mid_minitile_depth_step,
    
    	ap_uint<16> wino_out_size_by_wino_width,
    
    	ap_uint<1> row_address_bitnumber_flag,
    
    	ap_uint<16> start_row_idx,
    	ap_int<16> start_row_idx_minus_pad_size
    
    	#pragma HLS array_partition variable=input_tile_stream complete
    
    	ap_int<10> input_row_idx[WINO_DOMAIN_SIZE];
    	#pragma HLS array_partition variable=input_row_idx complete
    
    	
    	ap_int<10> input_col_idx[WINO_WIDTH][WINO_DOMAIN_SIZE];
    	#pragma HLS array_partition variable=input_col_idx dim=1 complete
    	#pragma HLS array_partition variable=input_col_idx dim=2 complete
    
    	ap_uint<16> wino_col_offset_constant[WINO_WIDTH];
    	#pragma HLS array_partition variable=wino_col_offset_constant complete
    
    	for(int i=0;i<WINO_WIDTH;i++)
    	{
    		#pragma HLS unroll
    		wino_col_offset_constant[i]=wino_output_tile_size*i;
    	}
    
    
    	ap_uint<16>  first_col_idx=0;
    
    	ap_uint<INDEPTH_MINITILE_SIZE_BITWIDTH> loop_indepth_minitile_idx=0;
    
    
    	ap_uint<16> loop_wino_tile_row_cnt=1;
    
    	ap_uint<16> loop_wino_tile_col_cnt=1;
    
    
    
    
    	// loop_wino_tile_col_reset_cycle =conv_desc.wino_tile_number_in_outwidth*conv_desc.weightbuffer_outdepth_minitile_number*INDEPTH_MINITILE_SIZE;
    	// loop_outdepth_minitile_baseidx_reset_cycle =conv_desc.weightbuffer_outdepth_minitile_number*INDEPTH_MINITILE_SIZE;
    
    
    
    
    
    	ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_minitile_depth_offset=0;
    	ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_buffertile_depth_offset=0;
    
    
    
    
    
    	ap_int<10> input_head_row_idx=start_row_idx_minus_pad_size;
    
    	for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    	{
    		input_row_idx[i]=start_row_idx_minus_pad_size+i;
    	}
    
    
    	ap_int<16> input_head_col_idx=wino_col_offset_constant[0]-pad_size;
    
    	for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    	{
    		#pragma HLS unroll
    		for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    		{
    			#pragma HLS unroll
    			input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col];
    		}
    	}
    
    
    
    	for(ap_uint<16> outdepth_buffertile_idx=0;outdepth_buffertile_idx<weightbuffer_load_outdepth_number;outdepth_buffertile_idx++)
    	{
    
    
    		buffer_address_mid_minitile_depth_offset = 0;
    
    		for(int counter=0;counter<input_buffer_feeding_loop_bound;counter++ )
    		{
    			#pragma HLS pipeline II =1
    			// it is a flattened loop which does following
    			// for(ap_uint<16> indepth_buffertile_baseidx=0;indepth_buffertile_baseidx<weightbuffer_load_indepth_number;indepth_buffertile_baseidx++)
    			// for( int indepth_minitile_baseidx=0;indepth_minitile_baseidx<weightbuffer_indepth_minitile_number; indepth_minitile_baseidx ++)
    
    			// for( int wino_row_cnt;wino_row_cnt <conv_desc.wino_tile_number_in_outwidth;wino_row_cnt++)
    
    			// for(int wino_tile_col_idx =1; wino_tile_col_idx < wino_tile_number_in_outwidth+1 ; wino_tile_col_idx++)
    			// for(ap_uint<3> indepth_minitile_idx=0; indepth_minitile_idx< INDEPTH_MINITILE_SIZE; indepth_minitile_idx++)
    
    
    
    			ap_uint<1> row_legal_flag[WINO_DOMAIN_SIZE];
    			#pragma HLS array_partition variable=row_legal_flag complete
    
    			for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    			{
    			#pragma HLS unroll
    				row_legal_flag[i] = ( input_row_idx[i] >=0 && input_row_idx[i] < inwidth);
    			}
    
    
    			ap_uint<1> row_address_offset_bit1[INBUFFER_HEIGHT];
    			#pragma HLS array_partition variable=row_address_offset_bit1 complete
    			
    			ap_uint<2> row_address_offset_bit2[INBUFFER_HEIGHT];
    			#pragma HLS array_partition variable=row_address_offset_bit2 complete
    
    			ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_breakpoint=input_head_row_idx.range(INBUFFER_HEIGHT_BITWIDTH-1,0);
    			ap_uint<2> input_head_row_address_offset=input_head_row_idx.range(INBUFFER_HEIGHT_BITWIDTH+1,INBUFFER_HEIGHT_BITWIDTH);
    
    
    
    			for(int i=0;i<INBUFFER_HEIGHT;i++)
    			{
    				if(i>=row_breakpoint)
    				{
    					row_address_offset_bit1[i]=input_head_row_address_offset;
    					row_address_offset_bit2[i]=input_head_row_address_offset;
    				}
    				else
    				{
    					row_address_offset_bit1[i]=input_head_row_address_offset+1;
    					row_address_offset_bit2[i]=input_head_row_address_offset+1;
    				}
    				
    			}
    
    
    
    
    			ap_uint<1> col_legal_flag[WINO_WIDTH][WINO_DOMAIN_SIZE];
    			#pragma HLS array_partition variable=col_legal_flag complete
    
    			for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    			{
    
    					#pragma HLS unroll
    					col_legal_flag[wino_array_col][i]= ( input_col_idx[wino_array_col][i] >=0 && input_col_idx[wino_array_col][i] < inwidth);
    
    			ap_uint<INBUFFER_MID_ADDR_BITWIDTH> col_pix_address_offset[INBUFFER_WIDTH];
    
    			ap_uint<INBUFFER_WIDTH_BITWIDTH> col_breakpoint=input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH-1,0);
    
    			ap_uint<INBUFFER_MID_ADDR_BITWIDTH> input_head_col_address_offset;
    			input_head_col_address_offset= input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH+INBUFFER_MID_ADDR_BITWIDTH-1,INBUFFER_WIDTH_BITWIDTH) 
    			+ buffer_address_mid_minitile_depth_offset;
    
    			for(int i=0;i<INBUFFER_WIDTH;i++)
    			{
    				if(i>=col_breakpoint)
    					col_pix_address_offset[i] = input_head_col_address_offset;
    				else
    					col_pix_address_offset[i] = input_head_col_address_offset+1;
    			}
    
    			ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH> buffer_address[INBUFFER_HEIGHT][INBUFFER_WIDTH];
    
    			for(int i=0;i<INBUFFER_HEIGHT; i++)
    			{
    				#pragma HLS unroll
    				for(int j=0;j<INBUFFER_WIDTH;j++)
    
    					ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH-2> common=((ap_uint<INBUFFER_MID_ADDR_BITWIDTH-1> )col_pix_address_offset[j],loop_indepth_minitile_idx);
    					ap_uint<2> headbits=row_address_bitnumber_flag?
    										(row_address_offset_bit1[i],col_pix_address_offset[j][INBUFFER_MID_ADDR_BITWIDTH-1]):
    										row_address_offset_bit2[i];
    					buffer_address[i][j]=(headbits,common);
    
    			ap_uint<16> input_buffer_val[INBUFFER_HEIGHT][INBUFFER_WIDTH];
    			#pragma HLS array_partition variable=input_buffer_val complete
    
    			for(int i=0;i<INBUFFER_HEIGHT; i++)
    			{
    				#pragma HLS unroll
    				for(int j=0;j<INBUFFER_WIDTH;j++)
    
    					input_buffer_val[i][j]=input_buffer[i][j][buffer_address[i][j]];
    
    			ap_uint<16> input_plane_tile_row[WINO_DOMAIN_SIZE][INBUFFER_WIDTH];
    			#pragma HLS array_partition variable=input_plane_tile_row dim=1 complete
    			#pragma HLS array_partition variable=input_plane_tile_row dim=2 complete
    
    			for(int j=0;j<INBUFFER_WIDTH;j++)
    			{
    			#pragma HLS unroll
    				for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    
    						input_plane_tile_row[i][j]=input_buffer_val[ (ap_uint<INBUFFER_HEIGHT_BITWIDTH>) input_row_idx[i].range(INBUFFER_HEIGHT_BITWIDTH-1,0) ][j];
    
    			ap_uint<16*WINO_DOMAIN_SIZE_SQUARE> input_plane_tile[WINO_WIDTH];
    			#pragma HLS array_partition variable=input_plane_tile complete dim=1
    			#pragma HLS array_partition variable=input_plane_tile complete dim=2
    			#pragma HLS array_partition variable=input_plane_tile complete dim=3
    
    
    			for(int i=0;i<WINO_WIDTH;i++)
    			{
    			#pragma HLS unroll
    				for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    
    							input_plane_tile[i].range((j*6+k)*16+15,(j*6+k)*16)=input_plane_tile_row[j][ (ap_uint<INBUFFER_WIDTH_BITWIDTH>) input_col_idx[i][k].range(INBUFFER_WIDTH_BITWIDTH-1,0) ];
    
    							input_plane_tile[i].range((j*6+k)*16+15,(j*6+k)*16)=0;
    
    
    			for(int i=0;i<WINO_WIDTH;i++)
    			{
    				#pragma HLS unroll
    				input_tile_stream[i]<<input_plane_tile[i];
    			}
    
    
    			#if DEBUG_FILE_PRINT
    				int indepth = buffer_address_mid_minitile_depth_offset/buffer_address_mid_minitile_depth_step*INDEPTH_MINITILE_SIZE
    								+loop_indepth_minitile_idx;
    
    				attach_streaming_content<WINO_WIDTH,WINO_DOMAIN_SIZE>(input_plane_tile, start_row_idx, input_head_col_idx+pad_size, indepth, "instream.txt");
    
    			if(loop_wino_tile_row_cnt == loop_wino_tile_row_reset_cycle)
    
    			{
    				buffer_address_mid_minitile_depth_offset += buffer_address_mid_minitile_depth_step;
    			}
    
    			if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
    			{
    				input_head_col_idx=wino_col_offset_constant[0]-pad_size;
    				for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    
    					#pragma HLS unroll
    					for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    
    						input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col];
    
    			else if(loop_indepth_minitile_idx==INDEPTH_MINITILE_SIZE-1)
    
    			{
    				input_head_col_idx+=wino_out_size_by_wino_width;
    				for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    
    					#pragma HLS unroll
    					for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    
    						input_col_idx[wino_array_col][i]+=wino_out_size_by_wino_width;
    
    
    			
    			if(loop_wino_tile_row_cnt == loop_wino_tile_row_reset_cycle)
    
    				input_head_row_idx=start_row_idx_minus_pad_size;
    
    				for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    				{
    					#pragma HLS unroll
    					input_row_idx[i]=start_row_idx_minus_pad_size+i;
    				}
    
    
    			else if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
    
    				input_head_row_idx+=wino_output_tile_size;
    
    				for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    				{
    					#pragma HLS unroll
    					input_row_idx[i]+=wino_output_tile_size;
    				}
    			
    
    
    			if(loop_wino_tile_row_cnt==loop_wino_tile_row_reset_cycle)
    
    				loop_wino_tile_row_cnt++;
    			}
    
    			if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
    			{
    				loop_wino_tile_col_cnt=1;
    			}
    			else 
    			{
    				loop_wino_tile_col_cnt++;
    
    xliu79's avatar
    xliu79 committed
    //template<int dummy>
    
    void load_weight_ddr_one_port(
    
    	ap_uint<128>* weight_DDR,
    	ap_uint<32> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4][WEIGHT_BUFFER_DEPTH],
    	ap_uint<16> weightDDR_buffer_burst_length,
    	ap_uint<16> weightDDR_port_burst_length,
    	ap_uint<32> ddr_address_offset,
    	ap_uint<1> pingpong,
    	ap_uint<1> skip_flag
    
    xliu79's avatar
    xliu79 committed
    	#if DEBUG_CONV_DESC
    
    xliu79's avatar
    xliu79 committed
    
    	#pragma HLS array_partition variable = weight_buff dim=1 complete
    	#pragma HLS array_partition variable = weight_buff dim=2 complete
    
    
    	// printf("DDR_offset %d, i%d o%d\n",(int) ddr_address_offset,(int) conv_desc.weightbuffer_load_indepth_number,(int) conv_desc.weightbuffer_load_outdepth_number);
    
    	ap_uint<128>* offseted_weight_DDR=weight_DDR+ddr_address_offset;
    
    	ap_uint<WEIGHTDDR_INDEPTH_MINITILE_128BIT_STEP_BITWIDTH> counter=0;
    
    	ap_uint<16> port_load_cnt=1;
    
    	ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH-1> buffer_address_offset=0;
    
    	#if WEIGHT_FEED_NUMBER_PER_PORT_BITWIDTH == 0
    	ap_uint<1> buffer_idx;
    	#else
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<WEIGHT_FEED_NUMBER_PER_PORT_BITWIDTH> buffer_idx=0;
    
    	printf("into the loop %d %d\n", (int) weightDDR_port_burst_length, (int) weightDDR_buffer_burst_length);
    	fflush(stdout);
    
    	for(int address = 0; address<weightDDR_port_burst_length; address++)
    
    xliu79's avatar
    xliu79 committed
    		#pragma HLS pipeline
    
    		ap_uint<128> temp128 = offseted_weight_DDR[address];
    		ap_uint<32> temp32[4];
    		#pragma HLS array_partition  variable = temp32 complete
    
    		{
    			#pragma HLS unroll
    
    		}
    
    		ap_uint<10> buffer_address = (pingpong,buffer_address_offset);
    
    
    		for(int i=0;i<WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4;i++)
    
    				#if WEIGHT_FEED_NUMBER_PER_PORT_BITWIDTH == 0
    
    				weight_buff[0][i][buffer_address]=temp32[i%4];
    				#else
    				weight_buff[buffer_idx][i][buffer_address]=temp32[i%4];
    				#endif
    			}
    		}
    	
    		if(port_load_cnt==weightDDR_buffer_burst_length)
    
    		else if(counter== WEIGHTDDR_INDEPTH_MINITILE_128BIT_STEP-1)
    
    		{
    			buffer_address_offset++;
    		}
    
    
    		if(port_load_cnt==weightDDR_buffer_burst_length)
    
    		{
    			buffer_idx++;
    
    		}
    		else
    		{
    			port_load_cnt++;
    		}
    
    
    		if(counter==WEIGHTDDR_INDEPTH_MINITILE_128BIT_STEP-1)
    
    xliu79's avatar
    xliu79 committed
    
    
    void weight_streamer(
    	ap_uint<32> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4][WEIGHT_BUFFER_DEPTH],
    	hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> >  weight_stream[WEIGHT_FEED_NUMBER_PER_PORT],
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<16> loop_outdepth_minitile_baseidx_reset_cycle_minus1,
    
    	ap_uint<16> loop_start_output_baserowcol_reset_cycle,
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<32> loop_weight_feed_bound,
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<16> weightbuffer_outdepth_minitile_number,
    
    xliu79's avatar
    xliu79 committed
    	#if DEBUG_CONV_DESC
    
    	,ConvDesc_t conv_desc
    	#endif
    )
    {
    	printf("stream pingpong %d\n",(int) pingpong);
    
    xliu79's avatar
    xliu79 committed
    	#pragma HLS array_partition variable = weight_buff dim=1 complete
    	#pragma HLS array_partition variable = weight_buff dim=2 complete
    
    	// int weight_feed_total_size_by2 = weight_feed_total_size/2;
    
    	ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH -1> outdepth_minitile_addr_offset=0;
    	ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH -1> indepth_minitile_addr_offset=0;
    
    	ap_uint<16> loop_outdepth_minitile_baseidx_cnt=1;
    
    	ap_uint<16> loop_start_output_baserowcol_cnt=1;
    
    xliu79's avatar
    xliu79 committed
    
    
    xliu79's avatar
    xliu79 committed
    	// loop_outdepth_minitile_baseidx_reset_cycle_minus1=conv_desc.weightbuffer_outdepth_minitile_number-1;
    
    	// loop_start_output_baserowcol_reset_cycle=conv_desc.weightbuffer_outdepth_minitile_number * conv_desc.wino_tile_number_in_outwidth;
    
    xliu79's avatar
    xliu79 committed
    
    	// int loop_weight_feed_bound = conv_desc.weightbuffer_indepth_minitile_number * conv_desc.weightbuffer_outdepth_minitile_number * conv_desc.wino_tile_number_in_outwidth;
    
    xliu79's avatar
    xliu79 committed
    	for(ap_uint<32> cycle=0;cycle < loop_weight_feed_bound; cycle++)
    
    xliu79's avatar
    xliu79 committed
    		#pragma HLS pipeline
    		// for(int indepth_minitile_baseidx=0;indepth_minitile_baseidx<conv_desc.weightbuffer_load_indepth_step; indepth_minitile_baseidx += INDEPTH_MINITILE_SIZE)
    
    		// for(int start_output_baserowcol =0; start_output_baserowcol < conv_desc.outwidth *conv_desc.wino_out_tile_in_rowstep; start_output_baserowcol+=conv_desc.wino_output_tile_size*WINO_WIDTH)
    
    xliu79's avatar
    xliu79 committed
    		// for(int outdepth_minitile_baseidx=0;outdepth_minitile_baseidx<conv_desc.weightbuffer_load_outdepth_step; outdepth_minitile_baseidx += OUTDEPTH_MINITILE_SIZE)
    		ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH-1> weight_buffer_address_right=indepth_minitile_addr_offset+outdepth_minitile_addr_offset;
    		
    
    xliu79's avatar
    xliu79 committed
    		ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH> weight_buffer_address = (pingpong,weight_buffer_address_right);
    
    xliu79's avatar
    xliu79 committed
    		ap_uint<32> temp18[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4];
    		#pragma HLS array_partition variable = temp18 complete
    
    xliu79's avatar
    xliu79 committed
    		for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
    		{
    		#pragma HLS unroll
    			for(int j18=0;j18<WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4;j18++)
    			{
    			#pragma HLS unroll
    				temp18[buffer_idx][j18]=weight_buff[buffer_idx][j18][weight_buffer_address];
    			}
    		}
    
    xliu79's avatar
    xliu79 committed
    		ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> temp16x36[WEIGHT_FEED_NUMBER_PER_PORT];
    		#pragma HLS array_partition variable = temp16x36 complete
    
    xliu79's avatar
    xliu79 committed
    		for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
    		{
    			#pragma HLS unroll
    			for(int j18=0;j18<WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4;j18++)
    			{
    				#pragma HLS unroll
    				temp16x36[buffer_idx].range(j18*32+31,j18*32)=temp18[buffer_idx][j18];
    			}
    		}
    		for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
    		{
    			#pragma HLS unroll
    			weight_stream[buffer_idx]<<temp16x36[buffer_idx];
    		}
    
    		if(loop_start_output_baserowcol_cnt==loop_start_output_baserowcol_reset_cycle){
    
    xliu79's avatar
    xliu79 committed
    			indepth_minitile_addr_offset+=weightbuffer_outdepth_minitile_number;
    
    xliu79's avatar
    xliu79 committed
    		}
    		
    		if(outdepth_minitile_addr_offset==loop_outdepth_minitile_baseidx_reset_cycle_minus1){
    			outdepth_minitile_addr_offset=0;
    		}
    		else{
    			outdepth_minitile_addr_offset++;
    		}
    
    		if(loop_start_output_baserowcol_cnt==loop_start_output_baserowcol_reset_cycle){
    			loop_start_output_baserowcol_cnt=1;
    
    xliu79's avatar
    xliu79 committed
    		}
    		else{
    
    			loop_start_output_baserowcol_cnt++;
    
    	ap_uint<128>* weight_DDR0,
    
    	hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> >  weight_stream[WEIGHT_FEED_NUMBER_PER_PORT],
    	ap_uint<16> weightDDR_burst_number,
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<16> weightDDR_buffer_burst_length,
    
    	ap_uint<16> weightDDR_port_burst_length,
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<16> loop_outdepth_minitile_baseidx_reset_cycle_minus1,
    
    	ap_uint<16> loop_start_output_baserowcol_reset_cycle,
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<32> loop_weight_feed_bound,
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<16> weightbuffer_outdepth_minitile_number,
    
    xliu79's avatar
    xliu79 committed
    	#if DEBUG_CONV_DESC
    
    	printf("\ninto weight feed %d %d %d\n",(int) first_flag, (int) last_flag, (int) weightDDR_burst_number  );
    	static ap_uint<32> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4][WEIGHT_BUFFER_DEPTH];
    
    	static ap_uint<16> DDR_offset;
    	static ap_uint<16> DDR_load_cnt;
    	static ap_uint<1> pingpong;
    
    	if(first_flag)
    
    		pingpong = 0;
    
    
    	weight_DDR0,
    	weight_buff,
    
    xliu79's avatar
    xliu79 committed
    	weightDDR_buffer_burst_length,
    	weightDDR_port_burst_length,
    
    xliu79's avatar
    xliu79 committed
    	#if DEBUG_CONV_DESC
    
    	for(ap_uint<16> cnt=0;cnt< weightDDR_burst_number ;cnt++)
    
    		
    		if(DDR_load_cnt == weightDDR_burst_number-1)
    		{
    			DDR_load_cnt = 0;
    			DDR_offset = 0;
    		}
    		else
    		{
    			DDR_load_cnt+=1;
    			DDR_offset+=weightDDR_port_burst_length;
    		}
    
    		weight_DDR0,
    		weight_buff,
    
    xliu79's avatar
    xliu79 committed
    		weightDDR_buffer_burst_length,
    		weightDDR_port_burst_length,
    
    		DDR_offset,
    
    xliu79's avatar
    xliu79 committed
    		#if DEBUG_CONV_DESC
    
    		#endif
    
    xliu79's avatar
    xliu79 committed
    			loop_outdepth_minitile_baseidx_reset_cycle_minus1,
    
    			loop_start_output_baserowcol_reset_cycle,
    
    xliu79's avatar
    xliu79 committed
    			loop_weight_feed_bound,
    
    xliu79's avatar
    xliu79 committed
    			weightbuffer_outdepth_minitile_number,
    
    xliu79's avatar
    xliu79 committed
    			#if DEBUG_CONV_DESC
    
    xliu79's avatar
    xliu79 committed
    #endif