Skip to content
Snippets Groups Projects
wino_buffer.cpp 24.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • 
    
    #ifndef _WINO_BUFFER_HPP_
    #define _WINO_BUFFER_HPP_
    
    xliu79's avatar
    xliu79 committed
    #include "wino_macro.h"
    
    #include "../software/param.h"
    
    #include <ap_int.h>
    #include <hls_stream.h>
    
    
    xliu79's avatar
    xliu79 committed
    
    
    
    void input_feed(
    	ap_uint<16> input_buffer[8][16][INPUT_BUFFER_DEPTH],
    	hls::stream< ap_uint<32*36> > &input_tile_stream1, 
    	hls::stream< ap_uint<32*36> > &input_tile_stream2,
    	ap_uint<16> start_row_idx,
    	ap_uint<16> output_width,
    	ap_uint<16> input_height,
    	ap_uint<16> input_width,
    	ap_uint<16> input_width_ceildiv_16,
    	ap_uint<16> input_depth_aligned_8,
    	ap_uint<16> pad_size,
        ap_uint<16> weight_outdepth_load_number,
    	ap_uint<8> window_size,
    	ap_uint<3> winograd_output_tile_size
    )
    {
    	#pragma HLS array_partition variable=input_buffer dim=1 complete
    	#pragma HLS array_partition variable=input_buffer dim=2 complete
    	int tile_step =  winograd_output_tile_size*2;
    
    
    
    
        for(ap_uint<16> depth_tile_idx=0,depth_base_address=0;depth_tile_idx<input_depth_aligned_8/8;depth_tile_idx++,depth_base_address+=input_width_ceildiv_16)
    	{
    		for(int weight_load_outdepth_cnt=0;weight_load_outdepth_cnt<weight_outdepth_load_number;weight_load_outdepth_cnt++)
    		{
    			int start_col_idx =0;
    			int depth_address =0;
    			int row_tile_offset =0;
    
    
    			for(int window_row_offset=0; window_row_offset<window_size; window_row_offset++)
    			{
    				for(int window_col_offset=0; window_col_offset<window_size; window_col_offset++)
    				{
    					ap_uint<1> row_legal_flag[6];
    					#pragma HLS array_partition variable=row_legal_flag complete
    
    					ap_uint<3> row_bank_idx[6];
    					#pragma HLS array_partition variable=row_bank_idx complete
    
    					ap_uint<1> row_addres_offset[8];
    					#pragma HLS array_partition variable=row_addres_offset complete
    
    					ap_uint<4> start_row_mod16 = start_row_idx-pad_size+window_row_offset;
    					ap_uint<3> breakpoint = start_row_mod16.range(2,0);
    
    
    					for(int i=0;i<8;i++)
    					{
    						#pragma HLS unroll
    						if(i< breakpoint)
    							row_addres_offset[i] = ~start_row_mod16[3];
    						else
    							row_addres_offset[i] = start_row_mod16[3];
    					}			
    
    					
    					for(int i=0;i<6;i++)
    					{
    						#pragma HLS unroll
    						row_bank_idx[i] = start_row_idx+i-pad_size+window_row_offset;
    						row_legal_flag[i] = ( start_row_idx+i-pad_size >=0 && start_row_idx+i-pad_size < input_height);
    					}
    
    
    					
    					ap_uint<10> input_col_idx[12];
    	#pragma HLS array_partition variable=input_col_idx complete
    					ap_int<16> input_head_col_idx=-pad_size;
    
    					for(int i=0;i<6;i++)
    					{
    						#pragma HLS unroll
    						input_col_idx[i]=i-pad_size+window_col_offset;
    						input_col_idx[i+6]=i+winograd_output_tile_size-pad_size+window_col_offset;
    					}
    
    					for(ap_uint<10> out_col_idx=0;out_col_idx < output_width; out_col_idx+=tile_step)
    					{		
    						ap_uint<6> input_buffer_address_by8[16];
    	#pragma HLS array_partition variable=input_buffer_address_by8 complete
    						ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH-4>  input_head_col_by16 = input_head_col_idx.range(10,4)+depth_base_address;
    						ap_uint<4>  input_buffer_address_break_point = input_head_col_idx.range(3,0);
    						
    						ap_uint<1> col_legal_flag[12];
    	#pragma HLS array_partition variable=col_legal_flag complete
    
    						for(int i=0;i<12; i++)
    						{
    							#pragma HLS unroll
    							col_legal_flag[i]= ( input_col_idx[i] >=0 && input_col_idx[i] < input_width);
    						}
    						for(int i=0;i<16;i++)
    						{
    							if(i>=input_buffer_address_break_point)
    								input_buffer_address_by8[i] = input_head_col_by16;
    							else
    								input_buffer_address_by8[i] = input_head_col_by16+1;
    						}
    
    						DEPTH:for(ap_uint<4> depth_idx_in_tile=0;depth_idx_in_tile<8;depth_idx_in_tile++)
    						{	
    
    	#pragma HLS pipeline
    							ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH> input_buffer_address[8][16];
    	#pragma HLS array_partition variable=input_buffer_address complete
    
    	#pragma HLS array_partition variable=col_legal_flag complete
    							ap_uint<16> input_buffer_val[8][16];
    	#pragma HLS array_partition variable=input_buffer_val complete
    							for(int j=0;j<8;j++)
    							{
    							#pragma HLS unroll
    								for(int i=0;i<16;i++)
    								{
    									#pragma HLS unroll
    									input_buffer_address[j][i].range(INPUT_BUFFER_DEPTH_BITWIDTH-1,0) = (row_addres_offset[j],( input_buffer_address_by8[i], (ap_uint<3>) depth_idx_in_tile.range(2,0) ));
    								}
    							}
    
    							for(int i=0;i<8;i++)
    							{
    								#pragma HLS unroll
    								for(int j=0;j<16;j++)
    								{
    									#pragma HLS unroll
    									input_buffer_val[i][j]=input_buffer[i][j][ (ap_uint<10>) input_buffer_address[i][j] ];   
    								}
    							}
    							// for(int i=0;i<8;i++)
    							// {
    							// 	for(int j=0;j<16;j++)
    							// 	{
    							// 	}
    							// }
    
    
    							ap_uint<16> input_plane_tile_row[6][16];
    
    
    
    	#pragma HLS array_partition variable=input_plane_tile_row dim=1 complete
    
    	#pragma HLS array_partition variable=input_plane_tile_row dim=2 complete
    							for(int j=0;j<16;j++)
    							{
    	#pragma HLS unroll
    								for(int i=0;i<6;i++)
    								{
    	#pragma HLS unroll
    									if(row_legal_flag[i])
    									{
    										input_plane_tile_row[i][j]=input_buffer_val[row_bank_idx[i]][j];
    									}
    									else
    									{
    										input_plane_tile_row[i][j]=0;
    									}
    								}
    							}
    
    
    							ap_uint<16> input_plane_tile[6][12];
    							#pragma HLS array_partition variable=input_plane_tile complete
    							for(int i=0;i<6;i++)
    							{
    							#pragma HLS unroll
    								for(int j=0;j<12;j++)
    								{
    								#pragma HLS unroll
    									if(col_legal_flag[j])
    										input_plane_tile[i][j]=input_plane_tile_row[i][  (ap_uint<4>) input_col_idx[j].range(3,0) ];
    									else
    										input_plane_tile[i][j]=0;
    								}
    							}
    
    
    							// #if DEBUG_FILE_PRINT
    							// attach_streaming_content<0>(input_plane_tile,start_row_idx-pad_size,out_col_idx-pad_size,depth_tile_idx*8+depth_idx_in_tile,"input_stream_content.txt");
    							// #endif
    
    
    							ap_int<8> in[36][4];
    
    		
    							for(int k=0;k<6;k++)
    							{
    							#pragma HLS unroll
    								for(int l=0;l<6;l++)
    								{
    								#pragma HLS unroll
    									for(ap_uint<3> j=0;j<4;j++)
    									{
    									#pragma HLS unroll
    
    xliu79's avatar
    xliu79 committed
    										in[k*6+l][j].range(7,0)=  input_plane_tile[k][ j/2*6+l].range( j%2*8+7 ,j%2*8);
    
    xliu79's avatar
    xliu79 committed
    							ap_int<16>  dB[6][6][4];
    
    							#pragma HLS array_partition variable=dB complete
    	
    							for(int i=0;i<6;i++)
    							{
    							#pragma HLS unroll
    								for(int j=0;j<4;j++)
    								{
    
    xliu79's avatar
    xliu79 committed
    									dB[i][0][j]=(((ap_int<16>)in[i*6+0][j]-(ap_int<16>)in[i*6+2][j])<<2) - (ap_int<16>)in[i*6+2][j] + (ap_int<16>)in[i*6+4][j];
    									dB[i][1][j]=(ap_int<16>)in[i*6+3][j]+(ap_int<16>)in[i*6+4][j]-(((ap_int<16>)in[i*6+1][j]+(ap_int<16>)in[i*6+2][j])<<2);
    									dB[i][2][j]=(((ap_int<16>)in[i*6+1][j]-(ap_int<16>)in[i*6+2][j])<<2)+(ap_int<16>)in[i*6+4][j]-(ap_int<16>)in[i*6+3][j];
    									dB[i][3][j]=(((ap_int<16>)in[i*6+3][j]-(ap_int<16>)in[i*6+1][j])<<1)+(ap_int<16>)in[i*6+4][j]-(ap_int<16>)in[i*6+2][j];
    									dB[i][4][j]=(((ap_int<16>)in[i*6+1][j]-(ap_int<16>)in[i*6+3][j])<<1)+(ap_int<16>)in[i*6+4][j]-(ap_int<16>)in[i*6+2][j];
    									dB[i][5][j]=(((ap_int<16>)in[i*6+1][j]-(ap_int<16>)in[i*6+3][j])<<2)+(ap_int<16>)in[i*6+5][j]-(ap_int<16>)in[i*6+3][j];	
    
    xliu79's avatar
    xliu79 committed
    
    
    
    							ap_int<16>  BTDB[6][6][4];
    							#pragma HLS array_partition variable=BTDB complete
    							for(int i=0;i<6;i++)
    							{
    							#pragma HLS unroll
    								for(int j=0;j<4;j++)
    								{
    									BTDB[0][i][j]=((dB[0][i][j]-dB[2][i][j])<<2) - dB[2][i][j] + dB[4][i][j];
    									BTDB[1][i][j]=dB[3][i][j]+dB[4][i][j]-((dB[1][i][j]+dB[2][i][j])<<2);
    									BTDB[2][i][j]=((dB[1][i][j]-dB[2][i][j])<<2)+dB[4][i][j]-dB[3][i][j];
    									BTDB[3][i][j]=((dB[3][i][j]-dB[1][i][j])<<1)+dB[4][i][j]-dB[2][i][j];
    									BTDB[4][i][j]=((dB[1][i][j]-dB[3][i][j])<<1)+dB[4][i][j]-dB[2][i][j];
    									BTDB[5][i][j]=((dB[1][i][j]-dB[3][i][j])<<2)+dB[5][i][j]-dB[3][i][j];
    								}
    							}
    
    							ap_uint<32*36> stream_out1;
    							ap_uint<32*36> stream_out2;
    							for(int i=0;i<6;i++)
    							{
    							#pragma HLS unroll
    								for(int j=0;j<6;j++)
    								{
    								#pragma HLS unroll
    									stream_out1.range( (i*6+j)*32+31,(i*6+j)*32) =(BTDB[i][j][1],BTDB[i][j][0]);
    									stream_out2.range( (i*6+j)*32+31,(i*6+j)*32) =(BTDB[i][j][3],BTDB[i][j][2]);
    								}
    							}
    
    xliu79's avatar
    xliu79 committed
    							#if DEBUG_FILE_PRINT
    
    xliu79's avatar
    xliu79 committed
    							attach_streaming_wino<0>(stream_out1,stream_out2,start_row_idx-pad_size,input_head_col_idx,depth_tile_idx*8+depth_idx_in_tile,"input_stream_wino.txt");
    
    xliu79's avatar
    xliu79 committed
    							#endif
    
    							input_tile_stream1<<stream_out1;
    							input_tile_stream2<<stream_out2;
    						}
    
    						for(int i=0;i<12; i++)
    						{
    							#pragma HLS unroll
    							input_col_idx[i]+=tile_step;
    						}
    						input_head_col_idx+=tile_step;
    					}
    				}//depth_tile1
    			}// window_col
    		}
    	}//window_row
    
    	ap_uint<32*36> stream_out1=0;
    	ap_uint<32*36> stream_out2=0;
    	input_tile_stream1<<stream_out1;
    	input_tile_stream1<<stream_out1;
    	input_tile_stream2<<stream_out2;
    	input_tile_stream2<<stream_out2;
    }
    
    
    
    void input_feed_underconstruction(
    	ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
    
    	// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1, 
    	// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
    	ap_uint<16> inheight,
    	ap_uint<16> inwidth,
    
    	ap_uint<16> weightbuffer_load_indepth_number,
    	ap_uint<16> weightbuffer_load_outdepth_number,
    	ap_uint<16> wino_output_tile_size,
    	ap_uint<32> input_buffer_feeding_loop_bound,
    	ap_uint<16> loop_wino_tile_col_reset_cycle,
    	ap_uint<16> loop_indepth_minitile_baseidx_reset_cycle,
    	ap_uint<10> buffer_address_mid_buffertile_depth_step,
    	ap_uint<10> buffer_address_mid_minitile_depth_step,
    	ap_uint<16> start_row_idx,
    	ap_int<16> start_row_idx_minus_pad_size
    	#if DEBUG_FILE_PRINT
    	,ConvDesc_t conv_desc
    	#endif
    
    	// row_selection preparation 
    	ap_uint<1> row_legal_flag[WINO_DOMAIN_SIZE];
    	#pragma HLS array_partition variable=row_legal_flag complete
    	ap_uint<1> row_address_offset[INBUFFER_HEIGHT];
    	#pragma HLS array_partition variable=row_address_offset complete
    	ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_bank_idx[WINO_DOMAIN_SIZE];
    	#pragma HLS array_partition variable=row_bank_idx complete
    
    	ap_uint<16> wino_out_size_by_wino_width = wino_output_tile_size<<WINO_WIDTH_BITWIDTH;
    
    	ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_breakpoint = start_row_idx_minus_pad_size.range(INBUFFER_HEIGHT_BITWIDTH-1,0);
    
    	for(int i=0;i<8;i++)
    	{
    	#pragma HLS unroll
    		if(i< row_breakpoint)
    			row_address_offset[i] = ~start_row_idx_minus_pad_size[INBUFFER_HEIGHT_BITWIDTH];
    		else
    			row_address_offset[i] = start_row_idx_minus_pad_size[INBUFFER_HEIGHT_BITWIDTH];
    	}			
    	for(int i=0;i<6;i++)
    	{
    	#pragma HLS unroll
    		row_bank_idx[i] = start_row_idx_minus_pad_size+i;
    		row_legal_flag[i] = ( start_row_idx_minus_pad_size+i >=0 && start_row_idx_minus_pad_size+i < inheight);
    	}
    
    	
    	ap_int<10> input_col_idx[WINO_WIDTH][WINO_DOMAIN_SIZE];
    	#pragma HLS array_partition variable=input_col_idx dim=1 complete
    	#pragma HLS array_partition variable=input_col_idx dim=2 complete
    
    	ap_uint<16> wino_col_offset_constant[WINO_WIDTH];
    	#pragma HLS array_partition variable=wino_col_offset_constant complete
    
    	for(int i=0;i<WINO_WIDTH;i++)
    	{
    		#pragma HLS unroll
    		wino_col_offset_constant[i]=wino_output_tile_size*i;
    	}
    
    
    	ap_uint<16>  first_col_idx=0;
    
    
    
    
    
    	ap_uint<INDEPTH_MINITILE_SIZE_BITWIDTH> loop_indepth_minitile_idx=0;
    
    	ap_uint<16> loop_wino_tile_col_cnt=1;
    	ap_uint<16> loop_indepth_minitile_baseidx_cnt =1;
    
    
    
    
    
    	ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_minitile_depth_offset=0;
    	ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_buffertile_depth_offset=0;
    
    
    
    	ap_int<16> input_head_col_idx=wino_col_offset_constant[0]-pad_size;
    
    	for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    	{
    		#pragma HLS unroll
    		for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    		{
    			#pragma HLS unroll
    			input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col];
    		}
    	}
    
    
    
    	for(ap_uint<16> outdepth_buffertile_idx=0;outdepth_buffertile_idx<weightbuffer_load_outdepth_number;outdepth_buffertile_idx++)
    	{
    		for(ap_uint<16> indepth_buffertile_baseidx=0;indepth_buffertile_baseidx<weightbuffer_load_indepth_number;indepth_buffertile_baseidx++)
    		{
    			buffer_address_mid_buffertile_depth_offset = indepth_buffertile_baseidx*buffer_address_mid_buffertile_depth_step;
    			buffer_address_mid_minitile_depth_offset = buffer_address_mid_buffertile_depth_offset;
    
    			for(int counter=0;counter<input_buffer_feeding_loop_bound;counter++ )
    			{
    				#pragma HLS pipeline II =1
    				// it is a semi flattened loop which does following
    				// for(int outdepth_minitile_baseidx=0;outdepth_minitile_baseidx<weightbuffer_outdepth_minitile_number; outdepth_minitile_baseidx ++)
    				// for( int indepth_minitile_baseidx=0;indepth_minitile_baseidx<weightbuffer_indepth_minitile_number; indepth_minitile_baseidx ++)
    				// for(int wino_tile_col_idx =1; wino_tile_col_idx < wino_tile_number_in_outwidth+1 ; wino_tile_col_idx++)
    				// for(ap_uint<3> indepth_minitile_idx=0; indepth_minitile_idx< INDEPTH_MINITILE_SIZE; indepth_minitile_idx++)
    				ap_uint<1> col_legal_flag[WINO_WIDTH][WINO_DOMAIN_SIZE];
    				#pragma HLS array_partition variable=col_legal_flag complete
    
    				for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    				{
    				#pragma HLS unroll
    					for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    					{
    						#pragma HLS unroll
    						col_legal_flag[wino_array_col][i]= ( input_col_idx[wino_array_col][i] >=0 && input_col_idx[wino_array_col][i] < inwidth);
    
    					}
    				}
    				ap_uint<INBUFFER_MID_ADDR_BITWIDTH> col_pix_address_offset[INBUFFER_WIDTH];
    				
    				ap_uint<INBUFFER_WIDTH_BITWIDTH> col_breakpoint=input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH-1,0);
    
    				ap_uint<INBUFFER_MID_ADDR_BITWIDTH> input_head_col_address_offset;
    				input_head_col_address_offset= input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH+INBUFFER_MID_ADDR_BITWIDTH-1,INBUFFER_WIDTH_BITWIDTH) 
    				+ buffer_address_mid_minitile_depth_offset;
    
    
    
    
    				for(int i=0;i<INBUFFER_WIDTH;i++)
    				{
    					if(i>=col_breakpoint)
    						col_pix_address_offset[i] = input_head_col_address_offset;
    					else
    						col_pix_address_offset[i] = input_head_col_address_offset+1;
    				}
    
    				ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH> buffer_address[INBUFFER_HEIGHT][INBUFFER_WIDTH];
    
    				for(int i=0;i<INBUFFER_HEIGHT; i++)
    				{
    					#pragma HLS unroll
    					for(int j=0;j<INBUFFER_WIDTH;j++)
    					{
    						#pragma HLS unroll
    						buffer_address[i][j]=(row_address_offset[i],col_pix_address_offset[j],loop_indepth_minitile_idx);
    
    					}
    				}
    
    				
    
    				ap_uint<16> input_buffer_val[INBUFFER_HEIGHT][INBUFFER_WIDTH];
    				#pragma HLS array_partition variable=input_buffer_val complete
    
    
    				for(int i=0;i<INBUFFER_HEIGHT; i++)
    				{
    					#pragma HLS unroll
    					for(int j=0;j<INBUFFER_WIDTH;j++)
    					{
    						#pragma HLS unroll
    						input_buffer_val[i][j]=input_buffer[i][j][buffer_address[i][j]];
    					}
    				}
    
    
    
    				ap_uint<16> input_plane_tile_row[WINO_DOMAIN_SIZE][INBUFFER_WIDTH];
    				#pragma HLS array_partition variable=input_plane_tile_row dim=1 complete
    				#pragma HLS array_partition variable=input_plane_tile_row dim=2 complete
    
    
    				for(int j=0;j<INBUFFER_WIDTH;j++)
    				{
    				#pragma HLS unroll
    					for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    					{
    					#pragma HLS unroll
    						if(row_legal_flag[i])
    						{
    							input_plane_tile_row[i][j]=input_buffer_val[row_bank_idx[i]][j];
    						}
    						else
    						{
    							input_plane_tile_row[i][j]=0;
    						}
    					}
    				}
    
    
    
    				ap_uint<16> input_plane_tile[WINO_WIDTH][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE];
    				#pragma HLS array_partition variable=input_plane_tile complete
    				for(int i=0;i<WINO_WIDTH;i++)
    				{
    				#pragma HLS unroll
    					for(int j=0;j<WINO_DOMAIN_SIZE;j++)
    					{
    					#pragma HLS unroll
    						for(int k=0;k<WINO_DOMAIN_SIZE;k++)
    						{
    						#pragma HLS unroll
    							if(col_legal_flag[i][k])
    								input_plane_tile[i][j][k]=input_plane_tile_row[j][ (ap_uint<INBUFFER_WIDTH_BITWIDTH>) input_col_idx[i][k].range(INBUFFER_WIDTH_BITWIDTH-1,0) ];
    							else
    								input_plane_tile[i][j][k]=0;
    						}
    					}
    				}
    
    				#if DEBUG_FILE_PRINT
    					int indepth = buffer_address_mid_minitile_depth_offset/buffer_address_mid_minitile_depth_step*INDEPTH_MINITILE_SIZE
    									+loop_indepth_minitile_idx;
    					attach_streaming_content<WINO_WIDTH>(input_plane_tile, start_row_idx, input_head_col_idx+pad_size, indepth, "instream.txt");
    					
    				#endif
    
    				if(loop_indepth_minitile_baseidx_cnt == loop_indepth_minitile_baseidx_reset_cycle)
    				{
    					buffer_address_mid_minitile_depth_offset = buffer_address_mid_buffertile_depth_offset;
    				}
    				else if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
    				{
    					buffer_address_mid_minitile_depth_offset += buffer_address_mid_minitile_depth_step;
    				}
    
    				if(loop_wino_tile_col_cnt==loop_wino_tile_col_reset_cycle)
    				{
    					input_head_col_idx=wino_col_offset_constant[0]-pad_size;
    					for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    					{
    						#pragma HLS unroll
    						for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    						{
    							#pragma HLS unroll
    							input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col];
    						}
    					}
    				}
    				else if(loop_indepth_minitile_idx==INDEPTH_MINITILE_SIZE-1)
    				{
    
    					input_head_col_idx+=wino_out_size_by_wino_width;
    					for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
    					{
    						#pragma HLS unroll
    						for(int i=0;i<WINO_DOMAIN_SIZE;i++)
    						{
    							#pragma HLS unroll
    							input_col_idx[wino_array_col][i]+=wino_out_size_by_wino_width;
    						}
    					}
    				}
    
    
    				if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
    				{
    					loop_wino_tile_col_cnt=1;
    				}
    				else
    				{
    					loop_wino_tile_col_cnt++;
    				}
    
    				if(loop_indepth_minitile_baseidx_cnt == loop_indepth_minitile_baseidx_reset_cycle)
    				{
    					loop_indepth_minitile_baseidx_cnt=1;
    				}
    				else
    				{
    					loop_indepth_minitile_baseidx_cnt++;
    				}
    				loop_indepth_minitile_idx++;
    							
    			}
    		}
    	}	
    
    xliu79's avatar
    xliu79 committed
    //template<int dummy>
    
    void load_weight_ddr_one_port(
    	ap_uint<128>* DDR_interface,
    	ap_uint<64> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH],
    	int load_number,
    	int port_load_number,
    	ap_uint<1> pingpong)
    {
    
    xliu79's avatar
    xliu79 committed
    
    
    	#pragma HLS array_partition variable = weight_buff dim=1 complete
    	#pragma HLS array_partition variable = weight_buff dim=2 complete
    
    
    	ap_uint<4> counter=0;
    
    	ap_uint<9> port_load_cnt=0;
    
    	ap_uint<9> buffer_address_offset=0;
    	ap_uint<2> buffer_idx;
    	
    
    	for(int address = 0; address<load_number; address++)
    	{
    
    xliu79's avatar
    xliu79 committed
    		#pragma HLS pipeline
    
    		ap_uint<128> temp128 = DDR_interface[address];
    		ap_uint<64> temp64[2];
    		#pragma HLS array_partition  variable = temp64 complete
    
    		for(int i=0;i<2;i++)
    		{
    			#pragma HLS unroll
    			temp64[i]=temp128.range(i*64+63,i*64);
    		}
    
    		ap_uint<10> buffer_address = (pingpong,buffer_address_offset);
    		
    
    		if(counter==0)
    		{
    			weight_buff[buffer_idx][0][buffer_address]=temp64[0];
    			weight_buff[buffer_idx][1][buffer_address]=temp64[1];
    		}
    		else if(counter == 1)
    		{
    			weight_buff[buffer_idx][2][buffer_address]=temp64[0];
    			weight_buff[buffer_idx][3][buffer_address]=temp64[1];
    		}
    		else if(counter == 2)
    		{
    			weight_buff[buffer_idx][4][buffer_address]=temp64[0];
    			weight_buff[buffer_idx][5][buffer_address]=temp64[1];
    		}
    		else if(counter == 3)
    		{
    			weight_buff[buffer_idx][6][buffer_address]=temp64[0];
    			weight_buff[buffer_idx][7][buffer_address]=temp64[1];
    		}
    		else if(counter == 4)
    		{
    			weight_buff[buffer_idx][8][buffer_address]=temp64[0];
    		}
    
    		if(port_load_cnt==port_load_number-1)
    		{
    			buffer_address_offset =0;
    		}
    		else if(counter==4 )
    		{
    			buffer_address_offset++;
    		}
    
    		if(port_load_cnt==port_load_number-1)
    		{
    			buffer_idx++;
    			port_load_cnt=0;
    		}
    		else
    		{
    			port_load_cnt++;
    		}
    
    		if(counter==4)
    		{
    			counter=0;
    		}
    		else
    		{
    			counter++;
    		}
    	}
    }
    
    
    xliu79's avatar
    xliu79 committed
    //template<int dummy>
    
    void load_weight_ddr(
    	ap_uint<128>* weight_DDR0,
    	ap_uint<128>* weight_DDR1,
    	ap_uint<128>* weight_DDR2,
    	ap_uint<128>* weight_DDR3,
    	ap_uint<64> weight_buff[4][WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH],
    	int DDR_offset,
    	int load_number,
    	int port_load_number,
    	ap_uint<1> skip_flag,
    	ap_uint<1> pingpong)
    {
    
    xliu79's avatar
    xliu79 committed
    
    	#pragma HLS array_partition variable = weight_buff dim=1 complete
    	#pragma HLS array_partition variable = weight_buff dim=2 complete
    
    	if(skip_flag)
    		return;
    
    	// fflush(stdout);
    
    
    xliu79's avatar
    xliu79 committed
    	load_weight_ddr_one_port(
    
    	weight_DDR0+DDR_offset,
    	weight_buff[0],
    	load_number,
    	port_load_number,
    	pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    	load_weight_ddr_one_port(
    
    	weight_DDR1+DDR_offset,
    	weight_buff[1],
    	load_number,
    	port_load_number,
    	pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    	load_weight_ddr_one_port(
    
    	weight_DDR2+DDR_offset,
    	weight_buff[2],
    	load_number,
    	port_load_number,
    	pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    	load_weight_ddr_one_port(
    
    	weight_DDR3+DDR_offset,
    	weight_buff[3],
    	load_number,
    	port_load_number,
    	pingpong);
    }
    
    
    xliu79's avatar
    xliu79 committed
    //template<int dummy>
    
    void weight_stream(
    	ap_uint<64> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH],
    	#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    	hls::stream<ap_uint<16*36> > & weight_stream0,
    	hls::stream<ap_uint<16*36> > & weight_stream1,
    	#endif
    	int row_repeat_time, // output_row ceil_div out tiles
    	int weight_feed_total_size,
    	ap_int<1> pingpong
    )
    {
    
    xliu79's avatar
    xliu79 committed
    #pragma HLS array_partition variable = weight_buff dim=1 complete
    #pragma HLS array_partition variable = weight_buff dim=2 complete
    
    
    	int weight_feed_total_size_by2 = weight_feed_total_size/2;
    
    xliu79's avatar
    xliu79 committed
    	OUTSIDE:for(int i=0;i<row_repeat_time;i++)
    
    xliu79's avatar
    xliu79 committed
    		INSIDE:for(ap_uint<9> buffer_addr_offset=0; buffer_addr_offset<weight_feed_total_size_by2; buffer_addr_offset++)
    
    xliu79's avatar
    xliu79 committed
    			#pragma HLS pipeline
    
    			ap_uint<64> temp18[WEIGHT_FEED_NUMBER_PER_PORT][9];
    			#pragma HLS array_partition variable = temp18 complete
    
    
    xliu79's avatar
    xliu79 committed
    			ap_uint<10> buffer_addr=(pingpong,buffer_addr_offset.range(8,0));
    
    
    			for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
    			{
    			#pragma HLS unroll
    				for(int j18=0;j18<9;j18++)
    				{
    				#pragma HLS unroll
    					temp18[buffer_idx][j18]=weight_buff[buffer_idx][j18][buffer_addr];
    				}
    			}
    			ap_uint<16*36> temp16x36[WEIGHT_FEED_NUMBER_PER_PORT];
    
    xliu79's avatar
    xliu79 committed
    			#pragma HLS array_partition variable = temp16x36 complete
    
    			for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
    			{
    
    xliu79's avatar
    xliu79 committed
    				#pragma HLS unroll
    
    				for(int j18=0;j18<9;j18++)
    				{
    				#pragma HLS unroll
    				temp16x36[buffer_idx].range(j18*64+63,j18*64)=temp18[buffer_idx][j18];
    				}
    			}
    			#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    			weight_stream0<<temp16x36[0];
    			weight_stream1<<temp16x36[1];
    			#endif
    		}
    	}
    }
    
    
    
    xliu79's avatar
    xliu79 committed
    //template<int dummy>
    
    void weight_feed(
    	ap_uint<128>* weight_DDR0,
    	ap_uint<128>* weight_DDR1,
    	ap_uint<128>* weight_DDR2,
    	ap_uint<128>* weight_DDR3,
    	#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    	hls::stream<ap_uint<16*36> > & weight_stream0_0,
    	hls::stream<ap_uint<16*36> > & weight_stream0_1,
    
    	hls::stream<ap_uint<16*36> > & weight_stream1_0,
    
    	hls::stream<ap_uint<16*36> > & weight_stream1_1,
    
    	hls::stream<ap_uint<16*36> > & weight_stream2_0,
    
    	hls::stream<ap_uint<16*36> > & weight_stream2_1,
    
    	hls::stream<ap_uint<16*36> > & weight_stream3_0,
    
    	hls::stream<ap_uint<16*36> > & weight_stream3_1,
    	#endif
    
    xliu79's avatar
    xliu79 committed
    	ap_uint<32> weight_total_load_number,
    
    	ap_uint<16> weight_total_feed_size,
    	ap_uint<16> ddr_load_length,
    	ap_uint<16> ddr_load_length_per_feed,
    	ap_uint<16> row_repeat_times,
    	ap_uint<16> first_flag,
    	ap_uint<16> last_flag
    )
    {
    	static ap_uint<16> DDR_offset;
    	static ap_uint<16> DDR_load_cnt;
    	static ap_uint<1> pingpong;
    
    	static ap_uint<64> weight_buff[4][WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH];
    
    	if(first_flag)
    		DDR_offset=ddr_load_length;
    		DDR_load_cnt=1;
    		pingpong = 0;
    
    
    xliu79's avatar
    xliu79 committed
    	load_weight_ddr(
    
    	weight_DDR0,
    	weight_DDR1,
    	weight_DDR2,
    	weight_DDR3,
    	weight_buff,
    	0,
    	ddr_load_length,
    	ddr_load_length_per_feed,
    	~first_flag,
    	pingpong);
    
    	for(int cnt=0;cnt<weight_total_load_number;cnt++)
    	{
    
    		pingpong = ~pingpong;
    
    
    xliu79's avatar
    xliu79 committed
    		load_weight_ddr(
    
    		weight_DDR0,
    		weight_DDR1,
    		weight_DDR2,
    		weight_DDR3,
    		weight_buff,
    		DDR_offset,
    		ddr_load_length,
    		ddr_load_length_per_feed,
    		last_flag & (DDR_load_cnt==0) ,
    		pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    		weight_stream(
    
    		weight_buff[0],
    		#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    		weight_stream0_0,
    		weight_stream0_1,
    		#endif
    		row_repeat_times, // output_row ceil_div out tiles
    		weight_total_feed_size,
    		~pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    		weight_stream(
    
    		weight_buff[1],
    		#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    		weight_stream1_0,
    		weight_stream1_1,
    		#endif
    		row_repeat_times, // output_row ceil_div out tiles
    		weight_total_feed_size,
    		~pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    		weight_stream(
    
    		weight_buff[2],
    		#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    		weight_stream2_0,
    		weight_stream2_1,
    		#endif
    		row_repeat_times, // output_row ceil_div out tiles
    		weight_total_feed_size,
    		~pingpong);
    
    
    xliu79's avatar
    xliu79 committed
    		weight_stream(
    
    		weight_buff[3],
    		#if WEIGHT_FEED_NUMBER_PER_PORT == 2
    		weight_stream3_0,
    		weight_stream3_1,
    		#endif
    		row_repeat_times, // output_row ceil_div out tiles
    		weight_total_feed_size,
    		~pingpong);
    
    		if(DDR_load_cnt == weight_total_load_number-1)
    		{
    			DDR_load_cnt = 0;
    			DDR_offset = 0;
    		}
    		else
    		{
    			DDR_load_cnt+=1;
    			DDR_offset+=ddr_load_length;
    		}
    	}	
    }
    
    
    xliu79's avatar
    xliu79 committed
    #endif