#ifndef _WINO_BUFFER_HPP_ #define _WINO_BUFFER_HPP_ #include "wino_macro.h" #include "../software/param.h" #include <ap_int.h> #include <hls_stream.h> void input_feed( ap_uint<16> input_buffer[8][16][INPUT_BUFFER_DEPTH], hls::stream< ap_uint<32*36> > &input_tile_stream1, hls::stream< ap_uint<32*36> > &input_tile_stream2, ap_uint<16> start_row_idx, ap_uint<16> output_width, ap_uint<16> input_height, ap_uint<16> input_width, ap_uint<16> input_width_ceildiv_16, ap_uint<16> input_depth_aligned_8, ap_uint<16> pad_size, ap_uint<16> weight_outdepth_load_number, ap_uint<8> window_size, ap_uint<3> winograd_output_tile_size ) { #pragma HLS array_partition variable=input_buffer dim=1 complete #pragma HLS array_partition variable=input_buffer dim=2 complete int tile_step = winograd_output_tile_size*2; for(ap_uint<16> depth_tile_idx=0,depth_base_address=0;depth_tile_idx<input_depth_aligned_8/8;depth_tile_idx++,depth_base_address+=input_width_ceildiv_16) { for(int weight_load_outdepth_cnt=0;weight_load_outdepth_cnt<weight_outdepth_load_number;weight_load_outdepth_cnt++) { int start_col_idx =0; int depth_address =0; int row_tile_offset =0; for(int window_row_offset=0; window_row_offset<window_size; window_row_offset++) { for(int window_col_offset=0; window_col_offset<window_size; window_col_offset++) { ap_uint<1> row_legal_flag[6]; #pragma HLS array_partition variable=row_legal_flag complete ap_uint<3> row_bank_idx[6]; #pragma HLS array_partition variable=row_bank_idx complete ap_uint<1> row_addres_offset[8]; #pragma HLS array_partition variable=row_addres_offset complete ap_uint<4> start_row_mod16 = start_row_idx-pad_size+window_row_offset; ap_uint<3> breakpoint = start_row_mod16.range(2,0); for(int i=0;i<8;i++) { #pragma HLS unroll if(i< breakpoint) row_addres_offset[i] = ~start_row_mod16[3]; else row_addres_offset[i] = start_row_mod16[3]; } for(int i=0;i<6;i++) { #pragma HLS unroll row_bank_idx[i] = start_row_idx+i-pad_size+window_row_offset; row_legal_flag[i] = ( start_row_idx+i-pad_size >=0 && start_row_idx+i-pad_size < input_height); } ap_uint<10> input_col_idx[12]; #pragma HLS array_partition variable=input_col_idx complete ap_int<16> input_head_col_idx=-pad_size; for(int i=0;i<6;i++) { #pragma HLS unroll input_col_idx[i]=i-pad_size+window_col_offset; input_col_idx[i+6]=i+winograd_output_tile_size-pad_size+window_col_offset; } for(ap_uint<10> out_col_idx=0;out_col_idx < output_width; out_col_idx+=tile_step) { ap_uint<6> input_buffer_address_by8[16]; #pragma HLS array_partition variable=input_buffer_address_by8 complete ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH-4> input_head_col_by16 = input_head_col_idx.range(10,4)+depth_base_address; ap_uint<4> input_buffer_address_break_point = input_head_col_idx.range(3,0); ap_uint<1> col_legal_flag[12]; #pragma HLS array_partition variable=col_legal_flag complete for(int i=0;i<12; i++) { #pragma HLS unroll col_legal_flag[i]= ( input_col_idx[i] >=0 && input_col_idx[i] < input_width); } for(int i=0;i<16;i++) { if(i>=input_buffer_address_break_point) input_buffer_address_by8[i] = input_head_col_by16; else input_buffer_address_by8[i] = input_head_col_by16+1; } DEPTH:for(ap_uint<4> depth_idx_in_tile=0;depth_idx_in_tile<8;depth_idx_in_tile++) { #pragma HLS pipeline ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH> input_buffer_address[8][16]; #pragma HLS array_partition variable=input_buffer_address complete #pragma HLS array_partition variable=col_legal_flag complete ap_uint<16> input_buffer_val[8][16]; #pragma HLS array_partition variable=input_buffer_val complete for(int j=0;j<8;j++) { #pragma HLS unroll for(int i=0;i<16;i++) { #pragma HLS unroll input_buffer_address[j][i].range(INPUT_BUFFER_DEPTH_BITWIDTH-1,0) = (row_addres_offset[j],( input_buffer_address_by8[i], (ap_uint<3>) depth_idx_in_tile.range(2,0) )); } } for(int i=0;i<8;i++) { #pragma HLS unroll for(int j=0;j<16;j++) { #pragma HLS unroll input_buffer_val[i][j]=input_buffer[i][j][ (ap_uint<10>) input_buffer_address[i][j] ]; } } // for(int i=0;i<8;i++) // { // for(int j=0;j<16;j++) // { // } // } ap_uint<16> input_plane_tile_row[6][16]; #pragma HLS array_partition variable=input_plane_tile_row dim=1 complete #pragma HLS array_partition variable=input_plane_tile_row dim=2 complete for(int j=0;j<16;j++) { #pragma HLS unroll for(int i=0;i<6;i++) { #pragma HLS unroll if(row_legal_flag[i]) { input_plane_tile_row[i][j]=input_buffer_val[row_bank_idx[i]][j]; } else { input_plane_tile_row[i][j]=0; } } } ap_uint<16> input_plane_tile[6][12]; #pragma HLS array_partition variable=input_plane_tile complete for(int i=0;i<6;i++) { #pragma HLS unroll for(int j=0;j<12;j++) { #pragma HLS unroll if(col_legal_flag[j]) input_plane_tile[i][j]=input_plane_tile_row[i][ (ap_uint<4>) input_col_idx[j].range(3,0) ]; else input_plane_tile[i][j]=0; } } // #if DEBUG_FILE_PRINT // attach_streaming_content<0>(input_plane_tile,start_row_idx-pad_size,out_col_idx-pad_size,depth_tile_idx*8+depth_idx_in_tile,"input_stream_content.txt"); // #endif ap_int<8> in[36][4]; for(int k=0;k<6;k++) { #pragma HLS unroll for(int l=0;l<6;l++) { #pragma HLS unroll for(ap_uint<3> j=0;j<4;j++) { #pragma HLS unroll in[k*6+l][j].range(7,0)= input_plane_tile[k][ j/2*6+l].range( j%2*8+7 ,j%2*8); } } } ap_int<16> dB[6][6][4]; #pragma HLS array_partition variable=dB complete for(int i=0;i<6;i++) { #pragma HLS unroll for(int j=0;j<4;j++) { dB[i][0][j]=(((ap_int<16>)in[i*6+0][j]-(ap_int<16>)in[i*6+2][j])<<2) - (ap_int<16>)in[i*6+2][j] + (ap_int<16>)in[i*6+4][j]; dB[i][1][j]=(ap_int<16>)in[i*6+3][j]+(ap_int<16>)in[i*6+4][j]-(((ap_int<16>)in[i*6+1][j]+(ap_int<16>)in[i*6+2][j])<<2); dB[i][2][j]=(((ap_int<16>)in[i*6+1][j]-(ap_int<16>)in[i*6+2][j])<<2)+(ap_int<16>)in[i*6+4][j]-(ap_int<16>)in[i*6+3][j]; dB[i][3][j]=(((ap_int<16>)in[i*6+3][j]-(ap_int<16>)in[i*6+1][j])<<1)+(ap_int<16>)in[i*6+4][j]-(ap_int<16>)in[i*6+2][j]; dB[i][4][j]=(((ap_int<16>)in[i*6+1][j]-(ap_int<16>)in[i*6+3][j])<<1)+(ap_int<16>)in[i*6+4][j]-(ap_int<16>)in[i*6+2][j]; dB[i][5][j]=(((ap_int<16>)in[i*6+1][j]-(ap_int<16>)in[i*6+3][j])<<2)+(ap_int<16>)in[i*6+5][j]-(ap_int<16>)in[i*6+3][j]; } } ap_int<16> BTDB[6][6][4]; #pragma HLS array_partition variable=BTDB complete for(int i=0;i<6;i++) { #pragma HLS unroll for(int j=0;j<4;j++) { BTDB[0][i][j]=((dB[0][i][j]-dB[2][i][j])<<2) - dB[2][i][j] + dB[4][i][j]; BTDB[1][i][j]=dB[3][i][j]+dB[4][i][j]-((dB[1][i][j]+dB[2][i][j])<<2); BTDB[2][i][j]=((dB[1][i][j]-dB[2][i][j])<<2)+dB[4][i][j]-dB[3][i][j]; BTDB[3][i][j]=((dB[3][i][j]-dB[1][i][j])<<1)+dB[4][i][j]-dB[2][i][j]; BTDB[4][i][j]=((dB[1][i][j]-dB[3][i][j])<<1)+dB[4][i][j]-dB[2][i][j]; BTDB[5][i][j]=((dB[1][i][j]-dB[3][i][j])<<2)+dB[5][i][j]-dB[3][i][j]; } } ap_uint<32*36> stream_out1; ap_uint<32*36> stream_out2; for(int i=0;i<6;i++) { #pragma HLS unroll for(int j=0;j<6;j++) { #pragma HLS unroll stream_out1.range( (i*6+j)*32+31,(i*6+j)*32) =(BTDB[i][j][1],BTDB[i][j][0]); stream_out2.range( (i*6+j)*32+31,(i*6+j)*32) =(BTDB[i][j][3],BTDB[i][j][2]); } } #if DEBUG_FILE_PRINT attach_streaming_wino<0>(stream_out1,stream_out2,start_row_idx-pad_size,input_head_col_idx,depth_tile_idx*8+depth_idx_in_tile,"input_stream_wino.txt"); #endif input_tile_stream1<<stream_out1; input_tile_stream2<<stream_out2; } for(int i=0;i<12; i++) { #pragma HLS unroll input_col_idx[i]+=tile_step; } input_head_col_idx+=tile_step; } }//depth_tile1 }// window_col } }//window_row ap_uint<32*36> stream_out1=0; ap_uint<32*36> stream_out2=0; input_tile_stream1<<stream_out1; input_tile_stream1<<stream_out1; input_tile_stream2<<stream_out2; input_tile_stream2<<stream_out2; } void input_feed_underconstruction( ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH], // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1, // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2, ap_uint<16> inheight, ap_uint<16> inwidth, ap_uint<16> pad_size, ap_uint<16> weightbuffer_load_indepth_number, ap_uint<16> weightbuffer_load_outdepth_number, ap_uint<16> wino_output_tile_size, ap_uint<32> input_buffer_feeding_loop_bound, ap_uint<16> loop_wino_tile_col_reset_cycle, ap_uint<16> loop_indepth_minitile_baseidx_reset_cycle, ap_uint<10> buffer_address_mid_buffertile_depth_step, ap_uint<10> buffer_address_mid_minitile_depth_step, ap_uint<16> start_row_idx, ap_int<16> start_row_idx_minus_pad_size #if DEBUG_FILE_PRINT ,ConvDesc_t conv_desc #endif ) { // row_selection preparation ap_uint<1> row_legal_flag[WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable=row_legal_flag complete ap_uint<1> row_address_offset[INBUFFER_HEIGHT]; #pragma HLS array_partition variable=row_address_offset complete ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_bank_idx[WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable=row_bank_idx complete ap_uint<16> wino_out_size_by_wino_width = wino_output_tile_size<<WINO_WIDTH_BITWIDTH; ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_breakpoint = start_row_idx_minus_pad_size.range(INBUFFER_HEIGHT_BITWIDTH-1,0); for(int i=0;i<8;i++) { #pragma HLS unroll if(i< row_breakpoint) row_address_offset[i] = ~start_row_idx_minus_pad_size[INBUFFER_HEIGHT_BITWIDTH]; else row_address_offset[i] = start_row_idx_minus_pad_size[INBUFFER_HEIGHT_BITWIDTH]; } for(int i=0;i<6;i++) { #pragma HLS unroll row_bank_idx[i] = start_row_idx_minus_pad_size+i; row_legal_flag[i] = ( start_row_idx_minus_pad_size+i >=0 && start_row_idx_minus_pad_size+i < inheight); } ap_int<10> input_col_idx[WINO_WIDTH][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable=input_col_idx dim=1 complete #pragma HLS array_partition variable=input_col_idx dim=2 complete ap_uint<16> wino_col_offset_constant[WINO_WIDTH]; #pragma HLS array_partition variable=wino_col_offset_constant complete for(int i=0;i<WINO_WIDTH;i++) { #pragma HLS unroll wino_col_offset_constant[i]=wino_output_tile_size*i; } ap_uint<16> first_col_idx=0; ap_uint<INDEPTH_MINITILE_SIZE_BITWIDTH> loop_indepth_minitile_idx=0; ap_uint<16> loop_wino_tile_col_cnt=1; ap_uint<16> loop_indepth_minitile_baseidx_cnt =1; ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_minitile_depth_offset=0; ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_buffertile_depth_offset=0; ap_int<16> input_head_col_idx=wino_col_offset_constant[0]-pad_size; for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col]; } } for(ap_uint<16> outdepth_buffertile_idx=0;outdepth_buffertile_idx<weightbuffer_load_outdepth_number;outdepth_buffertile_idx++) { for(ap_uint<16> indepth_buffertile_baseidx=0;indepth_buffertile_baseidx<weightbuffer_load_indepth_number;indepth_buffertile_baseidx++) { buffer_address_mid_buffertile_depth_offset = indepth_buffertile_baseidx*buffer_address_mid_buffertile_depth_step; buffer_address_mid_minitile_depth_offset = buffer_address_mid_buffertile_depth_offset; for(int counter=0;counter<input_buffer_feeding_loop_bound;counter++ ) { #pragma HLS pipeline II =1 // it is a semi flattened loop which does following // for(int outdepth_minitile_baseidx=0;outdepth_minitile_baseidx<weightbuffer_outdepth_minitile_number; outdepth_minitile_baseidx ++) // for( int indepth_minitile_baseidx=0;indepth_minitile_baseidx<weightbuffer_indepth_minitile_number; indepth_minitile_baseidx ++) // for(int wino_tile_col_idx =1; wino_tile_col_idx < wino_tile_number_in_outwidth+1 ; wino_tile_col_idx++) // for(ap_uint<3> indepth_minitile_idx=0; indepth_minitile_idx< INDEPTH_MINITILE_SIZE; indepth_minitile_idx++) ap_uint<1> col_legal_flag[WINO_WIDTH][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable=col_legal_flag complete for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll col_legal_flag[wino_array_col][i]= ( input_col_idx[wino_array_col][i] >=0 && input_col_idx[wino_array_col][i] < inwidth); } } ap_uint<INBUFFER_MID_ADDR_BITWIDTH> col_pix_address_offset[INBUFFER_WIDTH]; ap_uint<INBUFFER_WIDTH_BITWIDTH> col_breakpoint=input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH-1,0); ap_uint<INBUFFER_MID_ADDR_BITWIDTH> input_head_col_address_offset; input_head_col_address_offset= input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH+INBUFFER_MID_ADDR_BITWIDTH-1,INBUFFER_WIDTH_BITWIDTH) + buffer_address_mid_minitile_depth_offset; for(int i=0;i<INBUFFER_WIDTH;i++) { if(i>=col_breakpoint) col_pix_address_offset[i] = input_head_col_address_offset; else col_pix_address_offset[i] = input_head_col_address_offset+1; } ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH> buffer_address[INBUFFER_HEIGHT][INBUFFER_WIDTH]; for(int i=0;i<INBUFFER_HEIGHT; i++) { #pragma HLS unroll for(int j=0;j<INBUFFER_WIDTH;j++) { #pragma HLS unroll buffer_address[i][j]=(row_address_offset[i],col_pix_address_offset[j],loop_indepth_minitile_idx); } } ap_uint<16> input_buffer_val[INBUFFER_HEIGHT][INBUFFER_WIDTH]; #pragma HLS array_partition variable=input_buffer_val complete for(int i=0;i<INBUFFER_HEIGHT; i++) { #pragma HLS unroll for(int j=0;j<INBUFFER_WIDTH;j++) { #pragma HLS unroll input_buffer_val[i][j]=input_buffer[i][j][buffer_address[i][j]]; } } ap_uint<16> input_plane_tile_row[WINO_DOMAIN_SIZE][INBUFFER_WIDTH]; #pragma HLS array_partition variable=input_plane_tile_row dim=1 complete #pragma HLS array_partition variable=input_plane_tile_row dim=2 complete for(int j=0;j<INBUFFER_WIDTH;j++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll if(row_legal_flag[i]) { input_plane_tile_row[i][j]=input_buffer_val[row_bank_idx[i]][j]; } else { input_plane_tile_row[i][j]=0; } } } ap_uint<16> input_plane_tile[WINO_WIDTH][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable=input_plane_tile complete for(int i=0;i<WINO_WIDTH;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { #pragma HLS unroll for(int k=0;k<WINO_DOMAIN_SIZE;k++) { #pragma HLS unroll if(col_legal_flag[i][k]) input_plane_tile[i][j][k]=input_plane_tile_row[j][ (ap_uint<INBUFFER_WIDTH_BITWIDTH>) input_col_idx[i][k].range(INBUFFER_WIDTH_BITWIDTH-1,0) ]; else input_plane_tile[i][j][k]=0; } } } #if DEBUG_FILE_PRINT int indepth = buffer_address_mid_minitile_depth_offset/buffer_address_mid_minitile_depth_step*INDEPTH_MINITILE_SIZE +loop_indepth_minitile_idx; attach_streaming_content<WINO_WIDTH>(input_plane_tile, start_row_idx, input_head_col_idx+pad_size, indepth, "instream.txt"); #endif if(loop_indepth_minitile_baseidx_cnt == loop_indepth_minitile_baseidx_reset_cycle) { buffer_address_mid_minitile_depth_offset = buffer_address_mid_buffertile_depth_offset; } else if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle) { buffer_address_mid_minitile_depth_offset += buffer_address_mid_minitile_depth_step; } if(loop_wino_tile_col_cnt==loop_wino_tile_col_reset_cycle) { input_head_col_idx=wino_col_offset_constant[0]-pad_size; for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col]; } } } else if(loop_indepth_minitile_idx==INDEPTH_MINITILE_SIZE-1) { input_head_col_idx+=wino_out_size_by_wino_width; for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll input_col_idx[wino_array_col][i]+=wino_out_size_by_wino_width; } } } if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle) { loop_wino_tile_col_cnt=1; } else { loop_wino_tile_col_cnt++; } if(loop_indepth_minitile_baseidx_cnt == loop_indepth_minitile_baseidx_reset_cycle) { loop_indepth_minitile_baseidx_cnt=1; } else { loop_indepth_minitile_baseidx_cnt++; } loop_indepth_minitile_idx++; } } } } //template<int dummy> void load_weight_ddr_one_port( ap_uint<128>* DDR_interface, ap_uint<64> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH], int load_number, int port_load_number, ap_uint<1> pingpong) { #pragma HLS array_partition variable = weight_buff dim=1 complete #pragma HLS array_partition variable = weight_buff dim=2 complete ap_uint<4> counter=0; ap_uint<9> port_load_cnt=0; ap_uint<9> buffer_address_offset=0; ap_uint<2> buffer_idx; for(int address = 0; address<load_number; address++) { #pragma HLS pipeline ap_uint<128> temp128 = DDR_interface[address]; ap_uint<64> temp64[2]; #pragma HLS array_partition variable = temp64 complete for(int i=0;i<2;i++) { #pragma HLS unroll temp64[i]=temp128.range(i*64+63,i*64); } ap_uint<10> buffer_address = (pingpong,buffer_address_offset); if(counter==0) { weight_buff[buffer_idx][0][buffer_address]=temp64[0]; weight_buff[buffer_idx][1][buffer_address]=temp64[1]; } else if(counter == 1) { weight_buff[buffer_idx][2][buffer_address]=temp64[0]; weight_buff[buffer_idx][3][buffer_address]=temp64[1]; } else if(counter == 2) { weight_buff[buffer_idx][4][buffer_address]=temp64[0]; weight_buff[buffer_idx][5][buffer_address]=temp64[1]; } else if(counter == 3) { weight_buff[buffer_idx][6][buffer_address]=temp64[0]; weight_buff[buffer_idx][7][buffer_address]=temp64[1]; } else if(counter == 4) { weight_buff[buffer_idx][8][buffer_address]=temp64[0]; } if(port_load_cnt==port_load_number-1) { buffer_address_offset =0; } else if(counter==4 ) { buffer_address_offset++; } if(port_load_cnt==port_load_number-1) { buffer_idx++; port_load_cnt=0; } else { port_load_cnt++; } if(counter==4) { counter=0; } else { counter++; } } } //template<int dummy> void load_weight_ddr( ap_uint<128>* weight_DDR0, ap_uint<128>* weight_DDR1, ap_uint<128>* weight_DDR2, ap_uint<128>* weight_DDR3, ap_uint<64> weight_buff[4][WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH], int DDR_offset, int load_number, int port_load_number, ap_uint<1> skip_flag, ap_uint<1> pingpong) { #pragma HLS array_partition variable = weight_buff dim=1 complete #pragma HLS array_partition variable = weight_buff dim=2 complete if(skip_flag) return; // fflush(stdout); load_weight_ddr_one_port( weight_DDR0+DDR_offset, weight_buff[0], load_number, port_load_number, pingpong); load_weight_ddr_one_port( weight_DDR1+DDR_offset, weight_buff[1], load_number, port_load_number, pingpong); load_weight_ddr_one_port( weight_DDR2+DDR_offset, weight_buff[2], load_number, port_load_number, pingpong); load_weight_ddr_one_port( weight_DDR3+DDR_offset, weight_buff[3], load_number, port_load_number, pingpong); } //template<int dummy> void weight_stream( ap_uint<64> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH], #if WEIGHT_FEED_NUMBER_PER_PORT == 2 hls::stream<ap_uint<16*36> > & weight_stream0, hls::stream<ap_uint<16*36> > & weight_stream1, #endif int row_repeat_time, // output_row ceil_div out tiles int weight_feed_total_size, ap_int<1> pingpong ) { #pragma HLS array_partition variable = weight_buff dim=1 complete #pragma HLS array_partition variable = weight_buff dim=2 complete int weight_feed_total_size_by2 = weight_feed_total_size/2; OUTSIDE:for(int i=0;i<row_repeat_time;i++) { INSIDE:for(ap_uint<9> buffer_addr_offset=0; buffer_addr_offset<weight_feed_total_size_by2; buffer_addr_offset++) { #pragma HLS pipeline ap_uint<64> temp18[WEIGHT_FEED_NUMBER_PER_PORT][9]; #pragma HLS array_partition variable = temp18 complete ap_uint<10> buffer_addr=(pingpong,buffer_addr_offset.range(8,0)); for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++) { #pragma HLS unroll for(int j18=0;j18<9;j18++) { #pragma HLS unroll temp18[buffer_idx][j18]=weight_buff[buffer_idx][j18][buffer_addr]; } } ap_uint<16*36> temp16x36[WEIGHT_FEED_NUMBER_PER_PORT]; #pragma HLS array_partition variable = temp16x36 complete for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++) { #pragma HLS unroll for(int j18=0;j18<9;j18++) { #pragma HLS unroll temp16x36[buffer_idx].range(j18*64+63,j18*64)=temp18[buffer_idx][j18]; } } #if WEIGHT_FEED_NUMBER_PER_PORT == 2 weight_stream0<<temp16x36[0]; weight_stream1<<temp16x36[1]; #endif } } } //template<int dummy> void weight_feed( ap_uint<128>* weight_DDR0, ap_uint<128>* weight_DDR1, ap_uint<128>* weight_DDR2, ap_uint<128>* weight_DDR3, #if WEIGHT_FEED_NUMBER_PER_PORT == 2 hls::stream<ap_uint<16*36> > & weight_stream0_0, hls::stream<ap_uint<16*36> > & weight_stream0_1, hls::stream<ap_uint<16*36> > & weight_stream1_0, hls::stream<ap_uint<16*36> > & weight_stream1_1, hls::stream<ap_uint<16*36> > & weight_stream2_0, hls::stream<ap_uint<16*36> > & weight_stream2_1, hls::stream<ap_uint<16*36> > & weight_stream3_0, hls::stream<ap_uint<16*36> > & weight_stream3_1, #endif ap_uint<32> weight_total_load_number, ap_uint<16> weight_total_feed_size, ap_uint<16> ddr_load_length, ap_uint<16> ddr_load_length_per_feed, ap_uint<16> row_repeat_times, ap_uint<16> first_flag, ap_uint<16> last_flag ) { static ap_uint<16> DDR_offset; static ap_uint<16> DDR_load_cnt; static ap_uint<1> pingpong; static ap_uint<64> weight_buff[4][WEIGHT_FEED_NUMBER_PER_PORT][9][WEIGHT_BUFFER_DEPTH]; if(first_flag) DDR_offset=ddr_load_length; DDR_load_cnt=1; pingpong = 0; load_weight_ddr( weight_DDR0, weight_DDR1, weight_DDR2, weight_DDR3, weight_buff, 0, ddr_load_length, ddr_load_length_per_feed, ~first_flag, pingpong); for(int cnt=0;cnt<weight_total_load_number;cnt++) { pingpong = ~pingpong; load_weight_ddr( weight_DDR0, weight_DDR1, weight_DDR2, weight_DDR3, weight_buff, DDR_offset, ddr_load_length, ddr_load_length_per_feed, last_flag & (DDR_load_cnt==0) , pingpong); weight_stream( weight_buff[0], #if WEIGHT_FEED_NUMBER_PER_PORT == 2 weight_stream0_0, weight_stream0_1, #endif row_repeat_times, // output_row ceil_div out tiles weight_total_feed_size, ~pingpong); weight_stream( weight_buff[1], #if WEIGHT_FEED_NUMBER_PER_PORT == 2 weight_stream1_0, weight_stream1_1, #endif row_repeat_times, // output_row ceil_div out tiles weight_total_feed_size, ~pingpong); weight_stream( weight_buff[2], #if WEIGHT_FEED_NUMBER_PER_PORT == 2 weight_stream2_0, weight_stream2_1, #endif row_repeat_times, // output_row ceil_div out tiles weight_total_feed_size, ~pingpong); weight_stream( weight_buff[3], #if WEIGHT_FEED_NUMBER_PER_PORT == 2 weight_stream3_0, weight_stream3_1, #endif row_repeat_times, // output_row ceil_div out tiles weight_total_feed_size, ~pingpong); if(DDR_load_cnt == weight_total_load_number-1) { DDR_load_cnt = 0; DDR_offset = 0; } else { DDR_load_cnt+=1; DDR_offset+=ddr_load_length; } } } #endif