#ifndef _WINO_CELL_HPP_ #define _WINO_CELL_HPP_ #include "wino_macro.h" #include <ap_int.h> #include <hls_stream.h> #include "wino_buffer.cpp" #define IN_WIDTH 8 #define W_WIDTH 8 #define OUT_WIDTH 18 #define B_WIDTH_IN 8 #define B_WIDTH_OUT 12 #define BT_WIDTH_IN 12 #define BT_WIDTH_OUT 16 #define GTGG_WIDTH_IN 16 #define GTGG_WIDTH_W 16 #define GTGG_WIDTH_OUT 24 #define A_WIDTH_IN 24 #define A_WIDTH_OUT 28 #define AT_WIDTH_IN 28 #define AT_WIDTH_OUT 32 void wino6x6_stream_cell( hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in, hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out, hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in, hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out, ap_uint<OUT_WIDTH*2> out_buffer[16][1024], ap_int<16> weight_indepth_load_number, ap_int<16> weight_outdepth_load_number, ap_int<16> weight_indepth_feed_size, ap_int<16> weight_outdepth_feed_size, ap_int<16> row_tile_number #if DEBUG_FILE_PRINT ,int ROW_IDX, int COL_IDX #endif ) { #pragma HLS array_partition variable = out_buffer dim=1 complete ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg0[2]; ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg1[2]; #pragma HLS array_partition variable = stream_temp_reg0 complete #pragma HLS array_partition variable = stream_temp_reg1 complete #if DEBUG_FILE_PRINT memset(stream_temp_reg0,0xAA,2*2*36*2); memset(stream_temp_reg1,0xAA,2*2*36*2); #endif ap_int<W_WIDTH> G1[36][2]; #pragma HLS array_partition variable=G1 ap_uint<1> stream_pingpong_flag=0; int write_idx=0; int weight_stream_idx=0; int input_stream_idx =0; #if DEBUG_FILE_PRINT char filename[100]; sprintf(filename,"wino_cell_output_%d_%d.txt",ROW_IDX, COL_IDX); char input_stream_filename[100]; sprintf(input_stream_filename,"wino_cell_input_%d_%d.txt",ROW_IDX, COL_IDX); char weight_stream_filename[100]; sprintf(weight_stream_filename,"wino_cell_weight_%d_%d.txt",ROW_IDX, COL_IDX); #endif top_stream_in>>stream_temp_reg0[0]; bottom_stream_out<<stream_temp_reg0[0]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg0[0],input_stream_idx,input_stream_filename); #endif top_stream_in>>stream_temp_reg0[1]; bottom_stream_out<<stream_temp_reg0[1]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg0[1],input_stream_idx,input_stream_filename); #endif ITER0:for(int iter0=0;iter0<weight_indepth_load_number;iter0++) ITER1:for(int iter1=0;iter1<weight_outdepth_load_number;iter1++) ITER2:for(int iter2=0;iter2<row_tile_number;iter2++) ITER3:for(int iter3=0;iter3<weight_indepth_feed_size/2;iter3++) { ITER4:for(int iter4=0;iter4<weight_outdepth_feed_size;iter4++) { #pragma HLS pipeline #pragma HLS dependence variable=out_buffer inter false #pragma HLS dependence variable=out_buffer intra false ap_int<GTGG_WIDTH_IN> stream_temp_array[2][36][2]; #pragma HLS array_partition variable = stream_temp_array complete for(int i=0;i<36;i++) { #pragma HLS unroll if(stream_pingpong_flag) { stream_temp_array[0][i][0]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); stream_temp_array[0][i][1]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); stream_temp_array[1][i][0]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); stream_temp_array[1][i][1]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); } else { stream_temp_array[0][i][0]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); stream_temp_array[0][i][1]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); stream_temp_array[1][i][0]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); stream_temp_array[1][i][1]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); } } if(iter4<2 ) { if(stream_pingpong_flag) { top_stream_in>>stream_temp_reg0[iter4]; bottom_stream_out<<stream_temp_reg0[iter4]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg0[iter4],input_stream_idx,input_stream_filename); #endif } else { top_stream_in>>stream_temp_reg1[iter4]; bottom_stream_out<<stream_temp_reg1[iter4]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg1[iter4],input_stream_idx,input_stream_filename); #endif } } ap_uint<W_WIDTH*2*36> weight_value_temp; left_stream_in>>weight_value_temp; right_stream_out<<weight_value_temp; for(int i=0;i<36;i++) { #pragma HLS unroll G1[i][0]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1,i*W_WIDTH); G1[i][1]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1+288,i*W_WIDTH +288 ); } ap_int<GTGG_WIDTH_OUT> UV[36][2]; #pragma HLS array_partition variable=UV complete for(int i=0;i<36;i++) { #pragma HLS unroll UV[i][0] = G1[i][0]*stream_temp_array[0][i][0] + G1[i][1]*stream_temp_array[1][i][0]; UV[i][1] = G1[i][0]*stream_temp_array[0][i][1] + G1[i][1]*stream_temp_array[1][i][1]; } ap_int<A_WIDTH_OUT> UVA[6][4][2]; #pragma HLS array_partition variable=UVA complete for(int b_idx=0;b_idx<2;b_idx++) { #pragma HLS unroll for(int i=0;i<6;i++) { #pragma HLS unroll UVA[i][0][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+0][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx]; UVA[i][1][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<1); UVA[i][2][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<2); UVA[i][3][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<3)+(ap_int<A_WIDTH_OUT>) UV[i*6+5][b_idx]; } } int address = iter1*row_tile_number* weight_outdepth_feed_size + iter4*row_tile_number + iter2; ap_int<36> outbuffer_value[16]; #pragma HLS array_partition variable= outbuffer_value complete for(int i=0;i<16;i++) { #pragma HLS unroll outbuffer_value[i]=out_buffer[i][address]; } ap_int<AT_WIDTH_OUT> ATUVA[16][2]; #pragma HLS array_partition variable=ATUVA complete for(int b_idx=0;b_idx<2;b_idx++) { #pragma HLS unroll for(int i=0;i<4;i++) { #pragma HLS unroll ATUVA[0+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[0][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx]; ATUVA[4+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<1); ATUVA[8+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<2); ATUVA[12+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<3) + (ap_int<AT_WIDTH_OUT>) UVA[5][i][b_idx]; } } #if DEBUG_FILE_PRINT attach_output_vector<0>(ATUVA,write_idx,filename); #endif ap_int<18> out_residual[16][2]; if(iter0 ==0 && iter3 ==0) { for(int i=0;i<16;i++) { #pragma HLS unroll out_residual[i][0]=0; out_residual[i][1]=0; } } else { for(int i=0;i<16;i++) { #pragma HLS unroll out_residual[i][0]=outbuffer_value[i].range(17,0); out_residual[i][1]=outbuffer_value[i].range(35,18); } } ap_int<18> outbuffer_writeback_value[16][2]; #pragma HLS array_partition variable = outbuffer_writeback_value complete dim=1 #pragma HLS array_partition variable = outbuffer_writeback_value complete dim=2 for(int i=0;i<16;i++) { #pragma HLS unroll outbuffer_writeback_value[i][0]=ATUVA[i][0]+out_residual[i][0]; outbuffer_writeback_value[i][1]=ATUVA[i][1]+out_residual[i][1]; } ap_int<18*2> outbuffer_writeback_value_batch[16]; #pragma HLS array_partition variable = outbuffer_writeback_value_batch complete for(int i=0;i<16;i++) { #pragma HLS unroll outbuffer_writeback_value_batch[i]=(outbuffer_writeback_value[i][1],outbuffer_writeback_value[i][0] ); } for(int i=0;i<16;i++) { #pragma HLS unroll out_buffer[i][address]=outbuffer_writeback_value_batch[i]; } } stream_pingpong_flag=~stream_pingpong_flag; } } //template<int dummy> void wino6x6_stream_bottomend( hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in, ap_int<16> weight_indepth_load_number, ap_int<16> weight_outdepth_load_number, ap_int<16> weight_indepth_feed_size, ap_int<16> row_tile_number ) { ap_uint<GTGG_WIDTH_IN*2*36> dummy_temp; for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size*row_tile_number+2;i++) { top_stream_in.read(dummy_temp); } } //template<int dummy> void wino6x6_stream_rightend( hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in, ap_int<16> weight_indepth_load_number, ap_int<16> weight_outdepth_load_number, ap_int<16> weight_indepth_feed_size, ap_int<16> weight_outdepth_feed_size, ap_int<16> row_tile_number ) { ap_uint<W_WIDTH*2*36>dummy_temp; for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size/2*weight_outdepth_feed_size*row_tile_number;i++) { left_stream_in.read(dummy_temp); } } //template<int dummy> void wino_stream_ceil_4x4( hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in, hls::stream< ap_uint<IN_WIDTH*16> > & bottom_stream_out, hls::stream< ap_uint<W_WIDTH*16> > &left_stream_in, hls::stream< ap_uint<W_WIDTH*16> > &right_stream_out, ap_uint<OUT_WIDTH> out_buffer[16][1024], ap_uint<1> wino_flag, int number) { #pragma HLS array_partition variable=out_buffer dim=1 complete ap_int<W_WIDTH> G1[16]; #pragma HLS array_partition variable=G1 ap_int<IN_WIDTH> in[16]; #pragma HLS array_partition variable=in for(int counter=0;counter<number;counter++) { #pragma HLS pipeline ap_uint<IN_WIDTH*16> stream_in_temp; ap_uint<W_WIDTH*16> stream_weight_temp; top_stream_in>>stream_in_temp; bottom_stream_out<<stream_in_temp; left_stream_in>>stream_weight_temp; right_stream_out<<stream_weight_temp; for(int k=0;k<16;k++) { #pragma HLS unroll factor=36 G1[k].range(W_WIDTH-1,0)=stream_weight_temp.range(W_WIDTH-1+k*W_WIDTH,k*W_WIDTH); in[k].range(IN_WIDTH-1,0)=stream_in_temp.range(IN_WIDTH-1+k*IN_WIDTH,k*IN_WIDTH); } ap_int<B_WIDTH_OUT> dB[4][4]; #pragma HLS array_partition variable=dB complete for(int i=0;i<4;i++) { #pragma HLS unroll dB[i][0]=in[i*4]-in[i*4+2]; dB[i][1]=in[i*4+1]+in[i*4+2]; dB[i][2]=-in[i*4+1]+in[i*4+2]; dB[i][3]=in[i*4+1]-in[i*4+3]; } ap_int<BT_WIDTH_OUT> BTdB[4][4]; #pragma HLS array_partition variable=BTdB complete if(wino_flag) { for(int i=0;i<4;i++) { #pragma HLS unroll BTdB[i][0]=dB[0][i]-dB[2][i]; BTdB[i][1]=dB[1][i]+dB[2][i]; BTdB[i][2]=-dB[1][i]+dB[2][i]; BTdB[i][3]=dB[1][i]-dB[3][i]; } } else { for(int i=0;i<4;i++) { #pragma HLS unroll BTdB[i][0]=in[i*4+0]; BTdB[i][1]=in[i*4+1]; BTdB[i][2]=in[i*4+2]; BTdB[i][3]=in[i*4+3]; } } ap_int<GTGG_WIDTH_OUT> UV[4][4]; #pragma HLS array_partition variable=UV complete for(int i=0;i<4;i++){ #pragma HLS unroll for(int j=0;j<4;j++){ #pragma HLS unroll if(wino_flag) UV[i][j]=BTdB[i][j]*G1[i*4+j]; else UV[i][j]=in[i][j]*G1[i*4+j]; } } ap_int<A_WIDTH_OUT> UVA[4][2]; #pragma HLS array_partition variable=UVA complete for(int i=0;i<4;i++) { #pragma HLS unroll UVA[i][0]=UV[i][0]+UV[i][1]+UV[i][2]; UVA[i][1]=UV[i][1]-UV[i][2]-UV[i][3]; } ap_int<AT_WIDTH_OUT> ATUVA[16]; #pragma HLS array_partition variable=ATUVA complete if(wino_flag) { for(int i=0;i<2;i++) { #pragma HLS unroll ATUVA[0+i]=UVA[0][i]+UVA[1][i]+UVA[2][i]; ATUVA[2+i]=UVA[1][i]-UVA[2][i]-UVA[3][i]; } } else { for(int i=0;i<16;i++) { ATUVA[i]=UV[i/4][i%4]; } } if(wino_flag) { for(int i=0;i<4;i++) { #pragma HLS unroll out_buffer[i][counter]=out_buffer[i][counter]+ATUVA[i]; } } else { for(int i=0;i<4;i++) for(int j=0;j<4;j++) { #pragma HLS unroll out_buffer[i*4+j][counter]=out_buffer[i*4+j][counter]+UV[i][j]; } } } } void wino_stream_cell_combined( hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in0, hls::stream< ap_uint<IN_WIDTH*16> > & top_stream_in1, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_0, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_1, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_2, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_3, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_4, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_5, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_6, hls::stream< ap_uint<W_WIDTH*2*36> > & left_stream_in_7, ap_uint<OUT_WIDTH*2> out_buffer[16][16][1024], ap_int<16> weight_indepth_load_number, ap_int<16> weight_outdepth_load_number, ap_int<16> weight_indepth_feed_size, ap_int<16> weight_outdepth_feed_size, ap_int<16> row_tile_number ) { } //template<int dummy> /* void wino_systolic( ap_uint<16> input_buffer[8][16][INPUT_BUFFER_DEPTH], ap_uint<OUT_WIDTH*2> output_buffer0[4*WEIGHT_FEED_NUMBER_PER_PORT*INPUT_FEED_NUMBER][16][OUTPUT_BUFFER_DEPTH], ap_uint<128>* weight_DDR0, ap_uint<128>* weight_DDR1, ap_uint<128>* weight_DDR2, ap_uint<128>* weight_DDR3, ap_uint<16> input_height, ap_uint<16> input_width, ap_uint<16> input_depth, ap_uint<16> input_width_ceildiv_16, ap_uint<16> input_depth_align8, ap_uint<16> output_height, ap_uint<16> output_width, ap_uint<16> output_depth, ap_uint<8> kernel_window_size, ap_uint<8> pad_size, ap_uint<16> weight_indepth_load_number, ap_uint<16> weight_outdepth_load_number, ap_uint<16> weight_outdepth_feed_size, ap_uint<16> start_row_idx, //weight parameters ap_uint<16> weight_total_load_number, ap_uint<16> weight_total_feed_size, ap_uint<16> ddr_load_length, ap_uint<16> ddr_load_length_per_feed, ap_uint<16> row_repeat_times, ap_uint<16> first_flag, ap_uint<16> last_flag ) { #pragma HLS interface m_axi port = weight_DDR3 #pragma HLS interface m_axi port = weight_DDR2 #pragma HLS interface m_axi port = weight_DDR1 #pragma HLS interface m_axi port = weight_DDR0 #pragma HLS array_partition variable=output_buffer0 dim=1 complete #pragma HLS array_partition variable=output_buffer0 dim=2 complete #pragma HLS array_partition variable=input_buffer dim=1 complete #pragma HLS array_partition variable=input_buffer dim=2 complete #pragma HLS dataflow static hls::stream<ap_uint<32*36> > indata_out_0_0("indata_out_0_0"); #pragma HLS stream variable=indata_out_0_0 depth=16 static hls::stream<ap_uint<32*36> > indata_out_1_0("indata_out_1_0"); #pragma HLS stream variable=indata_out_1_0 depth=16 static hls::stream<ap_uint<32*36> > indata_out_2_0("indata_out_2_0"); #pragma HLS stream variable=indata_out_2_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_3_0("indata_out_3_0"); #pragma HLS stream variable=indata_out_3_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_4_0("indata_out_4_0"); #pragma HLS stream variable=indata_out_4_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_5_0("indata_out_5_0"); #pragma HLS stream variable=indata_out_5_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_6_0("indata_out_6_0"); #pragma HLS stream variable=indata_out_6_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_7_0("indata_out_7_0"); #pragma HLS stream variable=indata_out_7_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_8_0("indata_out_8_0"); #pragma HLS stream variable=indata_out_8_0 depth=1 static hls::stream<ap_uint<32*36> > indata_out_0_1("indata_out_0_1"); #pragma HLS stream variable=indata_out_0_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_1_1("indata_out_1_1"); #pragma HLS stream variable=indata_out_1_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_2_1("indata_out_2_1"); #pragma HLS stream variable=indata_out_2_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_3_1("indata_out_3_1"); #pragma HLS stream variable=indata_out_3_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_4_1("indata_out_4_1"); #pragma HLS stream variable=indata_out_4_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_5_1("indata_out_5_1"); #pragma HLS stream variable=indata_out_5_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_6_1("indata_out_6_1"); #pragma HLS stream variable=indata_out_6_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_7_1("indata_out_7_1"); #pragma HLS stream variable=indata_out_7_1 depth=1 static hls::stream<ap_uint<32*36> > indata_out_8_1("indata_out_8_1"); #pragma HLS stream variable=indata_out_8_1 depth=1 static hls::stream<ap_uint<16*36> > weightdata_out_0_0("weightdata_out_0_0"); #pragma HLS stream variable=weightdata_out_0_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_1_0("weightdata_out_1_0"); #pragma HLS stream variable=weightdata_out_1_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_2_0("weightdata_out_2_0"); #pragma HLS stream variable=weightdata_out_2_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_3_0("weightdata_out_3_0"); #pragma HLS stream variable=weightdata_out_3_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_4_0("weightdata_out_4_0"); #pragma HLS stream variable=weightdata_out_4_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_5_0("weightdata_out_5_0"); #pragma HLS stream variable=weightdata_out_5_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_6_0("weightdata_out_6_0"); #pragma HLS stream variable=weightdata_out_6_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_7_0("weightdata_out_7_0"); #pragma HLS stream variable=weightdata_out_7_0 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_0_1("weightdata_out_0_1"); #pragma HLS stream variable=weightdata_out_0_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_1_1("weightdata_out_1_1"); #pragma HLS stream variable=weightdata_out_1_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_2_1("weightdata_out_2_1"); #pragma HLS stream variable=weightdata_out_2_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_3_1("weightdata_out_3_1"); #pragma HLS stream variable=weightdata_out_3_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_4_1("weightdata_out_4_1"); #pragma HLS stream variable=weightdata_out_4_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_5_1("weightdata_out_5_1"); #pragma HLS stream variable=weightdata_out_5_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_6_1("weightdata_out_6_1"); #pragma HLS stream variable=weightdata_out_6_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_7_1("weightdata_out_7_1"); #pragma HLS stream variable=weightdata_out_7_1 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_0_2("weightdata_out_0_2"); #pragma HLS stream variable=weightdata_out_0_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_1_2("weightdata_out_1_2"); #pragma HLS stream variable=weightdata_out_1_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_2_2("weightdata_out_2_2"); #pragma HLS stream variable=weightdata_out_2_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_3_2("weightdata_out_3_2"); #pragma HLS stream variable=weightdata_out_3_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_4_2("weightdata_out_4_2"); #pragma HLS stream variable=weightdata_out_4_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_5_2("weightdata_out_5_2"); #pragma HLS stream variable=weightdata_out_5_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_6_2("weightdata_out_6_2"); #pragma HLS stream variable=weightdata_out_6_2 depth=2 off static hls::stream<ap_uint<16*36> > weightdata_out_7_2("weightdata_out_7_2"); #pragma HLS stream variable=weightdata_out_7_2 depth=2 off input_feed( input_buffer, indata_out_0_0, indata_out_0_1, start_row_idx, output_width, input_height, input_width, input_width_ceildiv_16, input_depth_align8, pad_size, weight_outdepth_load_number, kernel_window_size, weight_outdepth_feed_size ); weight_feed( weight_DDR0, weight_DDR1, weight_DDR2, weight_DDR3, #if WEIGHT_FEED_NUMBER_PER_PORT == 2 weightdata_out_0_0, weightdata_out_1_0, weightdata_out_2_0, weightdata_out_3_0, weightdata_out_4_0, weightdata_out_5_0, weightdata_out_6_0, weightdata_out_7_0, #endif weight_total_load_number, weight_total_feed_size, ddr_load_length, ddr_load_length_per_feed, row_repeat_times, first_flag, last_flag); // wino6x6_stream_cell( // indata_out_0_0 // hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out, // hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in, // hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out, // ap_uint<OUT_WIDTH*2> out_buffer[16][1024], // ap_int<16> weight_indepth_load_number, // ap_int<16> weight_outdepth_load_number, // ap_int<16> weight_indepth_feed_size, // ap_int<16> weight_outdepth_feed_size, // ap_int<16> row_tile_number); // wino6x6_stream_cell(indata_out_0_0,indata_out_1_0,weightdata_out_0_0,weightdata_out_0_1,output_buffer0[0],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,0,0 #endif ); wino6x6_stream_cell(indata_out_1_0,indata_out_2_0,weightdata_out_1_0,weightdata_out_1_1,output_buffer0[1],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,1,0 #endif ); wino6x6_stream_cell(indata_out_2_0,indata_out_3_0,weightdata_out_2_0,weightdata_out_2_1,output_buffer0[2],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,2,0 #endif ); wino6x6_stream_cell(indata_out_3_0,indata_out_4_0,weightdata_out_3_0,weightdata_out_3_1,output_buffer0[3],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,3,0 #endif ); wino6x6_stream_cell(indata_out_4_0,indata_out_5_0,weightdata_out_4_0,weightdata_out_4_1,output_buffer0[4],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,4,0 #endif ); wino6x6_stream_cell(indata_out_5_0,indata_out_6_0,weightdata_out_5_0,weightdata_out_5_1,output_buffer0[5],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,5,0 #endif ); wino6x6_stream_cell(indata_out_6_0,indata_out_7_0,weightdata_out_6_0,weightdata_out_6_1,output_buffer0[6],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,6,0 #endif ); wino6x6_stream_cell(indata_out_7_0,indata_out_8_0,weightdata_out_7_0,weightdata_out_7_1,output_buffer0[7],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,7,0 #endif ); wino6x6_stream_cell(indata_out_0_1,indata_out_1_1,weightdata_out_0_1,weightdata_out_0_2,output_buffer0[8],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,0,1 #endif ); wino6x6_stream_cell(indata_out_1_1,indata_out_2_1,weightdata_out_1_1,weightdata_out_1_2,output_buffer0[9],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,1,1 #endif ); wino6x6_stream_cell(indata_out_2_1,indata_out_3_1,weightdata_out_2_1,weightdata_out_2_2,output_buffer0[10],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,2,1 #endif ); wino6x6_stream_cell(indata_out_3_1,indata_out_4_1,weightdata_out_3_1,weightdata_out_3_2,output_buffer0[11],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,3,1 #endif ); wino6x6_stream_cell(indata_out_4_1,indata_out_5_1,weightdata_out_4_1,weightdata_out_4_2,output_buffer0[12],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,4,1 #endif ); wino6x6_stream_cell(indata_out_5_1,indata_out_6_1,weightdata_out_5_1,weightdata_out_5_2,output_buffer0[13],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,5,1 #endif ); wino6x6_stream_cell(indata_out_6_1,indata_out_7_1,weightdata_out_6_1,weightdata_out_6_2,output_buffer0[14],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,6,1 #endif ); wino6x6_stream_cell(indata_out_7_1,indata_out_8_1,weightdata_out_7_1,weightdata_out_7_2,output_buffer0[15],weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times #if DEBUG_FILE_PRINT ,7,1 #endif ); wino6x6_stream_bottomend(indata_out_8_0,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times); wino6x6_stream_bottomend(indata_out_8_1,weight_indepth_load_number,weight_outdepth_load_number,8,row_repeat_times); wino6x6_stream_rightend(weightdata_out_0_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_1_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_2_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_3_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_4_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_5_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_6_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); wino6x6_stream_rightend(weightdata_out_7_2,weight_indepth_load_number,weight_outdepth_load_number,8,weight_outdepth_feed_size,row_repeat_times); } */ #endif