#ifndef _WINO_CELL_HPP_ #define _WINO_CELL_HPP_ #include "wino_macro.h" #include <dsp_builtins.h> #include <hls_stream.h> #include <ap_int.h> #include "wino_buffer.cpp" #include "wino_transform.cpp" void input_transform( hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_stream, hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_transformed_stream, int input_transform_feeding_loop_bound, ap_uint<3> wino_array_col ) { #if DEBUG_FILE_PRINT printf("---input_transform---\n");fflush(stdout); int write_idx=0; #endif for(int cycle=0;cycle<input_transform_feeding_loop_bound;cycle++) { #pragma hls pipeline ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> input_tile_stream_data; input_tile_stream>>input_tile_stream_data; ap_int<IN_WIDTH> in[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=in complete dim=1 #pragma HLS array_partition variable=in complete dim=2 #pragma HLS array_partition variable=in complete dim=3 for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { #pragma HLS unroll for(int k=0;k<BATCH_SIZE;k++) { #pragma HLS unroll in[i][j][k]=input_tile_stream_data.range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8+IN_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8); } } } ap_int<DB_WIDTH> DB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=DB complete dim=1 #pragma HLS array_partition variable=DB complete dim=2 #pragma HLS array_partition variable=DB complete dim=3 for(int k=0;k<BATCH_SIZE;k++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE ==6 DB6x6_1(in,DB,i,k) #else DB4x4_1(in,DB,i,k) #endif } } ap_int<BTB_WIDTH> BtDB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=BtDB complete dim=1 #pragma HLS array_partition variable=BtDB complete dim=2 #pragma HLS array_partition variable=BtDB complete dim=3 for(int k=0;k<BATCH_SIZE;k++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE ==6 BTB6x6_1(DB,BtDB,i,k) #else BTB4x4_1(DB,BtDB,i,k) #endif } } ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> input_tile_transformed_data; for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { #pragma HLS unroll for(int k=0;k<BATCH_SIZE;k++) { #pragma HLS unroll input_tile_transformed_data.range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+BTB_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH)=BtDB[i][j][k]; } } } #if DEBUG_FILE_PRINT char infilename[100]; sprintf(infilename,"intile_transform_%d.txt",(int) wino_array_col); attach_output_vector<BTB_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(BtDB,write_idx,infilename); write_idx++; #endif input_tile_transformed_stream<<input_tile_transformed_data; } } void load_input_tile( ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg[WINO_WIDTH][INDEPTH_MINITILE_SIZE] ) { #pragma HLS pipeline #pragma HLS array_partition variable = input_tile complete dim=5 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #pragma HLS array_partition variable = stream_temp_reg0 complete dim=1 #pragma HLS array_partition variable = stream_temp_reg0 complete dim=2 for(int w=0;w<WINO_WIDTH;w++){ #pragma HLS unroll for(int iid=0;iid<INDEPTH_MINITILE_SIZE;iid++){ #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++){ #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++){ #pragma HLS unroll for(int k=0;k<BATCH_SIZE;k++){ #pragma HLS unroll //if(stream_pingpong_flag) input_tile[w][iid][i][j][k]= stream_temp_reg[w][iid].range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+BTB_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH); } } } } } } void load_weight_tile( ap_int<W_WIDTH> weight_tile[WINO_HEIGHT][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE], ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> weight_value_temp[WINO_HEIGHT] ) { #pragma HLS array_partition variable = weight_tile complete dim=4 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 #pragma HLS array_partition variable=weight_value_temp complete for(int wh=0;wh<WINO_HEIGHT;wh++) { #pragma HLS unroll for(int id=0;id<INDEPTH_MINITILE_SIZE;id++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { #pragma HLS unroll weight_tile[wh][id][i][j]=weight_value_temp[wh].range( (id*WINO_DOMAIN_SIZE_SQUARE+i*WINO_DOMAIN_SIZE+j)*W_WIDTH+W_WIDTH-1, (id*WINO_DOMAIN_SIZE_SQUARE+i*WINO_DOMAIN_SIZE+j)*W_WIDTH); } } } } } template<int dummy> void element_wise_mult_6x6( ap_int<UV_MUL_WIDTH> UV_MUL_TILE[INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<BTB_WIDTH> input_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<W_WIDTH> weight_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE], ap_int<1> ap_clk_div2 ) { #pragma HLS pipeline #pragma HLS array_partition variable=UV_MUL_TILE complete dim=1 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=2 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=3 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 for(int wr=0;wr<WINO_DOMAIN_SIZE;wr++) { #pragma HLS unroll for(int wc=0; wc<WINO_DOMAIN_SIZE;wc++) { #pragma HLS unroll for(int id2=0,id=0;id2<INDEPTH_MINITILE_SIZE/2;id2++,id+=2) { #pragma HLS unroll for(int b=0;b<BATCH_SIZE;b++) { #pragma HLS unroll ap_int<BTB_WIDTH> input_val0=input_tile[id][wr][wc][b]; ap_int<BTB_WIDTH> input_val1=input_tile[id+1][wr][wc][b]; UV_MUL_TILE[id2][wr][wc][b]=__builtin_mac16x2( input_tile[id][wr][wc][b], input_tile[id+1][wr][wc][b], weight_tile[id][wr][wc], weight_tile[id+1][wr][wc], 0,1,ap_clk_div2); } } } } } template<int dummy> void element_wise_mult_4x4( ap_int<UV_MUL_WIDTH> UV_MUL_TILE[2][INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<BTB_WIDTH> input_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<W_WIDTH> weight_tile[2][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE], ap_int<1> ap_clk_div2 ) { #pragma HLS pipeline #pragma HLS array_partition variable=UV_MUL_TILE complete dim=1 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=2 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=3 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=4 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=5 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 #pragma HLS array_partition variable = weight_tile complete dim=4 for(int wr=0;wr<WINO_DOMAIN_SIZE;wr++) { #pragma HLS unroll for(int wc=0; wc<WINO_DOMAIN_SIZE;wc++) { #pragma HLS unroll for(int id2=0,id=0;id2<INDEPTH_MINITILE_SIZE/2;id2++,id+=2) { #pragma HLS unroll for(int b=0;b<BATCH_SIZE;b++) { #pragma HLS unroll ap_int<48> result=__builtin_mac6x2_mac8x1( weight_tile[0][id][wr][wc],weight_tile[0][id+1][wr][wc], weight_tile[1][id][wr][wc],weight_tile[1][id+1][wr][wc], input_tile[id][wr][wc][b], input_tile[id+1][wr][wc][b], 0,1,ap_clk_div2); UV_MUL_TILE[1][id2][wr][wc][b]=result.range(UV_MUL_WIDTH-1,0); #if __HLS_SYN__ UV_MUL_TILE[0][id2][wr][wc][b]=result.range(UV_MUL_WIDTH+19,20)+result[19]; #else UV_MUL_TILE[0][id2][wr][wc][b]=result.range(UV_MUL_WIDTH+20,21)+result[20]; #endif } } } } } template<class T> void load_reg(T ®, T val) { reg=val; } template<class T, int dim1, int dim2, int dim3, int dim4> void load_reg_tile4( T reg[dim1][dim2][dim3][dim4], T val[dim1][dim2][dim3][dim4]) { #pragma HLS pipeline #pragma HLS array_partition variable = reg complete dim=4 #pragma HLS array_partition variable = reg complete dim=3 #pragma HLS array_partition variable = reg complete dim=2 #pragma HLS array_partition variable = reg complete dim=1 #pragma HLS array_partition variable = val complete dim=4 #pragma HLS array_partition variable = val complete dim=3 #pragma HLS array_partition variable = val complete dim=2 #pragma HLS array_partition variable = val complete dim=1 for(int d4=0;d4<dim4;d4++){ #pragma HLS unroll for(int d3=0;d3<dim3;d3++){ #pragma HLS unroll for(int d2=0;d2<dim2;d2++){ #pragma HLS unroll for(int d1=0;d1<dim1;d1++){ #pragma HLS unroll reg[d1][d2][d3][d4]=val[d1][d2][d3][d4]; }}}} } template<class T, int dim1, int dim2, int dim3> void load_reg_tile3( T reg[dim1][dim2][dim3], T val[dim1][dim2][dim3]) { #pragma HLS pipeline #pragma HLS array_partition variable = reg complete dim=3 #pragma HLS array_partition variable = reg complete dim=2 #pragma HLS array_partition variable = reg complete dim=1 #pragma HLS array_partition variable = val complete dim=3 #pragma HLS array_partition variable = val complete dim=2 #pragma HLS array_partition variable = val complete dim=1 for(int d3=0;d3<dim3;d3++){ #pragma HLS unroll for(int d2=0;d2<dim2;d2++){ #pragma HLS unroll for(int d1=0;d1<dim1;d1++){ #pragma HLS unroll reg[d1][d2][d3]=val[d1][d2][d3]; }}} } template<int dummy> void DSP_LOADER(ap_int<32> &rst, ap_int<16> a0, ap_int<16> a1, ap_int<16> b0, ap_int<16> b1, ap_int<32> accum, ap_int<1> clear, ap_int<1> ap_clk_div2) { rst=__builtin_mac16x2(a0,a1,b0,b1,accum,clear,ap_clk_div2); } void wino_stream_block( hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > top_stream_in[WINO_WIDTH], hls::stream< ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > left_stream_in[4][WEIGHT_FEED_NUMBER_PER_PORT], ap_uint<OUT_WIDTH*BATCH_SIZE> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH], ap_uint<16> weightbuffer_outdepth_minitile_number, ap_uint<24> total_input_stream_tile, ap_uint<16> loop_omini_base_reset_cycle, ap_uint<10> loop_wino_tile_rowcol_self_reset_cycle_min1, ap_uint<32> loop_iload_reset_cycle, ap_uint<32> loop_wino_cell_bound, ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_oload_increment_step, ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_omini_increment_step, ap_uint<1> wino5x5_flag #if DEBUG_CONV_DESC ,ConvDesc_t conv_desc #endif ,ap_uint<1> ap_clk_div2 ) { #if DEBUG_FILE_PRINT printf("---wino_stream_block---\n");fflush(stdout); #endif #pragma HLS interface ap_stable port=ap_clk_div2 #pragma HLS interface ap_stable port=weightbuffer_outdepth_minitile_number #pragma HLS interface ap_stable port=total_input_stream_tile #pragma HLS interface ap_stable port=loop_omini_base_reset_cycle #pragma HLS interface ap_stable port=loop_wino_tile_rowcol_self_reset_cycle_min1 #pragma HLS interface ap_stable port=loop_iload_reset_cycle #pragma HLS interface ap_stable port=loop_wino_cell_bound #pragma HLS interface ap_stable port=outbuffer_oload_increment_step #pragma HLS interface ap_stable port=outbuffer_omini_increment_step #pragma HLS interface ap_stable port=wino5x5_flag #pragma HLS array_partition variable = out_buffer dim=1 complete #pragma HLS array_partition variable = out_buffer dim=2 complete #pragma HLS array_partition variable = out_buffer dim=3 complete #pragma HLS array_partition variable = out_buffer dim=4 complete ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg0[WINO_WIDTH][INDEPTH_MINITILE_SIZE]; #pragma HLS array_partition variable = stream_temp_reg0 complete dim=1 #pragma HLS array_partition variable = stream_temp_reg0 complete dim=2 ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg1[WINO_WIDTH][INDEPTH_MINITILE_SIZE]; #pragma HLS array_partition variable = stream_temp_reg1 complete dim=1 #pragma HLS array_partition variable = stream_temp_reg1 complete dim=2 #if DEBUG_FILE_PRINT for(int i=0;i<WINO_WIDTH;i++) { memset(stream_temp_reg0[i],0xAB,INDEPTH_MINITILE_SIZE*sizeof(ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE>)); memset(stream_temp_reg1[i],0xAB,INDEPTH_MINITILE_SIZE*sizeof(ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE>)); } #endif ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable = input_tile complete dim=5 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 ap_int<BTB_WIDTH> input_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #if 0 memset(stream_temp_reg0,0xAA,2*2*36*2); memset(stream_temp_reg1,0xAA,2*2*36*2); #endif for(int i=0;i<INDEPTH_MINITILE_SIZE;i++) { for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++) { #pragma HLS unroll stream_temp_reg0[wino_array_col][imini_idx]=stream_temp_reg0[wino_array_col][imini_idx+1]; } top_stream_in[wino_array_col]>>stream_temp_reg0[wino_array_col][INDEPTH_MINITILE_SIZE-1]; } } ap_uint<1> load_input_flag=1; ap_uint<1> stream_pingpong_flag=1; ap_uint<24> loaded_input_stream_tile_number=1; ap_uint<16> loop_omini_base_cnt=1; ap_uint<10> loop_wino_tile_rowcol_cnt=0; ap_uint<32> loop_iload_cnt=1; ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_oload_offset=0; ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_omini_offset=0; ap_uint<10> idepth_minitile_idx=0; #if DEBUG_FILE_PRINT int write_idx=0; #endif for(int cycle=0;cycle < loop_wino_cell_bound; cycle++) { // for(int oload_idx=0;oload_idx<conv_desc.weightbuffer_load_outdepth_number;oload_idx++) // for(int iload_idx=0;iload_idx<conv_desc.weightbuffer_load_indepth_number;iload_idx++) // for(int imini_base_idx=0;imini_base_idx<conv_desc.weightbuffer_indepth_minitile_number;imini_base_idx++) // for(int wino_tile_row_idx=0;wino_tile_row_idx<conv_desc.wino_tile_number_in_out_rowstep;wino_tile_row_idx++) // for(int wino_tile_col_idx=0;wino_tile_col_idx<conv_desc.wino_tile_number_in_outwidth;wino_tile_col_idx++) // for(int omini_base_idx=0;omini_base_idx<loop_omini_base_reset_cycle ;omini_base_idx++) #pragma HLS pipeline #pragma HLS dependence variable=out_buffer inter false #pragma HLS dependence variable=out_buffer intra false ap_uint<1> load_input_flag_reg = (load_input_flag && loaded_input_stream_tile_number != total_input_stream_tile); if(stream_pingpong_flag) load_input_tile(input_tile,stream_temp_reg0); else load_input_tile(input_tile,stream_temp_reg1); ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_addr = outbuffer_oload_offset + loop_wino_tile_rowcol_cnt + outbuffer_omini_offset; // #if DEBUG_FILE_PRINT // int rowtile_idx=loop_wino_tile_rowcol_cnt/conv_desc.wino_tile_number_in_outwidth; // int coltile_idx=loop_wino_tile_rowcol_cnt%conv_desc.wino_tile_number_in_outwidth; // int outdepth_minitile_idx= (outbuffer_oload_offset+outbuffer_omini_offset)/(conv_desc.wino_tile_number_in_out_rowstep*conv_desc.wino_tile_number_in_outwidth); // if((outbuffer_oload_offset+outbuffer_omini_offset)%(conv_desc.wino_tile_number_in_out_rowstep*conv_desc.wino_tile_number_in_outwidth)) // { // printf("outdepth_minitile_idx not valid\n"); // exit(-3); // } // #endif for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll if(stream_pingpong_flag && load_input_flag_reg) { for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++) { #pragma HLS unroll stream_temp_reg1[wino_array_col][imini_idx]=stream_temp_reg1[wino_array_col][imini_idx+1]; } top_stream_in[wino_array_col]>>stream_temp_reg1[wino_array_col][INDEPTH_MINITILE_SIZE-1]; } else if(load_input_flag_reg) { for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++) { #pragma HLS unroll stream_temp_reg0[wino_array_col][imini_idx]=stream_temp_reg0[wino_array_col][imini_idx+1]; } top_stream_in[wino_array_col]>>stream_temp_reg0[wino_array_col][INDEPTH_MINITILE_SIZE-1]; } } ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> weight_value_temp[WINO_HEIGHT]; #pragma HLS array_partition variable=weight_value_temp complete if(loop_omini_base_cnt <= weightbuffer_outdepth_minitile_number) { for(int i=0;i<4;i++) { #pragma HLS unroll for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++) { #pragma HLS unroll left_stream_in[i][j]>>weight_value_temp[i*WEIGHT_FEED_NUMBER_PER_PORT+j]; #if 0 printf("wino_row_idx: %2d --", i*WEIGHT_FEED_NUMBER_PER_PORT+j); for(int k=0;k<WINO_DOMAIN_SIZE_SQUARE;k++) { printf("[%08x]", (unsigned int) weight_value_temp[i*WEIGHT_FEED_NUMBER_PER_PORT+j].range(k*32+31,k*32) ); } printf("\n"); #endif } } } ap_int<W_WIDTH> weight_tile[WINO_HEIGHT][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable = weight_tile complete dim=4 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 load_weight_tile(weight_tile,weight_value_temp); #if WINO_DOMAIN_SIZE==6 for(int wino_array_idx=0;wino_array_idx<WINO_HEIGHT*WINO_WIDTH;wino_array_idx++) #else for(int wino_array_idx=0;wino_array_idx<WINO_HEIGHT*WINO_WIDTH;wino_array_idx+=2) #endif { #pragma HLS unroll // for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) // { #pragma HLS unroll factor=2 ap_int<BTB_WIDTH> input_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable = input_tile_reg complete dim=4 #pragma HLS array_partition variable = input_tile_reg complete dim=3 #pragma HLS array_partition variable = input_tile_reg complete dim=2 #pragma HLS array_partition variable = input_tile_reg complete dim=1 #if WINO_DOMAIN_SIZE==6 ap_int<W_WIDTH> weight_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE]; #else ap_int<W_WIDTH> weight_tile_reg[2][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable = weight_tile_reg complete dim=4 #endif #pragma HLS array_partition variable = weight_tile_reg complete dim=3 #pragma HLS array_partition variable = weight_tile_reg complete dim=2 #pragma HLS array_partition variable = weight_tile_reg complete dim=1 #if WINO_DOMAIN_SIZE==6 ap_int<UV_MUL_WIDTH> UV_MUL_TILE[INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #else ap_int<UV_MUL_WIDTH> UV_MUL_TILE[2][INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=UV_MUL_TILE complete dim=5 #endif #pragma HLS array_partition variable=UV_MUL_TILE complete dim=1 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=2 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=3 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=4 load_reg_tile4<ap_int<BTB_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg, input_tile[wino_array_idx/WINO_HEIGHT] ); #if WINO_DOMAIN_SIZE==6 load_reg_tile3<ap_int<W_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg, weight_tile[wino_array_idx%WINO_HEIGHT]); #else load_reg_tile3<ap_int<W_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg[0], weight_tile[wino_array_idx%WINO_HEIGHT]); load_reg_tile3<ap_int<W_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg[1], weight_tile[wino_array_idx%WINO_HEIGHT+1]); #endif #if DEBUG_FILE_PRINT if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number) { char infilename[100]; sprintf(infilename,"invector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_input_vector<BTB_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg,write_idx,infilename); #if WINO_DOMAIN_SIZE==4 sprintf(infilename,"invector_%d_%d.txt",wino_array_idx%WINO_HEIGHT+1,wino_array_idx/WINO_HEIGHT); attach_input_vector<BTB_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg,write_idx,infilename); #endif #if WINO_DOMAIN_SIZE==6 char wfilename[100]; sprintf(wfilename,"wvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_weight_vector<W_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg,write_idx,wfilename); #else char wfilename[100]; sprintf(wfilename,"wvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_weight_vector<W_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg[0],write_idx,wfilename); sprintf(wfilename,"wvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT+1,wino_array_idx/WINO_HEIGHT); attach_weight_vector<W_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg[1],write_idx,wfilename); #endif } #endif #if WINO_DOMAIN_SIZE==6 element_wise_mult_6x6<0>(UV_MUL_TILE,input_tile_reg,weight_tile_reg, ap_clk_div2 ); #else // element_wise_mult_6x6<0>(UV_MUL_TILE[0],input_tile_reg,weight_tile_reg[0], ap_clk_div2 ); // element_wise_mult_6x6<0>(UV_MUL_TILE[1],input_tile_reg,weight_tile_reg[1], ap_clk_div2 ); element_wise_mult_4x4<0>(UV_MUL_TILE,input_tile_reg,weight_tile_reg, ap_clk_div2 ); #endif #if WINO_DOMAIN_SIZE==6 ap_int<UV_WIDTH> UV[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #else ap_int<UV_WIDTH> UV[2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=UV complete dim=4 #endif #pragma HLS array_partition variable=UV complete dim=1 #pragma HLS array_partition variable=UV complete dim=2 #pragma HLS array_partition variable=UV complete dim=3 for(int wino_row=0;wino_row<WINO_DOMAIN_SIZE;wino_row++) { #pragma HLS unroll for(int wino_col=0;wino_col<WINO_DOMAIN_SIZE;wino_col++) { #pragma HLS unroll for(int b=0;b<BATCH_SIZE;b++) { #if WINO_DOMAIN_SIZE==6 ap_int<UV_MUL_WIDTH> temp=0; for(int id2=0;id2<INDEPTH_MINITILE_SIZE/2;id2++) { #pragma HLS unroll temp+=UV_MUL_TILE[id2][wino_row][wino_col][b]; } UV[wino_row][wino_col][b]=temp>>UV_QUANT_BIT; #else ap_int<UV_MUL_WIDTH> temp0=0; ap_int<UV_MUL_WIDTH> temp1=0; for(int id2=0;id2<INDEPTH_MINITILE_SIZE/2;id2++) { #pragma HLS unroll temp0+=UV_MUL_TILE[0][id2][wino_row][wino_col][b]; temp1+=UV_MUL_TILE[1][id2][wino_row][wino_col][b]; } UV[0][wino_row][wino_col][b]=temp0>>UV_QUANT_BIT; UV[1][wino_row][wino_col][b]=temp1>>UV_QUANT_BIT; #endif } } } #if DEBUG_FILE_PRINT char uvfilename[100]; #if WINO_DOMAIN_SIZE==6 if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number) { sprintf(uvfilename,"uvvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_output_vector<UV_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(UV,write_idx,uvfilename); } #else if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number) { sprintf(uvfilename,"uvvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_output_vector<UV_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(UV[0],write_idx,uvfilename); sprintf(uvfilename,"uvvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT+1,wino_array_idx/WINO_HEIGHT); attach_output_vector<UV_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(UV[1],write_idx,uvfilename); } #endif #endif #if WINO_DOMAIN_SIZE==6 ap_int<UVA_WIDTH> UVA[WINO_DOMAIN_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #else ap_int<UVA_WIDTH> UVA[2][WINO_DOMAIN_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=UVA complete dim=4 #endif #pragma HLS array_partition variable=UVA complete dim=1 #pragma HLS array_partition variable=UVA complete dim=2 #pragma HLS array_partition variable=UVA complete dim=3 for(int ridx=0;ridx<WINO_DOMAIN_SIZE;ridx++) { #pragma HLS unroll for(int bidx=0;bidx<BATCH_SIZE;bidx++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE==6 UVA_row(UVA,UV,ridx,bidx,wino5x5_flag); #else UVA_row(UVA[0],UV[0],ridx,bidx,wino5x5_flag); UVA_row(UVA[1],UV[1],ridx,bidx,wino5x5_flag); #endif } } #if WINO_DOMAIN_SIZE==6 ap_int<ATA_WIDTH> ATA[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #else ap_int<ATA_WIDTH> ATA[2][WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=ATA complete dim=4 #endif #pragma HLS array_partition variable=ATA complete dim=1 #pragma HLS array_partition variable=ATA complete dim=2 #pragma HLS array_partition variable=ATA complete dim=3 for(int cidx=0;cidx<WINO_OUT_SIZE;cidx++) { #pragma HLS unroll for(int bidx=0;bidx<BATCH_SIZE;bidx++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE==6 ATA_col(ATA,UVA,cidx,bidx,wino5x5_flag); #else ATA_col(ATA[0],UVA[0],cidx,bidx,wino5x5_flag); ATA_col(ATA[1],UVA[1],cidx,bidx,wino5x5_flag); #endif } } #if DEBUG_FILE_PRINT char filename[100]; if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number) { #if WINO_DOMAIN_SIZE==6 sprintf(filename,"outvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_output_vector<ATA_WIDTH,WINO_OUT_SIZE,BATCH_SIZE>(ATA,write_idx,filename); #else sprintf(filename,"outvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT,wino_array_idx/WINO_HEIGHT); attach_output_vector<ATA_WIDTH,WINO_OUT_SIZE,BATCH_SIZE>(ATA[0],write_idx,filename); sprintf(filename,"outvector_%d_%d.txt",wino_array_idx%WINO_HEIGHT+1,wino_array_idx/WINO_HEIGHT); attach_output_vector<ATA_WIDTH,WINO_OUT_SIZE,BATCH_SIZE>(ATA[1],write_idx,filename); #endif } #endif #if WINO_DOMAIN_SIZE==6 ap_int<OUT_WIDTH> out_value[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #else ap_int<OUT_WIDTH> out_value[2][WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=out_value complete dim=4 #endif #pragma HLS array_partition variable=out_value complete dim=1 #pragma HLS array_partition variable=out_value complete dim=2 #pragma HLS array_partition variable=out_value complete dim=3 ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> outbuffer_addr_reg; load_reg< ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> >(outbuffer_addr_reg,outbuffer_addr); for(int r=0;r<WINO_OUT_SIZE;r++) { #pragma HLS unroll for(int c=0;c<WINO_OUT_SIZE;c++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE==6 ap_uint<OUT_WIDTH*BATCH_SIZE> data; if(idepth_minitile_idx==0) data=0; else data=out_buffer[r][c][wino_array_idx%WINO_HEIGHT][wino_array_idx/WINO_HEIGHT][outbuffer_addr_reg]; (out_value[r][c][1],out_value[r][c][0])=data; #else ap_uint<OUT_WIDTH*BATCH_SIZE> data0; ap_uint<OUT_WIDTH*BATCH_SIZE> data1; if(idepth_minitile_idx==0) { data0=0; data1=0; } else { data0=out_buffer[r][c][wino_array_idx%WINO_HEIGHT][wino_array_idx/WINO_HEIGHT][outbuffer_addr_reg]; data1=out_buffer[r][c][wino_array_idx%WINO_HEIGHT+1][wino_array_idx/WINO_HEIGHT][outbuffer_addr_reg]; } (out_value[0][r][c][1],out_value[0][r][c][0])=data0; (out_value[1][r][c][1],out_value[1][r][c][0])=data1; #endif } } #if WINO_DOMAIN_SIZE==6 ap_int<OUT_WIDTH> out_value_back[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #else ap_int<OUT_WIDTH> out_value_back[2][WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=out_value_back complete dim=4 #endif #pragma HLS array_partition variable=out_value_back complete dim=1 #pragma HLS array_partition variable=out_value_back complete dim=2 #pragma HLS array_partition variable=out_value_back complete dim=3 for(int r=0;r<WINO_OUT_SIZE;r++) { #pragma HLS unroll for(int c=0;c<WINO_OUT_SIZE;c++) { #pragma HLS unroll for(int b=0;b<BATCH_SIZE;b++) { #if WINO_DOMAIN_SIZE==6 ap_int<ATA_WIDTH+1> sum_sat; sum_sat=ATA[r][c][b]+out_value[r][c][b]; #if ATA_WIDTH+1 > OUT_WIDTH ap_int<ATA_WIDTH+2-OUT_WIDTH> judgebit=sum_sat.range(ATA_WIDTH,OUT_WIDTH-1); if(judgebit ==0 || judgebit == -1) out_value_back[r][c][b]=sum_sat; else if (sum_sat[ATA_WIDTH]==1 ) out_value_back[r][c][b]=OUT_SAT_MIN; else out_value_back[r][c][b]=OUT_SAT_MAX; #else out_value_back[r][c][b]=sum_sat; #endif #else ap_int<ATA_WIDTH+1> sum_sat0; ap_int<ATA_WIDTH+1> sum_sat1; sum_sat0=ATA[0][r][c][b]+out_value[0][r][c][b]; sum_sat1=ATA[1][r][c][b]+out_value[1][r][c][b]; #if ATA_WIDTH+1 > OUT_WIDTH ap_int<ATA_WIDTH+2-OUT_WIDTH> judgebit0=sum_sat0.range(ATA_WIDTH,OUT_WIDTH-1); ap_int<ATA_WIDTH+2-OUT_WIDTH> judgebit1=sum_sat1.range(ATA_WIDTH,OUT_WIDTH-1); if(judgebit0 ==0 || judgebit0 == -1) out_value_back[0][r][c][b]=sum_sat0; else if (sum_sat0[ATA_WIDTH]==1 ) out_value_back[0][r][c][b]=OUT_SAT_MIN; else out_value_back[0][r][c][b]=OUT_SAT_MAX; if(judgebit1 ==0 || judgebit1 == -1) out_value_back[1][r][c][b]=sum_sat1; else if (sum_sat1[ATA_WIDTH]==1 ) out_value_back[1][r][c][b]=OUT_SAT_MIN; else out_value_back[1][r][c][b]=OUT_SAT_MAX; #else out_value_back[0][r][c][b]=sum_sat0; out_value_back[1][r][c][b]=sum_sat1; #endif #endif // #if DEBUG_FILE_PRINT // int outdepth_idx = outdepth_minitile_idx*OUTDEPTH_MINITILE_SIZE+wino_array_row; // int col_idx = (coltile_idx*WINO_WIDTH+wino_array_col)*conv_desc.wino_output_tile_size+c; // int row_idx = rowtile_idx*conv_desc.wino_output_tile_size+r; // out_value_back[r][c][0]=row_idx*conv_desc.outwidth+col_idx; // out_value_back[r][c][1]=outdepth_idx; // #endif } } } if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number) { for(int r=0;r<WINO_OUT_SIZE;r++) { #pragma HLS unroll for(int c=0;c<WINO_OUT_SIZE;c++) { #if WINO_DOMAIN_SIZE == 6 out_buffer[r][c][wino_array_idx%WINO_HEIGHT][wino_array_idx/WINO_HEIGHT][outbuffer_addr_reg]=(out_value_back[r][c][1],out_value_back[r][c][0]); #else out_buffer[r][c][wino_array_idx%WINO_HEIGHT][wino_array_idx/WINO_HEIGHT][outbuffer_addr_reg]=(out_value_back[0][r][c][1],out_value_back[0][r][c][0]); out_buffer[r][c][wino_array_idx%WINO_HEIGHT+1][wino_array_idx/WINO_HEIGHT][outbuffer_addr_reg]=(out_value_back[1][r][c][1],out_value_back[1][r][c][0]); #endif } } } // } } #if DEBUG_FILE_PRINT if(loop_omini_base_cnt<=weightbuffer_outdepth_minitile_number) write_idx++; #endif // element_wise_mult(UV_MUL,input_tile,weight_tile,ap_clk_div2); if(loop_omini_base_cnt==loop_omini_base_reset_cycle && loop_wino_tile_rowcol_cnt==loop_wino_tile_rowcol_self_reset_cycle_min1) { idepth_minitile_idx++; } else if(loop_iload_cnt==loop_iload_reset_cycle) { idepth_minitile_idx=0; } if(loop_omini_base_cnt==loop_omini_base_reset_cycle && loop_wino_tile_rowcol_cnt==loop_wino_tile_rowcol_self_reset_cycle_min1) { loop_wino_tile_rowcol_cnt=0; } else if(loop_omini_base_cnt==loop_omini_base_reset_cycle) { loop_wino_tile_rowcol_cnt++; } if(loop_iload_cnt==loop_iload_reset_cycle) { loop_iload_cnt=1; outbuffer_oload_offset+=outbuffer_oload_increment_step; } else { loop_iload_cnt++; } if(loop_omini_base_cnt==loop_omini_base_reset_cycle ) { load_input_flag = 1; } else if(loop_omini_base_cnt==INDEPTH_MINITILE_SIZE) { load_input_flag = 0; } if(loop_omini_base_cnt==loop_omini_base_reset_cycle) { loop_omini_base_cnt=1; loaded_input_stream_tile_number++; stream_pingpong_flag=~stream_pingpong_flag; outbuffer_omini_offset=0; } else { loop_omini_base_cnt++; outbuffer_omini_offset+=outbuffer_omini_increment_step; } } } #endif