#ifndef _WINO_CELL_HPP_ #define _WINO_CELL_HPP_ #include "wino_macro.h" #include <ap_int.h> #include <hls_stream.h> #include "wino_buffer.cpp" #include "wino_transform.cpp" #include <dsp_builtins.h> #define OUT_WIDTH 18 #define B_WIDTH_IN 8 #define B_WIDTH_OUT 12 #define BTB_WIDTH_IN 12 #define BT_WIDTH_OUT 16 #define GTGG_WIDTH_IN 16 #define GTGG_WIDTH_W 16 #define GTGG_WIDTH_OUT 24 #define A_WIDTH_IN 24 #define A_WIDTH_OUT 28 #define AT_WIDTH_IN 28 #define AT_WIDTH_OUT 32 void input_transform( hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_stream, hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_transformed_stream, int input_transform_feeding_loop_bound #if DEBUG_FILE_PRINT ,int wino_array_col #endif ) { #if DEBUG_FILE_PRINT int write_idx=0; #endif for(int cycle=0;cycle<input_transform_feeding_loop_bound;cycle++) { #pragma hls pipeline ap_uint<8*BATCH_SIZE*36> input_tile_stream_data; input_tile_stream>>input_tile_stream_data; ap_int<8> in[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=in complete #if DEBUG_FILE_PRINT char infilename[100]; sprintf(infilename,"intile_transform_%d.txt",wino_array_col); // attach_output_vector<8,WINO_DOMAIN_SIZE,BATCH_SIZE>(in,write_idx,infilename); #endif for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { for(int k=0;k<BATCH_SIZE;k++) { in[i][j][k]=input_tile_stream_data.range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8+7, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8); } } } ap_int<DB_WIDTH> DB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; for(int k=0;k<BATCH_SIZE;k++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE ==6 // ap_int<IN_WIDTH+1> a=in[i][0][k]-in[i][2][k]; // ap_int<IN_WIDTH+1> b=in[i][2][k]-in[i][4][k]; // ap_int<IN_WIDTH+3> c=in[i][1][k]*4-in[i][3][k]; // ap_int<IN_WIDTH+3> d=in[i][2][k]*4-in[i][5][k]; // ap_int<IN_WIDTH+1> e=in[i][1][k]-in[i][3][k]; // ap_int<IN_WIDTH+1> f=in[i][3][k]-in[i][5][k]; // DB[i][0][k]=(a*4-b)>>DB_QUANT_BIT; // DB[i][1][k]=(-c-d)>>DB_QUANT_BIT; // DB[i][2][k]=(c-d)>>DB_QUANT_BIT; // DB[i][3][k]=(-e*2-f)>>DB_QUANT_BIT; // DB[i][4][k]=(e*2-f)>>DB_QUANT_BIT; // DB[i][5][k]=(e*4-f)>>DB_QUANT_BIT; DB6x6_1(in,DB,i,k) #else #error "WINO_DOMAIN_SIZE!=6 not implemented " #endif } } ap_int<BTB_WIDTH> BtDB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; for(int k=0;k<BATCH_SIZE;k++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll #if WINO_DOMAIN_SIZE ==6 BTB6x6_1(DB,BtDB,i,k) #else #error "WINO_DOMAIN_SIZE!=6 not implemented " #endif } } ap_uint<BTB_WIDTH*BATCH_SIZE*36> input_tile_transformed_data; for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { for(int k=0;k<BATCH_SIZE;k++) { input_tile_transformed_data.range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+BTB_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH)=BtDB[i][j][k]; } } } #if DEBUG_FILE_PRINT attach_output_vector<BTB_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(BtDB,write_idx,infilename); write_idx++; #endif input_tile_transformed_stream<<input_tile_transformed_data; } } void load_input_tile( ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg[WINO_WIDTH][INDEPTH_MINITILE_SIZE] ) { #pragma HLS pipeline #pragma HLS array_partition variable = input_tile complete dim=5 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #pragma HLS array_partition variable = stream_temp_reg0 complete dim=1 #pragma HLS array_partition variable = stream_temp_reg0 complete dim=2 for(int w=0;w<WINO_WIDTH;w++){ #pragma HLS unroll for(int iid=0;iid<INDEPTH_MINITILE_SIZE;iid++){ #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++){ #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++){ #pragma HLS unroll for(int k=0;k<BATCH_SIZE;k++){ #pragma HLS unroll //if(stream_pingpong_flag) input_tile[w][iid][i][j][k]= stream_temp_reg[w][iid].range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+BTB_WIDTH-1, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH); } } } } } } void load_weight_tile( ap_int<W_WIDTH> weight_tile[WINO_HEIGHT][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE], ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> weight_value_temp[WINO_HEIGHT] ) { #pragma HLS array_partition variable = weight_tile complete dim=4 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 #pragma HLS array_partition variable=weight_value_temp complete for(int wh=0;wh<WINO_HEIGHT;wh++) { #pragma HLS unroll for(int id=0;id<INDEPTH_MINITILE_SIZE;id++) { #pragma HLS unroll for(int i=0;i<WINO_DOMAIN_SIZE;i++) { #pragma HLS unroll for(int j=0;j<WINO_DOMAIN_SIZE;j++) { #pragma HLS unroll weight_tile[wh][id][i][j]=weight_value_temp[wh].range( (id*WINO_DOMAIN_SIZE_SQUARE+i*WINO_DOMAIN_SIZE+j)*W_WIDTH+W_WIDTH-1, (id*WINO_DOMAIN_SIZE_SQUARE+i*WINO_DOMAIN_SIZE+j)*W_WIDTH); } } } } } template<int dummy> void element_wise_mult_6x6( ap_int<UV_MUL_WIDTH> UV_MUL_TILE[INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<BTB_WIDTH> input_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<W_WIDTH> weight_tile[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE], ap_int<1> ap_clk_div2 ) { #pragma HLS pipeline #pragma HLS array_partition variable=UV_MUL_TILE complete dim=1 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=2 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=3 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 for(int wr=0;wr<WINO_DOMAIN_SIZE;wr++) { #pragma HLS unroll for(int wc=0; wc<WINO_DOMAIN_SIZE;wc++) { #pragma HLS unroll for(int id2=0,id=0;id2<INDEPTH_MINITILE_SIZE/2;id2++,id+=2) { #pragma HLS unroll for(int b=0;b<BATCH_SIZE;b++) { #pragma HLS unroll ap_int<BTB_WIDTH> input_val0=input_tile[id][wr][wc][b]; ap_int<BTB_WIDTH> input_val1=input_tile[id+1][wr][wc][b]; UV_MUL_TILE[id2][wr][wc][b]=__builtin_mac16x2( input_tile[id][wr][wc][b], input_tile[id+1][wr][wc][b], weight_tile[id][wr][wc], weight_tile[id+1][wr][wc], 0,1,ap_clk_div2); } } } } } void element_wise_mult( ap_int<UV_MUL_WIDTH> UV_MUL[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE], ap_int<W_WIDTH> weight_tile[OUTDEPTH_MINITILE_SIZE][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE], ap_int<1> ap_clk_div2 ) { #pragma HLS pipeline #pragma HLS array_partition variable=UV_MUL complete dim=1 #pragma HLS array_partition variable=UV_MUL complete dim=2 #pragma HLS array_partition variable=UV_MUL complete dim=3 #pragma HLS array_partition variable=UV_MUL complete dim=4 #pragma HLS array_partition variable=UV_MUL complete dim=5 #pragma HLS array_partition variable=UV_MUL complete dim=6 #pragma HLS array_partition variable = input_tile complete dim=5 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #pragma HLS array_partition variable = weight_tile complete dim=4 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 for(int wr=0;wr<WINO_DOMAIN_SIZE;wr++) { #pragma HLS unroll for(int wc=0; wc<WINO_DOMAIN_SIZE;wc++) { for(int od=0;od<OUTDEPTH_MINITILE_SIZE;od++) { for(int id2=0,id=0;id2<INDEPTH_MINITILE_SIZE/2;id2++,id+=2) { ap_int<W_WIDTH> wino_val_d0 = weight_tile[od][id][wr][wc]; ap_int<W_WIDTH> wino_val_d1 = weight_tile[od][id+1][wr][wc]; for(int ww=0;ww<WINO_WIDTH;ww++) { for(int b=0;b<BATCH_SIZE;b++) { ap_int<BTB_WIDTH_IN> input_val0=input_tile[ww][id][wr][wc][b]; ap_int<BTB_WIDTH_IN> input_val1=input_tile[ww][id+1][wr][wc][b]; UV_MUL[od][ww][id2][wr][wc][b]=__builtin_mac16x2(input_val0,input_val1,wino_val_d0,wino_val_d1,0,1,ap_clk_div2); } } } } } } } template<class T, int dim1, int dim2, int dim3, int dim4> void load_reg_tile4( T reg[dim1][dim2][dim3][dim4], T val[dim1][dim2][dim3][dim4]) { #pragma HLS pipeline #pragma HLS array_partition variable = reg complete dim=4 #pragma HLS array_partition variable = reg complete dim=3 #pragma HLS array_partition variable = reg complete dim=2 #pragma HLS array_partition variable = reg complete dim=1 #pragma HLS array_partition variable = val complete dim=4 #pragma HLS array_partition variable = val complete dim=3 #pragma HLS array_partition variable = val complete dim=2 #pragma HLS array_partition variable = val complete dim=1 for(int d4=0;d4<dim4;d4++){ #pragma HLS unroll for(int d3=0;d3<dim3;d3++){ #pragma HLS unroll for(int d2=0;d2<dim2;d2++){ #pragma HLS unroll for(int d1=0;d1<dim1;d1++){ #pragma HLS unroll reg[d1][d2][d3][d4]=val[d1][d2][d3][d4]; }}}} } template<class T, int dim1, int dim2, int dim3> void load_reg_tile3( T reg[dim1][dim2][dim3], T val[dim1][dim2][dim3]) { #pragma HLS pipeline #pragma HLS array_partition variable = reg complete dim=3 #pragma HLS array_partition variable = reg complete dim=2 #pragma HLS array_partition variable = reg complete dim=1 #pragma HLS array_partition variable = val complete dim=3 #pragma HLS array_partition variable = val complete dim=2 #pragma HLS array_partition variable = val complete dim=1 for(int d3=0;d3<dim3;d3++){ #pragma HLS unroll for(int d2=0;d2<dim2;d2++){ #pragma HLS unroll for(int d1=0;d1<dim1;d1++){ #pragma HLS unroll reg[d1][d2][d3]=val[d1][d2][d3]; }}} } template<int dummy> void DSP_LOADER(ap_int<32> &rst, ap_int<16> a0, ap_int<16> a1, ap_int<16> b0, ap_int<16> b1, ap_int<32> accum, ap_int<1> clear, ap_int<1> ap_clk_div2) { rst=__builtin_mac16x2(a0,a1,b0,b1,accum,clear,ap_clk_div2); } void wino_stream_block( hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > top_stream_in[WINO_WIDTH], hls::stream< ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > left_stream_in[4][WEIGHT_FEED_NUMBER_PER_PORT], ap_uint<OUT_WIDTH*2> out_buffer[OUTDEPTH_MINITILE_SIZE][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH], ap_uint<24> total_input_stream_tile, ap_uint<16> loop_omini_base_reset_cycle, ap_uint<32> loop_wino_cell_bound, ap_uint<1> wino5x5_flag #if DEBUG_FILE_PRINT ,ConvDesc_t conv_desc #endif ,ap_uint<1> ap_clk_div2 ) { #pragma HLS ap_stable port=ap_clk_div2 #pragma HLS array_partition variable = out_buffer dim=1 complete ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg0[WINO_WIDTH][INDEPTH_MINITILE_SIZE]; #pragma HLS array_partition variable = stream_temp_reg0 complete dim=1 #pragma HLS array_partition variable = stream_temp_reg0 complete dim=2 ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> stream_temp_reg1[WINO_WIDTH][INDEPTH_MINITILE_SIZE]; #pragma HLS array_partition variable = stream_temp_reg1 complete dim=1 #pragma HLS array_partition variable = stream_temp_reg1 complete dim=2 #if DEBUG_FILE_PRINT for(int i=0;i<WINO_WIDTH;i++) { memset(stream_temp_reg0[i],0xAB,INDEPTH_MINITILE_SIZE*sizeof(ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE>)); memset(stream_temp_reg1[i],0xAB,INDEPTH_MINITILE_SIZE*sizeof(ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE>)); } #endif ap_int<BTB_WIDTH> input_tile[WINO_WIDTH][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable = input_tile complete dim=5 #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 ap_int<BTB_WIDTH> input_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable = input_tile complete dim=4 #pragma HLS array_partition variable = input_tile complete dim=3 #pragma HLS array_partition variable = input_tile complete dim=2 #pragma HLS array_partition variable = input_tile complete dim=1 #if 0 memset(stream_temp_reg0,0xAA,2*2*36*2); memset(stream_temp_reg1,0xAA,2*2*36*2); #endif for(int i=0;i<INDEPTH_MINITILE_SIZE;i++) { #pragma hls pipeline for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++) { #pragma HLS unroll stream_temp_reg0[wino_array_col][imini_idx]=stream_temp_reg0[wino_array_col][imini_idx+1]; } top_stream_in[wino_array_col]>>stream_temp_reg0[wino_array_col][INDEPTH_MINITILE_SIZE-1]; } } ap_uint<1> load_input_flag=1; ap_uint<1> stream_pingpong_flag=1; ap_uint<24> loaded_input_stream_tile_number=1; #if DEBUG_FILE_PRINT total_input_stream_tile=conv_desc.weightbuffer_load_outdepth_number* conv_desc.weightbuffer_load_indepth_number* conv_desc.weightbuffer_indepth_minitile_number* conv_desc.wino_tile_number_in_outwidth* conv_desc.wino_tile_number_in_out_rowstep; #endif ap_uint<16> loop_omini_base_cnt=1; #if DEBUG_FILE_PRINT loop_omini_base_reset_cycle = conv_desc.weightbuffer_outdepth_minitile_number>INDEPTH_MINITILE_SIZE? conv_desc.weightbuffer_outdepth_minitile_number:INDEPTH_MINITILE_SIZE; #endif ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> out_buffer_address_outer=0; ap_uint<OUTPUT_BUFFER_DEPTH_BITWIDTH> out_buffer_address_inner=0; #if DEBUG_FILE_PRINT int write_idx=0; for(int oload_idx=0;oload_idx<conv_desc.weightbuffer_load_outdepth_number;oload_idx++) for(int iload_idx=0;iload_idx<conv_desc.weightbuffer_load_indepth_number;iload_idx++) for(int imini_base_idx=0;imini_base_idx<conv_desc.weightbuffer_indepth_minitile_number;imini_base_idx++) for(int wino_tile_row_idx=0;wino_tile_row_idx<conv_desc.wino_tile_number_in_out_rowstep;wino_tile_row_idx++) for(int wino_tile_idx=0;wino_tile_idx<conv_desc.wino_tile_number_in_outwidth;wino_tile_idx++) for(int omini_base_idx=0;omini_base_idx<loop_omini_base_reset_cycle ;omini_base_idx++) #else for(int cycle=0;cycle < loop_wino_cell_bound; cycle++) #endif { #pragma HLS pipeline #pragma HLS dependence variable=out_buffer inter false #pragma HLS dependence variable=out_buffer intra false ap_uint<1> load_input_flag_reg = (load_input_flag && loaded_input_stream_tile_number != total_input_stream_tile); //ap_uint<1> increment_out_inner_address_flag = (loop_omini_base_cnt < conv_desc.weightbuffer_outdepth_minitile_number); if(stream_pingpong_flag) load_input_tile(input_tile,stream_temp_reg0); else load_input_tile(input_tile,stream_temp_reg1); #if 0 printf("pingpong %d load %d, write idx %d, [%d/%d]\n", (int) stream_pingpong_flag, (int) load_input_flag_reg,write_idx, (int) loaded_input_stream_tile_number, (int) total_input_stream_tile); puts("input tile reg 0"); for(int id=0;id<4;id++) { for(int i=0;i<WINO_DOMAIN_SIZE;i++) { for(int j=0;j<WINO_DOMAIN_SIZE;j++) { ap_uint<BTB_WIDTH> data0,data1; (data1,data0)=stream_temp_reg0[0][id].range( (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH+BTB_WIDTH*BATCH_SIZE-1, (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH); printf("[%04x %04x]", (int) data0, (int) data1); } printf(" "); for(int j=0;j<WINO_DOMAIN_SIZE;j++) { ap_uint<BTB_WIDTH> data0,data1; (data1,data0)=stream_temp_reg1[0][id].range( (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH+BTB_WIDTH*BATCH_SIZE-1, (i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE*BTB_WIDTH); printf("[%04x %04x]", (int) data0, (int) data1); } printf("\n"); } printf("\n"); } #endif for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll if(stream_pingpong_flag && load_input_flag_reg) { for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++) { #pragma HLS unroll stream_temp_reg1[wino_array_col][imini_idx]=stream_temp_reg1[wino_array_col][imini_idx+1]; } top_stream_in[wino_array_col]>>stream_temp_reg1[wino_array_col][INDEPTH_MINITILE_SIZE-1]; } else if(load_input_flag_reg) { for(int imini_idx=0;imini_idx<INDEPTH_MINITILE_SIZE-1;imini_idx++) { #pragma HLS unroll stream_temp_reg0[wino_array_col][imini_idx]=stream_temp_reg0[wino_array_col][imini_idx+1]; } top_stream_in[wino_array_col]>>stream_temp_reg0[wino_array_col][INDEPTH_MINITILE_SIZE-1]; } } ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> weight_value_temp[WINO_HEIGHT]; #pragma HLS array_partition variable=weight_value_temp complete if(loop_omini_base_cnt <= conv_desc.weightbuffer_outdepth_minitile_number) { for(int i=0;i<4;i++) { #pragma HLS unroll for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++) { #pragma HLS unroll left_stream_in[i][j]>>weight_value_temp[i*WEIGHT_FEED_NUMBER_PER_PORT+j]; #if 0 printf("wino_row_idx: %2d --", i*WEIGHT_FEED_NUMBER_PER_PORT+j); for(int k=0;k<WINO_DOMAIN_SIZE_SQUARE;k++) { printf("[%08x]", (unsigned int) weight_value_temp[i*WEIGHT_FEED_NUMBER_PER_PORT+j].range(k*32+31,k*32) ); } printf("\n"); #endif } } } ap_int<W_WIDTH> weight_tile[WINO_HEIGHT][INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable = weight_tile complete dim=4 #pragma HLS array_partition variable = weight_tile complete dim=3 #pragma HLS array_partition variable = weight_tile complete dim=2 #pragma HLS array_partition variable = weight_tile complete dim=1 load_weight_tile(weight_tile,weight_value_temp); for(int wino_array_row=0;wino_array_row<WINO_HEIGHT;wino_array_row++) { #pragma HLS unroll for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++) { #pragma HLS unroll ap_int<BTB_WIDTH> input_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable = input_tile_reg complete dim=4 #pragma HLS array_partition variable = input_tile_reg complete dim=3 #pragma HLS array_partition variable = input_tile_reg complete dim=2 #pragma HLS array_partition variable = input_tile_reg complete dim=1 ap_int<W_WIDTH> weight_tile_reg[INDEPTH_MINITILE_SIZE][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE]; #pragma HLS array_partition variable = weight_tile_reg complete dim=3 #pragma HLS array_partition variable = weight_tile_reg complete dim=2 #pragma HLS array_partition variable = weight_tile_reg complete dim=1 ap_int<UV_MUL_WIDTH> UV_MUL_TILE[INDEPTH_MINITILE_SIZE/2][WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=UV_MUL_TILE complete dim=1 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=2 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=3 #pragma HLS array_partition variable=UV_MUL_TILE complete dim=4 load_reg_tile4<ap_int<BTB_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg, input_tile[wino_array_col] ); load_reg_tile3<ap_int<W_WIDTH>,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg, weight_tile[wino_array_row]); #if DEBUG_FILE_PRINT char infilename[100]; sprintf(infilename,"invector_%d_%d.txt",wino_array_row,wino_array_col); attach_input_vector<BTB_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,BATCH_SIZE>(input_tile_reg,write_idx,infilename); char wfilename[100]; sprintf(wfilename,"wvector_%d_%d.txt",wino_array_row,wino_array_col); attach_weight_vector<W_WIDTH,INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE>(weight_tile_reg,write_idx,wfilename); #endif element_wise_mult_6x6<0>(UV_MUL_TILE,input_tile_reg,weight_tile_reg, ap_clk_div2 ); ap_int<UV_WIDTH> UV[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=UV complete dim=1 #pragma HLS array_partition variable=UV complete dim=2 #pragma HLS array_partition variable=UV complete dim=3 for(int wino_row=0;wino_row<WINO_DOMAIN_SIZE;wino_row++) { #pragma HLS unroll for(int wino_col=0;wino_col<WINO_DOMAIN_SIZE;wino_col++) { #pragma HLS unroll for(int b=0;b<BATCH_SIZE;b++) { ap_int<UV_MUL_WIDTH> temp=0; for(int id2=0;id2<INDEPTH_MINITILE_SIZE/2;id2++) { #pragma HLS unroll temp+=UV_MUL_TILE[id2][wino_row][wino_col][b]; } UV[wino_row][wino_col][b]=temp>>UV_QUANT_BIT; } } } #if DEBUG_FILE_PRINT char uvfilename[100]; sprintf(uvfilename,"uvvector_%d_%d.txt",wino_array_row,wino_array_col); attach_output_vector<UV_WIDTH,WINO_DOMAIN_SIZE,BATCH_SIZE>(UV,write_idx,uvfilename); #endif ap_int<UVA_WIDTH> UVA[WINO_DOMAIN_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=UVA complete dim=1 #pragma HLS array_partition variable=UVA complete dim=2 #pragma HLS array_partition variable=UVA complete dim=3 for(int ridx=0;ridx<WINO_DOMAIN_SIZE;ridx++) { #pragma HLS unroll for(int bidx=0;bidx<BATCH_SIZE;bidx++) { #pragma HLS unroll UVA_row(UVA,UV,ridx,bidx,wino5x5_flag); } } ap_int<ATA_WIDTH> ATA[WINO_OUT_SIZE][WINO_OUT_SIZE][BATCH_SIZE]; #pragma HLS array_partition variable=ATA complete dim=1 #pragma HLS array_partition variable=ATA complete dim=2 #pragma HLS array_partition variable=ATA complete dim=3 for(int cidx=0;cidx<WINO_OUT_SIZE;cidx++) { #pragma HLS unroll for(int bidx=0;bidx<BATCH_SIZE;bidx++) { #pragma HLS unroll ATA_col(ATA,UVA,cidx,bidx,wino5x5_flag); } } #if DEBUG_FILE_PRINT char filename[100]; sprintf(filename,"outvector_%d_%d.txt",wino_array_row,wino_array_col); attach_output_vector<ATA_WIDTH,WINO_OUT_SIZE,BATCH_SIZE>(ATA,write_idx,filename); #endif } } #if DEBUG_FILE_PRINT write_idx++; #endif // element_wise_mult(UV_MUL,input_tile,weight_tile,ap_clk_div2); if(loop_omini_base_cnt==loop_omini_base_reset_cycle ) { load_input_flag = 1; } else if(loop_omini_base_cnt==INDEPTH_MINITILE_SIZE) { load_input_flag = 0; } if(loop_omini_base_cnt==loop_omini_base_reset_cycle) { loop_omini_base_cnt=1; loaded_input_stream_tile_number++; stream_pingpong_flag=~stream_pingpong_flag; } else { loop_omini_base_cnt++; } } } void wino6x6_stream_cell( hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in, hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & bottom_stream_out, hls::stream< ap_uint<W_WIDTH*2*36> > &left_stream_in, hls::stream< ap_uint<W_WIDTH*2*36> > &right_stream_out, ap_uint<OUT_WIDTH*2> out_buffer[16][1024], ap_int<16> weight_indepth_load_number, ap_int<16> weight_outdepth_load_number, ap_int<16> weight_indepth_feed_size, ap_int<16> weight_outdepth_feed_size, ap_int<16> row_tile_number #if DEBUG_FILE_PRINT ,int ROW_IDX, int COL_IDX #endif ) { #pragma HLS array_partition variable = out_buffer dim=1 complete ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg0[2]; ap_uint<GTGG_WIDTH_IN*2*36> stream_temp_reg1[2]; #pragma HLS array_partition variable = stream_temp_reg0 complete #pragma HLS array_partition variable = stream_temp_reg1 complete #if DEBUG_FILE_PRINT memset(stream_temp_reg0,0xAA,2*2*36*2); memset(stream_temp_reg1,0xAA,2*2*36*2); #endif ap_int<W_WIDTH> G1[36][2]; #pragma HLS array_partition variable=G1 ap_uint<1> stream_pingpong_flag=0; int write_idx=0; int weight_stream_idx=0; int input_stream_idx =0; #if DEBUG_FILE_PRINT char filename[100]; sprintf(filename,"wino_cell_output_%d_%d.txt",ROW_IDX, COL_IDX); char input_stream_filename[100]; sprintf(input_stream_filename,"wino_cell_input_%d_%d.txt",ROW_IDX, COL_IDX); char weight_stream_filename[100]; sprintf(weight_stream_filename,"wino_cell_weight_%d_%d.txt",ROW_IDX, COL_IDX); #endif top_stream_in>>stream_temp_reg0[0]; bottom_stream_out<<stream_temp_reg0[0]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg0[0],input_stream_idx,input_stream_filename); #endif top_stream_in>>stream_temp_reg0[1]; bottom_stream_out<<stream_temp_reg0[1]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg0[1],input_stream_idx,input_stream_filename); #endif ITER0:for(int iter0=0;iter0<weight_indepth_load_number;iter0++) ITER1:for(int iter1=0;iter1<weight_outdepth_load_number;iter1++) ITER2:for(int iter2=0;iter2<row_tile_number;iter2++) ITER3:for(int iter3=0;iter3<weight_indepth_feed_size/2;iter3++) { ITER4:for(int iter4=0;iter4<weight_outdepth_feed_size;iter4++) { #pragma HLS pipeline #pragma HLS dependence variable=out_buffer inter false #pragma HLS dependence variable=out_buffer intra false ap_int<GTGG_WIDTH_IN> input_tile[2][36][2]; #pragma HLS array_partition variable = input_tile complete for(int i=0;i<36;i++) { #pragma HLS unroll if(stream_pingpong_flag) { input_tile[0][i][0]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); input_tile[0][i][1]=stream_temp_reg1[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); input_tile[1][i][0]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); input_tile[1][i][1]=stream_temp_reg1[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); } else { input_tile[0][i][0]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); input_tile[0][i][1]=stream_temp_reg0[0].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); input_tile[1][i][0]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN-1,i*GTGG_WIDTH_IN*2); input_tile[1][i][1]=stream_temp_reg0[1].range(i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN*2-1,i*GTGG_WIDTH_IN*2+GTGG_WIDTH_IN); } } if(iter4<2 ) { if(stream_pingpong_flag) { top_stream_in>>stream_temp_reg0[iter4]; bottom_stream_out<<stream_temp_reg0[iter4]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg0[iter4],input_stream_idx,input_stream_filename); #endif } else { top_stream_in>>stream_temp_reg1[iter4]; bottom_stream_out<<stream_temp_reg1[iter4]; #if DEBUG_FILE_PRINT wino_cell_stream_input<0>(stream_temp_reg1[iter4],input_stream_idx,input_stream_filename); #endif } } ap_uint<W_WIDTH*2*36> weight_value_temp; left_stream_in>>weight_value_temp; right_stream_out<<weight_value_temp; for(int i=0;i<36;i++) { #pragma HLS unroll G1[i][0]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1,i*W_WIDTH); G1[i][1]=weight_value_temp.range(i*W_WIDTH+W_WIDTH-1+288,i*W_WIDTH +288 ); } ap_int<GTGG_WIDTH_OUT> UV[36][2]; #pragma HLS array_partition variable=UV complete for(int i=0;i<36;i++) { #pragma HLS unroll UV[i][0] = G1[i][0]*input_tile[0][i][0] + G1[i][1]*input_tile[1][i][0]; UV[i][1] = G1[i][0]*input_tile[0][i][1] + G1[i][1]*input_tile[1][i][1]; } ap_int<A_WIDTH_OUT> UVA[6][4][2]; #pragma HLS array_partition variable=UVA complete for(int b_idx=0;b_idx<2;b_idx++) { #pragma HLS unroll for(int i=0;i<6;i++) { #pragma HLS unroll UVA[i][0][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+0][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx]; UVA[i][1][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<1); UVA[i][2][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]+(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<2); UVA[i][3][b_idx]=(ap_int<A_WIDTH_OUT>) UV[i*6+1][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+2][b_idx]+(((ap_int<A_WIDTH_OUT>) UV[i*6+3][b_idx]-(ap_int<A_WIDTH_OUT>) UV[i*6+4][b_idx])<<3)+(ap_int<A_WIDTH_OUT>) UV[i*6+5][b_idx]; } } int address = iter1*row_tile_number* weight_outdepth_feed_size + iter4*row_tile_number + iter2; ap_int<36> outbuffer_value[16]; #pragma HLS array_partition variable= outbuffer_value complete for(int i=0;i<16;i++) { #pragma HLS unroll outbuffer_value[i]=out_buffer[i][address]; } ap_int<AT_WIDTH_OUT> ATUVA[16][2]; #pragma HLS array_partition variable=ATUVA complete for(int b_idx=0;b_idx<2;b_idx++) { #pragma HLS unroll for(int i=0;i<4;i++) { #pragma HLS unroll ATUVA[0+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[0][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx]; ATUVA[4+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<1); ATUVA[8+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]+(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<2); ATUVA[12+i][b_idx]=(ap_int<AT_WIDTH_OUT>) UVA[1][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[2][i][b_idx]+(((ap_int<AT_WIDTH_OUT>) UVA[3][i][b_idx]-(ap_int<AT_WIDTH_OUT>) UVA[4][i][b_idx])<<3) + (ap_int<AT_WIDTH_OUT>) UVA[5][i][b_idx]; } } ap_int<18> out_residual[16][2]; if(iter0 ==0 && iter3 ==0) { for(int i=0;i<16;i++) { #pragma HLS unroll out_residual[i][0]=0; out_residual[i][1]=0; } } else { for(int i=0;i<16;i++) { #pragma HLS unroll out_residual[i][0]=outbuffer_value[i].range(17,0); out_residual[i][1]=outbuffer_value[i].range(35,18); } } ap_int<18> outbuffer_writeback_value[16][2]; #pragma HLS array_partition variable = outbuffer_writeback_value complete dim=1 #pragma HLS array_partition variable = outbuffer_writeback_value complete dim=2 for(int i=0;i<16;i++) { #pragma HLS unroll outbuffer_writeback_value[i][0]=ATUVA[i][0]+out_residual[i][0]; outbuffer_writeback_value[i][1]=ATUVA[i][1]+out_residual[i][1]; } ap_int<18*2> outbuffer_writeback_value_batch[16]; #pragma HLS array_partition variable = outbuffer_writeback_value_batch complete for(int i=0;i<16;i++) { #pragma HLS unroll outbuffer_writeback_value_batch[i]=(outbuffer_writeback_value[i][1],outbuffer_writeback_value[i][0] ); } for(int i=0;i<16;i++) { #pragma HLS unroll out_buffer[i][address]=outbuffer_writeback_value_batch[i]; } } stream_pingpong_flag=~stream_pingpong_flag; } } // //template<int dummy> // void wino6x6_stream_bottomend( // hls::stream< ap_uint<GTGG_WIDTH_IN*2*36> > & top_stream_in, // ap_int<16> weight_indepth_load_number, // ap_int<16> weight_outdepth_load_number, // ap_int<16> weight_indepth_feed_size, // ap_int<16> row_tile_number // ) // { // ap_uint<GTGG_WIDTH_IN*2*36> dummy_temp; // for(int i=0;i<weight_indepth_load_number*weight_outdepth_load_number*weight_indepth_feed_size*row_tile_number+2;i++) // { // top_stream_in.read(dummy_temp); // } // } #endif