#include "wino_macro.h" #include <ap_int.h> #include <hls_stream.h> #include "wino_IO.cpp" #include "wino_cell.cpp" #include "param.h" void load_params( ap_int<32>* mem_params, ConvDesc_t &conv_desc ) { ap_int<32> params[128]; for(int i=0;i<128;i++) { #pragma HLS pipeline params[i]=mem_params[i]; } //Original signal conv_desc.inheight=params[0]; conv_desc.inwidth=params[1]; conv_desc.indepth=params[2]; conv_desc.outheight=params[3]; conv_desc.outwidth=params[4]; conv_desc.outdepth=params[5]; conv_desc.kernel_size=params[6]; conv_desc.pad_size=params[7]; conv_desc.stride=params[8]; //wino related conv_desc.wino5x5_flag=params[9]; // 1: 3x3, 0:5x5 conv_desc.wino_output_tile_size=params[10]; //input buffer related conv_desc.indepth_align_minitile_size=params[11]; conv_desc.indepth_align8=params[12]; conv_desc.indepth_ceildiv8=params[13]; conv_desc.inwidth_ceildiv_inbufferwidth=params[14]; conv_desc.inwidth_align8=params[15]; conv_desc.group_indepth_offset=params[16]; conv_desc.group_indepth=params[17]; conv_desc.input_ddr_bytes=params[18]; conv_desc.input_ddr_128bits=params[19]; conv_desc.group_indepth_x_inwidth_align8_by8=params[20]; conv_desc.group_indepth_offset_x_inwidth_align8_by8=params[21]; conv_desc.input_load_burst_length=params[22]; conv_desc.buffer_address_mid_increment_step=params[23]; conv_desc.row_address_bitnumber_flag=params[24]; // ouput_buffer_related conv_desc.outwidth_align8=params[25]; conv_desc.outdepth_align8=params[26]; conv_desc.outheight_align4=params[27]; conv_desc.outdepth_align_minitile_size=params[28]; conv_desc.group_outdepth_offset=params[29]; conv_desc.group_outdepth=params[30]; conv_desc.output_ddr_bytes=params[31]; conv_desc.output_ddr_128bits=params[32]; // Weight_related conv_desc.weightbuffer_load_indepth_number=params[33]; conv_desc.weightbuffer_load_indepth_step=params[34]; conv_desc.weightbuffer_load_outdepth_number=params[35]; conv_desc.weightbuffer_load_outdepth_step=params[36]; conv_desc.weightbuffer_indepth_minitile_number=params[37]; conv_desc.weightbuffer_outdepth_minitile_number=params[38]; conv_desc.weightbuffer_total_load_number=params[39]; //weight_load hardware conv_desc.weightDDR_buffer_burst_length=params[40]; conv_desc.weightDDR_port_burst_length=params[41]; conv_desc.weightDDR_burst_number=params[42]; conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1=params[43]; conv_desc.loop_start_output_baserowcol_reset_cycle=params[44]; conv_desc.loop_weight_feed_bound=params[45]; // input buffer feeding related conv_desc.wino_out_size_by_wino_width=params[46]; conv_desc.wino_tile_number_in_outwidth=params[47]; conv_desc.loop_outdepth_minitile_baseidx_reset_cycle=params[48]; conv_desc.loop_wino_tile_col_reset_cycle=params[49]; conv_desc.loop_wino_tile_row_reset_cycle=params[50]; conv_desc.buffer_address_mid_minitile_depth_step=params[51]; conv_desc.input_buffer_feeding_loop_bound=params[52]; conv_desc.input_transform_feeding_loop_bound=params[53]; // row_tile calculation , these parameter have to be solved after weight parameters are decided. conv_desc.out_rowstep=params[54]; conv_desc.wino_tile_number_in_out_rowstep=params[55]; // wino computation conv_desc.total_input_stream_tile=params[56]; conv_desc.loop_omini_base_reset_cycle=params[57]; conv_desc.loop_wino_cell_bound=params[58]; conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1=params[59]; conv_desc.loop_iload_reset_cycle=params[60]; conv_desc.outbuffer_oload_increment_step=params[61]; conv_desc.outbuffer_omini_increment_step=params[62]; //output write back conv_desc.outdepth_ceildiv8=params[63]; conv_desc.output_burst_length=params[64]; conv_desc.write_back_flag=params[65]; conv_desc.wino_col_pix_upper_bound=params[66]; conv_desc.wino_tile_number_rowcol=params[67]; conv_desc.out_ddr_increment_step=params[68]; } void wino_systolic_kernel( ap_uint<128> *weight_DDR0, ap_uint<128> *weight_DDR1, ap_uint<128> *weight_DDR2, ap_uint<128> *weight_DDR3, ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH], ap_uint<OUT_WIDTH*2> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH], ap_uint<16> start_output_row, ap_int<16> start_row_idx_minus_pad_size, ap_uint<1> first_flag, ap_uint<1> last_flag, ConvDesc_t conv_desc, ap_uint<1> ap_clk_div2 ) { printf("****wino_systolic_kernel****\n"); #pragma HLS interface ap_stable port=conv_desc #pragma HLS array_partition variable =input_buffer dim=1 complete #pragma HLS array_partition variable =input_buffer dim=2 complete #pragma HLS array_partition variable =out_buffer dim=4 complete #pragma HLS array_partition variable =out_buffer dim=3 complete #pragma HLS array_partition variable =out_buffer dim=2 complete #pragma HLS array_partition variable =out_buffer dim=1 complete #pragma HLS dataflow hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH]; #pragma HLS stream variable=input_tile_stream depth=1 hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_transformed_stream[WINO_WIDTH]; #pragma HLS stream variable=input_tile_transformed_stream depth=1 hls::stream<ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT]; #pragma HLS stream variable=weight_stream depth=1 input_feed_underconstruction( input_buffer, input_tile_stream, // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1, // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2, conv_desc.inwidth, conv_desc.pad_size, conv_desc.weightbuffer_load_outdepth_number, conv_desc.wino_output_tile_size, conv_desc.input_buffer_feeding_loop_bound, conv_desc.loop_wino_tile_row_reset_cycle, conv_desc.loop_wino_tile_col_reset_cycle, conv_desc.buffer_address_mid_minitile_depth_step, conv_desc.wino_out_size_by_wino_width, conv_desc.row_address_bitnumber_flag, start_row_idx_minus_pad_size ); for(int i=0;i<WINO_WIDTH;i++) { #pragma HLS unroll input_transform( input_tile_stream[i], input_tile_transformed_stream[i], conv_desc.input_transform_feeding_loop_bound, i ); } weight_feed_one_port<0>( weight_DDR0, weight_stream[0], conv_desc.weightDDR_burst_number, conv_desc.weightDDR_buffer_burst_length, conv_desc.weightDDR_port_burst_length, conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1, conv_desc.loop_start_output_baserowcol_reset_cycle, conv_desc.loop_weight_feed_bound, conv_desc.weightbuffer_outdepth_minitile_number, first_flag, last_flag #if DEBUG_CONV_DESC ,conv_desc #endif ); weight_feed_one_port<1>( weight_DDR1, weight_stream[1], conv_desc.weightDDR_burst_number, conv_desc.weightDDR_buffer_burst_length, conv_desc.weightDDR_port_burst_length, conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1, conv_desc.loop_start_output_baserowcol_reset_cycle, conv_desc.loop_weight_feed_bound, conv_desc.weightbuffer_outdepth_minitile_number, first_flag, last_flag #if DEBUG_CONV_DESC ,conv_desc #endif ); weight_feed_one_port<2>( weight_DDR2, weight_stream[2], conv_desc.weightDDR_burst_number, conv_desc.weightDDR_buffer_burst_length, conv_desc.weightDDR_port_burst_length, conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1, conv_desc.loop_start_output_baserowcol_reset_cycle, conv_desc.loop_weight_feed_bound, conv_desc.weightbuffer_outdepth_minitile_number, first_flag, last_flag #if DEBUG_CONV_DESC ,conv_desc #endif ); weight_feed_one_port<3>( weight_DDR3, weight_stream[3], conv_desc.weightDDR_burst_number, conv_desc.weightDDR_buffer_burst_length, conv_desc.weightDDR_port_burst_length, conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1, conv_desc.loop_start_output_baserowcol_reset_cycle, conv_desc.loop_weight_feed_bound, conv_desc.weightbuffer_outdepth_minitile_number, first_flag, last_flag #if DEBUG_CONV_DESC ,conv_desc #endif ); wino_stream_block( input_tile_transformed_stream, weight_stream, out_buffer, conv_desc.weightbuffer_outdepth_minitile_number, conv_desc.total_input_stream_tile, conv_desc.loop_omini_base_reset_cycle, conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1, conv_desc.loop_iload_reset_cycle, conv_desc.loop_wino_cell_bound, conv_desc.outbuffer_oload_increment_step, conv_desc.outbuffer_omini_increment_step, conv_desc.wino5x5_flag #if DEBUG_CONV_DESC ,conv_desc #endif ,ap_clk_div2 ); } #pragma SDS data zero_copy(input_DDR0[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(input_DDR1[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(input_DDR2[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(input_DDR3[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(weight_DDR0[0:WEIGHT_PORT_DEPTH]) #pragma SDS data zero_copy(weight_DDR1[0:WEIGHT_PORT_DEPTH]) #pragma SDS data zero_copy(weight_DDR2[0:WEIGHT_PORT_DEPTH]) #pragma SDS data zero_copy(weight_DDR3[0:WEIGHT_PORT_DEPTH]) #pragma SDS data zero_copy(output_DDR0[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(output_DDR1[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(output_DDR2[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(output_DDR3[0:FEATURE_PORT_DEPTH]) #pragma SDS data zero_copy(mem_params[0:128]) #pragma SDS data sys_port(input_DDR0:ps_e_S_AXI_HP0_FPD) #pragma SDS data sys_port(input_DDR1:ps_e_S_AXI_HP1_FPD) #pragma SDS data sys_port(input_DDR2:ps_e_S_AXI_HP2_FPD) #pragma SDS data sys_port(input_DDR3:ps_e_S_AXI_HP3_FPD) #pragma SDS data sys_port(weight_DDR0:ps_e_S_AXI_HP0_FPD) #pragma SDS data sys_port(weight_DDR1:ps_e_S_AXI_HP1_FPD) #pragma SDS data sys_port(weight_DDR2:ps_e_S_AXI_HP2_FPD) #pragma SDS data sys_port(weight_DDR3:ps_e_S_AXI_HP3_FPD) #pragma SDS data sys_port(output_DDR0:ps_e_S_AXI_HP0_FPD) #pragma SDS data sys_port(output_DDR1:ps_e_S_AXI_HP1_FPD) #pragma SDS data sys_port(output_DDR2:ps_e_S_AXI_HP2_FPD) #pragma SDS data sys_port(output_DDR3:ps_e_S_AXI_HP3_FPD) #pragma SDS data sys_port(mem_params:ps_e_S_AXI_HP0_FPD) void wino_systolic_top( ap_uint<128> *input_DDR0, ap_uint<128> *input_DDR1, ap_uint<128> *input_DDR2, ap_uint<128> *input_DDR3, ap_uint<128> *weight_DDR0, ap_uint<128> *weight_DDR1, ap_uint<128> *weight_DDR2, ap_uint<128> *weight_DDR3, ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR0, ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR1, ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR2, ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR3, // ConvDesc_t conv_desc, ap_int<32> *mem_params #ifdef __SDSVHLS__ ,ap_uint<1> ap_clk_div2 #endif ) { #ifndef __SDSVHLS__ ap_uint<1> ap_clk_div2=0; #endif // #pragma HLS interface m_axi port= input_DDR3 depth=65535 // #pragma HLS interface m_axi port= input_DDR2 depth=65535 // #pragma HLS interface m_axi port= input_DDR1 depth=65535 // #pragma HLS interface m_axi port= input_DDR0 depth=65535 // #pragma HLS interface m_axi port= output_DDR3 depth=65535 // #pragma HLS interface m_axi port= output_DDR2 depth=65535 // #pragma HLS interface m_axi port= output_DDR1 depth=65535 // #pragma HLS interface m_axi port= output_DDR0 depth=65535 // #pragma HLS interface m_axi port= weight_DDR3 depth=65535 // #pragma HLS interface m_axi port= weight_DDR2 depth=65535 // #pragma HLS interface m_axi port= weight_DDR1 depth=65535 // #pragma HLS interface m_axi port= weight_DDR0 depth=65535 // #pragma HLS interface m_axi port= mem_params depth=128 //input buffer declaration ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH]; #pragma HLS array_partition variable=input_buffer complete dim=1 #pragma HLS array_partition variable=input_buffer complete dim=2 #pragma HLS resource variable=input_buffer core=RAM_T2P_BRAM ap_uint<OUT_WIDTH*2> output_buffer0[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH]; #pragma HLS array_partition variable=output_buffer0 complete dim=1 #pragma HLS array_partition variable=output_buffer0 complete dim=2 #pragma HLS resource variable=output_buffer0 core=RAM_T2P_BRAM ap_uint<OUT_WIDTH*2> output_buffer1[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH]; #pragma HLS array_partition variable=output_buffer1 complete dim=1 #pragma HLS array_partition variable=output_buffer1 complete dim=2 #pragma HLS resource variable=output_buffer1 core=RAM_T2P_BRAM ConvDesc_t conv_desc; ap_uint<1> pingpong=0; #if DEBUG_FILE_PRINT clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer); clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer0); clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer1); #endif load_params(mem_params,conv_desc); load_input_rowtile_from_ddr( input_DDR0, input_DDR1, input_DDR2, input_DDR3, input_buffer, conv_desc.inheight, conv_desc.inwidth, conv_desc.stride, conv_desc.pad_size, conv_desc.inwidth_align8, conv_desc.indepth_align8, conv_desc.group_indepth_x_inwidth_align8_by8, conv_desc.group_indepth_offset_x_inwidth_align8_by8, conv_desc.inwidth_ceildiv_inbufferwidth, conv_desc.buffer_address_mid_increment_step, conv_desc.input_load_burst_length, conv_desc.row_address_bitnumber_flag, conv_desc.out_rowstep, 0, 1); #if DEBUG_FILE_PRINT attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt"); #endif ap_int<16> write_start_row= -conv_desc.out_rowstep; ap_int<16> next_start_row= conv_desc.out_rowstep; for( ap_int<16> compute_start_row =0; compute_start_row < conv_desc.outheight; compute_start_row+=conv_desc.out_rowstep) { ap_uint<16> start_row_idx_minus_pad_size=compute_start_row-conv_desc.pad_size; #pragma HLS DEPENDENCE variable=input_buffer intra false if(pingpong ) { wino_systolic_kernel( weight_DDR0, weight_DDR1, weight_DDR2, weight_DDR3, input_buffer, output_buffer0, compute_start_row, start_row_idx_minus_pad_size, compute_start_row==0, next_start_row > conv_desc.outheight, conv_desc, ap_clk_div2 ); #if DEBUG_FILE_PRINT char outfilename[100]; sprintf(outfilename,"outbuffer.txt"); attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>( output_buffer0,0,outfilename); #endif write_output_to_DDR( output_DDR0, output_DDR1, output_DDR2, output_DDR3, output_buffer1, conv_desc.outheight, conv_desc.outwidth_align8, conv_desc.wino_output_tile_size, conv_desc.wino_tile_number_in_outwidth, conv_desc.wino_tile_number_in_out_rowstep, conv_desc.wino_col_pix_upper_bound, conv_desc.wino_tile_number_rowcol, conv_desc.output_burst_length, conv_desc.out_ddr_increment_step, write_start_row, write_start_row==0 #if DEBUG_CONV_DESC ,conv_desc #endif ); load_input_rowtile_from_ddr( input_DDR0, input_DDR1, input_DDR2, input_DDR3, input_buffer, conv_desc.inheight, conv_desc.inwidth, conv_desc.stride, conv_desc.pad_size, conv_desc.inwidth_align8, conv_desc.indepth_align8, conv_desc.group_indepth_x_inwidth_align8_by8, conv_desc.group_indepth_offset_x_inwidth_align8_by8, conv_desc.inwidth_ceildiv_inbufferwidth, conv_desc.buffer_address_mid_increment_step, conv_desc.input_load_burst_length, conv_desc.row_address_bitnumber_flag, conv_desc.out_rowstep, next_start_row, 0); } else { wino_systolic_kernel( weight_DDR0, weight_DDR1, weight_DDR2, weight_DDR3, input_buffer, output_buffer1, compute_start_row, start_row_idx_minus_pad_size, compute_start_row==0, next_start_row > conv_desc.outheight, conv_desc, ap_clk_div2 ); #if DEBUG_FILE_PRINT char outfilename[100]; sprintf(outfilename,"outbuffer.txt"); attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>( output_buffer1,0,outfilename); #endif write_output_to_DDR( output_DDR0, output_DDR1, output_DDR2, output_DDR3, output_buffer0, conv_desc.outheight, conv_desc.outwidth_align8, conv_desc.wino_output_tile_size, conv_desc.wino_tile_number_in_outwidth, conv_desc.wino_tile_number_in_out_rowstep, conv_desc.wino_col_pix_upper_bound, conv_desc.wino_tile_number_rowcol, conv_desc.output_burst_length, conv_desc.out_ddr_increment_step, write_start_row, write_start_row==0 #if DEBUG_CONV_DESC ,conv_desc #endif ); load_input_rowtile_from_ddr( input_DDR0, input_DDR1, input_DDR2, input_DDR3, input_buffer, conv_desc.inheight, conv_desc.inwidth, conv_desc.stride, conv_desc.pad_size, conv_desc.inwidth_align8, conv_desc.indepth_align8, conv_desc.group_indepth_x_inwidth_align8_by8, conv_desc.group_indepth_offset_x_inwidth_align8_by8, conv_desc.inwidth_ceildiv_inbufferwidth, conv_desc.buffer_address_mid_increment_step, conv_desc.input_load_burst_length, conv_desc.row_address_bitnumber_flag, conv_desc.out_rowstep, next_start_row, 0); } pingpong =~pingpong; write_start_row+=conv_desc.out_rowstep; next_start_row+=conv_desc.out_rowstep; #if DEBUG_FILE_PRINT attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt"); #endif } if(pingpong ) { write_output_to_DDR( output_DDR0, output_DDR1, output_DDR2, output_DDR3, output_buffer1, conv_desc.outheight, conv_desc.outwidth_align8, conv_desc.wino_output_tile_size, conv_desc.wino_tile_number_in_outwidth, conv_desc.wino_tile_number_in_out_rowstep, conv_desc.wino_col_pix_upper_bound, conv_desc.wino_tile_number_rowcol, conv_desc.output_burst_length, conv_desc.out_ddr_increment_step, write_start_row, write_start_row==0 #if DEBUG_CONV_DESC ,conv_desc #endif ); } else { write_output_to_DDR( output_DDR0, output_DDR1, output_DDR2, output_DDR3, output_buffer0, conv_desc.outheight, conv_desc.outwidth_align8, conv_desc.wino_output_tile_size, conv_desc.wino_tile_number_in_outwidth, conv_desc.wino_tile_number_in_out_rowstep, conv_desc.wino_col_pix_upper_bound, conv_desc.wino_tile_number_rowcol, conv_desc.output_burst_length, conv_desc.out_ddr_increment_step, write_start_row, write_start_row==0 #if DEBUG_CONV_DESC ,conv_desc #endif ); } }