#include "wino_macro.h" #include <ap_int.h> #include <hls_stream.h> #include "wino_IO.cpp" #include "wino_cell.cpp" #include "../software/param.h" void wino_systolic_top( ap_uint<128> *input_DDR0, ap_uint<128> *input_DDR1, ap_uint<128> *input_DDR2, ap_uint<128> *input_DDR3, ap_uint<128> *weight_DDR0, ap_uint<128> *weight_DDR1, ap_uint<128> *weight_DDR2, ap_uint<128> *weight_DDR3, ap_uint<128> *output_DDR0, ap_uint<128> *output_DDR1, ap_uint<128> *output_DDR2, ap_uint<128> *output_DDR3, ConvDesc_t &conv_desc ) { #pragma HLS interface m_axi port= input_DDR3 depth=65535 #pragma HLS interface m_axi port= input_DDR2 depth=65535 #pragma HLS interface m_axi port= input_DDR1 depth=65535 #pragma HLS interface m_axi port= input_DDR0 depth=65535 #pragma HLS interface m_axi port= output_DDR3 depth=65535 #pragma HLS interface m_axi port= output_DDR2 depth=65535 #pragma HLS interface m_axi port= output_DDR1 depth=65535 #pragma HLS interface m_axi port= output_DDR0 depth=65535 #pragma HLS interface m_axi port= weight_DDR3 depth=65535 #pragma HLS interface m_axi port= weight_DDR2 depth=65535 #pragma HLS interface m_axi port= weight_DDR1 depth=65535 #pragma HLS interface m_axi port= weight_DDR0 depth=65535 //input buffer declaration ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH]; #pragma HLS array_partition variable=input_buffer complete dim=1 #pragma HLS array_partition variable=input_buffer complete dim=2 ap_uint<36> output_buffer0[16][16][OUTPUT_BUFFER_DEPTH]; #pragma HLS array_partition variable=output_buffer0 complete dim=1 #pragma HLS array_partition variable=output_buffer0 complete dim=2 ap_uint<36> output_buffer1[16][16][OUTPUT_BUFFER_DEPTH]; #pragma HLS array_partition variable=output_buffer1 complete dim=1 #pragma HLS array_partition variable=output_buffer1 complete dim=2 ap_uint<1> pingpong; #if DEBUG_FILE_PRINT clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer); #endif load_input_rowtile_from_ddr( input_DDR0, input_DDR1, input_DDR2, input_DDR3, input_buffer, conv_desc.inheight, conv_desc.inwidth, conv_desc.stride, conv_desc.pad_size, conv_desc.inwidth_align8, conv_desc.indepth_align8, conv_desc.group_indepth_x_inwidth_align8_by8, conv_desc.group_indepth_offset_x_inwidth_align8_by8, conv_desc.input_load_burst_length, conv_desc.wino_output_tile_size, 1); #if DEBUG_FILE_PRINT attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt"); #endif ap_uint<16> start_output_row =0; for( ; start_output_row < conv_desc.outheight; start_output_row+=conv_desc.wino_output_tile_size) { load_input_rowtile_from_ddr( input_DDR0, input_DDR1, input_DDR2, input_DDR3, input_buffer, conv_desc.inheight, conv_desc.inwidth, conv_desc.stride, conv_desc.pad_size, conv_desc.inwidth_align8, conv_desc.indepth_align8, conv_desc.group_indepth_x_inwidth_align8_by8, conv_desc.group_indepth_offset_x_inwidth_align8_by8, conv_desc.input_load_burst_length, start_output_row + conv_desc.wino_output_tile_size, 0); #if DEBUG_FILE_PRINT attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt"); #endif // if(pingpong) // { // wino_systolic( // input_buffer, // output_buffer0, // weight_DDR0, // weight_DDR1, // weight_DDR2, // weight_DDR3, // input_height, // input_width, // input_depth, // input_width_ceildiv_16, // input_depth_align8, // output_height, // output_width, // output_depth, // kernel_window_size, // pad_size, // weight_indepth_load_number, // weight_outdepth_load_number, // weight_outdepth_feed_size, // start_output_row, // weight_total_load_number, // weight_total_feed_size, // ddr_load_length, // ddr_load_length_per_feed, // row_repeat_times, // (start_output_row==0) , // (start_output_row+wino_output_tile_size >= output_height)); // #if DEBUG_FILE_PRINT // attach_output_buffer_content<0>(output_buffer0,"output_buffer_content.txt"); // #endif // write_output_to_DDR( // output_DDR0, // output_DDR1, // output_DDR2, // output_DDR3, // output_buffer1, // outdepth_ceil_div8, // start_output_row, // start_output_row+4, // output_height, // output_width, // wino_output_tile_size, // row_repeat_times, // 1, // 0); // pingpong=0; // } // else // { // wino_systolic( // input_buffer, // output_buffer1, // weight_DDR0, // weight_DDR1, // weight_DDR2, // weight_DDR3, // input_height, // input_width, // input_depth, // input_width_ceildiv_16, // input_depth_align8, // output_height, // output_width, // output_depth, // kernel_window_size, // pad_size, // weight_indepth_load_number, // weight_outdepth_load_number, // weight_outdepth_feed_size, // start_output_row, // weight_total_load_number, // weight_total_feed_size, // ddr_load_length, // ddr_load_length_per_feed, // row_repeat_times, // (start_output_row==0) , // (start_output_row+wino_output_tile_size >= output_height)); // #if DEBUG_FILE_PRINT // attach_output_buffer_content<0>(output_buffer0,"output_buffer_content.txt"); // #endif // write_output_to_DDR( // output_DDR0, // output_DDR1, // output_DDR2, // output_DDR3, // output_buffer0, // outdepth_ceil_div8, // start_output_row, // start_output_row+4, // output_height, // output_width, // wino_output_tile_size, // row_repeat_times, // 1, // 0); // pingpong=1; // } } // if(pingpong) // { // write_output_to_DDR( // output_DDR0, // output_DDR1, // output_DDR2, // output_DDR3, // output_buffer1, // outdepth_ceil_div8, // start_output_row, // start_output_row+4, // output_height, // output_width, // wino_output_tile_size, // row_repeat_times, // 1, // 0); // } // else // { // write_output_to_DDR( // output_DDR0, // output_DDR1, // output_DDR2, // output_DDR3, // output_buffer0, // outdepth_ceil_div8, // start_output_row, // start_output_row+4, // output_height, // output_width, // wino_output_tile_size, // row_repeat_times, // 1, // 0); // } }