Skip to content
Snippets Groups Projects
wino.cpp 11.75 KiB

#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>


#include "wino_IO.cpp"
#include "wino_cell.cpp"


#include "../software/param.h"




void wino_systolic_top(
    ap_uint<128> *input_DDR0,
    ap_uint<128> *input_DDR1,
    ap_uint<128> *input_DDR2,
    ap_uint<128> *input_DDR3,
    ap_uint<128> *weight_DDR0,
    ap_uint<128> *weight_DDR1,
    ap_uint<128> *weight_DDR2,
    ap_uint<128> *weight_DDR3,
    ap_uint<128> *output_DDR0,
    ap_uint<128> *output_DDR1,
    ap_uint<128> *output_DDR2,
    ap_uint<128> *output_DDR3,
    ConvDesc_t &conv_desc
    )
{


#pragma HLS interface m_axi port= input_DDR3 depth=65535
#pragma HLS interface m_axi port= input_DDR2 depth=65535
#pragma HLS interface m_axi port= input_DDR1 depth=65535
#pragma HLS interface m_axi port= input_DDR0 depth=65535
#pragma HLS interface m_axi port= output_DDR3 depth=65535
#pragma HLS interface m_axi port= output_DDR2 depth=65535
#pragma HLS interface m_axi port= output_DDR1 depth=65535
#pragma HLS interface m_axi port= output_DDR0 depth=65535
#pragma HLS interface m_axi port= weight_DDR3 depth=65535
#pragma HLS interface m_axi port= weight_DDR2 depth=65535
#pragma HLS interface m_axi port= weight_DDR1 depth=65535
#pragma HLS interface m_axi port= weight_DDR0 depth=65535


    

    



    //input buffer declaration
    ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1 
#pragma HLS array_partition variable=input_buffer complete dim=2 
    ap_uint<36> output_buffer0[16][16][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1 
#pragma HLS array_partition variable=output_buffer0 complete dim=2 
    ap_uint<36> output_buffer1[16][16][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1 
#pragma HLS array_partition variable=output_buffer1 complete dim=2 

    ap_uint<1> pingpong;

    #if DEBUG_FILE_PRINT
        clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
    #endif
    load_input_rowtile_from_ddr(
        input_DDR0,
        input_DDR1,
        input_DDR2,
        input_DDR3,
        input_buffer,
		conv_desc.inheight,
		conv_desc.inwidth,
		conv_desc.stride,
        conv_desc.pad_size,
		conv_desc.inwidth_align8,
		conv_desc.indepth_align8,
		conv_desc.group_indepth_x_inwidth_align8_by8,
		conv_desc.group_indepth_offset_x_inwidth_align8_by8,
        conv_desc.inwidth_ceildiv_inbufferwidth,
        conv_desc.buffer_address_mid_increment_step,
		conv_desc.input_load_burst_length,
        conv_desc.wino_output_tile_size,
        1);

    #if DEBUG_FILE_PRINT
        attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
    #endif


    ap_uint<16> start_output_row =0;
    for( ; start_output_row < conv_desc.outheight; start_output_row+=conv_desc.wino_output_tile_size)
    {


        ap_uint<16> start_row_idx_minus_pad_size=start_output_row-conv_desc.pad_size;

        load_input_rowtile_from_ddr(
            input_DDR0,
            input_DDR1,
            input_DDR2,
            input_DDR3,
            input_buffer,
            conv_desc.inheight,
            conv_desc.inwidth,
            conv_desc.stride,
            conv_desc.pad_size,
            conv_desc.inwidth_align8,
            conv_desc.indepth_align8,
            conv_desc.group_indepth_x_inwidth_align8_by8,
            conv_desc.group_indepth_offset_x_inwidth_align8_by8,
            conv_desc.inwidth_ceildiv_inbufferwidth,
            conv_desc.buffer_address_mid_increment_step,
            conv_desc.input_load_burst_length,
            start_output_row + conv_desc.wino_output_tile_size,
            0);
        #if DEBUG_FILE_PRINT
            attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
        #endif




        input_feed_underconstruction(
            input_buffer,
            // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1, 
            // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
   	        conv_desc.inheight,
	        conv_desc.inwidth,
	        conv_desc.pad_size,
	        conv_desc.weightbuffer_load_indepth_number,
	        conv_desc.weightbuffer_load_outdepth_number,
	        conv_desc.wino_output_tile_size,
	        conv_desc.input_buffer_feeding_loop_bound,
	        conv_desc.loop_wino_tile_col_reset_cycle,
	        conv_desc.loop_outdepth_minitile_baseidx_reset_cycle,
	        conv_desc.buffer_address_mid_minitile_depth_step,
            conv_desc.wino_out_size_by_wino_width,
            start_output_row,
            start_row_idx_minus_pad_size
            #if DEBUG_FILE_PRINT
            ,conv_desc
            #endif
        );

        hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> >  weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];

        weight_feed_one_port<0>(
        weight_DDR0,
        weight_stream[0],
        conv_desc.weightDDR_burst_number,
        conv_desc.weightDDR_port_burst_length,
        conv_desc.weightDDR_buffer_burst_length,
        conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
        conv_desc.loop_start_output_basecol_reset_cycle,
        conv_desc.loop_weight_feed_bound,
        start_output_row==0,
        start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
        #if DEBUG_FILE_PRINT
        ,conv_desc	
        #endif
    );
        weight_feed_one_port<1>(
        weight_DDR1,
        weight_stream[1],
        conv_desc.weightDDR_burst_number,
        conv_desc.weightDDR_port_burst_length,
        conv_desc.weightDDR_buffer_burst_length,
        conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
        conv_desc.loop_start_output_basecol_reset_cycle,
        conv_desc.loop_weight_feed_bound,
        start_output_row==0,
        start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
        #if DEBUG_FILE_PRINT
        ,conv_desc	
        #endif
    );
        weight_feed_one_port<2>(
        weight_DDR2,
        weight_stream[2],
        conv_desc.weightDDR_burst_number,
        conv_desc.weightDDR_port_burst_length,
        conv_desc.weightDDR_buffer_burst_length,
        conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
        conv_desc.loop_start_output_basecol_reset_cycle,
        conv_desc.loop_weight_feed_bound,
        start_output_row==0,
        start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
        #if DEBUG_FILE_PRINT
        ,conv_desc	
        #endif
    );
        weight_feed_one_port<3>(
        weight_DDR3,
        weight_stream[3],
        conv_desc.weightDDR_burst_number,
        conv_desc.weightDDR_port_burst_length,
        conv_desc.weightDDR_buffer_burst_length,
        conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
        conv_desc.loop_start_output_basecol_reset_cycle,
        conv_desc.loop_weight_feed_bound,
        start_output_row==0,
        start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
        #if DEBUG_FILE_PRINT
        ,conv_desc	
        #endif
    );

    #if DEBUG_FILE_PRINT
    for(int i=0;i<4;i++)
    {
        for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++)
        {
            int outdepth_minitile_idx=i*WEIGHT_FEED_NUMBER_PER_PORT+j;
            char filename[100];
            sprintf(filename,"weightstream%d.txt",outdepth_minitile_idx);

            attach_weight_stream_content<INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE_SQUARE>(weight_stream[i][j],filename);
        }
    }
    #endif

    //     if(pingpong)
    //     {
    //         wino_systolic(
    //         input_buffer,
    //         output_buffer0,
    //         weight_DDR0,
    //         weight_DDR1,
    //         weight_DDR2,
    //         weight_DDR3,
    //         input_height,
    //         input_width,
    //         input_depth,
    //         input_width_ceildiv_16,
    //         input_depth_align8,
    //         output_height,
    //         output_width,
    //         output_depth,
    //         kernel_window_size,
    //         pad_size,
    //         weight_indepth_load_number,
    //         weight_outdepth_load_number,
    //         weight_outdepth_feed_size,
    //         start_output_row,
    //         weight_total_load_number,
    //         weight_total_feed_size,
    //         ddr_load_length,
    //         ddr_load_length_per_feed,
    //         row_repeat_times,
    //         (start_output_row==0) ,
    //         (start_output_row+wino_output_tile_size >= output_height));
    //     #if DEBUG_FILE_PRINT
    //         attach_output_buffer_content<0>(output_buffer0,"output_buffer_content.txt");
    //         #endif

    //         write_output_to_DDR(
    //         output_DDR0,
    //         output_DDR1,
    //         output_DDR2,
    //         output_DDR3,
    //         output_buffer1,
    //         outdepth_ceil_div8,
    //         start_output_row,
    //         start_output_row+4,
    //         output_height,
    //         output_width,
    //         wino_output_tile_size,
    //         row_repeat_times,
    //         1,
    //         0);

    //         pingpong=0;
    //     }
    //     else
    //     {
    //         wino_systolic(
    //         input_buffer,
    //         output_buffer1,
    //         weight_DDR0,
    //         weight_DDR1,
    //         weight_DDR2,
    //         weight_DDR3,
    //         input_height,
    //         input_width,
    //         input_depth,
    //         input_width_ceildiv_16,
    //         input_depth_align8,
    //         output_height,
    //         output_width,
    //         output_depth,
    //         kernel_window_size,
    //         pad_size,
    //         weight_indepth_load_number,
    //         weight_outdepth_load_number,
    //         weight_outdepth_feed_size,
    //         start_output_row,
    //         weight_total_load_number,
    //         weight_total_feed_size,
    //         ddr_load_length,
    //         ddr_load_length_per_feed,
    //         row_repeat_times,
    //         (start_output_row==0) ,
    //         (start_output_row+wino_output_tile_size >= output_height));
    //     #if DEBUG_FILE_PRINT
    //         attach_output_buffer_content<0>(output_buffer0,"output_buffer_content.txt");
    //         #endif

    //         write_output_to_DDR(
    //         output_DDR0,
    //         output_DDR1,
    //         output_DDR2,
    //         output_DDR3,
    //         output_buffer0,
    //         outdepth_ceil_div8,
    //         start_output_row,
    //         start_output_row+4,
    //         output_height,
    //         output_width,
    //         wino_output_tile_size,
    //         row_repeat_times,
    //         1,
    //         0);
    //         pingpong=1;
    //     }
    }



    // if(pingpong)
    // {
    //         write_output_to_DDR(
    //         output_DDR0,
    //         output_DDR1,
    //         output_DDR2,
    //         output_DDR3,
    //         output_buffer1,
    //         outdepth_ceil_div8,
    //         start_output_row,
    //         start_output_row+4,
    //         output_height,
    //         output_width,
    //         wino_output_tile_size,
    //         row_repeat_times,
    //         1,
    //         0);
    // }
    // else
    // {
    //                write_output_to_DDR(
    //         output_DDR0,
    //         output_DDR1,
    //         output_DDR2,
    //         output_DDR3,
    //         output_buffer0,
    //         outdepth_ceil_div8,
    //         start_output_row,
    //         start_output_row+4,
    //         output_height,
    //         output_width,
    //         wino_output_tile_size,
    //         row_repeat_times,
    //         1,
    //         0);
    // }
    











    

}