Skip to content
Snippets Groups Projects
wino.cpp 9.91 KiB
Newer Older
  • Learn to ignore specific revisions
  • #include <ap_int.h>
    #include <hls_stream.h>
    
    
    xliu79's avatar
    xliu79 committed
    #include "wino_IO.cpp"
    
    xliu79's avatar
    xliu79 committed
    
    
    
    void wino_systolic_kernel(
        ap_uint<128> *weight_DDR0,
        ap_uint<128> *weight_DDR1,
        ap_uint<128> *weight_DDR2,
        ap_uint<128> *weight_DDR3,
        ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
    
        ap_uint<OUT_WIDTH*2> out_buffer[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH],
    
        ap_uint<16> start_output_row,
        ap_int<16> start_row_idx_minus_pad_size,
        ap_uint<1> first_flag,
        ap_uint<1> last_flag,
        ConvDesc_t conv_desc,
        ap_uint<1> ap_clk_div2
    )
    {
        hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH];
        hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*36> > input_tile_transformed_stream[WINO_WIDTH];
        hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> >  weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
        printf("start input_feed_underconstruction\n");
        fflush(stdout);
        input_feed_underconstruction(
            input_buffer,
            input_tile_stream,
            // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1, 
            // hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
            conv_desc.inheight,
            conv_desc.inwidth,
            conv_desc.pad_size,
            conv_desc.weightbuffer_load_indepth_number,
            conv_desc.weightbuffer_load_outdepth_number,
            conv_desc.wino_output_tile_size,
            conv_desc.input_buffer_feeding_loop_bound,
            conv_desc.loop_wino_tile_row_reset_cycle,
            conv_desc.loop_wino_tile_col_reset_cycle,
            conv_desc.loop_outdepth_minitile_baseidx_reset_cycle,
            conv_desc.buffer_address_mid_minitile_depth_step,
            conv_desc.wino_out_size_by_wino_width,
            conv_desc.row_address_bitnumber_flag,
            start_output_row,
            start_row_idx_minus_pad_size
            #if DEBUG_FILE_PRINT
            ,conv_desc
            #endif
        );
        printf("start input_transform\n");
        fflush(stdout);
        for(int i=0;i<WINO_WIDTH;i++)
        {
            input_transform(
                input_tile_stream[i],
                input_tile_transformed_stream[i],
                conv_desc.input_transform_feeding_loop_bound
                #if DEBUG_FILE_PRINT
                , i
                #endif
            );
        }
        printf("start weight stream\n");
        fflush(stdout);
        weight_feed_one_port<0>(
            weight_DDR0,
            weight_stream[0],
            conv_desc.weightDDR_burst_number,
            conv_desc.weightDDR_buffer_burst_length,
            conv_desc.weightDDR_port_burst_length,
            conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
            conv_desc.loop_start_output_baserowcol_reset_cycle,
            conv_desc.loop_weight_feed_bound,
            first_flag,
            last_flag
            #if DEBUG_FILE_PRINT
            ,conv_desc	
            #endif
        );
    
        weight_feed_one_port<1>(
            weight_DDR1,
            weight_stream[1],
            conv_desc.weightDDR_burst_number,
            conv_desc.weightDDR_buffer_burst_length,
            conv_desc.weightDDR_port_burst_length,
            conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
            conv_desc.loop_start_output_baserowcol_reset_cycle,
            conv_desc.loop_weight_feed_bound,
            first_flag,
            last_flag
            #if DEBUG_FILE_PRINT
            ,conv_desc	
            #endif
        );
    
        weight_feed_one_port<2>(
            weight_DDR2,
            weight_stream[2],
            conv_desc.weightDDR_burst_number,
            conv_desc.weightDDR_buffer_burst_length,
            conv_desc.weightDDR_port_burst_length,
            conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
            conv_desc.loop_start_output_baserowcol_reset_cycle,
            conv_desc.loop_weight_feed_bound,
            first_flag,
            last_flag
            #if DEBUG_FILE_PRINT
            ,conv_desc	
            #endif
        );
        
        weight_feed_one_port<3>(
            weight_DDR3,
            weight_stream[3],
            conv_desc.weightDDR_burst_number,
            conv_desc.weightDDR_buffer_burst_length,
            conv_desc.weightDDR_port_burst_length,
            conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
            conv_desc.loop_start_output_baserowcol_reset_cycle,
            conv_desc.loop_weight_feed_bound,
            first_flag,
            last_flag
            #if DEBUG_FILE_PRINT
            ,conv_desc	
            #endif
        );
    
    
    
        // #if DEBUG_FILE_PRINT
        // for(int i=0;i<4;i++)
        // {
        //     for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++)
        //     {
        //         int outdepth_minitile_idx=i*WEIGHT_FEED_NUMBER_PER_PORT+j;
        //         char filename[100];
        //         sprintf(filename,"weightstream%d.txt",outdepth_minitile_idx);
    
        //         attach_weight_stream_content<INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE_SQUARE>(weight_stream[i][j],filename);
        //     }
        // }
        // #endif
        printf("start wino block\n");
        fflush(stdout);
        wino_stream_block(
            input_tile_transformed_stream,
            weight_stream,
            out_buffer,
            conv_desc.total_input_stream_tile,
            conv_desc.loop_omini_base_reset_cycle,
            conv_desc.loop_wino_cell_bound,
            conv_desc.wino5x5_flag
            #if DEBUG_FILE_PRINT
            ,conv_desc	
            #endif
            ,ap_clk_div2
        );
    
    }
    
    
    
    void wino_systolic_top(
        ap_uint<128> *input_DDR0,
        ap_uint<128> *input_DDR1,
        ap_uint<128> *input_DDR2,
        ap_uint<128> *input_DDR3,
        ap_uint<128> *weight_DDR0,
        ap_uint<128> *weight_DDR1,
        ap_uint<128> *weight_DDR2,
        ap_uint<128> *weight_DDR3,
        ap_uint<128> *output_DDR0,
        ap_uint<128> *output_DDR1,
        ap_uint<128> *output_DDR2,
        ap_uint<128> *output_DDR3,
    
        ConvDesc_t &conv_desc,
        ap_uint<1> ap_clk_div2
    
        #pragma HLS interface m_axi port= input_DDR3 depth=65535
        #pragma HLS interface m_axi port= input_DDR2 depth=65535
        #pragma HLS interface m_axi port= input_DDR1 depth=65535
        #pragma HLS interface m_axi port= input_DDR0 depth=65535
        #pragma HLS interface m_axi port= output_DDR3 depth=65535
        #pragma HLS interface m_axi port= output_DDR2 depth=65535
        #pragma HLS interface m_axi port= output_DDR1 depth=65535
        #pragma HLS interface m_axi port= output_DDR0 depth=65535
        #pragma HLS interface m_axi port= weight_DDR3 depth=65535
        #pragma HLS interface m_axi port= weight_DDR2 depth=65535
        #pragma HLS interface m_axi port= weight_DDR1 depth=65535
        #pragma HLS interface m_axi port= weight_DDR0 depth=65535
    
    
    
        //input buffer declaration
    
        ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
    
        #pragma HLS array_partition variable=input_buffer complete dim=1 
        #pragma HLS array_partition variable=input_buffer complete dim=2 
    
        ap_uint<OUT_WIDTH*2> output_buffer0[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH];
    
        #pragma HLS array_partition variable=output_buffer0 complete dim=1 
        #pragma HLS array_partition variable=output_buffer0 complete dim=2 
    
        ap_uint<OUT_WIDTH*2> output_buffer1[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH];
    
        #pragma HLS array_partition variable=output_buffer1 complete dim=1 
        #pragma HLS array_partition variable=output_buffer1 complete dim=2 
    
    
        #if DEBUG_FILE_PRINT
    
            clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
    
        printf("conv_desc.row_address_bitnumber_flag %d\n", conv_desc.row_address_bitnumber_flag);
    
    xliu79's avatar
    xliu79 committed
        load_input_rowtile_from_ddr(
    
            input_DDR0,
            input_DDR1,
            input_DDR2,
            input_DDR3,
            input_buffer,
    
    		conv_desc.inheight,
    		conv_desc.inwidth,
    		conv_desc.stride,
            conv_desc.pad_size,
    		conv_desc.inwidth_align8,
    		conv_desc.indepth_align8,
    		conv_desc.group_indepth_x_inwidth_align8_by8,
    		conv_desc.group_indepth_offset_x_inwidth_align8_by8,
    
            conv_desc.inwidth_ceildiv_inbufferwidth,
            conv_desc.buffer_address_mid_increment_step,
    
            conv_desc.row_address_bitnumber_flag,
            conv_desc.out_rowstep,
            0,
    
            1);
    
        #if DEBUG_FILE_PRINT
    
            attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
    
        for( ; start_output_row < conv_desc.outheight; start_output_row+=conv_desc.out_rowstep)
    
            ap_uint<16> start_row_idx_minus_pad_size=start_output_row-conv_desc.pad_size;
    
            printf("conv_desc.row_address_bitnumber_flag %d\n", conv_desc.row_address_bitnumber_flag);
    
            load_input_rowtile_from_ddr(
                input_DDR0,
                input_DDR1,
                input_DDR2,
                input_DDR3,
                input_buffer,
                conv_desc.inheight,
                conv_desc.inwidth,
                conv_desc.stride,
                conv_desc.pad_size,
                conv_desc.inwidth_align8,
                conv_desc.indepth_align8,
                conv_desc.group_indepth_x_inwidth_align8_by8,
                conv_desc.group_indepth_offset_x_inwidth_align8_by8,
                conv_desc.inwidth_ceildiv_inbufferwidth,
                conv_desc.buffer_address_mid_increment_step,
                conv_desc.input_load_burst_length,
    
                conv_desc.row_address_bitnumber_flag,
                conv_desc.out_rowstep,
    
                start_output_row + conv_desc.out_rowstep,
    
            #if DEBUG_FILE_PRINT
    
                attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
    
            wino_systolic_kernel(
                weight_DDR0,
                weight_DDR1,
                weight_DDR2,
                weight_DDR3,
    
                start_row_idx_minus_pad_size,
                start_output_row==0,
                start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
                conv_desc,
                ap_clk_div2