Something went wrong on our end
wino.cpp 11.75 KiB
#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>
#include "wino_IO.cpp"
#include "wino_cell.cpp"
#include "../software/param.h"
void wino_systolic_top(
ap_uint<128> *input_DDR0,
ap_uint<128> *input_DDR1,
ap_uint<128> *input_DDR2,
ap_uint<128> *input_DDR3,
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<128> *output_DDR0,
ap_uint<128> *output_DDR1,
ap_uint<128> *output_DDR2,
ap_uint<128> *output_DDR3,
ConvDesc_t &conv_desc
)
{
#pragma HLS interface m_axi port= input_DDR3 depth=65535
#pragma HLS interface m_axi port= input_DDR2 depth=65535
#pragma HLS interface m_axi port= input_DDR1 depth=65535
#pragma HLS interface m_axi port= input_DDR0 depth=65535
#pragma HLS interface m_axi port= output_DDR3 depth=65535
#pragma HLS interface m_axi port= output_DDR2 depth=65535
#pragma HLS interface m_axi port= output_DDR1 depth=65535
#pragma HLS interface m_axi port= output_DDR0 depth=65535
#pragma HLS interface m_axi port= weight_DDR3 depth=65535
#pragma HLS interface m_axi port= weight_DDR2 depth=65535
#pragma HLS interface m_axi port= weight_DDR1 depth=65535
#pragma HLS interface m_axi port= weight_DDR0 depth=65535
//input buffer declaration
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1
#pragma HLS array_partition variable=input_buffer complete dim=2
ap_uint<36> output_buffer0[16][16][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1
#pragma HLS array_partition variable=output_buffer0 complete dim=2
ap_uint<36> output_buffer1[16][16][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1
#pragma HLS array_partition variable=output_buffer1 complete dim=2
ap_uint<1> pingpong;
#if DEBUG_FILE_PRINT
clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
#endif
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.wino_output_tile_size,
1);
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
#endif
ap_uint<16> start_output_row =0;
for( ; start_output_row < conv_desc.outheight; start_output_row+=conv_desc.wino_output_tile_size)
{
ap_uint<16> start_row_idx_minus_pad_size=start_output_row-conv_desc.pad_size;
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
start_output_row + conv_desc.wino_output_tile_size,
0);
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
#endif
input_feed_underconstruction(
input_buffer,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.pad_size,
conv_desc.weightbuffer_load_indepth_number,
conv_desc.weightbuffer_load_outdepth_number,
conv_desc.wino_output_tile_size,
conv_desc.input_buffer_feeding_loop_bound,
conv_desc.loop_wino_tile_col_reset_cycle,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle,
conv_desc.buffer_address_mid_minitile_depth_step,
conv_desc.wino_out_size_by_wino_width,
start_output_row,
start_row_idx_minus_pad_size
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
weight_feed_one_port<0>(
weight_DDR0,
weight_stream[0],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_port_burst_length,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_basecol_reset_cycle,
conv_desc.loop_weight_feed_bound,
start_output_row==0,
start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
weight_feed_one_port<1>(
weight_DDR1,
weight_stream[1],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_port_burst_length,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_basecol_reset_cycle,
conv_desc.loop_weight_feed_bound,
start_output_row==0,
start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
weight_feed_one_port<2>(
weight_DDR2,
weight_stream[2],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_port_burst_length,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_basecol_reset_cycle,
conv_desc.loop_weight_feed_bound,
start_output_row==0,
start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
weight_feed_one_port<3>(
weight_DDR3,
weight_stream[3],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_port_burst_length,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_basecol_reset_cycle,
conv_desc.loop_weight_feed_bound,
start_output_row==0,
start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
#if DEBUG_FILE_PRINT
for(int i=0;i<4;i++)
{
for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++)
{
int outdepth_minitile_idx=i*WEIGHT_FEED_NUMBER_PER_PORT+j;
char filename[100];
sprintf(filename,"weightstream%d.txt",outdepth_minitile_idx);
attach_weight_stream_content<INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE_SQUARE>(weight_stream[i][j],filename);
}
}
#endif
// if(pingpong)
// {
// wino_systolic(
// input_buffer,
// output_buffer0,
// weight_DDR0,
// weight_DDR1,
// weight_DDR2,
// weight_DDR3,
// input_height,
// input_width,
// input_depth,
// input_width_ceildiv_16,
// input_depth_align8,
// output_height,
// output_width,
// output_depth,
// kernel_window_size,
// pad_size,
// weight_indepth_load_number,
// weight_outdepth_load_number,
// weight_outdepth_feed_size,
// start_output_row,
// weight_total_load_number,
// weight_total_feed_size,
// ddr_load_length,
// ddr_load_length_per_feed,
// row_repeat_times,
// (start_output_row==0) ,
// (start_output_row+wino_output_tile_size >= output_height));
// #if DEBUG_FILE_PRINT
// attach_output_buffer_content<0>(output_buffer0,"output_buffer_content.txt");
// #endif
// write_output_to_DDR(
// output_DDR0,
// output_DDR1,
// output_DDR2,
// output_DDR3,
// output_buffer1,
// outdepth_ceil_div8,
// start_output_row,
// start_output_row+4,
// output_height,
// output_width,
// wino_output_tile_size,
// row_repeat_times,
// 1,
// 0);
// pingpong=0;
// }
// else
// {
// wino_systolic(
// input_buffer,
// output_buffer1,
// weight_DDR0,
// weight_DDR1,
// weight_DDR2,
// weight_DDR3,
// input_height,
// input_width,
// input_depth,
// input_width_ceildiv_16,
// input_depth_align8,
// output_height,
// output_width,
// output_depth,
// kernel_window_size,
// pad_size,
// weight_indepth_load_number,
// weight_outdepth_load_number,
// weight_outdepth_feed_size,
// start_output_row,
// weight_total_load_number,
// weight_total_feed_size,
// ddr_load_length,
// ddr_load_length_per_feed,
// row_repeat_times,
// (start_output_row==0) ,
// (start_output_row+wino_output_tile_size >= output_height));
// #if DEBUG_FILE_PRINT
// attach_output_buffer_content<0>(output_buffer0,"output_buffer_content.txt");
// #endif
// write_output_to_DDR(
// output_DDR0,
// output_DDR1,
// output_DDR2,
// output_DDR3,
// output_buffer0,
// outdepth_ceil_div8,
// start_output_row,
// start_output_row+4,
// output_height,
// output_width,
// wino_output_tile_size,
// row_repeat_times,
// 1,
// 0);
// pingpong=1;
// }
}
// if(pingpong)
// {
// write_output_to_DDR(
// output_DDR0,
// output_DDR1,
// output_DDR2,
// output_DDR3,
// output_buffer1,
// outdepth_ceil_div8,
// start_output_row,
// start_output_row+4,
// output_height,
// output_width,
// wino_output_tile_size,
// row_repeat_times,
// 1,
// 0);
// }
// else
// {
// write_output_to_DDR(
// output_DDR0,
// output_DDR1,
// output_DDR2,
// output_DDR3,
// output_buffer0,
// outdepth_ceil_div8,
// start_output_row,
// start_output_row+4,
// output_height,
// output_width,
// wino_output_tile_size,
// row_repeat_times,
// 1,
// 0);
// }
}