Something went wrong on our end
wino.cpp 20.50 KiB
#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>
#include "wino_IO.cpp"
#include "wino_cell.cpp"
#include "param.h"
void load_params(
ap_int<32>* mem_params,
ConvDesc_t &conv_desc
)
{
ap_int<32> params[128];
for(int i=0;i<128;i++)
{
#pragma HLS pipeline
params[i]=mem_params[i];
}
//Original signal
conv_desc.inheight=params[0];
conv_desc.inwidth=params[1];
conv_desc.indepth=params[2];
conv_desc.outheight=params[3];
conv_desc.outwidth=params[4];
conv_desc.outdepth=params[5];
conv_desc.kernel_size=params[6];
conv_desc.pad_size=params[7];
conv_desc.stride=params[8];
//wino related
conv_desc.wino5x5_flag=params[9];
// 1: 3x3, 0:5x5
conv_desc.wino_output_tile_size=params[10];
//input buffer related
conv_desc.indepth_align_minitile_size=params[11];
conv_desc.indepth_align8=params[12];
conv_desc.indepth_ceildiv8=params[13];
conv_desc.inwidth_ceildiv_inbufferwidth=params[14];
conv_desc.inwidth_align8=params[15];
conv_desc.group_indepth_offset=params[16];
conv_desc.group_indepth=params[17];
conv_desc.input_ddr_bytes=params[18];
conv_desc.input_ddr_128bits=params[19];
conv_desc.group_indepth_x_inwidth_align8_by8=params[20];
conv_desc.group_indepth_offset_x_inwidth_align8_by8=params[21];
conv_desc.input_load_burst_length=params[22];
conv_desc.buffer_address_mid_increment_step=params[23];
conv_desc.row_address_bitnumber_flag=params[24];
// ouput_buffer_related
conv_desc.outwidth_align8=params[25];
conv_desc.outdepth_align8=params[26];
conv_desc.outheight_align4=params[27];
conv_desc.outdepth_align_minitile_size=params[28];
conv_desc.group_outdepth_offset=params[29];
conv_desc.group_outdepth=params[30];
conv_desc.output_ddr_bytes=params[31];
conv_desc.output_ddr_128bits=params[32];
// Weight_related
conv_desc.weightbuffer_load_indepth_number=params[33];
conv_desc.weightbuffer_load_indepth_step=params[34];
conv_desc.weightbuffer_load_outdepth_number=params[35];
conv_desc.weightbuffer_load_outdepth_step=params[36];
conv_desc.weightbuffer_indepth_minitile_number=params[37];
conv_desc.weightbuffer_outdepth_minitile_number=params[38];
conv_desc.weightbuffer_total_load_number=params[39];
//weight_load hardware
conv_desc.weightDDR_buffer_burst_length=params[40];
conv_desc.weightDDR_port_burst_length=params[41];
conv_desc.weightDDR_burst_number=params[42];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1=params[43];
conv_desc.loop_start_output_baserowcol_reset_cycle=params[44];
conv_desc.loop_weight_feed_bound=params[45];
// input buffer feeding related
conv_desc.wino_out_size_by_wino_width=params[46];
conv_desc.wino_tile_number_in_outwidth=params[47];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle=params[48];
conv_desc.loop_wino_tile_col_reset_cycle=params[49];
conv_desc.loop_wino_tile_row_reset_cycle=params[50];
conv_desc.buffer_address_mid_minitile_depth_step=params[51];
conv_desc.input_buffer_feeding_loop_bound=params[52];
conv_desc.input_transform_feeding_loop_bound=params[53];
// row_tile calculation , these parameter have to be solved after weight parameters are decided.
conv_desc.out_rowstep=params[54];
conv_desc.wino_tile_number_in_out_rowstep=params[55];
// wino computation
conv_desc.total_input_stream_tile=params[56];
conv_desc.loop_omini_base_reset_cycle=params[57];
conv_desc.loop_wino_cell_bound=params[58];
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1=params[59];
conv_desc.loop_iload_reset_cycle=params[60];
conv_desc.outbuffer_oload_increment_step=params[61];
conv_desc.outbuffer_omini_increment_step=params[62];
//output write back
conv_desc.outdepth_ceildiv8=params[63];
conv_desc.output_burst_length=params[64];
conv_desc.write_back_flag=params[65];
conv_desc.wino_col_pix_upper_bound=params[66];
conv_desc.wino_tile_number_rowcol=params[67];
conv_desc.out_ddr_increment_step=params[68];
}
void wino_systolic_kernel(
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
ap_uint<OUT_WIDTH*2> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH],
ap_uint<16> start_output_row,
ap_int<16> start_row_idx_minus_pad_size,
ap_uint<1> first_flag,
ap_uint<1> last_flag,
ConvDesc_t conv_desc,
ap_uint<1> ap_clk_div2
)
{
printf("****wino_systolic_kernel****\n");
#pragma HLS interface ap_stable port=conv_desc
#pragma HLS array_partition variable =input_buffer dim=1 complete
#pragma HLS array_partition variable =input_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=4 complete
#pragma HLS array_partition variable =out_buffer dim=3 complete
#pragma HLS array_partition variable =out_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=1 complete
#pragma HLS dataflow
hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH];
#pragma HLS stream variable=input_tile_stream depth=1
hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_transformed_stream[WINO_WIDTH];
#pragma HLS stream variable=input_tile_transformed_stream depth=1
hls::stream<ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
#pragma HLS stream variable=weight_stream depth=1
input_feed_underconstruction(
input_buffer,
input_tile_stream,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
conv_desc.inwidth,
conv_desc.pad_size,
conv_desc.weightbuffer_load_outdepth_number,
conv_desc.wino_output_tile_size,
conv_desc.input_buffer_feeding_loop_bound,
conv_desc.loop_wino_tile_row_reset_cycle,
conv_desc.loop_wino_tile_col_reset_cycle,
conv_desc.buffer_address_mid_minitile_depth_step,
conv_desc.wino_out_size_by_wino_width,
conv_desc.row_address_bitnumber_flag,
start_row_idx_minus_pad_size
);
for(int i=0;i<WINO_WIDTH;i++)
{
#pragma HLS unroll
input_transform(
input_tile_stream[i],
input_tile_transformed_stream[i],
conv_desc.input_transform_feeding_loop_bound,
i
);
}
weight_feed_one_port<0>(
weight_DDR0,
weight_stream[0],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
conv_desc.weightbuffer_outdepth_minitile_number,
first_flag,
last_flag
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
weight_feed_one_port<1>(
weight_DDR1,
weight_stream[1],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
conv_desc.weightbuffer_outdepth_minitile_number,
first_flag,
last_flag
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
weight_feed_one_port<2>(
weight_DDR2,
weight_stream[2],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
conv_desc.weightbuffer_outdepth_minitile_number,
first_flag,
last_flag
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
weight_feed_one_port<3>(
weight_DDR3,
weight_stream[3],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
conv_desc.weightbuffer_outdepth_minitile_number,
first_flag,
last_flag
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
wino_stream_block(
input_tile_transformed_stream,
weight_stream,
out_buffer,
conv_desc.weightbuffer_outdepth_minitile_number,
conv_desc.total_input_stream_tile,
conv_desc.loop_omini_base_reset_cycle,
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1,
conv_desc.loop_iload_reset_cycle,
conv_desc.loop_wino_cell_bound,
conv_desc.outbuffer_oload_increment_step,
conv_desc.outbuffer_omini_increment_step,
conv_desc.wino5x5_flag
#if DEBUG_CONV_DESC
,conv_desc
#endif
,ap_clk_div2
);
}
#pragma SDS data zero_copy(input_DDR0[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR1[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR2[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR3[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR0[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR1[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR2[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR3[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR0[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR1[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR2[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR3[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(mem_params[0:128])
#pragma SDS data sys_port(input_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(input_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(input_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(input_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(weight_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(weight_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(weight_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(weight_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(output_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(output_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(output_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(output_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(mem_params:ps_e_S_AXI_HP0_FPD)
void wino_systolic_top(
ap_uint<128> *input_DDR0,
ap_uint<128> *input_DDR1,
ap_uint<128> *input_DDR2,
ap_uint<128> *input_DDR3,
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR0,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR1,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR2,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR3,
// ConvDesc_t conv_desc,
ap_int<32> *mem_params
#ifdef __SDSVHLS__
,ap_uint<1> ap_clk_div2
#endif
)
{
#ifndef __SDSVHLS__
ap_uint<1> ap_clk_div2=0;
#endif
#pragma HLS interface m_axi port= input_DDR3 depth=65535
#pragma HLS interface m_axi port= input_DDR2 depth=65535
#pragma HLS interface m_axi port= input_DDR1 depth=65535
#pragma HLS interface m_axi port= input_DDR0 depth=65535
#pragma HLS interface m_axi port= output_DDR3 depth=65535
#pragma HLS interface m_axi port= output_DDR2 depth=65535
#pragma HLS interface m_axi port= output_DDR1 depth=65535
#pragma HLS interface m_axi port= output_DDR0 depth=65535
#pragma HLS interface m_axi port= weight_DDR3 depth=65535
#pragma HLS interface m_axi port= weight_DDR2 depth=65535
#pragma HLS interface m_axi port= weight_DDR1 depth=65535
#pragma HLS interface m_axi port= weight_DDR0 depth=65535
#pragma HLS interface m_axi port= mem_params depth=128
//input buffer declaration
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1
#pragma HLS array_partition variable=input_buffer complete dim=2
#pragma HLS resource variable=input_buffer core=RAM_T2P_BRAM
ap_uint<OUT_WIDTH*2> output_buffer0[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1
#pragma HLS array_partition variable=output_buffer0 complete dim=2
#pragma HLS resource variable=output_buffer0 core=RAM_T2P_BRAM
ap_uint<OUT_WIDTH*2> output_buffer1[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1
#pragma HLS array_partition variable=output_buffer1 complete dim=2
#pragma HLS resource variable=output_buffer1 core=RAM_T2P_BRAM
ConvDesc_t conv_desc;
ap_uint<1> pingpong;
#if DEBUG_FILE_PRINT
clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer0);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer1);
#endif
load_params(mem_params,conv_desc);
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
0,
1);
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
#endif
ap_int<16> compute_start_row =0;
ap_int<16> write_start_row= -conv_desc.out_rowstep;
for( ; compute_start_row < conv_desc.outheight; compute_start_row+=conv_desc.out_rowstep,write_start_row+=conv_desc.out_rowstep)
{
ap_uint<16> start_row_idx_minus_pad_size=compute_start_row-conv_desc.pad_size;
if(pingpong )
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer0,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
compute_start_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer0,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer1,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
compute_start_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer1,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
compute_start_row + conv_desc.out_rowstep,
0);
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
#endif
pingpong =~pingpong;
}
if(pingpong )
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
}