Newer
Older
xliu79
committed
#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>
xliu79
committed
xliu79
committed
#include "wino_cell.cpp"
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
void load_params(
ap_int<32>* mem_params,
ConvDesc_t &conv_desc
)
{
ap_int<32> params[128];
for(int i=0;i<128;i++)
{
#pragma HLS pipeline
params[i]=mem_params[i];
}
//Original signal
conv_desc.inheight=params[0];
conv_desc.inwidth=params[1];
conv_desc.indepth=params[2];
conv_desc.outheight=params[3];
conv_desc.outwidth=params[4];
conv_desc.outdepth=params[5];
conv_desc.kernel_size=params[6];
conv_desc.pad_size=params[7];
conv_desc.stride=params[8];
//wino related
conv_desc.wino5x5_flag=params[9];
// 1: 3x3, 0:5x5
conv_desc.wino_output_tile_size=params[10];
//input buffer related
conv_desc.indepth_align_minitile_size=params[11];
conv_desc.indepth_align8=params[12];
conv_desc.indepth_ceildiv8=params[13];
conv_desc.inwidth_ceildiv_inbufferwidth=params[14];
conv_desc.inwidth_align8=params[15];
conv_desc.group_indepth_offset=params[16];
conv_desc.group_indepth=params[17];
conv_desc.input_ddr_bytes=params[18];
conv_desc.input_ddr_128bits=params[19];
conv_desc.group_indepth_x_inwidth_align8_by8=params[20];
conv_desc.group_indepth_offset_x_inwidth_align8_by8=params[21];
conv_desc.input_load_burst_length=params[22];
conv_desc.buffer_address_mid_increment_step=params[23];
conv_desc.row_address_bitnumber_flag=params[24];
// ouput_buffer_related
conv_desc.outwidth_align8=params[25];
conv_desc.outdepth_align8=params[26];
conv_desc.outheight_align4=params[27];
conv_desc.outdepth_align_minitile_size=params[28];
conv_desc.group_outdepth_offset=params[29];
conv_desc.group_outdepth=params[30];
conv_desc.output_ddr_bytes=params[31];
conv_desc.output_ddr_128bits=params[32];
// Weight_related
conv_desc.weightbuffer_load_indepth_number=params[33];
conv_desc.weightbuffer_load_indepth_step=params[34];
conv_desc.weightbuffer_load_outdepth_number=params[35];
conv_desc.weightbuffer_load_outdepth_step=params[36];
conv_desc.weightbuffer_indepth_minitile_number=params[37];
conv_desc.weightbuffer_outdepth_minitile_number=params[38];
conv_desc.weightbuffer_total_load_number=params[39];
//weight_load hardware
conv_desc.weightDDR_buffer_burst_length=params[40];
conv_desc.weightDDR_port_burst_length=params[41];
conv_desc.weightDDR_burst_number=params[42];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1=params[43];
conv_desc.loop_start_output_baserowcol_reset_cycle=params[44];
conv_desc.loop_weight_feed_bound=params[45];
// input buffer feeding related
conv_desc.wino_out_size_by_wino_width=params[46];
conv_desc.wino_tile_number_in_outwidth=params[47];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle=params[48];
conv_desc.loop_wino_tile_col_reset_cycle=params[49];
conv_desc.loop_wino_tile_row_reset_cycle=params[50];
conv_desc.buffer_address_mid_minitile_depth_step=params[51];
conv_desc.input_buffer_feeding_loop_bound=params[52];
conv_desc.input_transform_feeding_loop_bound=params[53];
// row_tile calculation , these parameter have to be solved after weight parameters are decided.
conv_desc.out_rowstep=params[54];
conv_desc.wino_tile_number_in_out_rowstep=params[55];
// wino computation
conv_desc.total_input_stream_tile=params[56];
conv_desc.loop_omini_base_reset_cycle=params[57];
conv_desc.loop_wino_cell_bound=params[58];
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1=params[59];
conv_desc.loop_iload_reset_cycle=params[60];
conv_desc.outbuffer_oload_increment_step=params[61];
conv_desc.outbuffer_omini_increment_step=params[62];
//output write back
conv_desc.outdepth_ceildiv8=params[63];
conv_desc.output_burst_length=params[64];
conv_desc.write_back_flag=params[65];
conv_desc.wino_col_pix_upper_bound=params[66];
conv_desc.wino_tile_number_rowcol=params[67];
conv_desc.out_ddr_increment_step=params[68];
}
void wino_systolic_kernel(
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
ap_uint<OUT_WIDTH*2> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH],
ap_uint<16> start_output_row,
ap_int<16> start_row_idx_minus_pad_size,
ap_uint<1> first_flag,
ap_uint<1> last_flag,
ConvDesc_t conv_desc,
ap_uint<1> ap_clk_div2
)
{
printf("****wino_systolic_kernel****\n");
#pragma HLS interface ap_stable port=conv_desc
#pragma HLS array_partition variable =input_buffer dim=1 complete
#pragma HLS array_partition variable =input_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=4 complete
#pragma HLS array_partition variable =out_buffer dim=3 complete
#pragma HLS array_partition variable =out_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=1 complete
#pragma HLS dataflow
hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH];
hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_transformed_stream[WINO_WIDTH];
#pragma HLS stream variable=input_tile_transformed_stream depth=1
hls::stream<ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
input_feed_underconstruction(
input_buffer,
input_tile_stream,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
conv_desc.inwidth,
conv_desc.pad_size,
conv_desc.weightbuffer_load_outdepth_number,
conv_desc.wino_output_tile_size,
conv_desc.input_buffer_feeding_loop_bound,
conv_desc.loop_wino_tile_row_reset_cycle,
conv_desc.loop_wino_tile_col_reset_cycle,
conv_desc.buffer_address_mid_minitile_depth_step,
conv_desc.wino_out_size_by_wino_width,
conv_desc.row_address_bitnumber_flag,
start_row_idx_minus_pad_size
);
for(int i=0;i<WINO_WIDTH;i++)
{
input_transform(
input_tile_stream[i],
input_tile_transformed_stream[i],
);
}
weight_feed_one_port<0>(
weight_DDR0,
weight_stream[0],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<1>(
weight_DDR1,
weight_stream[1],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<2>(
weight_DDR2,
weight_stream[2],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<3>(
weight_DDR3,
weight_stream[3],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
wino_stream_block(
input_tile_transformed_stream,
weight_stream,
out_buffer,
conv_desc.weightbuffer_outdepth_minitile_number,
conv_desc.total_input_stream_tile,
conv_desc.loop_omini_base_reset_cycle,
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1,
conv_desc.loop_iload_reset_cycle,
conv_desc.loop_wino_cell_bound,
conv_desc.outbuffer_oload_increment_step,
conv_desc.outbuffer_omini_increment_step,
conv_desc.wino5x5_flag
,conv_desc
#endif
,ap_clk_div2
);
}
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#pragma SDS data zero_copy(input_DDR0[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR1[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR2[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR3[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR0[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR1[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR2[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR3[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR0[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR1[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR2[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR3[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(mem_params[0:128])
#pragma SDS data sys_port(input_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(input_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(input_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(input_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(weight_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(weight_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(weight_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(weight_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(output_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(output_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(output_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(output_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(mem_params:ps_e_S_AXI_HP0_FPD)
void wino_systolic_top(
ap_uint<128> *input_DDR0,
ap_uint<128> *input_DDR1,
ap_uint<128> *input_DDR2,
ap_uint<128> *input_DDR3,
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR0,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR1,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR2,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR3,
ap_int<32> *mem_params
#ifdef __SDSVHLS__
,ap_uint<1> ap_clk_div2
#endif
xliu79
committed
)
// #pragma HLS interface m_axi port= input_DDR3 depth=65535
// #pragma HLS interface m_axi port= input_DDR2 depth=65535
// #pragma HLS interface m_axi port= input_DDR1 depth=65535
// #pragma HLS interface m_axi port= input_DDR0 depth=65535
// #pragma HLS interface m_axi port= output_DDR3 depth=65535
// #pragma HLS interface m_axi port= output_DDR2 depth=65535
// #pragma HLS interface m_axi port= output_DDR1 depth=65535
// #pragma HLS interface m_axi port= output_DDR0 depth=65535
// #pragma HLS interface m_axi port= weight_DDR3 depth=65535
// #pragma HLS interface m_axi port= weight_DDR2 depth=65535
// #pragma HLS interface m_axi port= weight_DDR1 depth=65535
// #pragma HLS interface m_axi port= weight_DDR0 depth=65535
// #pragma HLS interface m_axi port= mem_params depth=128
xliu79
committed
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1
#pragma HLS array_partition variable=input_buffer complete dim=2
ap_uint<OUT_WIDTH*2> output_buffer0[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1
#pragma HLS array_partition variable=output_buffer0 complete dim=2
#pragma HLS resource variable=output_buffer0 core=RAM_T2P_BRAM
ap_uint<OUT_WIDTH*2> output_buffer1[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1
#pragma HLS array_partition variable=output_buffer1 complete dim=2
#pragma HLS resource variable=output_buffer1 core=RAM_T2P_BRAM
ConvDesc_t conv_desc;
xliu79
committed
clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer0);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer1);
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
xliu79
committed
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
xliu79
committed
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
0,
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
ap_int<16> next_start_row= conv_desc.out_rowstep;
for( ap_int<16> compute_start_row =0; compute_start_row < conv_desc.outheight; compute_start_row+=conv_desc.out_rowstep)
ap_uint<16> start_row_idx_minus_pad_size=compute_start_row-conv_desc.pad_size;
#pragma HLS DEPENDENCE variable=input_buffer intra false
if(pingpong )
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer0,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
next_start_row > conv_desc.outheight,
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer0,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
next_start_row,
0);
}
else
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer1,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
next_start_row > conv_desc.outheight,
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer1,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
pingpong =~pingpong;
write_start_row+=conv_desc.out_rowstep;
next_start_row+=conv_desc.out_rowstep;
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
#endif
xliu79
committed
}
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
if(pingpong )
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
xliu79
committed
xliu79
committed
}