Newer
Older
xliu79
committed
#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>
xliu79
committed
xliu79
committed
#include "wino_cell.cpp"
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
void load_params(
ap_int<32>* mem_params,
ConvDesc_t &conv_desc
)
{
ap_int<32> params[128];
for(int i=0;i<128;i++)
{
#pragma HLS pipeline
params[i]=mem_params[i];
}
//Original signal
conv_desc.inheight=params[0];
conv_desc.inwidth=params[1];
conv_desc.indepth=params[2];
conv_desc.outheight=params[3];
conv_desc.outwidth=params[4];
conv_desc.outdepth=params[5];
conv_desc.kernel_size=params[6];
conv_desc.pad_size=params[7];
conv_desc.stride=params[8];
//wino related
conv_desc.wino5x5_flag=params[9];
// 1: 3x3, 0:5x5
conv_desc.wino_output_tile_size=params[10];
//input buffer related
conv_desc.indepth_align_minitile_size=params[11];
conv_desc.indepth_align8=params[12];
conv_desc.indepth_ceildiv8=params[13];
conv_desc.inwidth_ceildiv_inbufferwidth=params[14];
conv_desc.inwidth_align8=params[15];
conv_desc.group_indepth_offset=params[16];
conv_desc.group_indepth=params[17];
conv_desc.input_ddr_bytes=params[18];
conv_desc.input_ddr_128bits=params[19];
conv_desc.group_indepth_x_inwidth_align8_by8=params[20];
conv_desc.group_indepth_offset_x_inwidth_align8_by8=params[21];
conv_desc.input_load_burst_length=params[22];
conv_desc.buffer_address_mid_increment_step=params[23];
conv_desc.row_address_bitnumber_flag=params[24];
// ouput_buffer_related
conv_desc.outwidth_align8=params[25];
conv_desc.outdepth_align8=params[26];
conv_desc.outheight_align4=params[27];
conv_desc.outdepth_align_minitile_size=params[28];
conv_desc.group_outdepth_offset=params[29];
conv_desc.group_outdepth=params[30];
conv_desc.output_ddr_bytes=params[31];
conv_desc.output_ddr_128bits=params[32];
// Weight_related
conv_desc.weightbuffer_load_indepth_number=params[33];
conv_desc.weightbuffer_load_indepth_step=params[34];
conv_desc.weightbuffer_load_outdepth_number=params[35];
conv_desc.weightbuffer_load_outdepth_step=params[36];
conv_desc.weightbuffer_indepth_minitile_number=params[37];
conv_desc.weightbuffer_outdepth_minitile_number=params[38];
conv_desc.weightbuffer_total_load_number=params[39];
//weight_load hardware
conv_desc.weightDDR_buffer_burst_length=params[40];
conv_desc.weightDDR_port_burst_length=params[41];
conv_desc.weightDDR_burst_number=params[42];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1=params[43];
conv_desc.loop_start_output_baserowcol_reset_cycle=params[44];
conv_desc.loop_weight_feed_bound=params[45];
// input buffer feeding related
conv_desc.wino_out_size_by_wino_width=params[46];
conv_desc.wino_tile_number_in_outwidth=params[47];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle=params[48];
conv_desc.loop_wino_tile_col_reset_cycle=params[49];
conv_desc.loop_wino_tile_row_reset_cycle=params[50];
conv_desc.buffer_address_mid_minitile_depth_step=params[51];
conv_desc.input_buffer_feeding_loop_bound=params[52];
conv_desc.input_transform_feeding_loop_bound=params[53];
// row_tile calculation , these parameter have to be solved after weight parameters are decided.
conv_desc.out_rowstep=params[54];
conv_desc.wino_tile_number_in_out_rowstep=params[55];
// wino computation
conv_desc.total_input_stream_tile=params[56];
conv_desc.loop_omini_base_reset_cycle=params[57];
conv_desc.loop_wino_cell_bound=params[58];
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1=params[59];
conv_desc.loop_iload_reset_cycle=params[60];
conv_desc.outbuffer_oload_increment_step=params[61];
conv_desc.outbuffer_omini_increment_step=params[62];
//output write back
conv_desc.outdepth_ceildiv8=params[63];
conv_desc.output_burst_length=params[64];
conv_desc.write_back_flag=params[65];
conv_desc.wino_col_pix_upper_bound=params[66];
conv_desc.wino_tile_number_rowcol=params[67];
conv_desc.out_ddr_increment_step=params[68];
}
void wino_systolic_kernel(
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
ap_uint<OUT_WIDTH*2> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH],
ap_uint<16> start_output_row,
ap_int<16> start_row_idx_minus_pad_size,
ap_uint<1> first_flag,
ap_uint<1> last_flag,
ConvDesc_t conv_desc,
ap_uint<1> ap_clk_div2
)
{
printf("****wino_systolic_kernel****\n");
#pragma HLS interface ap_stable port=conv_desc
#pragma HLS array_partition variable =input_buffer dim=1 complete
#pragma HLS array_partition variable =input_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=4 complete
#pragma HLS array_partition variable =out_buffer dim=3 complete
#pragma HLS array_partition variable =out_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=1 complete
#pragma HLS dataflow
hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH];
hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_transformed_stream[WINO_WIDTH];
#pragma HLS stream variable=input_tile_transformed_stream depth=1
hls::stream<ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
input_feed_underconstruction(
input_buffer,
input_tile_stream,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
conv_desc.inwidth,
conv_desc.pad_size,
conv_desc.weightbuffer_load_outdepth_number,
conv_desc.wino_output_tile_size,
conv_desc.input_buffer_feeding_loop_bound,
conv_desc.loop_wino_tile_row_reset_cycle,
conv_desc.loop_wino_tile_col_reset_cycle,
conv_desc.buffer_address_mid_minitile_depth_step,
conv_desc.wino_out_size_by_wino_width,
conv_desc.row_address_bitnumber_flag,
start_row_idx_minus_pad_size
);
for(int i=0;i<WINO_WIDTH;i++)
{
input_transform(
input_tile_stream[i],
input_tile_transformed_stream[i],
);
}
weight_feed_one_port<0>(
weight_DDR0,
weight_stream[0],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<1>(
weight_DDR1,
weight_stream[1],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<2>(
weight_DDR2,
weight_stream[2],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<3>(
weight_DDR3,
weight_stream[3],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
wino_stream_block(
input_tile_transformed_stream,
weight_stream,
out_buffer,
conv_desc.weightbuffer_outdepth_minitile_number,
conv_desc.total_input_stream_tile,
conv_desc.loop_omini_base_reset_cycle,
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1,
conv_desc.loop_iload_reset_cycle,
conv_desc.loop_wino_cell_bound,
conv_desc.outbuffer_oload_increment_step,
conv_desc.outbuffer_omini_increment_step,
conv_desc.wino5x5_flag
,conv_desc
#endif
,ap_clk_div2
);
}
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#pragma SDS data zero_copy(input_DDR0[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR1[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR2[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(input_DDR3[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR0[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR1[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR2[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(weight_DDR3[0:WEIGHT_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR0[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR1[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR2[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(output_DDR3[0:FEATURE_PORT_DEPTH])
#pragma SDS data zero_copy(mem_params[0:128])
#pragma SDS data sys_port(input_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(input_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(input_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(input_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(weight_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(weight_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(weight_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(weight_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(output_DDR0:ps_e_S_AXI_HP0_FPD)
#pragma SDS data sys_port(output_DDR1:ps_e_S_AXI_HP1_FPD)
#pragma SDS data sys_port(output_DDR2:ps_e_S_AXI_HP2_FPD)
#pragma SDS data sys_port(output_DDR3:ps_e_S_AXI_HP3_FPD)
#pragma SDS data sys_port(mem_params:ps_e_S_AXI_HP0_FPD)
void wino_systolic_top(
ap_uint<128> *input_DDR0,
ap_uint<128> *input_DDR1,
ap_uint<128> *input_DDR2,
ap_uint<128> *input_DDR3,
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR0,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR1,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR2,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR3,
ap_int<32> *mem_params
#ifdef __SDSVHLS__
,ap_uint<1> ap_clk_div2
#endif
xliu79
committed
)
#pragma HLS interface m_axi port= input_DDR3 depth=65535
#pragma HLS interface m_axi port= input_DDR2 depth=65535
#pragma HLS interface m_axi port= input_DDR1 depth=65535
#pragma HLS interface m_axi port= input_DDR0 depth=65535
#pragma HLS interface m_axi port= output_DDR3 depth=65535
#pragma HLS interface m_axi port= output_DDR2 depth=65535
#pragma HLS interface m_axi port= output_DDR1 depth=65535
#pragma HLS interface m_axi port= output_DDR0 depth=65535
#pragma HLS interface m_axi port= weight_DDR3 depth=65535
#pragma HLS interface m_axi port= weight_DDR2 depth=65535
#pragma HLS interface m_axi port= weight_DDR1 depth=65535
#pragma HLS interface m_axi port= weight_DDR0 depth=65535
xliu79
committed
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1
#pragma HLS array_partition variable=input_buffer complete dim=2
ap_uint<OUT_WIDTH*2> output_buffer0[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1
#pragma HLS array_partition variable=output_buffer0 complete dim=2
#pragma HLS resource variable=output_buffer0 core=RAM_T2P_BRAM
ap_uint<OUT_WIDTH*2> output_buffer1[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1
#pragma HLS array_partition variable=output_buffer1 complete dim=2
#pragma HLS resource variable=output_buffer1 core=RAM_T2P_BRAM
ConvDesc_t conv_desc;
xliu79
committed
ap_uint<1> pingpong;
xliu79
committed
clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer0);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer1);
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
xliu79
committed
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
xliu79
committed
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
0,
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
ap_int<16> compute_start_row =0;
ap_int<16> write_start_row= -conv_desc.out_rowstep;
for( ; compute_start_row < conv_desc.outheight; compute_start_row+=conv_desc.out_rowstep,write_start_row+=conv_desc.out_rowstep)
ap_uint<16> start_row_idx_minus_pad_size=compute_start_row-conv_desc.pad_size;
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
if(pingpong )
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer0,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
compute_start_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer0,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer1,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
compute_start_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer1,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
#endif
xliu79
committed
}
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
if(pingpong )
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
xliu79
committed
xliu79
committed
}