Newer
Older
xliu79
committed
#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>
xliu79
committed
xliu79
committed
#include "wino_cell.cpp"
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
void load_params(
ap_int<32>* mem_params,
ConvDesc_t &conv_desc
)
{
ap_int<32> params[128];
for(int i=0;i<128;i++)
{
#pragma HLS pipeline
params[i]=mem_params[i];
}
//Original signal
conv_desc.inheight=params[0];
conv_desc.inwidth=params[1];
conv_desc.indepth=params[2];
conv_desc.outheight=params[3];
conv_desc.outwidth=params[4];
conv_desc.outdepth=params[5];
conv_desc.kernel_size=params[6];
conv_desc.pad_size=params[7];
conv_desc.stride=params[8];
//wino related
conv_desc.wino5x5_flag=params[9];
// 1: 3x3, 0:5x5
conv_desc.wino_output_tile_size=params[10];
//input buffer related
conv_desc.indepth_align_minitile_size=params[11];
conv_desc.indepth_align8=params[12];
conv_desc.indepth_ceildiv8=params[13];
conv_desc.inwidth_ceildiv_inbufferwidth=params[14];
conv_desc.inwidth_align8=params[15];
conv_desc.group_indepth_offset=params[16];
conv_desc.group_indepth=params[17];
conv_desc.input_ddr_bytes=params[18];
conv_desc.input_ddr_128bits=params[19];
conv_desc.group_indepth_x_inwidth_align8_by8=params[20];
conv_desc.group_indepth_offset_x_inwidth_align8_by8=params[21];
conv_desc.input_load_burst_length=params[22];
conv_desc.buffer_address_mid_increment_step=params[23];
conv_desc.row_address_bitnumber_flag=params[24];
// ouput_buffer_related
conv_desc.outwidth_align8=params[25];
conv_desc.outdepth_align8=params[26];
conv_desc.outheight_align4=params[27];
conv_desc.outdepth_align_minitile_size=params[28];
conv_desc.group_outdepth_offset=params[29];
conv_desc.group_outdepth=params[30];
conv_desc.output_ddr_bytes=params[31];
conv_desc.output_ddr_128bits=params[32];
// Weight_related
conv_desc.weightbuffer_load_indepth_number=params[33];
conv_desc.weightbuffer_load_indepth_step=params[34];
conv_desc.weightbuffer_load_outdepth_number=params[35];
conv_desc.weightbuffer_load_outdepth_step=params[36];
conv_desc.weightbuffer_indepth_minitile_number=params[37];
conv_desc.weightbuffer_outdepth_minitile_number=params[38];
conv_desc.weightbuffer_total_load_number=params[39];
//weight_load hardware
conv_desc.weightDDR_buffer_burst_length=params[40];
conv_desc.weightDDR_port_burst_length=params[41];
conv_desc.weightDDR_burst_number=params[42];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1=params[43];
conv_desc.loop_start_output_baserowcol_reset_cycle=params[44];
conv_desc.loop_weight_feed_bound=params[45];
// input buffer feeding related
conv_desc.wino_out_size_by_wino_width=params[46];
conv_desc.wino_tile_number_in_outwidth=params[47];
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle=params[48];
conv_desc.loop_wino_tile_col_reset_cycle=params[49];
conv_desc.loop_wino_tile_row_reset_cycle=params[50];
conv_desc.buffer_address_mid_minitile_depth_step=params[51];
conv_desc.input_buffer_feeding_loop_bound=params[52];
conv_desc.input_transform_feeding_loop_bound=params[53];
// row_tile calculation , these parameter have to be solved after weight parameters are decided.
conv_desc.out_rowstep=params[54];
conv_desc.wino_tile_number_in_out_rowstep=params[55];
// wino computation
conv_desc.total_input_stream_tile=params[56];
conv_desc.loop_omini_base_reset_cycle=params[57];
conv_desc.loop_wino_cell_bound=params[58];
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1=params[59];
conv_desc.loop_iload_reset_cycle=params[60];
conv_desc.outbuffer_oload_increment_step=params[61];
conv_desc.outbuffer_omini_increment_step=params[62];
//output write back
conv_desc.outdepth_ceildiv8=params[63];
conv_desc.output_burst_length=params[64];
conv_desc.write_back_flag=params[65];
conv_desc.wino_col_pix_upper_bound=params[66];
conv_desc.wino_tile_number_rowcol=params[67];
conv_desc.out_ddr_increment_step=params[68];
}
void wino_systolic_kernel(
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
ap_uint<OUT_WIDTH*2> out_buffer[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH],
ap_uint<16> start_output_row,
ap_int<16> start_row_idx_minus_pad_size,
ap_uint<1> first_flag,
ap_uint<1> last_flag,
ConvDesc_t conv_desc,
ap_uint<1> ap_clk_div2
)
{
printf("****wino_systolic_kernel****\n");
#pragma HLS interface ap_stable port=conv_desc
#pragma HLS array_partition variable =input_buffer dim=1 complete
#pragma HLS array_partition variable =input_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=4 complete
#pragma HLS array_partition variable =out_buffer dim=3 complete
#pragma HLS array_partition variable =out_buffer dim=2 complete
#pragma HLS array_partition variable =out_buffer dim=1 complete
#pragma HLS dataflow
hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH];
hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_transformed_stream[WINO_WIDTH];
#pragma HLS stream variable=input_tile_transformed_stream depth=1
hls::stream<ap_uint<W_WIDTH*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
input_feed_underconstruction(
input_buffer,
input_tile_stream,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
conv_desc.inwidth,
conv_desc.pad_size,
conv_desc.weightbuffer_load_outdepth_number,
conv_desc.wino_output_tile_size,
conv_desc.input_buffer_feeding_loop_bound,
conv_desc.loop_wino_tile_row_reset_cycle,
conv_desc.loop_wino_tile_col_reset_cycle,
conv_desc.buffer_address_mid_minitile_depth_step,
conv_desc.wino_out_size_by_wino_width,
conv_desc.row_address_bitnumber_flag,
start_row_idx_minus_pad_size
);
for(int i=0;i<WINO_WIDTH;i++)
{
input_transform(
input_tile_stream[i],
input_tile_transformed_stream[i],
);
}
weight_feed_one_port<0>(
weight_DDR0,
weight_stream[0],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<1>(
weight_DDR1,
weight_stream[1],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<2>(
weight_DDR2,
weight_stream[2],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
weight_feed_one_port<3>(
weight_DDR3,
weight_stream[3],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
,conv_desc
#endif
);
wino_stream_block(
input_tile_transformed_stream,
weight_stream,
out_buffer,
conv_desc.weightbuffer_outdepth_minitile_number,
conv_desc.total_input_stream_tile,
conv_desc.loop_omini_base_reset_cycle,
conv_desc.loop_wino_tile_rowcol_self_reset_cycle_min1,
conv_desc.loop_iload_reset_cycle,
conv_desc.loop_wino_cell_bound,
conv_desc.outbuffer_oload_increment_step,
conv_desc.outbuffer_omini_increment_step,
conv_desc.wino5x5_flag
,conv_desc
#endif
,ap_clk_div2
);
}
void wino_systolic_top(
ap_uint<128> *input_DDR0,
ap_uint<128> *input_DDR1,
ap_uint<128> *input_DDR2,
ap_uint<128> *input_DDR3,
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR0,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR1,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR2,
ap_uint<ODDR_WIDTH*BATCH_SIZE*8> *output_DDR3,
ap_uint<1> ap_clk_div2
xliu79
committed
)
#pragma HLS interface m_axi port= input_DDR3 depth=65535
#pragma HLS interface m_axi port= input_DDR2 depth=65535
#pragma HLS interface m_axi port= input_DDR1 depth=65535
#pragma HLS interface m_axi port= input_DDR0 depth=65535
#pragma HLS interface m_axi port= output_DDR3 depth=65535
#pragma HLS interface m_axi port= output_DDR2 depth=65535
#pragma HLS interface m_axi port= output_DDR1 depth=65535
#pragma HLS interface m_axi port= output_DDR0 depth=65535
#pragma HLS interface m_axi port= weight_DDR3 depth=65535
#pragma HLS interface m_axi port= weight_DDR2 depth=65535
#pragma HLS interface m_axi port= weight_DDR1 depth=65535
#pragma HLS interface m_axi port= weight_DDR0 depth=65535
xliu79
committed
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1
#pragma HLS array_partition variable=input_buffer complete dim=2
ap_uint<OUT_WIDTH*2> output_buffer0[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1
#pragma HLS array_partition variable=output_buffer0 complete dim=2
#pragma HLS resource variable=output_buffer0 core=RAM_T2P_BRAM
ap_uint<OUT_WIDTH*2> output_buffer1[WINO_OUT_SIZE][WINO_OUT_SIZE][OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1
#pragma HLS array_partition variable=output_buffer1 complete dim=2
#pragma HLS resource variable=output_buffer1 core=RAM_T2P_BRAM
ConvDesc_t conv_desc;
xliu79
committed
ap_uint<1> pingpong;
xliu79
committed
clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer0);
clear_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(output_buffer1);
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
xliu79
committed
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
xliu79
committed
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
0,
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
ap_int<16> compute_start_row =0;
ap_int<16> write_start_row= -conv_desc.out_rowstep;
for( ; compute_start_row < conv_desc.outheight; compute_start_row+=conv_desc.out_rowstep,write_start_row+=conv_desc.out_rowstep)
ap_uint<16> start_row_idx_minus_pad_size=compute_start_row-conv_desc.pad_size;
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
if(pingpong )
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer0,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
compute_start_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer0,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
input_buffer,
output_buffer1,
compute_start_row,
start_row_idx_minus_pad_size,
compute_start_row==0,
compute_start_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
);
#if DEBUG_FILE_PRINT
char outfilename[100];
sprintf(outfilename,"outbuffer.txt");
attach_output_buffer_content_uniformed_hw<OUT_WIDTH,BATCH_SIZE,WINO_HEIGHT,WINO_WIDTH,WINO_OUT_SIZE,OUTPUT_BUFFER_DEPTH>(
output_buffer1,0,outfilename);
#endif
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
#if DEBUG_FILE_PRINT
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,(char*) "input_buffer_content.txt");
#endif
xliu79
committed
}
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
if(pingpong )
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer1,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
else
{
write_output_to_DDR(
output_DDR0,
output_DDR1,
output_DDR2,
output_DDR3,
output_buffer0,
conv_desc.outheight,
conv_desc.outwidth_align8,
conv_desc.wino_output_tile_size,
conv_desc.wino_tile_number_in_outwidth,
conv_desc.wino_tile_number_in_out_rowstep,
conv_desc.wino_col_pix_upper_bound,
conv_desc.wino_tile_number_rowcol,
conv_desc.output_burst_length,
conv_desc.out_ddr_increment_step,
write_start_row,
write_start_row==0
#if DEBUG_CONV_DESC
,conv_desc
#endif
);
}
xliu79
committed
xliu79
committed
}