Newer
Older
xliu79
committed
#include "wino_macro.h"
#include <ap_int.h>
#include <hls_stream.h>
xliu79
committed
xliu79
committed
#include "wino_cell.cpp"
xliu79
committed
#include "../software/param.h"
void wino_systolic_kernel(
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
ap_uint<OUT_WIDTH*2> out_buffer[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH],
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
ap_uint<16> start_output_row,
ap_int<16> start_row_idx_minus_pad_size,
ap_uint<1> first_flag,
ap_uint<1> last_flag,
ConvDesc_t conv_desc,
ap_uint<1> ap_clk_div2
)
{
hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > input_tile_stream[WINO_WIDTH];
hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*36> > input_tile_transformed_stream[WINO_WIDTH];
hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[4][WEIGHT_FEED_NUMBER_PER_PORT];
printf("start input_feed_underconstruction\n");
fflush(stdout);
input_feed_underconstruction(
input_buffer,
input_tile_stream,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream1,
// hls::stream< ap_uint<16*BATCH_SIZE*36> > &input_tile_stream2,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.pad_size,
conv_desc.weightbuffer_load_indepth_number,
conv_desc.weightbuffer_load_outdepth_number,
conv_desc.wino_output_tile_size,
conv_desc.input_buffer_feeding_loop_bound,
conv_desc.loop_wino_tile_row_reset_cycle,
conv_desc.loop_wino_tile_col_reset_cycle,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle,
conv_desc.buffer_address_mid_minitile_depth_step,
conv_desc.wino_out_size_by_wino_width,
conv_desc.row_address_bitnumber_flag,
start_output_row,
start_row_idx_minus_pad_size
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
printf("start input_transform\n");
fflush(stdout);
for(int i=0;i<WINO_WIDTH;i++)
{
input_transform(
input_tile_stream[i],
input_tile_transformed_stream[i],
conv_desc.input_transform_feeding_loop_bound
#if DEBUG_FILE_PRINT
, i
#endif
);
}
printf("start weight stream\n");
fflush(stdout);
weight_feed_one_port<0>(
weight_DDR0,
weight_stream[0],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
weight_feed_one_port<1>(
weight_DDR1,
weight_stream[1],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
weight_feed_one_port<2>(
weight_DDR2,
weight_stream[2],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
weight_feed_one_port<3>(
weight_DDR3,
weight_stream[3],
conv_desc.weightDDR_burst_number,
conv_desc.weightDDR_buffer_burst_length,
conv_desc.weightDDR_port_burst_length,
conv_desc.loop_outdepth_minitile_baseidx_reset_cycle_minus1,
conv_desc.loop_start_output_baserowcol_reset_cycle,
conv_desc.loop_weight_feed_bound,
first_flag,
last_flag
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
// #if DEBUG_FILE_PRINT
// for(int i=0;i<4;i++)
// {
// for(int j=0;j<WEIGHT_FEED_NUMBER_PER_PORT;j++)
// {
// int outdepth_minitile_idx=i*WEIGHT_FEED_NUMBER_PER_PORT+j;
// char filename[100];
// sprintf(filename,"weightstream%d.txt",outdepth_minitile_idx);
// attach_weight_stream_content<INDEPTH_MINITILE_SIZE,WINO_DOMAIN_SIZE,WINO_DOMAIN_SIZE_SQUARE>(weight_stream[i][j],filename);
// }
// }
// #endif
printf("start wino block\n");
fflush(stdout);
wino_stream_block(
input_tile_transformed_stream,
weight_stream,
out_buffer,
conv_desc.total_input_stream_tile,
conv_desc.loop_omini_base_reset_cycle,
conv_desc.loop_wino_cell_bound,
conv_desc.wino5x5_flag
#if DEBUG_FILE_PRINT
,conv_desc
#endif
,ap_clk_div2
);
}
void wino_systolic_top(
ap_uint<128> *input_DDR0,
ap_uint<128> *input_DDR1,
ap_uint<128> *input_DDR2,
ap_uint<128> *input_DDR3,
ap_uint<128> *weight_DDR0,
ap_uint<128> *weight_DDR1,
ap_uint<128> *weight_DDR2,
ap_uint<128> *weight_DDR3,
ap_uint<128> *output_DDR0,
ap_uint<128> *output_DDR1,
ap_uint<128> *output_DDR2,
ap_uint<128> *output_DDR3,
ConvDesc_t &conv_desc,
ap_uint<1> ap_clk_div2
xliu79
committed
)
#pragma HLS interface m_axi port= input_DDR3 depth=65535
#pragma HLS interface m_axi port= input_DDR2 depth=65535
#pragma HLS interface m_axi port= input_DDR1 depth=65535
#pragma HLS interface m_axi port= input_DDR0 depth=65535
#pragma HLS interface m_axi port= output_DDR3 depth=65535
#pragma HLS interface m_axi port= output_DDR2 depth=65535
#pragma HLS interface m_axi port= output_DDR1 depth=65535
#pragma HLS interface m_axi port= output_DDR0 depth=65535
#pragma HLS interface m_axi port= weight_DDR3 depth=65535
#pragma HLS interface m_axi port= weight_DDR2 depth=65535
#pragma HLS interface m_axi port= weight_DDR1 depth=65535
#pragma HLS interface m_axi port= weight_DDR0 depth=65535
xliu79
committed
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=input_buffer complete dim=1
#pragma HLS array_partition variable=input_buffer complete dim=2
ap_uint<OUT_WIDTH*2> output_buffer0[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer0 complete dim=1
#pragma HLS array_partition variable=output_buffer0 complete dim=2
ap_uint<OUT_WIDTH*2> output_buffer1[OUTDEPTH_MINITILE_SIZE][WINO_WIDTH][WINO_OUT_SIZE_SQUARE][OUTPUT_BUFFER_DEPTH];
#pragma HLS array_partition variable=output_buffer1 complete dim=1
#pragma HLS array_partition variable=output_buffer1 complete dim=2
xliu79
committed
ap_uint<1> pingpong;
xliu79
committed
clear_buffer_content<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer);
printf("conv_desc.row_address_bitnumber_flag %d\n", conv_desc.row_address_bitnumber_flag);
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
xliu79
committed
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
xliu79
committed
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
0,
xliu79
committed
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
xliu79
committed
ap_uint<16> start_output_row =0;
for( ; start_output_row < conv_desc.outheight; start_output_row+=conv_desc.out_rowstep)
ap_uint<16> start_row_idx_minus_pad_size=start_output_row-conv_desc.pad_size;
printf("conv_desc.row_address_bitnumber_flag %d\n", conv_desc.row_address_bitnumber_flag);
load_input_rowtile_from_ddr(
input_DDR0,
input_DDR1,
input_DDR2,
input_DDR3,
input_buffer,
conv_desc.inheight,
conv_desc.inwidth,
conv_desc.stride,
conv_desc.pad_size,
conv_desc.inwidth_align8,
conv_desc.indepth_align8,
conv_desc.group_indepth_x_inwidth_align8_by8,
conv_desc.group_indepth_offset_x_inwidth_align8_by8,
conv_desc.inwidth_ceildiv_inbufferwidth,
conv_desc.buffer_address_mid_increment_step,
conv_desc.input_load_burst_length,
conv_desc.row_address_bitnumber_flag,
conv_desc.out_rowstep,
start_output_row + conv_desc.out_rowstep,
xliu79
committed
attach_input_buffer_content_uniformed<INBUFFER_HEIGHT,INBUFFER_WIDTH, INPUT_BUFFER_DEPTH>(input_buffer,0,"input_buffer_content.txt");
wino_systolic_kernel(
weight_DDR0,
weight_DDR1,
weight_DDR2,
weight_DDR3,
output_buffer0,
start_output_row,
start_row_idx_minus_pad_size,
start_output_row==0,
start_output_row+conv_desc.wino_output_tile_size > conv_desc.outheight,
conv_desc,
ap_clk_div2
xliu79
committed
}
xliu79
committed
xliu79
committed
}