Newer
Older
#ifndef _WINO_BUFFER_HPP_
#define _WINO_BUFFER_HPP_
#include "../software/param.h"
#include <ap_int.h>
#include <hls_stream.h>
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
void input_transform(
hls::stream< ap_uint<8*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_stream,
hls::stream< ap_uint<BTB_WIDTH*BATCH_SIZE*WINO_DOMAIN_SIZE_SQUARE> > &input_tile_transformed_stream,
int input_transform_feeding_loop_bound
)
{
for(int cycle=0;cycle<input_transform_feeding_loop_bound;cycle++)
{
#pragma hls pipeline
ap_uint<8*BATCH_SIZE*36> input_tile_stream_data;
input_tile_stream>>input_tile_stream_data;
ap_int<8> in[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
#pragma HLS array_partition variable=in complete
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
for(int j=0;j<WINO_DOMAIN_SIZE;j++)
{
for(int k=0;k<BATCH_SIZE;k++)
{
in[i][j][k]=input_tile_stream_data.range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8+7, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*8);
}
}
}
ap_int<DB_WIDTH> DB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
for(int k=0;k<BATCH_SIZE;k++)
{
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
#if WINO_DOMAIN_SIZE ==6
ap_int<IN_WIDTH+1> a=in[i][0][k]-in[i][2][k];
ap_int<IN_WIDTH+1> b=in[i][2][k]-in[i][4][k];
ap_int<IN_WIDTH+3> c=in[i][1][k]*4-in[i][3][k];
ap_int<IN_WIDTH+3> d=in[i][2][k]*4-in[i][5][k];
ap_int<IN_WIDTH+1> e=in[i][1][k]-in[i][3][k];
ap_int<IN_WIDTH+1> f=in[i][3][k]-in[i][5][k];
DB[i][0][k]=(a*4-b)>>DB_QUANT_BIT;
DB[i][1][k]=(-c-d)>>DB_QUANT_BIT;
DB[i][2][k]=(c-d)>>DB_QUANT_BIT;
DB[i][3][k]=(-e*2-f)>>DB_QUANT_BIT;
DB[i][4][k]=(e*2-f)>>DB_QUANT_BIT;
DB[i][5][k]=(e*4-f)>>DB_QUANT_BIT;
#else
#error "WINO_DOMAIN_SIZE!=6 not implemented "
#endif
}
}
ap_int<BTB_WIDTH> BtDB[WINO_DOMAIN_SIZE][WINO_DOMAIN_SIZE][BATCH_SIZE];
for(int k=0;k<BATCH_SIZE;k++)
{
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
#if WINO_DOMAIN_SIZE ==6
ap_int<DB_WIDTH+1> a=DB[0][i][k]-DB[2][i][k];
ap_int<DB_WIDTH+1> b=DB[2][i][k]-DB[4][i][k];
ap_int<DB_WIDTH+3> c=DB[1][i][k]*4-DB[3][i][k];
ap_int<DB_WIDTH+3> d=DB[2][i][k]*4-DB[5][i][k];
ap_int<DB_WIDTH+1> e=DB[1][i][k]-DB[3][i][k];
ap_int<DB_WIDTH+1> f=DB[3][i][k]-DB[5][i][k];
BtDB[0][i][k]=(a*4-b)>>BTB_QUANT_BIT;
BtDB[1][i][k]=(-c-d)>>BTB_QUANT_BIT;
BtDB[2][i][k]=(c-d)>>BTB_QUANT_BIT;
BtDB[3][i][k]=(-e*2-f)>>BTB_QUANT_BIT;
BtDB[4][i][k]=(e*2-f)>>BTB_QUANT_BIT;
BtDB[5][i][k]=(e*4-f)>>BTB_QUANT_BIT;
#else
#error "WINO_DOMAIN_SIZE!=6 not implemented "
#endif
}
}
ap_uint<BTB_WIDTH*BATCH_SIZE*36> input_tile_transformed_data;
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
for(int j=0;j<WINO_DOMAIN_SIZE;j++)
{
for(int k=0;k<BATCH_SIZE;k++)
{
input_tile_transformed_data.range( ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH+7, ((i*WINO_DOMAIN_SIZE+j)*BATCH_SIZE+k)*BTB_WIDTH)=BtDB[i][j][k];
}
}
}
input_tile_transformed_stream<<input_tile_transformed_data;
}
}
xliu79
committed
void input_feed_underconstruction(
ap_uint<16> input_buffer[INBUFFER_HEIGHT][INBUFFER_WIDTH][INPUT_BUFFER_DEPTH],
hls::stream< ap_uint<8*BATCH_SIZE*36> > input_tile_stream[WINO_WIDTH],
ap_uint<16> inheight,
ap_uint<16> inwidth,
xliu79
committed
ap_uint<16> pad_size,
ap_uint<16> weightbuffer_load_indepth_number,
ap_uint<16> weightbuffer_load_outdepth_number,
ap_uint<16> wino_output_tile_size,
ap_uint<32> input_buffer_feeding_loop_bound,
ap_uint<16> loop_wino_tile_col_reset_cycle,
xliu79
committed
ap_uint<16> loop_outdepth_minitile_baseidx_reset_cycle,
ap_uint<10> buffer_address_mid_minitile_depth_step,
xliu79
committed
ap_uint<16> wino_out_size_by_wino_width,
ap_uint<16> start_row_idx,
ap_int<16> start_row_idx_minus_pad_size
#if DEBUG_FILE_PRINT
,ConvDesc_t conv_desc
#endif
xliu79
committed
)
{
#pragma HLS array_partition variable=input_tile_stream complete
// row_selection preparation
ap_uint<1> row_legal_flag[WINO_DOMAIN_SIZE];
#pragma HLS array_partition variable=row_legal_flag complete
ap_uint<1> row_address_offset[INBUFFER_HEIGHT];
#pragma HLS array_partition variable=row_address_offset complete
ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_bank_idx[WINO_DOMAIN_SIZE];
#pragma HLS array_partition variable=row_bank_idx complete
xliu79
committed
// = wino_output_tile_size<<WINO_WIDTH_BITWIDTH;
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
ap_uint<INBUFFER_HEIGHT_BITWIDTH> row_breakpoint = start_row_idx_minus_pad_size.range(INBUFFER_HEIGHT_BITWIDTH-1,0);
for(int i=0;i<8;i++)
{
#pragma HLS unroll
if(i< row_breakpoint)
row_address_offset[i] = ~start_row_idx_minus_pad_size[INBUFFER_HEIGHT_BITWIDTH];
else
row_address_offset[i] = start_row_idx_minus_pad_size[INBUFFER_HEIGHT_BITWIDTH];
}
for(int i=0;i<6;i++)
{
#pragma HLS unroll
row_bank_idx[i] = start_row_idx_minus_pad_size+i;
row_legal_flag[i] = ( start_row_idx_minus_pad_size+i >=0 && start_row_idx_minus_pad_size+i < inheight);
}
ap_int<10> input_col_idx[WINO_WIDTH][WINO_DOMAIN_SIZE];
#pragma HLS array_partition variable=input_col_idx dim=1 complete
#pragma HLS array_partition variable=input_col_idx dim=2 complete
ap_uint<16> wino_col_offset_constant[WINO_WIDTH];
#pragma HLS array_partition variable=wino_col_offset_constant complete
for(int i=0;i<WINO_WIDTH;i++)
{
#pragma HLS unroll
wino_col_offset_constant[i]=wino_output_tile_size*i;
}
ap_uint<16> first_col_idx=0;
ap_uint<INDEPTH_MINITILE_SIZE_BITWIDTH> loop_indepth_minitile_idx=0;
ap_uint<16> loop_wino_tile_col_cnt=1;
ap_uint<16> loop_indepth_minitile_baseidx_cnt =1;
xliu79
committed
ap_uint<16> loop_outdepth_minitile_baseidx_cnt =1;
// loop_wino_tile_col_reset_cycle =conv_desc.wino_tile_number_in_outwidth*conv_desc.weightbuffer_outdepth_minitile_number*INDEPTH_MINITILE_SIZE;
// loop_outdepth_minitile_baseidx_reset_cycle =conv_desc.weightbuffer_outdepth_minitile_number*INDEPTH_MINITILE_SIZE;
ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_minitile_depth_offset=0;
ap_uint<INBUFFER_MID_ADDR_BITWIDTH> buffer_address_mid_buffertile_depth_offset=0;
xliu79
committed
ap_int<16> input_head_col_idx=wino_col_offset_constant[0]-pad_size;
for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
{
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col];
}
}
for(ap_uint<16> outdepth_buffertile_idx=0;outdepth_buffertile_idx<weightbuffer_load_outdepth_number;outdepth_buffertile_idx++)
{
xliu79
committed
buffer_address_mid_minitile_depth_offset = 0;
xliu79
committed
for(int counter=0;counter<input_buffer_feeding_loop_bound;counter++ )
{
#pragma HLS pipeline II =1
// it is a flattened loop which does following
// for(ap_uint<16> indepth_buffertile_baseidx=0;indepth_buffertile_baseidx<weightbuffer_load_indepth_number;indepth_buffertile_baseidx++)
// for( int indepth_minitile_baseidx=0;indepth_minitile_baseidx<weightbuffer_indepth_minitile_number; indepth_minitile_baseidx ++)
// for(int wino_tile_col_idx =1; wino_tile_col_idx < wino_tile_number_in_outwidth+1 ; wino_tile_col_idx++)
// for(int outdepth_minitile_baseidx=0;outdepth_minitile_baseidx<weightbuffer_outdepth_minitile_number; outdepth_minitile_baseidx ++)
// for(ap_uint<3> indepth_minitile_idx=0; indepth_minitile_idx< INDEPTH_MINITILE_SIZE; indepth_minitile_idx++)
ap_uint<1> col_legal_flag[WINO_WIDTH][WINO_DOMAIN_SIZE];
#pragma HLS array_partition variable=col_legal_flag complete
for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
{
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
xliu79
committed
#pragma HLS unroll
col_legal_flag[wino_array_col][i]= ( input_col_idx[wino_array_col][i] >=0 && input_col_idx[wino_array_col][i] < inwidth);
xliu79
committed
}
ap_uint<INBUFFER_MID_ADDR_BITWIDTH> col_pix_address_offset[INBUFFER_WIDTH];
ap_uint<INBUFFER_WIDTH_BITWIDTH> col_breakpoint=input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH-1,0);
xliu79
committed
ap_uint<INBUFFER_MID_ADDR_BITWIDTH> input_head_col_address_offset;
input_head_col_address_offset= input_head_col_idx.range(INBUFFER_WIDTH_BITWIDTH+INBUFFER_MID_ADDR_BITWIDTH-1,INBUFFER_WIDTH_BITWIDTH)
+ buffer_address_mid_minitile_depth_offset;
xliu79
committed
for(int i=0;i<INBUFFER_WIDTH;i++)
{
if(i>=col_breakpoint)
col_pix_address_offset[i] = input_head_col_address_offset;
else
col_pix_address_offset[i] = input_head_col_address_offset+1;
}
xliu79
committed
ap_uint<INPUT_BUFFER_DEPTH_BITWIDTH> buffer_address[INBUFFER_HEIGHT][INBUFFER_WIDTH];
xliu79
committed
for(int i=0;i<INBUFFER_HEIGHT; i++)
{
#pragma HLS unroll
for(int j=0;j<INBUFFER_WIDTH;j++)
{
#pragma HLS unroll
xliu79
committed
buffer_address[i][j]=(row_address_offset[i],col_pix_address_offset[j],loop_indepth_minitile_idx);
xliu79
committed
}
xliu79
committed
xliu79
committed
ap_uint<16> input_buffer_val[INBUFFER_HEIGHT][INBUFFER_WIDTH];
#pragma HLS array_partition variable=input_buffer_val complete
xliu79
committed
for(int i=0;i<INBUFFER_HEIGHT; i++)
{
#pragma HLS unroll
for(int j=0;j<INBUFFER_WIDTH;j++)
{
#pragma HLS unroll
xliu79
committed
input_buffer_val[i][j]=input_buffer[i][j][buffer_address[i][j]];
xliu79
committed
}
xliu79
committed
ap_uint<16> input_plane_tile_row[WINO_DOMAIN_SIZE][INBUFFER_WIDTH];
#pragma HLS array_partition variable=input_plane_tile_row dim=1 complete
#pragma HLS array_partition variable=input_plane_tile_row dim=2 complete
xliu79
committed
for(int j=0;j<INBUFFER_WIDTH;j++)
{
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
xliu79
committed
if(row_legal_flag[i])
xliu79
committed
input_plane_tile_row[i][j]=input_buffer_val[row_bank_idx[i]][j];
}
else
{
input_plane_tile_row[i][j]=0;
xliu79
committed
}
ap_uint<16*WINO_DOMAIN_SIZE_SQUARE> input_plane_tile[WINO_WIDTH];
#pragma HLS array_partition variable=input_plane_tile complete dim=1
#pragma HLS array_partition variable=input_plane_tile complete dim=2
#pragma HLS array_partition variable=input_plane_tile complete dim=3
xliu79
committed
for(int i=0;i<WINO_WIDTH;i++)
{
#pragma HLS unroll
for(int j=0;j<WINO_DOMAIN_SIZE;j++)
{
#pragma HLS unroll
xliu79
committed
for(int k=0;k<WINO_DOMAIN_SIZE;k++)
{
#pragma HLS unroll
xliu79
committed
if(col_legal_flag[i][k])
input_plane_tile[i].range((j*6+k)*16+15,(j*6+k)*16)=input_plane_tile_row[j][ (ap_uint<INBUFFER_WIDTH_BITWIDTH>) input_col_idx[i][k].range(INBUFFER_WIDTH_BITWIDTH-1,0) ];
xliu79
committed
else
input_plane_tile[i].range((j*6+k)*16+15,(j*6+k)*16)=0;
xliu79
committed
}
for(int i=0;i<WINO_WIDTH;i++)
{
#pragma HLS unroll
input_tile_stream[i]<<input_plane_tile[i];
}
xliu79
committed
#if DEBUG_FILE_PRINT
int indepth = buffer_address_mid_minitile_depth_offset/buffer_address_mid_minitile_depth_step*INDEPTH_MINITILE_SIZE
+loop_indepth_minitile_idx;
attach_streaming_content<WINO_WIDTH,WINO_DOMAIN_SIZE>(input_plane_tile, start_row_idx, input_head_col_idx+pad_size, indepth, "instream.txt");
xliu79
committed
#endif
xliu79
committed
if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
{
buffer_address_mid_minitile_depth_offset += buffer_address_mid_minitile_depth_step;
}
xliu79
committed
if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
{
input_head_col_idx=wino_col_offset_constant[0]-pad_size;
for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
xliu79
committed
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
xliu79
committed
input_col_idx[wino_array_col][i]=i-pad_size+wino_col_offset_constant[wino_array_col];
xliu79
committed
}
else if(loop_outdepth_minitile_baseidx_cnt==loop_outdepth_minitile_baseidx_reset_cycle)
{
input_head_col_idx+=wino_out_size_by_wino_width;
for(int wino_array_col=0;wino_array_col<WINO_WIDTH;wino_array_col++)
xliu79
committed
#pragma HLS unroll
for(int i=0;i<WINO_DOMAIN_SIZE;i++)
{
#pragma HLS unroll
xliu79
committed
input_col_idx[wino_array_col][i]+=wino_out_size_by_wino_width;
xliu79
committed
}
xliu79
committed
if(loop_wino_tile_col_cnt == loop_wino_tile_col_reset_cycle)
{
loop_wino_tile_col_cnt=1;
}
else
{
loop_wino_tile_col_cnt++;
}
xliu79
committed
if(loop_outdepth_minitile_baseidx_cnt==loop_outdepth_minitile_baseidx_reset_cycle)
{
loop_outdepth_minitile_baseidx_cnt=1;
}
else
{
loop_outdepth_minitile_baseidx_cnt++;
xliu79
committed
loop_indepth_minitile_idx++;
xliu79
committed
}
xliu79
committed
ap_uint<128>* weight_DDR,
ap_uint<32> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4][WEIGHT_BUFFER_DEPTH],
ap_uint<16> weightDDR_buffer_burst_length,
ap_uint<16> weightDDR_port_burst_length,
ap_uint<32> ddr_address_offset,
ap_uint<1> pingpong,
ap_uint<1> skip_flag
#if DEBUG_FILE_PRINT
,ConvDesc_t conv_desc
#endif
)
xliu79
committed
#pragma HLS array_partition variable = weight_buff dim=1 complete
#pragma HLS array_partition variable = weight_buff dim=2 complete
xliu79
committed
if(skip_flag)
return;
xliu79
committed
// printf("DDR_offset %d, i%d o%d\n",(int) ddr_address_offset,(int) conv_desc.weightbuffer_load_indepth_number,(int) conv_desc.weightbuffer_load_outdepth_number);
xliu79
committed
ap_uint<128>* offseted_weight_DDR=weight_DDR+ddr_address_offset;
xliu79
committed
ap_uint<WEIGHTDDR_INDEPTH_MINITILE_128BIT_STEP_BITWIDTH> counter=0;
xliu79
committed
ap_uint<16> port_load_cnt=1;
ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH-1> buffer_address_offset=0;
#if WEIGHT_FEED_NUMBER_PER_PORT_BITWIDTH == 0
ap_uint<1> buffer_idx;
#else
ap_uint<WEIGHT_FEED_NUMBER_PER_PORT_BITWIDTH> buffer_idx=0;
xliu79
committed
#endif
// printf("into the loop %d %d\n", (int) weightDDR_port_burst_length, (int) weightDDR_buffer_burst_length);
// fflush(stdout);
for(int address = 0; address<weightDDR_port_burst_length; address++)
xliu79
committed
xliu79
committed
ap_uint<128> temp128 = offseted_weight_DDR[address];
ap_uint<32> temp32[4];
#pragma HLS array_partition variable = temp32 complete
xliu79
committed
for(int i=0;i<4;i++)
xliu79
committed
temp32[i]=temp128.range(i*32+31,i*32);
}
ap_uint<10> buffer_address = (pingpong,buffer_address_offset);
xliu79
committed
for(int i=0;i<WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4;i++)
xliu79
committed
#pragma HLS unroll
if( i/4==counter)
{
#if WEIGHT_FEED_NUMBER_PER_PORT == 0
weight_buff[0][i][buffer_address]=temp32[i%4];
#else
weight_buff[buffer_idx][i][buffer_address]=temp32[i%4];
#endif
}
}
if(port_load_cnt==weightDDR_buffer_burst_length)
xliu79
committed
buffer_address_offset=0;
xliu79
committed
else if(counter== WEIGHTDDR_INDEPTH_MINITILE_128BIT_STEP-1)
xliu79
committed
if(port_load_cnt==weightDDR_buffer_burst_length)
xliu79
committed
port_load_cnt=1;
}
else
{
port_load_cnt++;
}
xliu79
committed
if(counter==WEIGHTDDR_INDEPTH_MINITILE_128BIT_STEP-1)
{
counter=0;
}
else
{
counter++;
}
}
}
xliu79
committed
void weight_streamer(
ap_uint<32> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4][WEIGHT_BUFFER_DEPTH],
hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[WEIGHT_FEED_NUMBER_PER_PORT],
ap_uint<16> loop_outdepth_minitile_baseidx_reset_cycle_minus1,
ap_uint<16> loop_start_output_basecol_reset_cycle,
ap_uint<32> loop_weight_feed_bound,
xliu79
committed
ap_uint<1> pingpong
#if DEBUG_FILE_PRINT
,ConvDesc_t conv_desc
#endif
)
{
printf("stream pingpong %d\n",(int) pingpong);
#pragma HLS array_partition variable = weight_buff dim=1 complete
#pragma HLS array_partition variable = weight_buff dim=2 complete
xliu79
committed
// int weight_feed_total_size_by2 = weight_feed_total_size/2;
xliu79
committed
ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH -1> outdepth_minitile_addr_offset=0;
ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH -1> indepth_minitile_addr_offset=0;
xliu79
committed
ap_uint<16> loop_outdepth_minitile_baseidx_cnt=1;
ap_uint<16> loop_start_output_basecol_cnt=1;
xliu79
committed
// loop_outdepth_minitile_baseidx_reset_cycle_minus1=conv_desc.weightbuffer_outdepth_minitile_number-1;
// loop_start_output_basecol_reset_cycle=conv_desc.weightbuffer_outdepth_minitile_number * conv_desc.wino_tile_number_in_outwidth;
// int loop_weight_feed_bound = conv_desc.weightbuffer_indepth_minitile_number * conv_desc.weightbuffer_outdepth_minitile_number * conv_desc.wino_tile_number_in_outwidth;
xliu79
committed
for(ap_uint<32> cycle=0;cycle < loop_weight_feed_bound; cycle++)
#pragma HLS pipeline
// for(int indepth_minitile_baseidx=0;indepth_minitile_baseidx<conv_desc.weightbuffer_load_indepth_step; indepth_minitile_baseidx += INDEPTH_MINITILE_SIZE)
// for(int start_output_basecol =0; start_output_basecol < conv_desc.outwidth; start_output_basecol+=conv_desc.wino_output_tile_size*WINO_WIDTH)
// for(int outdepth_minitile_baseidx=0;outdepth_minitile_baseidx<conv_desc.weightbuffer_load_outdepth_step; outdepth_minitile_baseidx += OUTDEPTH_MINITILE_SIZE)
ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH-1> weight_buffer_address_right=indepth_minitile_addr_offset+outdepth_minitile_addr_offset;
ap_uint<WEIGHT_BUFFER_DEPTH_BITWIDTH> weight_buffer_address = (pingpong,weight_buffer_address_right);
xliu79
committed
ap_uint<32> temp18[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4];
#pragma HLS array_partition variable = temp18 complete
xliu79
committed
for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
{
#pragma HLS unroll
for(int j18=0;j18<WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4;j18++)
{
#pragma HLS unroll
temp18[buffer_idx][j18]=weight_buff[buffer_idx][j18][weight_buffer_address];
}
}
xliu79
committed
ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> temp16x36[WEIGHT_FEED_NUMBER_PER_PORT];
#pragma HLS array_partition variable = temp16x36 complete
xliu79
committed
for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
{
#pragma HLS unroll
for(int j18=0;j18<WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4;j18++)
{
#pragma HLS unroll
temp16x36[buffer_idx].range(j18*32+31,j18*32)=temp18[buffer_idx][j18];
}
}
for(int buffer_idx =0; buffer_idx< WEIGHT_FEED_NUMBER_PER_PORT; buffer_idx++)
{
#pragma HLS unroll
weight_stream[buffer_idx]<<temp16x36[buffer_idx];
}
xliu79
committed
if(loop_start_output_basecol_cnt==loop_start_output_basecol_reset_cycle){
indepth_minitile_addr_offset+=conv_desc.weightbuffer_outdepth_minitile_number;
}
if(outdepth_minitile_addr_offset==loop_outdepth_minitile_baseidx_reset_cycle_minus1){
outdepth_minitile_addr_offset=0;
}
else{
outdepth_minitile_addr_offset++;
}
xliu79
committed
if(loop_start_output_basecol_cnt==loop_start_output_basecol_reset_cycle){
loop_start_output_basecol_cnt=1;
}
else{
loop_start_output_basecol_cnt++;
xliu79
committed
template<int dummy>
void weight_feed_one_port(
xliu79
committed
hls::stream<ap_uint<8*INDEPTH_MINITILE_SIZE*WINO_DOMAIN_SIZE_SQUARE> > weight_stream[WEIGHT_FEED_NUMBER_PER_PORT],
ap_uint<16> weightDDR_burst_number,
xliu79
committed
ap_uint<16> weightDDR_port_burst_length,
ap_uint<16> loop_outdepth_minitile_baseidx_reset_cycle_minus1,
ap_uint<16> loop_start_output_basecol_reset_cycle,
ap_uint<32> loop_weight_feed_bound,
xliu79
committed
ap_uint<1> first_flag,
ap_uint<1> last_flag
#if DEBUG_FILE_PRINT
,ConvDesc_t conv_desc
xliu79
committed
printf("\ninto weight feed %d %d %d\n",(int) first_flag, (int) last_flag, (int) weightDDR_burst_number );
static ap_uint<32> weight_buff[WEIGHT_FEED_NUMBER_PER_PORT][WINO_DOMAIN_SIZE_SQUARE*INDEPTH_MINITILE_SIZE/4][WEIGHT_BUFFER_DEPTH];
static ap_uint<16> DDR_offset;
static ap_uint<16> DDR_load_cnt;
static ap_uint<1> pingpong;
if(first_flag)
xliu79
committed
DDR_offset=0;
DDR_load_cnt=0;
xliu79
committed
load_weight_ddr_one_port(
weightDDR_buffer_burst_length,
weightDDR_port_burst_length,
xliu79
committed
0,
xliu79
committed
~first_flag
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
xliu79
committed
for(ap_uint<16> cnt=0;cnt< weightDDR_burst_number ;cnt++)
xliu79
committed
if(DDR_load_cnt == weightDDR_burst_number-1)
{
DDR_load_cnt = 0;
DDR_offset = 0;
}
else
{
DDR_load_cnt+=1;
DDR_offset+=weightDDR_port_burst_length;
}
xliu79
committed
load_weight_ddr_one_port(
weightDDR_buffer_burst_length,
weightDDR_port_burst_length,
xliu79
committed
~pingpong,
last_flag & (DDR_load_cnt==0)
#if DEBUG_FILE_PRINT
,conv_desc
xliu79
committed
);
xliu79
committed
weight_streamer(
weight_buff,
weight_stream,
loop_outdepth_minitile_baseidx_reset_cycle_minus1,
loop_start_output_basecol_reset_cycle,
loop_weight_feed_bound,
xliu79
committed
pingpong
#if DEBUG_FILE_PRINT
,conv_desc
#endif
);
pingpong = ~pingpong;
}
xliu79
committed