Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hpvm-release
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
llvm
hpvm-release
Commits
16585eb6
Commit
16585eb6
authored
4 years ago
by
Hashim Sharif
Browse files
Options
Downloads
Patches
Plain Diff
Removing commented-out code from approx_techniques.cu (hpvm-tensor-rt)
parent
a42f98e9
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+93
-191
93 additions, 191 deletions
...ts/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
with
93 additions
and
191 deletions
hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+
93
−
191
View file @
16585eb6
...
@@ -169,19 +169,13 @@ __global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ outp
...
@@ -169,19 +169,13 @@ __global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ outp
if
(
n
<
N
)
{
//is thread id within bounds?
if
(
n
<
N
)
{
//is thread id within bounds?
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
//const int ki = c * KH * KW + i;
//const int kj = c * KH * KW + j;
const
int
filter_elem_num
=
(
c
*
KH
+
i
)
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
(
c
*
KH
+
i
)
*
KW
+
j
;
//index of this filter element
if
((
filter_elem_num
-
skip_offset
)
%
skip_every
)
{
if
((
filter_elem_num
-
skip_offset
)
%
skip_every
)
{
const
int
condition
=
(
filter_elem_num
<
skip_offset
);
const
int
condition
=
(
filter_elem_num
<
skip_offset
);
const
int
output_col
=
condition
*
filter_elem_num
const
int
output_col
=
condition
*
filter_elem_num
+
(
!
condition
)
*
(
filter_elem_num
-
((
filter_elem_num
+
1
-
skip_offset
)
/
skip_every
)
+
(
!
condition
)
*
(
filter_elem_num
-
((
filter_elem_num
+
1
-
skip_offset
)
/
skip_every
)
-
((
filter_elem_num
+
1
-
skip_offset
)
%
skip_every
>
0
));
-
((
filter_elem_num
+
1
-
skip_offset
)
%
skip_every
>
0
));
//if(filter_elem_num % skip_every != skip_offset) {
// int output_col = filter_elem_num -
// (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
//if(skip_every == 1)
// output_col = filter_elem_num;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
output_col
)
*
H_out
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
output_col
)
*
H_out
+
h
)
*
W_out
+
w
;
//((output_col*N + n) * H_out + h) * W_out + w;
//((output_col*N + n) * H_out + h) * W_out + w;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
...
@@ -213,22 +207,16 @@ __global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ out
...
@@ -213,22 +207,16 @@ __global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ out
if
(
n
<
N
)
{
//is thread id within bounds?
if
(
n
<
N
)
{
//is thread id within bounds?
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
//const int ki = c * KH * KW + i;
//const int kj = c * KH * KW + j;
const
int
filter_elem_num
=
(
c
*
KH
+
i
)
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
(
c
*
KH
+
i
)
*
KW
+
j
;
//index of this filter element
if
((
filter_elem_num
-
skip_offset
)
%
skip_every
)
{
if
((
filter_elem_num
-
skip_offset
)
%
skip_every
)
{
const
int
condition
=
(
filter_elem_num
<
skip_offset
);
const
int
condition
=
(
filter_elem_num
<
skip_offset
);
const
int
output_col
=
condition
*
filter_elem_num
const
int
output_col
=
condition
*
filter_elem_num
+
(
!
condition
)
*
(
filter_elem_num
-
((
filter_elem_num
+
1
-
skip_offset
)
/
skip_every
)
+
(
!
condition
)
*
(
filter_elem_num
-
((
filter_elem_num
+
1
-
skip_offset
)
/
skip_every
)
-
((
filter_elem_num
+
1
-
skip_offset
)
%
skip_every
>
0
));
-
((
filter_elem_num
+
1
-
skip_offset
)
%
skip_every
>
0
));
//if(filter_elem_num % skip_every != skip_offset) {
// int output_col = filter_elem_num -
// (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
//if(skip_every == 1)
// output_col = filter_elem_num;
const
int
out_index
=
((
output_col
*
N
+
n
)
*
H_out
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
output_col
*
N
+
n
)
*
H_out
+
h
)
*
W_out
+
w
;
//((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
//((output_col*N + n) * H_out + h) * W_out + w
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
output
[
out_index
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
output
[
out_index
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
else
else
...
@@ -288,15 +276,13 @@ __global__ void convToGemmPerfRow(float * const __restrict__ output,
...
@@ -288,15 +276,13 @@ __global__ void convToGemmPerfRow(float * const __restrict__ output,
}
}
const
int
inH
=
h_index
*
V_stride
-
V_pad
;
const
int
inH
=
h_index
*
V_stride
-
V_pad
;
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
//#pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
const
int
out_index
=
((
n
*
C
*
KH
*
KW
+
filter_elem_num
)
*
H_eff
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
n
*
C
*
KH
*
KW
+
filter_elem_num
)
*
H_eff
+
h
)
*
W_out
+
w
;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
output
[
out_index
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
output
[
out_index
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
else
else
output
[
out_index
]
=
0
;
output
[
out_index
]
=
0
;
...
@@ -357,11 +343,7 @@ __global__ void convToGemmPerfCol(float * const __restrict__ output,
...
@@ -357,11 +343,7 @@ __global__ void convToGemmPerfCol(float * const __restrict__ output,
}
}
const
int
inW
=
w_index
*
H_stride
-
H_pad
;
const
int
inW
=
w_index
*
H_stride
-
H_pad
;
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
//#pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
...
@@ -427,11 +409,8 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
...
@@ -427,11 +409,8 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
}
}
const
int
inH
=
h_index
*
V_stride
-
V_pad
;
const
int
inH
=
h_index
*
V_stride
-
V_pad
;
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
// #pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
...
@@ -465,38 +444,31 @@ __global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
...
@@ -465,38 +444,31 @@ __global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
}
}
const
int
inH
=
h_index
*
V_stride
-
V_pad
;
const
int
inH
=
h_index
*
V_stride
-
V_pad
;
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
// #pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
//
int i =
k
i
/
K
W
;
for
(
int
i
=
0
;
i
<
K
H
;
i
++
)
{
// int j = ki % KW;
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
out_index
=
((
filter_elem_num
*
N
+
n
)
*
H_eff
+
h
)
*
W_out
+
w
;
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
const
int
out_index
=
((
filter_elem_num
*
N
+
n
)
*
H_eff
+
h
)
*
W_out
+
w
;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
//
((n * C
* KH
*
KW
+
filter_elem_num) * H_eff
+
h
) * W
_out + w
;
output
[
out_index
]
=
input
[
((
n
*
C
+
c
)
*
H
+
(
inH
+
i
)
)
*
W
+
(
inW
+
j
)]
;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
else
output
[
out_index
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)]
;
output
[
out_index
]
=
0
;
else
output
[
out_index
]
=
0
;
}
}
}
}
}
}
}
}
__global__
void
approxInterpolateRowHalf
(
int
N
,
int
old_h
,
int
j
,
int
c
,
int
h
,
int
w
,
__global__
void
approxInterpolateRowHalf
(
int
N
,
int
old_h
,
int
j
,
int
c
,
int
h
,
int
w
,
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int n = tx / (c * h * w); //output image number
//const int stride = blockDim.x * gridDim.x;
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
if
(
n
<
N
)
{
if
(
n
<
N
)
{
//for(int i = index; i < N; i += stride){
//const int col = ((i % (c * h * w)) % (h * w)) % w;
//const int row = ((i % (c * h * w)) % (h * w)) / w;
//const int ch = (i % (c * h * w)) / (h * w);
// const int n = i / (c * h * w);
const
int
ch
=
tx
%
(
c
*
h
*
w
)
/
(
h
*
w
);
//filter number
const
int
ch
=
tx
%
(
c
*
h
*
w
)
/
(
h
*
w
);
//filter number
const
int
row
=
tx
%
(
h
*
w
)
/
w
;
//output height index (row number)
const
int
row
=
tx
%
(
h
*
w
)
/
w
;
//output height index (row number)
...
@@ -527,17 +499,9 @@ __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
...
@@ -527,17 +499,9 @@ __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
__global__
void
approxInterpolateRowHalf2
(
int
N
,
int
old_h
,
int
b
,
int
c
,
int
h
,
int
w
,
__global__
void
approxInterpolateRowHalf2
(
int
N
,
int
old_h
,
int
b
,
int
c
,
int
h
,
int
w
,
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int n = tx / (c * h * w); //output image numbe
//const int stride = blockDim.x * gridDim.x;
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
if
(
n
<
N
)
{
if
(
n
<
N
)
{
//for(int i = index; i < N; i += stride){
//const int col = ((i % (c * h * w)) % (h * w)) % w;
//const int row = ((i % (c * h * w)) % (h * w)) / w;
//const int ch = (i % (c * h * w)) / (h * w);
//const int n = i / (c * h * w);
const
int
ch
=
tx
%
(
c
*
h
*
w
)
/
(
h
*
w
);
//filter number
const
int
ch
=
tx
%
(
c
*
h
*
w
)
/
(
h
*
w
);
//filter number
const
int
row
=
tx
%
(
h
*
w
)
/
w
;
//output height index (row number)
const
int
row
=
tx
%
(
h
*
w
)
/
w
;
//output height index (row number)
...
@@ -554,13 +518,11 @@ __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h,
...
@@ -554,13 +518,11 @@ __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h,
}
else
if
((
row
-
start
)
%
x
==
0
)
{
}
else
if
((
row
-
start
)
%
x
==
0
)
{
const
int
row_index
=
row
-
((
row
+
1
-
start
)
/
x
);
const
int
row_index
=
row
-
((
row
+
1
-
start
)
/
x
);
const
int
output_index
=
ch
*
(
b
*
old_h
*
w
)
+
n
*
(
old_h
*
w
)
+
row_index
*
(
w
)
+
col
;
const
int
output_index
=
ch
*
(
b
*
old_h
*
w
)
+
n
*
(
old_h
*
w
)
+
row_index
*
(
w
)
+
col
;
//n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
__hdiv
(
__hadd
(
old_data
[
output_index
],
old_data
[
output_index
-
w
]),
2
);
__hdiv
(
__hadd
(
old_data
[
output_index
],
old_data
[
output_index
-
w
]),
2
);
}
else
{
}
else
{
const
int
row_index
=
row
-
((
row
+
1
-
start
)
/
x
)
-
((
row
+
1
-
start
)
%
x
>
0
);
const
int
row_index
=
row
-
((
row
+
1
-
start
)
/
x
)
-
((
row
+
1
-
start
)
%
x
>
0
);
const
int
output_index
=
ch
*
(
b
*
old_h
*
w
)
+
n
*
(
old_h
*
w
)
+
row_index
*
(
w
)
+
col
;
const
int
output_index
=
ch
*
(
b
*
old_h
*
w
)
+
n
*
(
old_h
*
w
)
+
row_index
*
(
w
)
+
col
;
//n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
output_index
];
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
output_index
];
}
}
}
}
...
@@ -587,11 +549,7 @@ __global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
...
@@ -587,11 +549,7 @@ __global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
}
}
const
int
inW
=
w_index
*
H_stride
-
H_pad
;
const
int
inW
=
w_index
*
H_stride
-
H_pad
;
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
//#pragma unroll
// for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter element
...
@@ -626,10 +584,8 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
...
@@ -626,10 +584,8 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
}
}
const
int
inW
=
w_index
*
H_stride
-
H_pad
;
const
int
inW
=
w_index
*
H_stride
-
H_pad
;
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
//#pragma unroll
// for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
i
=
0
;
i
<
KH
;
i
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter elemen
const
int
filter_elem_num
=
c
*
KH
*
KW
+
i
*
KW
+
j
;
//index of this filter elemen
...
@@ -647,15 +603,6 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
...
@@ -647,15 +603,6 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
__global__
void
approxInterpolateColHalf
(
int
N
,
int
old_w
,
int
b
,
int
c
,
int
h
,
int
w
,
__global__
void
approxInterpolateColHalf
(
int
N
,
int
old_w
,
int
b
,
int
c
,
int
h
,
int
w
,
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int stride = blockDim.x * gridDim.x;
//for(int i = index; i < N; i += stride){
// const int col = ((i % (c * h * w)) % (h * w)) % w;
// const int row = ((i % (c * h * w)) % (h * w)) / w;
// const int ch = (i % (c * h * w)) / (h * w);
// const int n = i / (c * h * w);
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
if
(
n
<
N
)
{
if
(
n
<
N
)
{
...
@@ -688,14 +635,6 @@ __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
...
@@ -688,14 +635,6 @@ __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
__global__
void
approxInterpolateColHalf2
(
int
N
,
int
old_w
,
int
b
,
int
c
,
int
h
,
int
w
,
__global__
void
approxInterpolateColHalf2
(
int
N
,
int
old_w
,
int
b
,
int
c
,
int
h
,
int
w
,
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
__half
*
old_data
,
__half
*
new_data
,
int
x
,
int
start
)
{
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int stride = blockDim.x * gridDim.x;
// for(int i = index; i < N; i += stride){
// const int col = ((i % (c * h * w)) % (h * w)) % w;
// const int row = ((i % (c * h * w)) % (h * w)) / w;
// const int ch = (i % (c * h * w)) / (h * w);
// const int n = i / (c * h * w);
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
const
int
n
=
tx
/
(
c
*
h
*
w
);
//output image number
if
(
n
<
N
)
{
if
(
n
<
N
)
{
...
@@ -705,25 +644,23 @@ __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
...
@@ -705,25 +644,23 @@ __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
if
(
col
<
start
)
{
if
(
col
<
start
)
{
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
old_w
+
col
];
=
old_data
[
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
old_w
+
col
];
//n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
}
else
if
(
col
==
w
-
1
)
{
}
else
if
(
col
==
w
-
1
)
{
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
(
old_w
)
+
old_w
-
1
];
old_data
[
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
(
old_w
)
+
old_w
-
1
];
//n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
}
else
if
(
col
==
0
)
{
}
else
if
(
col
==
0
)
{
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
(
old_w
)];
old_data
[
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
(
old_w
)];
//n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
}
else
if
((
col
-
start
)
%
x
==
0
)
{
}
else
if
((
col
-
start
)
%
x
==
0
)
{
const
int
col_index
=
col
-
((
col
+
1
-
start
)
/
x
);
const
int
col_index
=
col
-
((
col
+
1
-
start
)
/
x
);
const
int
output_index
=
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
old_w
+
col_index
;
const
int
output_index
=
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
old_w
+
col_index
;
//n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
__hdiv
(
__hadd
(
old_data
[
output_index
],
old_data
[
output_index
-
1
]),
2
);
__hdiv
(
__hadd
(
old_data
[
output_index
],
old_data
[
output_index
-
1
]),
2
);
}
else
{
}
else
{
const
int
col_index
=
col
-
((
col
+
1
-
start
)
/
x
)
-
((
col
+
1
-
start
)
%
x
>
0
);
const
int
col_index
=
col
-
((
col
+
1
-
start
)
/
x
)
-
((
col
+
1
-
start
)
%
x
>
0
);
const
int
output_index
=
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
old_w
+
col_index
;
const
int
output_index
=
ch
*
(
b
*
h
*
old_w
)
+
n
*
(
h
*
old_w
)
+
row
*
old_w
+
col_index
;
//const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
output_index
];
new_data
[
n
*
(
c
*
h
*
w
)
+
ch
*
(
h
*
w
)
+
row
*
(
w
)
+
col
]
=
old_data
[
output_index
];
}
}
}
}
...
@@ -759,6 +696,7 @@ __global__ void convToGemmFullInputRegular(float * const __restrict__ output,
...
@@ -759,6 +696,7 @@ __global__ void convToGemmFullInputRegular(float * const __restrict__ output,
in_index
=
((
fi
-
offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
in_index
=
((
fi
-
offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
fi
-
offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
offset
-
1
;
+
(((
fi
-
offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
offset
-
1
;
}
}
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
j
=
in_index
%
KW
;
const
int
j
=
in_index
%
KW
;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
fi
)
*
H_out
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
fi
)
*
H_out
+
h
)
*
W_out
+
w
;
...
@@ -809,13 +747,15 @@ __global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
...
@@ -809,13 +747,15 @@ __global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
}
}
}
}
}
}
}
}
__global__
void
createReducedFiltersFullRegular
(
float
*
output
,
__global__
void
createReducedFiltersFullRegular
(
float
*
output
,
const
float
*
const
__restrict
input
,
const
int
NF
,
const
float
*
const
__restrict
input
,
const
int
NF
,
const
int
num_filter_elem
,
const
int
reduced_filter_elem
,
const
int
num_filter_elem
,
const
int
reduced_filter_elem
,
const
int
channels
,
const
int
channels
,
const
int
skip_every
,
const
int
skip_offset
,
const
float
fac
)
{
const
int
skip_every
,
const
int
skip_offset
,
const
float
fac
)
{
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
fIdx
=
tx
/
reduced_filter_elem
;
//filter index
const
int
fIdx
=
tx
/
reduced_filter_elem
;
//filter index
...
@@ -826,11 +766,13 @@ __global__ void createReducedFiltersFullRegular(float * output,
...
@@ -826,11 +766,13 @@ __global__ void createReducedFiltersFullRegular(float * output,
int
in_index
;
int
in_index
;
if
(
offset
<
channel_offset
)
{
if
(
offset
<
channel_offset
)
{
in_index
=
offset
;
in_index
=
offset
;
}
else
{
}
else
{
in_index
=
((
offset
-
channel_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
in_index
=
((
offset
-
channel_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
offset
-
channel_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
channel_offset
-
1
;
+
(((
offset
-
channel_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
channel_offset
-
1
;
}
}
output
[
fIdx
*
reduced_filter_elem
+
offset
]
=
fac
*
input
[
num_filter_elem
*
fIdx
+
in_index
];
output
[
fIdx
*
reduced_filter_elem
+
offset
]
=
fac
*
input
[
num_filter_elem
*
fIdx
+
in_index
];
}
}
}
}
...
@@ -873,30 +815,23 @@ __global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
...
@@ -873,30 +815,23 @@ __global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
const
int
inH
=
h
*
V_stride
-
V_pad
;
//input height index (row number)
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
const
int
inW
=
w
*
H_stride
-
H_pad
;
//input width index (col number)
#pragma unroll
#pragma unroll
//for(int fi = 0; fi < reduced_filter_elem; fi++) {
//const int ch = (fi * C) / reduced_filter_elem;
for
(
int
ki
=
0
;
ki
<
reduced_filter_elem
/
C
;
ki
++
)
{
for
(
int
ki
=
0
;
ki
<
reduced_filter_elem
/
C
;
ki
++
)
{
const
int
fi
=
ch
*
(
reduced_filter_elem
/
C
)
+
ki
;
const
int
fi
=
ch
*
(
reduced_filter_elem
/
C
)
+
ki
;
const
int
offset
=
(
skip_offset
+
ch
)
%
skip_every
;
const
int
offset
=
(
skip_offset
+
ch
)
%
skip_every
;
//int in_index;
const
bool
condition
=
(
fi
<
offset
);
const
bool
condition
=
(
fi
<
offset
);
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
fi
-
offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
offset
-
1
);
+
(((
fi
-
offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
offset
-
1
);
//if(fi < offset) {
// in_index = fi;
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
//} else {
const
int
j
=
in_index
%
KW
;
// in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
const
int
out_index
=
((
n
*
reduced_filter_elem
+
fi
)
*
H_out
+
h
)
*
W_out
+
w
;
// + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
// }
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
}
else
{
const
int
j
=
in_index
%
KW
;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
fi
)
*
H_out
+
h
)
*
W_out
+
w
;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
}
else
{
output
[
out_index
]
=
0
;
output
[
out_index
]
=
0
;
}
}
}
}
}
}
}
}
...
@@ -922,26 +857,20 @@ __global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
...
@@ -922,26 +857,20 @@ __global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
#pragma unroll
#pragma unroll
for
(
int
ki
=
0
;
ki
<
reduced_filter_elem
/
C
;
ki
++
)
{
for
(
int
ki
=
0
;
ki
<
reduced_filter_elem
/
C
;
ki
++
)
{
const
int
fi
=
ch
*
(
reduced_filter_elem
/
C
)
+
ki
;
//for(int fi = 0; fi < reduced_filter_elem; fi++) {
const
int
fi
=
ch
*
(
reduced_filter_elem
/
C
)
+
ki
;
// const int ch = (fi * C) / reduced_filter_elem;
const
int
offset
=
(
skip_offset
+
ch
)
%
skip_every
;
const
int
offset
=
(
skip_offset
+
ch
)
%
skip_every
;
const
int
condition
=
(
fi
<
offset
);
const
int
condition
=
(
fi
<
offset
);
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
fi
-
offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
offset
-
1
);
+
(((
fi
-
offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
offset
-
1
);
// int in_index;
//if(fi < offset) {
// in_index = fi;
//} else {
// in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
// + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
// }
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
j
=
in_index
%
KW
;
const
int
j
=
in_index
%
KW
;
const
int
out_index
=
((
fi
*
N
+
n
)
*
H_out
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
fi
*
N
+
n
)
*
H_out
+
h
)
*
W_out
+
w
;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
}
else
{
}
else
{
output
[
out_index
]
=
0
;
output
[
out_index
]
=
0
;
}
}
}
}
...
@@ -971,20 +900,15 @@ __global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
...
@@ -971,20 +900,15 @@ __global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
const
int
condition
=
(
fi
<
skip_offset
);
const
int
condition
=
(
fi
<
skip_offset
);
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
skip_offset
-
1
);
+
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
skip_offset
-
1
);
//int in_index;
//if(fi < skip_offset) {
const
int
ch
=
in_index
/
(
KW
*
KH
);
// in_index = fi;
//} else {
// in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
// + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
// }
const
int
ch
=
in_index
/
(
KW
*
KH
);
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
j
=
in_index
%
KW
;
const
int
j
=
in_index
%
KW
;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
fi
)
*
H_out
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
n
*
reduced_filter_elem
+
fi
)
*
H_out
+
h
)
*
W_out
+
w
;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
}
else
{
}
else
{
output
[
out_index
]
=
0
;
output
[
out_index
]
=
0
;
}
}
}
}
...
@@ -1013,18 +937,11 @@ __global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output
...
@@ -1013,18 +937,11 @@ __global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output
const
int
condition
=
(
fi
<
skip_offset
);
const
int
condition
=
(
fi
<
skip_offset
);
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
const
int
in_index
=
condition
*
fi
+
(
!
condition
)
*
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
skip_offset
-
1
);
+
(((
fi
-
skip_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
skip_offset
-
1
);
// int in_index;
// if(fi < skip_offset) {
// in_index = fi;
// } else {
// in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
// + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
// }
const
int
ch
=
in_index
/
(
KW
*
KH
);
const
int
ch
=
in_index
/
(
KW
*
KH
);
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
i
=
(
in_index
%
(
KW
*
KH
))
/
KW
;
const
int
j
=
in_index
%
KW
;
const
int
j
=
in_index
%
KW
;
const
int
out_index
=
((
fi
*
N
+
n
)
*
H_out
+
h
)
*
W_out
+
w
;
const
int
out_index
=
((
fi
*
N
+
n
)
*
H_out
+
h
)
*
W_out
+
w
;
//const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
{
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
output
[
out_index
]
=
input
[((
n
*
C
+
ch
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
}
else
{
}
else
{
...
@@ -1042,11 +959,8 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
...
@@ -1042,11 +959,8 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
const
int
skip_every
,
const
int
skip_offset
,
const
float
fac
)
{
const
int
skip_every
,
const
int
skip_offset
,
const
float
fac
)
{
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
//const int stride = blockDim.x * gridDim.x;
const
int
fIdx
=
tx
/
reduced_filter_elem
;
//filter index
//#pragma unroll
//for (int i = tx; i < NF; i += stride) {
const
int
fIdx
=
tx
/
reduced_filter_elem
;
//filter index
if
(
fIdx
<
NF
)
{
if
(
fIdx
<
NF
)
{
const
int
offset
=
tx
%
reduced_filter_elem
;
//offset within filter
const
int
offset
=
tx
%
reduced_filter_elem
;
//offset within filter
const
int
ch
=
(
offset
*
channels
)
/
reduced_filter_elem
;
const
int
ch
=
(
offset
*
channels
)
/
reduced_filter_elem
;
...
@@ -1055,15 +969,9 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
...
@@ -1055,15 +969,9 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
const
int
in_index
=
condition
*
offset
+
(
!
condition
)
*
(((
offset
-
channel_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
const
int
in_index
=
condition
*
offset
+
(
!
condition
)
*
(((
offset
-
channel_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
offset
-
channel_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
channel_offset
-
1
);
+
(((
offset
-
channel_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
channel_offset
-
1
);
// int in_index;
// if(offset < channel_offset) {
// in_index = offset;
//} else {
// in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
// + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
// }
output
[
fIdx
*
reduced_filter_elem
+
offset
]
=
__hmul
(
__float2half_rn
(
fac
),
input
[
num_filter_elem
*
fIdx
+
in_index
]);
output
[
fIdx
*
reduced_filter_elem
+
offset
]
=
__hmul
(
__float2half_rn
(
fac
),
input
[
num_filter_elem
*
fIdx
+
in_index
]);
}
}
}
}
__global__
void
createReducedFiltersHalfIrregular
(
__half
*
output
,
__global__
void
createReducedFiltersHalfIrregular
(
__half
*
output
,
...
@@ -1071,21 +979,20 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
...
@@ -1071,21 +979,20 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
const
int
num_filter_elem
,
const
int
reduced_filter_elem
,
const
int
num_filter_elem
,
const
int
reduced_filter_elem
,
const
int
skip_every
,
const
int
skip_offset
,
const
float
fac
)
{
const
int
skip_every
,
const
int
skip_offset
,
const
float
fac
)
{
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
const
int
tx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
//thread id
//const int stride = blockDim.x * gridDim.x;
const
int
fIdx
=
tx
/
reduced_filter_elem
;
//filter index
//#pragma unroll
//for (int i = tx; i < NF; i += stride) {
const
int
fIdx
=
tx
/
reduced_filter_elem
;
//filter index
if
(
fIdx
<
NF
)
{
if
(
fIdx
<
NF
)
{
const
int
offset
=
tx
%
reduced_filter_elem
;
//offset within filter
const
int
offset
=
tx
%
reduced_filter_elem
;
//offset within filter
const
int
condition
=
(
offset
<
skip_offset
);
const
int
condition
=
(
offset
<
skip_offset
);
int
in_index
=
condition
*
offset
+
(
!
condition
)
*
(((
offset
-
skip_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
int
in_index
=
condition
*
offset
+
(
!
condition
)
*
(((
offset
-
skip_offset
+
1
)
*
skip_every
)
/
(
skip_every
-
1
)
+
(((
offset
-
skip_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
skip_offset
-
1
);
+
(((
offset
-
skip_offset
+
1
)
*
skip_every
)
%
(
skip_every
-
1
)
>
0
)
+
skip_offset
-
1
);
//}
output
[
fIdx
*
reduced_filter_elem
+
offset
]
=
__hmul
(
__float2half_rn
(
fac
),
input
[
num_filter_elem
*
fIdx
+
in_index
]);
output
[
fIdx
*
reduced_filter_elem
+
offset
]
=
__hmul
(
__float2half_rn
(
fac
),
input
[
num_filter_elem
*
fIdx
+
in_index
]);
//}
}
}
}
}
...
@@ -1112,7 +1019,7 @@ __global__ void convToGemmApprox(float * const __restrict__ output,
...
@@ -1112,7 +1019,7 @@ __global__ void convToGemmApprox(float * const __restrict__ output,
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
for
(
int
j
=
0
;
j
<
KW
;
j
++
)
{
const
int
filter_elem_num
=
(
c
*
KH
+
i
)
*
KW
+
j
;
//index of this filter element
const
int
filter_elem_num
=
(
c
*
KH
+
i
)
*
KW
+
j
;
//index of this filter element
if
(
filter_elem_num
%
skip_every
!=
skip_every
-
1
)
{
//are we including this filter element?
if
(
filter_elem_num
%
skip_every
!=
skip_every
-
1
)
{
//are we including this filter element?
const
int
output_col
=
filter_elem_num
-
(
filter_elem_num
/
skip_every
);
//cal
culate
output column, taking skipping into account
const
int
output_col
=
filter_elem_num
-
(
filter_elem_num
/
skip_every
);
//cal output column, taking skipping into account
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
if
(
inH
+
i
>=
0
&&
inH
+
i
<
H
&&
inW
+
j
>=
0
&&
inW
+
j
<
W
)
output
[((
n
*
reduced_filter_elem
+
output_col
)
*
H_out
+
h
)
*
W_out
+
w
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
output
[((
n
*
reduced_filter_elem
+
output_col
)
*
H_out
+
h
)
*
W_out
+
w
]
=
input
[((
n
*
C
+
c
)
*
H
+
(
inH
+
i
))
*
W
+
(
inW
+
j
)];
else
else
...
@@ -1130,8 +1037,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
...
@@ -1130,8 +1037,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
int
horizontal_stride
,
int
conv_mode
,
int
conv_groups
,
int
horizontal_stride
,
int
conv_mode
,
int
conv_groups
,
int
row
,
int
col
,
int
start
){
int
row
,
int
col
,
int
start
){
//////INFO("*** TensorConvolution (output perforation) \n");
//Event("Conv");
Tensor
*
input
=
(
Tensor
*
)
input_ptr
;
Tensor
*
input
=
(
Tensor
*
)
input_ptr
;
Tensor
*
filter
=
(
Tensor
*
)
filter_ptr
;
Tensor
*
filter
=
(
Tensor
*
)
filter_ptr
;
//FIXME: Current hack to preserve backward compatibilty
//FIXME: Current hack to preserve backward compatibilty
...
@@ -1144,10 +1049,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
...
@@ -1144,10 +1049,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
hostToDeviceCopy
(
input
);
hostToDeviceCopy
(
input
);
hostToDeviceCopy
(
filter
);
hostToDeviceCopy
(
filter
);
//Event("H2F_start");
convertToFP32
(
input
);
convertToFP32
(
input
);
convertToFP32
(
filter
);
convertToFP32
(
filter
);
//Event("H2F_end");
long
int
n
,
c
,
h
,
w
;
// output dimensions
long
int
n
,
c
,
h
,
w
;
// output dimensions
n
=
input
->
dims
.
dim_sizes
[
0
];
n
=
input
->
dims
.
dim_sizes
[
0
];
...
@@ -1221,14 +1124,14 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
...
@@ -1221,14 +1124,14 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
freeTensor
(
output
);
freeTensor
(
output
);
cudaFree
(
convData
);
cudaFree
(
convData
);
}
else
if
(
col
>
1
){
}
else
if
(
col
>
1
){
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
float_type
,
//input->data_type,
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
float_type
,
//input->data_type,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w_eff
);
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w_eff
);
// NOTE: Changing output tensor placement from host to device
// NOTE: Changing output tensor placement from host to device
changeTensorPlacement
(
output
,
DEVICE
);
changeTensorPlacement
(
output
,
DEVICE
);
// NOTE: Necessary to insert the above call for every output tensor
//total number of filter elem
const
long
int
num_filter_elem
=
KH
*
KW
*
input
->
dims
.
dim_sizes
[
1
];
const
long
int
num_filter_elem
=
KH
*
KW
*
input
->
dims
.
dim_sizes
[
1
];
float
*
convData
;
float
*
convData
;
...
@@ -1550,7 +1453,8 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
...
@@ -1550,7 +1453,8 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
cudaFree
(
convData
);
cudaFree
(
convData
);
cudaFree
(
reducedFilter
);
cudaFree
(
reducedFilter
);
}
else
{
}
else
{
INFO
(
"FP32 BASELINE
\n
"
);
//INFO("FP32 BASELINE\n");
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
float_type
,
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
float_type
,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
changeTensorPlacement
(
output
,
DEVICE
);
changeTensorPlacement
(
output
,
DEVICE
);
...
@@ -1996,14 +1900,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
...
@@ -1996,14 +1900,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
freeTensor
(
output
);
freeTensor
(
output
);
cudaFree
(
convData
);
cudaFree
(
convData
);
}
}
// INFO("CONV DONE\n");
profileEvent
(
"H2F_start"
);
profileEvent
(
"H2F_start"
);
convertToFP32_offline
(
new_output
);
convertToFP32_offline
(
new_output
);
//convertToFP32(input);
//convertToFP32(filter);
profileEvent
(
"H2F_end"
);
profileEvent
(
"H2F_end"
);
//profileEvent("#Conv_end");
//INFO("CONVOLUTION END\n");
return
new_output
;
return
new_output
;
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment