Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hpvm-release
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
llvm
hpvm-release
Commits
9ffb63b8
Commit
9ffb63b8
authored
6 years ago
by
Huzaifa
Browse files
Options
Downloads
Patches
Plain Diff
Adding Efficient CUDA-based versions for FP16 and FP32 Depthwise Convolution
parent
f9b1d79c
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
+738
-124
738 additions, 124 deletions
...hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
with
738 additions
and
124 deletions
llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
+
738
−
124
View file @
9ffb63b8
...
...
@@ -2,72 +2,7 @@
__global__
void
depthwise_conv8
(
float
*
const
__restrict__
y
,
const
float
*
const
__restrict__
x
,
const
float
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
H_stride
,
const
int
W_stride
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
8
;
const
int
b
=
blockIdx
.
x
*
num
;
const
int
m
=
blockIdx
.
y
;
//current filter/channel
const
int
tx
=
threadIdx
.
x
;
const
int
start_h
=
(
threadIdx
.
x
/
W_out
)
*
H_stride
-
H_pad
;
const
int
start_w
=
(
threadIdx
.
x
%
W_out
)
*
W_stride
-
W_pad
;
const
float
*
weights
=
&
w
[
m
*
KH
*
KW
];
float
c0
=
0
;
float
c1
=
0
;
float
c2
=
0
;
float
c3
=
0
;
float
c4
=
0
;
float
c5
=
0
;
float
c6
=
0
;
float
c7
=
0
;
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c0
+=
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c1
+=
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c2
+=
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c3
+=
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c4
+=
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c5
+=
x4d
(
b
+
5
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c6
+=
x4d
(
b
+
6
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
c7
+=
x4d
(
b
+
7
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
}
}
y4d
(
b
,
m
,
0
,
tx
)
=
c0
;
y4d
(
b
+
1
,
m
,
0
,
tx
)
=
c1
;
y4d
(
b
+
2
,
m
,
0
,
tx
)
=
c2
;
y4d
(
b
+
3
,
m
,
0
,
tx
)
=
c3
;
y4d
(
b
+
4
,
m
,
0
,
tx
)
=
c4
;
y4d
(
b
+
5
,
m
,
0
,
tx
)
=
c5
;
y4d
(
b
+
6
,
m
,
0
,
tx
)
=
c6
;
y4d
(
b
+
7
,
m
,
0
,
tx
)
=
c7
;
#undef y4d
#undef x4d
}
__global__
void
depthwise_conv
(
float
*
const
__restrict__
y
,
...
...
@@ -124,60 +59,6 @@ __global__ void depthwise_conv(float* const __restrict__ y,
#undef x4d
}
__global__
void
depthwise_conv12
(
float
*
const
__restrict__
y
,
const
float
*
const
__restrict__
x
,
const
float
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
H_stride
,
const
int
W_stride
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
12
;
const
int
b
=
num
*
blockIdx
.
x
;
const
int
m
=
blockIdx
.
y
;
//current filter/channel
const
int
tx
=
threadIdx
.
x
;
const
int
start_h
=
(
threadIdx
.
x
/
W_out
)
*
H_stride
-
H_pad
;
const
int
start_w
=
(
threadIdx
.
x
%
W_out
)
*
W_stride
-
W_pad
;
float
C
[
num
]
=
{
0
};
const
float
*
weights
=
&
w
[
m
*
KH
*
KW
];
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
//if(b + i < B)
C
[
i
]
+=
x4d
(
b
+
i
,
m
,
start_h
+
p
,
start_w
+
q
)
*
weights
[
k
];
}
}
}
#pragma unroll
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
//if(b + i < B)
y4d
(
b
+
i
,
m
,
0
,
tx
)
=
C
[
i
];
}
#undef y4d
#undef x4d
}
__global__
void
depthwise_convNew
(
float
*
const
__restrict__
y
,
const
float
*
const
__restrict__
x
,
...
...
@@ -400,6 +281,165 @@ __global__ void depthwise_convNew8_half(__half* const __restrict__ y,
#undef x4d
}
__global__
void
depthwise_convNew8_half1
(
__half
*
const
__restrict__
y
,
const
__half
*
const
__restrict__
x
,
const
__half
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
H_stride
,
const
int
W_stride
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
8
;
const
int
b
=
num
*
blockIdx
.
x
;
const
int
m
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
H_out
*
W_out
);
if
(
m
<
M
){
const
int
tx
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
H_out
*
W_out
);
const
int
start_h
=
(
tx
/
W_out
)
*
H_stride
-
H_pad
;
const
int
start_w
=
(
tx
%
W_out
)
*
W_stride
-
W_pad
;
__half
c0
=
0
;
__half
c1
=
0
;
__half
c2
=
0
;
__half
c3
=
0
;
__half
c4
=
0
;
__half
c5
=
0
;
__half
c6
=
0
;
__half
c7
=
0
;
const
__half
*
weights
=
&
w
[
m
*
KH
*
KW
];
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c0
=
__hfma
(
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c0
);
}
}
if
(
b
+
1
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c1
=
__hfma
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c1
);
}
}
}
if
(
b
+
2
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c2
=
__hfma
(
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c2
);
}
}
}
if
(
b
+
3
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c3
=
__hfma
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c3
);
}
}
}
if
(
b
+
4
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c4
=
__hfma
(
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c4
);
}
}
}
if
(
b
+
5
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c5
=
__hfma
(
x4d
(
b
+
5
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c5
);
}
}
}
if
(
b
+
6
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c6
=
__hfma
(
x4d
(
b
+
6
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c6
);
}
}
}
if
(
b
+
7
<
B
){
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c7
=
__hfma
(
x4d
(
b
+
7
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c7
);
}
}
}
y4d
(
b
,
m
,
0
,
tx
)
=
c0
;
if
(
b
+
1
<
B
)
y4d
(
b
+
1
,
m
,
0
,
tx
)
=
c1
;
if
(
b
+
2
<
B
)
y4d
(
b
+
2
,
m
,
0
,
tx
)
=
c2
;
if
(
b
+
3
<
B
)
y4d
(
b
+
3
,
m
,
0
,
tx
)
=
c3
;
if
(
b
+
4
<
B
)
y4d
(
b
+
4
,
m
,
0
,
tx
)
=
c4
;
if
(
b
+
5
<
B
)
y4d
(
b
+
5
,
m
,
0
,
tx
)
=
c5
;
if
(
b
+
6
<
B
)
y4d
(
b
+
6
,
m
,
0
,
tx
)
=
c6
;
if
(
b
+
7
<
B
)
y4d
(
b
+
7
,
m
,
0
,
tx
)
=
c7
;
}
#undef y4d
#undef x4d
}
__global__
void
depthwise_convNew12
(
float
*
const
__restrict__
y
,
...
...
@@ -507,6 +547,374 @@ __global__ void depthwise_convNew12(float* const __restrict__ y,
}
__global__
void
depthwise_convNew12_half
(
__half
*
const
__restrict__
y
,
const
__half
*
const
__restrict__
x
,
const
__half
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
H_stride
,
const
int
W_stride
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
12
;
const
int
b
=
num
*
blockIdx
.
x
;
const
int
m
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
H_out
*
W_out
);
if
(
m
<
M
){
const
int
tx
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
H_out
*
W_out
);
const
int
start_h
=
(
tx
/
W_out
)
*
H_stride
-
H_pad
;
const
int
start_w
=
(
tx
%
W_out
)
*
W_stride
-
W_pad
;
__half
c0
=
0
;
__half
c1
=
0
;
__half
c2
=
0
;
__half
c3
=
0
;
__half
c4
=
0
;
__half
c5
=
0
;
__half
c6
=
0
;
__half
c7
=
0
;
__half
c8
=
0
;
__half
c9
=
0
;
__half
c10
=
0
;
__half
c11
=
0
;
const
__half
*
weights
=
&
w
[
m
*
KH
*
KW
];
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
c0
=
__hfma
(
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c0
);
if
(
b
+
1
<
B
)
c1
=
__hfma
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c1
);
if
(
b
+
2
<
B
)
c2
=
__hfma
(
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c2
);
if
(
b
+
3
<
B
)
c3
=
__hfma
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c3
);
if
(
b
+
4
<
B
)
c4
=
__hfma
(
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c4
);
if
(
b
+
5
<
B
)
c5
=
__hfma
(
x4d
(
b
+
5
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c5
);
if
(
b
+
6
<
B
)
c6
=
__hfma
(
x4d
(
b
+
6
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c6
);
if
(
b
+
7
<
B
)
c7
=
__hfma
(
x4d
(
b
+
7
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c7
);
if
(
b
+
8
<
B
)
c8
=
__hfma
(
x4d
(
b
+
8
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c8
);
if
(
b
+
9
<
B
)
c9
=
__hfma
(
x4d
(
b
+
9
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c9
);
if
(
b
+
10
<
B
)
c10
=
__hfma
(
x4d
(
b
+
10
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c10
);
if
(
b
+
11
<
B
)
c11
=
__hfma
(
x4d
(
b
+
11
,
m
,
start_h
+
p
,
start_w
+
q
),
weights
[
k
],
c11
);
}
}
y4d
(
b
,
m
,
0
,
tx
)
=
c0
;
if
(
b
+
1
<
B
)
y4d
(
b
+
1
,
m
,
0
,
tx
)
=
c1
;
if
(
b
+
2
<
B
)
y4d
(
b
+
2
,
m
,
0
,
tx
)
=
c2
;
if
(
b
+
3
<
B
)
y4d
(
b
+
3
,
m
,
0
,
tx
)
=
c3
;
if
(
b
+
4
<
B
)
y4d
(
b
+
4
,
m
,
0
,
tx
)
=
c4
;
if
(
b
+
5
<
B
)
y4d
(
b
+
5
,
m
,
0
,
tx
)
=
c5
;
if
(
b
+
6
<
B
)
y4d
(
b
+
6
,
m
,
0
,
tx
)
=
c6
;
if
(
b
+
7
<
B
)
y4d
(
b
+
7
,
m
,
0
,
tx
)
=
c7
;
if
(
b
+
8
<
B
)
y4d
(
b
+
8
,
m
,
0
,
tx
)
=
c8
;
if
(
b
+
9
<
B
)
y4d
(
b
+
9
,
m
,
0
,
tx
)
=
c9
;
if
(
b
+
10
<
B
)
y4d
(
b
+
10
,
m
,
0
,
tx
)
=
c10
;
if
(
b
+
11
<
B
)
y4d
(
b
+
11
,
m
,
0
,
tx
)
=
c11
;
}
#undef y4d
#undef x4d
}
__global__
void
depthwise_convNew8_half2
(
__half
*
const
__restrict__
y
,
const
__half
*
const
__restrict__
x
,
const
__half
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
H_stride
,
const
int
W_stride
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
8
;
const
int
b
=
num
*
blockIdx
.
x
;
const
int
m
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
H_out
*
W_out
);
if
(
m
<
M
){
const
int
tx
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
H_out
*
W_out
);
const
int
start_h
=
(
tx
/
W_out
)
*
H_stride
-
H_pad
;
const
int
start_w
=
(
tx
%
W_out
)
*
W_stride
-
W_pad
;
__half2
c0
=
__half2half2
(
0
);
__half2
c1
=
__half2half2
(
0
);
__half2
c2
=
__half2half2
(
0
);
__half2
c3
=
__half2half2
(
0
);
const
__half
*
weights
=
&
w
[
m
*
KH
*
KW
];
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
__half2
t1
;
__half2
t2
;
__half2
t3
;
__half2
t4
;
if
(
b
+
7
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
t3
=
__halves2half2
(
x4d
(
b
+
5
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
));
t4
=
__halves2half2
(
x4d
(
b
+
7
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
6
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
6
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
t3
=
__halves2half2
(
x4d
(
b
+
5
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
));
t4
=
__halves2half2
(
0
,
x4d
(
b
+
6
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
5
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
t3
=
__halves2half2
(
x4d
(
b
+
5
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
4
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
t3
=
__halves2half2
(
0
,
x4d
(
b
+
4
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
3
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
2
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
0
,
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
1
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
{
t1
=
__halves2half2
(
0
,
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
}
c0
=
__hfma2
(
t1
,
__halves2half2
(
weights
[
k
],
weights
[
k
]),
c0
);
c1
=
__hfma2
(
t2
,
__halves2half2
(
weights
[
k
],
weights
[
k
]),
c1
);
c2
=
__hfma2
(
t3
,
__halves2half2
(
weights
[
k
],
weights
[
k
]),
c2
);
c3
=
__hfma2
(
t4
,
__halves2half2
(
weights
[
k
],
weights
[
k
]),
c3
);
}
}
y4d
(
b
,
m
,
0
,
tx
)
=
__high2half
(
c0
);
if
(
b
+
1
<
B
)
y4d
(
b
+
1
,
m
,
0
,
tx
)
=
__low2half
(
c0
);
if
(
b
+
2
<
B
)
y4d
(
b
+
2
,
m
,
0
,
tx
)
=
__high2half
(
c1
);
if
(
b
+
3
<
B
)
y4d
(
b
+
3
,
m
,
0
,
tx
)
=
__low2half
(
c1
);
if
(
b
+
4
<
B
)
y4d
(
b
+
4
,
m
,
0
,
tx
)
=
__high2half
(
c2
);
if
(
b
+
5
<
B
)
y4d
(
b
+
5
,
m
,
0
,
tx
)
=
__low2half
(
c2
);
if
(
b
+
6
<
B
)
y4d
(
b
+
6
,
m
,
0
,
tx
)
=
__high2half
(
c3
);
if
(
b
+
7
<
B
)
y4d
(
b
+
7
,
m
,
0
,
tx
)
=
__low2half
(
c3
);
}
#undef y4d
#undef x4d
}
//When stride is 1
__global__
void
depthwise_conv4_half3
(
__half
*
const
__restrict__
y
,
const
__half
*
const
__restrict__
x
,
const
__half
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
C_dim
,
const
int
H_dim
,
const
int
W_dim
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
1
;
const
int
b
=
num
*
blockIdx
.
x
;
const
int
m
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
H_out
*
W_out
);
if
(
m
<
M
)
{
const
int
tx
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
H_out
*
W_out
);
const
int
start_h
=
(
tx
/
W_out
)
-
H_pad
;
const
int
start_w
=
(
tx
%
W_out
)
-
W_pad
;
const
int
bstart_h
=
(
blockIdx
.
y
*
blockDim
.
x
%
(
H_out
*
W_out
))
/
W_out
-
H_pad
;
const
int
bstart_w
=
(
blockIdx
.
y
*
blockDim
.
x
%
(
H_out
*
W_out
))
%
W_out
-
H_pad
;
const
int
bstartm
=
(
blockIdx
.
y
*
blockDim
.
x
/
(
H_out
*
W_out
));
extern
__shared__
__half
xdata
[];
for
(
int
i
=
0
;
i
<
C_dim
*
H_dim
*
W_dim
;
i
+=
blockDim
.
x
)
{
if
(
i
/
(
H_dim
*
W_dim
)
+
bstartm
<
M
&&
(
i
%
(
H_dim
*
W_dim
))
/
W_dim
+
bstart_h
>
-
1
&&
(
i
%
(
H_dim
*
W_dim
))
/
W_dim
+
bstart_h
<
H
&&
(
i
%
(
H_dim
*
W_dim
))
%
W_dim
+
bstart_w
>
-
1
&&
(
i
%
(
H_dim
*
W_dim
))
%
W_dim
+
bstart_w
<
W
)
{
xdata
[
i
]
=
x4d
(
b
,
i
/
(
H_dim
*
W_dim
)
+
bstartm
,
(
i
%
(
H_dim
*
W_dim
))
/
W_dim
+
bstart_h
,
(
i
%
(
H_dim
*
W_dim
))
%
W_dim
+
bstart_w
);
}
}
__syncthreads
();
__half
c0
;
const
__half
*
weights
=
&
w
[
m
*
KH
*
KW
];
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
__half
t1
;
int
total
=
C_dim
*
H_dim
*
W_dim
;
t1
=
xdata
[(
m
-
bstartm
)
*
H_dim
*
W_dim
+
(
start_h
+
p
-
bstart_h
)
*
W_dim
+
start_w
+
q
-
bstart_w
];
c0
=
__hfma
(
t1
,
weights
[
k
],
c0
);
}
}
y4d
(
b
,
m
,
0
,
tx
)
=
c0
;
}
#undef y4d
#undef x4d
}
__global__
void
depthwise_convNew4_half2
(
__half
*
const
__restrict__
y
,
const
__half
*
const
__restrict__
x
,
const
__half
*
const
__restrict__
w
,
const
int
B
,
const
int
M
,
const
int
H
,
const
int
W
,
const
int
KH
,
const
int
KW
,
const
int
H_out
,
const
int
W_out
,
const
int
H_pad
,
const
int
W_pad
,
const
int
H_stride
,
const
int
W_stride
)
{
#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
const
int
num
=
4
;
const
int
b
=
num
*
blockIdx
.
x
;
const
int
m
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
H_out
*
W_out
);
if
(
m
<
M
){
const
int
tx
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
H_out
*
W_out
);
const
int
start_h
=
(
tx
/
W_out
)
*
H_stride
-
H_pad
;
const
int
start_w
=
(
tx
%
W_out
)
*
W_stride
-
W_pad
;
__half2
c0
=
__half2half2
(
0
);
__half2
c1
=
__half2half2
(
0
);
const
__half
*
weights
=
&
w
[
m
*
KH
*
KW
];
for
(
int
k
=
0
;
k
<
KH
*
KW
;
k
++
)
{
int
p
=
k
/
KW
;
int
q
=
k
%
KW
;
if
(
start_h
+
p
>
-
1
&&
start_h
+
p
<
H
&&
start_w
+
q
>
-
1
&&
start_w
+
q
<
W
)
{
__half2
t1
;
__half2
t2
;
if
(
b
+
3
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
x4d
(
b
+
3
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
2
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
t2
=
__halves2half2
(
0
,
x4d
(
b
+
2
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
if
(
b
+
1
<
B
){
t1
=
__halves2half2
(
x4d
(
b
+
1
,
m
,
start_h
+
p
,
start_w
+
q
),
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
}
else
{
t1
=
__halves2half2
(
0
,
x4d
(
b
,
m
,
start_h
+
p
,
start_w
+
q
));
}
c0
=
__hfma2
(
t1
,
__halves2half2
(
weights
[
k
],
weights
[
k
]),
c0
);
c1
=
__hfma2
(
t2
,
__halves2half2
(
weights
[
k
],
weights
[
k
]),
c1
);
}
}
y4d
(
b
,
m
,
0
,
tx
)
=
__high2half
(
c0
);
if
(
b
+
1
<
B
)
y4d
(
b
+
1
,
m
,
0
,
tx
)
=
__low2half
(
c0
);
if
(
b
+
2
<
B
)
y4d
(
b
+
2
,
m
,
0
,
tx
)
=
__high2half
(
c1
);
if
(
b
+
3
<
B
)
y4d
(
b
+
3
,
m
,
0
,
tx
)
=
__low2half
(
c1
);
}
#undef y4d
#undef x4d
}
void
*
tensorConvCutlass
(
void
*
input_ptr
,
void
*
filter_ptr
,
int
vertical_pad
,
int
horizontal_pad
,
int
vertical_stride
,
int
horizontal_stride
,
...
...
@@ -578,7 +986,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
*/
int
blockSize
;
blockSize
=
128
;
blockSize
=
64
;
dim3
grid
(((
n
+
7
)
/
8
),
(
c
*
h
*
w
+
blockSize
-
1
)
/
blockSize
);
dim3
block
(
blockSize
);
...
...
@@ -797,10 +1205,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
const
int
KW
=
filter
->
dims
.
dim_sizes
[
3
];
int
h
=
(
2
*
vertical_pad
+
input
->
dims
.
dim_sizes
[
2
]
-
KH
)
/
vertical_stride
+
1
;
int
w
=
(
2
*
horizontal_pad
+
input
->
dims
.
dim_sizes
[
3
]
-
KW
)
/
horizontal_stride
+
1
;
DEBUG
(
"**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d
\n
"
,
n
,
c
,
h
,
w
);
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
input
->
data_type
,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
...
...
@@ -816,10 +1223,10 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
int
blockSize
;
blockSize
=
128
;
dim3
grid
(((
n
+
7
)
/
8
),
(
c
*
h
*
w
+
blockSize
-
1
)
/
blockSize
);
dim3
block
(
blockSize
);
depthwise_convNew8_half
<<<
grid
,
block
>>>
((
__half
*
)
output_half
->
gpu_data
,
depthwise_convNew8_half
2
<<<
grid
,
block
>>>
((
__half
*
)
output_half
->
gpu_data
,
(
__half
*
)
input_half
->
gpu_data
,
(
__half
*
)
filter_half
->
gpu_data
,
input
->
dims
.
dim_sizes
[
0
],
input
->
dims
.
dim_sizes
[
1
],
input
->
dims
.
dim_sizes
[
2
],
input
->
dims
.
dim_sizes
[
3
],
...
...
@@ -934,6 +1341,213 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
}
void
*
tensorHalfConvCutlass2
(
void
*
input_ptr
,
void
*
filter_ptr
,
int
vertical_pad
,
int
horizontal_pad
,
int
vertical_stride
,
int
horizontal_stride
,
int
conv_mode
,
int
conv_groups
){
INFO
(
"*** TensorHConvolution
\n
"
);
profileEvent
(
"#Conv"
);
Tensor
*
input
=
(
Tensor
*
)
input_ptr
;
Tensor
*
filter
=
(
Tensor
*
)
filter_ptr
;
cudnnConvolutionDescriptor_t
convDesc
;
cudnnConvolutionFwdAlgo_t
convAlgo
;
cudnnConvolutionMode_t
mode
;
if
(
conv_mode
==
0
)
mode
=
CUDNN_CONVOLUTION
;
else
if
(
conv_mode
==
1
)
mode
=
CUDNN_CROSS_CORRELATION
;
// FIXIT: Need to be more aware of the implications of alpha and beta
float
alpha
=
1
.
0
f
,
beta
=
0
.
0
f
;
// NOTE: compute in half precision
cudnnDataType_t
computeType
=
CUDNN_DATA_HALF
;
// NOTE: Moving inputs to GPU global memory
hostToDeviceCopy
(
input
);
hostToDeviceCopy
(
filter
);
/***** CONVERSIONS from FP32 to FP16 - on the GPU */
size_t
*
input_dims
=
input
->
dims
.
dim_sizes
;
size_t
*
filter_dims
=
filter
->
dims
.
dim_sizes
;
profileEvent
(
"F2H_start"
);
Tensor
*
input_half
=
(
Tensor
*
)
create4DTensor
(
CUDNN_DATA_HALF
,
CUDNN_TENSOR_NCHW
,
input_dims
[
0
],
input_dims
[
1
],
input_dims
[
2
],
input_dims
[
3
]);
changeTensorPlacement
(
input_half
,
DEVICE
);
Tensor
*
filter_half
=
(
Tensor
*
)
create4DTensor
(
CUDNN_DATA_HALF
,
CUDNN_TENSOR_NCHW
,
filter_dims
[
0
],
filter_dims
[
1
],
filter_dims
[
2
],
filter_dims
[
3
]);
changeTensorPlacement
(
filter_half
,
DEVICE
);
f2h
((
float
*
)
input
->
gpu_data
,
input
->
num_elems
,
(
half
*
)
input_half
->
gpu_data
);
f2h
((
float
*
)
filter
->
gpu_data
,
filter
->
num_elems
,
(
half
*
)
filter_half
->
gpu_data
);
/******* END OF INPUT DATA CONVERSIONS*/
profileEvent
(
"F2H_end"
);
Tensor
*
output
;
Tensor
*
output_half
;
if
(
conv_groups
>
1
&&
horizontal_stride
==
1
&&
vertical_stride
==
1
)
{
int
n
=
input
->
dims
.
dim_sizes
[
0
];
int
c
=
input
->
dims
.
dim_sizes
[
1
];
const
int
KH
=
filter
->
dims
.
dim_sizes
[
2
];
const
int
KW
=
filter
->
dims
.
dim_sizes
[
3
];
int
h
=
(
2
*
vertical_pad
+
input
->
dims
.
dim_sizes
[
2
]
-
KH
)
/
vertical_stride
+
1
;
int
w
=
(
2
*
horizontal_pad
+
input
->
dims
.
dim_sizes
[
3
]
-
KW
)
/
horizontal_stride
+
1
;
DEBUG
(
"**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d
\n
"
,
n
,
c
,
h
,
w
);
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
input
->
data_type
,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
// FIXIT: more checks for data types needed
output_half
=
(
Tensor
*
)
create4DTensor
(
CUDNN_DATA_HALF
,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
// NOTE: Changing output tensor placement from host to device
changeTensorPlacement
(
output
,
DEVICE
);
// NOTE: Necessary to insert the above call for every output tensor
int
blockSize
;
blockSize
=
128
;
dim3
grid
(((
n
+
3
)
/
4
),
(
c
*
h
*
w
+
blockSize
-
1
)
/
blockSize
);
dim3
block
(
blockSize
);
int
C_dim
=
blockSize
/
(
h
*
w
)
+
1
+
1
;
int
H_dim
=
blockSize
%
(
h
*
w
)
/
w
+
1
+
KH
+
1
;
int
W_dim
=
blockSize
%
(
h
*
w
)
%
w
+
1
+
KW
+
1
;
depthwise_conv4_half3
<<
<
grid
,
block
,
sizeof
(
__half
)
*
C_dim
*
H_dim
*
W_dim
>>
>
((
__half
*
)
output_half
->
gpu_data
,
(
__half
*
)
input_half
->
gpu_data
,
(
__half
*
)
filter_half
->
gpu_data
,
input
->
dims
.
dim_sizes
[
0
],
input
->
dims
.
dim_sizes
[
1
],
input
->
dims
.
dim_sizes
[
2
],
input
->
dims
.
dim_sizes
[
3
],
KH
,
KW
,
h
,
w
,
vertical_pad
,
horizontal_pad
,
C_dim
,
H_dim
,
W_dim
);
cudaDeviceSynchronize
();
}
else
{
checkCUDNN
(
cudnnCreateConvolutionDescriptor
(
&
convDesc
));
//FIXME: Current hack to preserve backward compatibilty
if
(
conv_groups
==
0
)
{
conv_groups
=
1
;
}
// NOTE: Adding support for grouped convolution
checkCUDNN
(
cudnnSetConvolutionGroupCount
(
convDesc
,
conv_groups
));
// FIXIT: Think if upscaling values need to be configurable?
// IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context
// IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
checkCUDNN
(
cudnnSetConvolution2dDescriptor
(
convDesc
,
vertical_pad
,
horizontal_pad
,
// conv padding
vertical_stride
,
horizontal_stride
,
// conv strides
1
,
1
,
// upscaling values
mode
,
// mode is configurable
computeType
));
// defines compute precision
int
n
,
c
,
h
,
w
;
// output dimensions
// Find dimension of convolution output
checkCUDNN
(
cudnnGetConvolution2dForwardOutputDim
(
convDesc
,
input
->
tensor_desc
,
filter
->
filter_desc
,
&
n
,
&
c
,
&
h
,
&
w
));
DEBUG
(
"**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d
\n
"
,
n
,
c
,
h
,
w
);
output
=
(
Tensor
*
)
create4DTensor
((
cudnnDataType_t
)
input
->
data_type
,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
// FIXIT: more checks for data types needed
output_half
=
(
Tensor
*
)
create4DTensor
(
CUDNN_DATA_HALF
,
CUDNN_TENSOR_NCHW
,
n
,
c
,
h
,
w
);
// NOTE: Changing output tensor placement from host to device
changeTensorPlacement
(
output
,
DEVICE
);
// NOTE: Necessary to insert the above call for every output tensor
DEBUG
(
"tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d
\n
"
,
output
->
data_type
,
output
->
data_format
,
output
->
dims
.
dim_sizes
[
0
],
output
->
dims
.
dim_sizes
[
1
],
output
->
dims
.
dim_sizes
[
2
],
output
->
dims
.
dim_sizes
[
3
]);
if
(
convDesc
==
NULL
||
input
->
tensor_desc
==
NULL
||
filter
->
filter_desc
==
NULL
||
output
->
tensor_desc
==
NULL
)
ERROR
(
"NULL descriptor!
\n
"
);
// NOTE: The following algo works with TRUE half precision
convAlgo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
//convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
size_t
workspace_size
;
checkCUDNN
(
cudnnGetConvolutionForwardWorkspaceSize
(
cudnnHandle
,
input_half
->
tensor_desc
,
filter_half
->
filter_desc
,
convDesc
,
output_half
->
tensor_desc
,
convAlgo
,
&
workspace_size
));
// Allocating memory for the convolution workspace
DEBUG
(
"workspace size = %d
\n
"
,
workspace_size
);
void
*
workspace
;
checkCudaErrors
(
cudaMalloc
(
&
workspace
,
workspace_size
));
checkCUDNN
(
cudnnConvolutionForward
(
cudnnHandle
,
&
alpha
,
input_half
->
tensor_desc
,
input_half
->
gpu_data
,
filter_half
->
filter_desc
,
filter_half
->
gpu_data
,
convDesc
,
convAlgo
,
workspace
,
workspace_size
,
&
beta
,
output_half
->
tensor_desc
,
output_half
->
gpu_data
));
}
profileEvent
(
"H2F_start"
);
// NOTE: Transforming half precision output to single precision
h2f
((
half
*
)
output_half
->
gpu_data
,
output
->
num_elems
,
(
float
*
)
output
->
gpu_data
);
profileEvent
(
"H2F_end"
);
profileEvent
(
"#Conv_end"
);
freeTensor
(
input_half
);
freeTensor
(
filter_half
);
freeTensor
(
output_half
);
return
output
;
}
// Perforated Tensor Conv with 'perforation_rate' parameter
void
*
tensorConvPerf
(
void
*
input
,
void
*
filter
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment