Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hpvm-release
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
llvm
hpvm-release
Commits
37ae3dca
Commit
37ae3dca
authored
4 years ago
by
Yifan Zhao
Browse files
Options
Downloads
Plain Diff
Merge remote-tracking branch 'origin/approx_hpvm_reorg_akash' into approx_hpvm_reorg
parents
6caf9a1c
e6190395
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+16
-91
16 additions, 91 deletions
...s/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
with
16 additions
and
91 deletions
hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+
16
−
91
View file @
37ae3dca
...
...
@@ -34,9 +34,11 @@
#include
<omp.h>
// Tensor runtime header files
//#include "tensor_cpu.h"
#include
"tensor.h"
#include
"tensor_runtime.h"
#include
"tensor_cpu_runtime.h"
#include
"approx_api.h"
void
llvm_hpvm_initTensorRtCPU
()
{
// NOTE: Do Nothing
...
...
@@ -50,78 +52,6 @@ void hpvm_request_tensorCPU(void *tensor, int destination) {
// NOTE: Do Nothing
}
std
::
vector
<
void
*>
PtrVect
;
void
freeBatchMemory
()
{
for
(
auto
it
=
PtrVect
.
rbegin
();
it
!=
PtrVect
.
rend
();
it
++
)
{
free
(
*
it
);
}
PtrVect
.
erase
(
PtrVect
.
begin
(),
PtrVect
.
end
());
}
int
getTypeSizeCPU
(
int
data_type
)
__attribute__
((
always_inline
));
inline
int
getTypeSizeCPU
(
int
data_type
)
{
return
(
data_type
==
0
)
?
4
:
((
data_type
==
1
)
?
2
:
1
);
}
void
setSizeInBytesCPU
(
struct
Tensor
*
tensor
,
int
data_type
,
size_t
num_elems
)
__attribute__
((
always_inline
));
inline
void
setSizeInBytesCPU
(
struct
Tensor
*
tensor
,
int
data_type
,
size_t
num_elems
)
{
int
type_size
=
getTypeSizeCPU
(
data_type
);
size_t
size_in_bytes
=
type_size
*
num_elems
;
tensor
->
size_in_bytes
=
size_in_bytes
;
}
void
allocateMemCPU
(
struct
Tensor
*
tensor
,
int
data_type
,
size_t
num_elems
,
bool
freeMemory
=
true
)
__attribute__
((
always_inline
));
inline
void
allocateMemCPU
(
struct
Tensor
*
tensor
,
int
data_type
,
size_t
num_elems
,
bool
freeMemory
)
{
setSizeInBytesCPU
(
tensor
,
data_type
,
num_elems
);
tensor
->
data_type
=
data_type
;
tensor
->
num_elems
=
num_elems
;
tensor
->
host_data
=
(
void
*
)
malloc
(
tensor
->
size_in_bytes
);
// Allocate memory on the host
if
(
freeMemory
)
PtrVect
.
push_back
(
tensor
->
host_data
);
}
void
initTensorDataCPU
(
void
*
tensor_ptr
,
void
*
data_ptr
,
size_t
size_in_bytes
)
__attribute__
((
always_inline
));
inline
void
initTensorDataCPU
(
void
*
tensor_ptr
,
void
*
data_ptr
,
size_t
size_in_bytes
)
{
Tensor
*
tensor
=
(
Tensor
*
)
tensor_ptr
;
if
(
tensor
->
size_in_bytes
!=
size_in_bytes
)
{
printf
(
"The destination and source sizes don't match"
);
}
memcpy
(
tensor
->
host_data
,
data_ptr
,
size_in_bytes
);
// Is this efficient enough?
}
void
*
create4DTensorCPU
(
int
data_type
,
int
data_format
,
size_t
dim1_size
,
size_t
dim2_size
,
size_t
dim3_size
,
size_t
dim4_size
,
bool
freeMemory
=
true
)
__attribute__
((
always_inline
));
inline
void
*
create4DTensorCPU
(
int
data_type
,
int
data_format
,
size_t
dim1_size
,
size_t
dim2_size
,
size_t
dim3_size
,
size_t
dim4_size
,
bool
freeMemory
)
{
struct
Tensor
*
tensor
=
(
struct
Tensor
*
)
malloc
(
sizeof
(
Tensor
));
size_t
num_elems
=
dim1_size
*
dim2_size
*
dim3_size
*
dim4_size
;
if
(
freeMemory
)
PtrVect
.
push_back
(
tensor
);
allocateMemCPU
(
tensor
,
data_type
,
num_elems
,
freeMemory
);
// Setting the tensor dimensions
size_t
*
dim_sizes
=
(
size_t
*
)
malloc
(
sizeof
(
size_t
)
*
4
);
dim_sizes
[
0
]
=
dim1_size
;
dim_sizes
[
1
]
=
dim2_size
;
dim_sizes
[
2
]
=
dim3_size
;
dim_sizes
[
3
]
=
dim4_size
;
tensor
->
dims
.
dim_sizes
=
dim_sizes
;
tensor
->
dims
.
num_dims
=
4
;
tensor
->
data_placement
=
HOST
;
return
tensor
;
}
void
*
tensorRegularConvolutionCPU
(
void
*
input_ptr
,
void
*
filter_ptr
,
int
vertical_pad
,
int
horizontal_pad
,
int
vertical_stride
,
int
horizontal_stride
,
...
...
@@ -146,7 +76,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
int
num_filter_elem
=
kernel_height
*
kernel_width
*
channels
;
int
output_size
=
output_width
*
output_height
;
printf
(
"--CREATE 4D TENSOR
\n
"
);
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
CPU
(
0
,
0
,
batch_size
,
num_filters
,
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
num_filters
,
output_height
,
output_width
);
float
*
__restrict__
output_data
=
(
float
*
)
output
->
host_data
;
printf
(
"CREATED 4D TENSOR
\n
"
);
...
...
@@ -235,7 +165,7 @@ void *tensorRegularFilterSamplingConvolutionCPU(
num_filter_elem
-
((
num_filter_elem
-
start
)
/
skip_every
)
-
remainder
;
const
int
output_size
=
output_width
*
output_height
;
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
CPU
(
0
,
0
,
batch_size
,
num_filters
,
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
num_filters
,
output_height
,
output_width
);
float
*
__restrict__
output_data
=
(
float
*
)
output
->
host_data
;
...
...
@@ -359,7 +289,7 @@ void *tensorIrregularFilterSamplingConvolutionCPU(
num_filter_elem
-
((
num_filter_elem
-
start
)
/
skip_every
)
-
remainder
;
const
int
output_size
=
output_width
*
output_height
;
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
CPU
(
0
,
0
,
batch_size
,
num_filters
,
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
num_filters
,
output_height
,
output_width
);
float
*
__restrict__
output_data
=
(
float
*
)
output
->
host_data
;
...
...
@@ -478,7 +408,7 @@ void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
int
num_filter_elem
=
kernel_height
*
kernel_width
*
channels
;
int
full_output_size
=
full_output_height
*
full_output_width
;
Tensor
*
full_output
=
(
Tensor
*
)
create4DTensor
CPU
(
Tensor
*
full_output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
num_filters
,
full_output_height
,
full_output_width
);
float
*
__restrict__
full_output_data
=
(
float
*
)
full_output
->
host_data
;
...
...
@@ -619,7 +549,7 @@ void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
int
num_filter_elem
=
kernel_height
*
kernel_width
*
channels
;
int
full_output_size
=
full_output_height
*
full_output_width
;
Tensor
*
full_output
=
(
Tensor
*
)
create4DTensor
CPU
(
Tensor
*
full_output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
num_filters
,
full_output_height
,
full_output_width
);
float
*
__restrict__
full_output_data
=
(
float
*
)
full_output
->
host_data
;
...
...
@@ -785,7 +715,6 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
const
int
channels
=
input
->
dims
.
dim_sizes
[
1
];
const
int
image_height
=
input
->
dims
.
dim_sizes
[
2
];
const
int
image_width
=
input
->
dims
.
dim_sizes
[
3
];
const
int
num_filters
=
filter
->
dims
.
dim_sizes
[
0
];
const
int
kernel_height
=
filter
->
dims
.
dim_sizes
[
2
];
const
int
kernel_width
=
filter
->
dims
.
dim_sizes
[
3
];
const
int
output_height
=
...
...
@@ -797,8 +726,8 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
const
int
num_filter_elem
=
filter_dim
*
channels
;
const
int
output_size
=
output_width
*
output_height
;
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
CPU
(
0
,
0
,
batch_size
,
num_filters
,
channels
,
output_height
*
output_width
);
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
channels
,
output_height
,
output_width
);
float
*
__restrict__
output_data
=
(
float
*
)
output
->
host_data
;
const
long
int
conv_data_size
=
sizeof
(
float
)
*
num_filter_elem
*
...
...
@@ -836,22 +765,18 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
}
}
}
for
(
int
p
=
0
;
p
<
num_filters
;
++
p
)
{
for
(
int
m
=
0
;
m
<
output_size
;
++
m
)
{
for
(
int
m
=
0
;
m
<
output_size
;
++
m
)
{
for
(
int
ch
=
0
;
ch
<
channels
;
ch
++
)
{
float
sum
=
0
;
#pragma omp simd reduction(+ : sum)
for
(
int
k
=
0
;
k
<
filter_dim
;
++
k
)
{
int
input_index
=
k
+
ch
*
filter_dim
+
num_filter_elem
*
m
+
b
*
num_filter_elem
*
output_size
;
sum
+=
host_data
[
input_index
]
*
host_filter
[
p
*
num_filter_elem
+
ch
*
filter_dim
+
k
];
sum
+=
host_data
[
input_index
]
*
host_filter
[
ch
*
filter_dim
+
k
];
}
output_data
[
b
*
(
output_size
*
num_filters
*
channels
)
+
p
*
output_size
*
channels
+
ch
*
output_size
+
m
]
=
sum
;
output_data
[
b
*
(
output_size
*
channels
)
+
ch
*
output_size
+
m
]
=
sum
;
}
}
}
}
}
free
(
host_data
);
...
...
@@ -928,7 +853,7 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
int
x_radius
=
(
window_width
-
1
)
/
2
;
int
y_radius
=
(
window_height
-
1
)
/
2
;
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
CPU
(
0
,
0
,
batch_size
,
channels
,
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
batch_size
,
channels
,
output_height
,
output_width
);
float
*
__restrict__
output_data
=
(
float
*
)
output
->
host_data
;
...
...
@@ -1026,7 +951,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
int
m
=
lhs
->
dims
.
dim_sizes
[
0
];
int
n
=
rhs
->
dims
.
dim_sizes
[
rhs
->
dims
.
num_dims
-
1
];
// output neurons
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
CPU
(
0
,
0
,
m
,
n
,
1
,
1
);
Tensor
*
output
=
(
Tensor
*
)
create4DTensor
(
0
,
0
,
m
,
n
,
1
,
1
);
float
*
__restrict__
lhs_arr
=
(
float
*
)
lhs
->
host_data
;
float
*
__restrict__
rhs_arr
=
(
float
*
)
rhs
->
host_data
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment