Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hpvm-release
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
llvm
hpvm-release
Commits
730b473a
Commit
730b473a
authored
5 years ago
by
kotsifa2
Browse files
Options
Downloads
Plain Diff
Merge branch 'approx_hpvm' of gitlab.engr.illinois.edu:llvm/hpvm into approx_hpvm
parents
8ab6c870
5bb2d3f3
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
llvm/projects/gpu_profiler/profiler.cpp
+179
-526
179 additions, 526 deletions
llvm/projects/gpu_profiler/profiler.cpp
with
179 additions
and
526 deletions
llvm/projects/gpu_profiler/profiler.cpp
+
179
−
526
View file @
730b473a
#include
<
cmath
>
#include
<
atomic
>
#include
<chrono>
#include
<chrono>
#include
<cmath>
#include
<
iostream
>
#include
<
condition_variable
>
#include
<fstream>
#include
<fstream>
#include
<iostream>
#include
<string>
#include
<string>
#include
<boost/algorithm/string.hpp>
#include
<thread>
#include
<vector>
#include
<vector>
#include
<map>
#include
<thread>
// Reads power rails at runtime and computes the GPU and DDR energy within a window
#include
<atomic>
// of time, which is delimitered by the calls to resume_profiler() and stop_profiler()
#include
<sched.h>
//
// IMPORTANT: Must call exit_profiler() to kill the profiler thread
//
// Public interface methods:
// void initialize();
// void run_profiler();
// void resume_profiler();
// void stop_profiler();
// std::pair<double, double> get_time_energy() const;
// void reset()
// void exit_profiler();
class
Profiler
{
public:
Profiler
()
:
should_run_profiler_
(
false
),
should_exit_profiler_
(
false
)
{
// Open all streams. Not done in initialize() function bc the streams
// should be strictly opened once
cpu_stream_
.
open
(
cpu_power_rail
,
std
::
ifstream
::
in
);
gpu_stream_
.
open
(
gpu_power_rail
,
std
::
ifstream
::
in
);
ddr_stream_
.
open
(
ddr_power_rail
,
std
::
ifstream
::
in
);
soc_stream_
.
open
(
soc_power_rail
,
std
::
ifstream
::
in
);
sys_stream_
.
open
(
sys_power_rail
,
std
::
ifstream
::
in
);
#define NUM_ARGS 4
if
(
!
cpu_stream_
.
is_open
()
or
!
gpu_stream_
.
is_open
()
or
!
ddr_stream_
.
is_open
()
or
!
soc_stream_
.
is_open
()
or
!
sys_stream_
.
is_open
())
{
std
::
cout
<<
"Failed to open one of the power rails for reading
\n
"
;
exit
(
1
);
}
}
// This is a simple power profiler that can sample the power of the various
~
Profiler
()
{
// components in a Jetson TX2. The usage is simple: profile() measures power
cpu_stream_
.
close
();
// for the specified program, and then dumpOutput() prints the readings to a
gpu_stream_
.
close
();
// file. profile() can be called as many times as desired - the internal state
ddr_stream_
.
close
();
// is reset each time and thus the measurements are not cumulative.
soc_stream_
.
close
();
class
Profiler
{
sys_stream_
.
close
();
private:
}
// Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
// we can't use them.
// Reinitializes boolean vars used for control flow and launches the profiler
const
unsigned
core0
=
0
;
// thread. DOES NOT reset other internal data structures.
const
unsigned
core1
=
3
;
void
initialize
(){
const
unsigned
core2
=
4
;
// Reinitialize in case the profiler object has been used before
const
unsigned
core3
=
5
;
should_run_profiler_
=
false
;
should_exit_profiler_
=
false
;
// Launch profiler thread
profiler_thread_
=
std
::
thread
(
&
Profiler
::
run_profiler
,
this
);
}
// Runs the profiler thread, keeping it alive by wrapping the functionality
// in an infinite loop
void
run_profiler
(){
while
(
true
){
if
(
should_exit_profiler_
)
{
break
;
}
// TODO overhead between calls to obtain_power_reading
// Need to lock the mutex and check the condition var
{
std
::
unique_lock
<
std
::
mutex
>
mutex_lock
(
mutex_
);
if
(
should_exit_profiler_
)
{
break
;
}
// Wake the thread up when it's time to run the profiler or exit
// the profiler
cond_var_
.
wait
(
mutex_lock
,
[
this
]{
return
should_run_profiler_
||
should_exit_profiler_
;
});
}
if
(
should_exit_profiler_
)
{
break
;
}
obtain_power_reading
();
}
}
// Resumes the profiling of whatever executable's currently running
// DOES NOT reset any data
void
resume_profiler
()
{
{
std
::
unique_lock
<
std
::
mutex
>
mutex_lock
(
mutex_
);
if
(
should_run_profiler_
){
std
::
cout
<<
"WARNING: resume_profiler was already called
\n
"
;
}
should_run_profiler_
=
true
;
start_time_
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
cond_var_
.
notify_one
();
}
// sysfs paths for i2c buses of various components
// Stops profiler by putting profiler thread to sleep
const
char
*
const
cpu_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input"
;
void
stop_profiler
()
{
const
char
*
const
gpu_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input"
;
{
const
char
*
const
ddr_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input"
;
std
::
unique_lock
<
std
::
mutex
>
mutex_lock
(
mutex_
);
const
char
*
const
soc_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input"
;
if
(
!
should_run_profiler_
){
const
char
*
const
sys_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input"
;
std
::
cout
<<
"WARNING: stop_profiler was already called
\n
"
;
}
should_run_profiler_
=
false
;
}
cond_var_
.
notify_one
();
}
// It takes some time for the GPU's power to return to idle (ms)
// Gets the delta time and total GPU and DDR energy between the last two
const
unsigned
gpu_idle_time
=
0
;
// calls to resume_profiler and stop_profiler
//
// Returns this as a pair of <delta time in milliseconds, energy>
std
::
pair
<
double
,
double
>
get_time_energy
()
const
{
double
total_energy
=
0.0
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
prev_time
=
start_time_
;
for
(
auto
reading
:
power_readings_
)
{
std
::
chrono
::
duration
<
double
>
duration
=
reading
.
time_
-
prev_time
;
total_energy
+=
reading
.
gpu_
*
duration
.
count
();
total_energy
+=
reading
.
ddr_
*
duration
.
count
();
prev_time
=
reading
.
time_
;
}
double
delta_time
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
prev_time
-
start_time_
).
count
();
return
std
::
make_pair
(
delta_time
,
total_energy
);
}
// Resets all internal data structures, including the vector storing all power_readings.
void
reset
()
{
should_exit_profiler_
=
false
;
// Can call reset after calling exit_profiler()
should_run_profiler_
=
false
;
// Can call reset after calling resume
power_readings_
.
clear
();
}
// Exit the profiler and kill the thread
// Must call initialize() to reuse this object after calling exit_profiler()
void
exit_profiler
()
{
std
::
cout
<<
"Exiting profiler
\n
"
;
should_exit_profiler_
=
true
;
cond_var_
.
notify_one
();
profiler_thread_
.
join
();
}
private
:
// Power rails are mounted as files. Keeping the old power rail file names for possible future
// integrations
const
std
::
string
cpu_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input"
;
const
std
::
string
gpu_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input"
;
const
std
::
string
ddr_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input"
;
const
std
::
string
soc_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input"
;
const
std
::
string
sys_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input"
;
// An individual power reading
// An individual power reading
struct
PowerReading
{
struct
PowerReading
{
...
@@ -49,81 +164,10 @@ private:
...
@@ -49,81 +164,10 @@ private:
double
sys_
;
double
sys_
;
};
};
// Individual tensor op
// Stores all power readings and is cleared only when reset() is called
struct
TensorOp
{
std
::
vector
<
PowerReading
>
power_readings_
;
std
::
string
name_
;
double
start_
;
double
finish_
;
double
time_
;
double
energy_
;
double
gpu_energy_
;
double
ddr_energy_
;
double
power_
;
double
gpu_power_
;
double
ddr_power_
;
TensorOp
(
std
::
string
name
,
double
start
,
double
finish
)
:
name_
(
name
),
start_
(
start
),
finish_
(
finish
),
time_
(
finish
-
start
),
energy_
(
0.0
),
gpu_energy_
(
0.0
),
ddr_energy_
(
0.0
),
power_
(
0.0
),
gpu_power_
(
0.0
),
ddr_power_
(
0.0
)
{
}
};
// Aggregate tensor info
struct
AggTensorInfo
{
// Op name
std
::
string
name_
;
// Averages
double
average_time_
;
double
average_energy_
;
double
average_gpu_energy_
;
double
average_ddr_energy_
;
double
average_power_
;
double
average_gpu_power_
;
double
average_ddr_power_
;
// Standard deviations
double
time_std_
;
double
energy_std_
;
double
gpu_energy_std_
;
double
ddr_energy_std_
;
double
power_std_
;
double
gpu_power_std_
;
double
ddr_power_std_
;
};
// Total time, energy, and power
struct
TotalInfo
{
double
time_
;
double
energy_
;
double
gpu_energy_
;
double
ddr_energy_
;
double
power_
;
double
gpu_power_
;
double
ddr_power_
;
void
clear
()
{
time_
=
0.0
;
energy_
=
0.0
;
gpu_energy_
=
0.0
;
ddr_energy_
=
0.0
;
power_
=
0.0
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
start_time_
;
gpu_power_
=
0.0
;
ddr_power_
=
0.0
;
}
};
// For reading the i2c buses via sysfs
// For reading the i2c buses via sysfs
std
::
ifstream
cpu_stream_
;
std
::
ifstream
cpu_stream_
;
...
@@ -132,58 +176,18 @@ private:
...
@@ -132,58 +176,18 @@ private:
std
::
ifstream
soc_stream_
;
std
::
ifstream
soc_stream_
;
std
::
ifstream
sys_stream_
;
std
::
ifstream
sys_stream_
;
// Start time (so graph begins from t=0)
std
::
mutex
mutex_
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
start_time_
;
std
::
condition_variable
cond_var_
;
// Per-run info
std
::
vector
<
PowerReading
>
power_readings_
;
// Aggregate (across all runs) info
bool
should_run_profiler_
;
// True if we want to resume the profiling thread
std
::
map
<
std
::
string
,
std
::
vector
<
TensorOp
>>
tensor_info_
;
std
::
vector
<
AggTensorInfo
>
agg_tensor_info_
;
TotalInfo
total_info_
;
unsigned
iterations_
;
// Start and stop flags to synchronize the program and profiling threads
std
::
atomic_bool
should_exit_profiler_
;
// Quit profiling
std
::
atomic_bool
start_
;
std
::
atomic_bool
stop_
;
private
:
std
::
thread
profiler_thread_
;
// Resets tensor info and total time and energy
void
resetGlobal
()
{
tensor_info_
.
clear
();
agg_tensor_info_
.
clear
();
total_info_
.
clear
();
}
// Resets power readings and flags
void
resetLocal
()
{
power_readings_
.
clear
();
start_
=
false
;
stop_
=
false
;
}
// Pins the given thread to the specified core
void
pinThread
(
std
::
thread
&
t
,
const
unsigned
core
)
const
{
cpu_set_t
cpuset
;
CPU_ZERO
(
&
cpuset
);
CPU_SET
(
core
,
&
cpuset
);
if
(
pthread_setaffinity_np
(
t
.
native_handle
(),
sizeof
(
cpu_set_t
),
&
cpuset
)
!=
0
)
std
::
cout
<<
"Couldn't set thread affinity
\n
"
;
}
// Adds a tensor op to the map
void
addTensorOp
(
std
::
string
&
op_name
,
TensorOp
&
top
)
{
// Create a vector if this is the first entry
auto
it
=
tensor_info_
.
find
(
op_name
);
if
(
it
==
tensor_info_
.
end
())
{
tensor_info_
.
insert
(
std
::
pair
<
std
::
string
,
std
::
vector
<
TensorOp
>>
(
op_name
,
std
::
vector
<
TensorOp
>
()));
}
tensor_info_
[
op_name
].
push_back
(
top
);
}
// Obtain's a single power reading from the GPU and DDR rails
// Obtain's a single power reading from the GPU and DDR rails
void
getP
ower
R
eading
()
{
void
obtain_p
ower
_r
eading
()
{
PowerReading
reading
;
PowerReading
reading
;
// The order matters here. All the reads have to happen together first
// The order matters here. All the reads have to happen together first
...
@@ -199,385 +203,34 @@ private:
...
@@ -199,385 +203,34 @@ private:
gpu_stream_
.
seekg
(
0
);
gpu_stream_
.
seekg
(
0
);
ddr_stream_
.
seekg
(
0
);
ddr_stream_
.
seekg
(
0
);
}
}
};
// Executes the program to be profiled
/*
void
runProgram
(
const
char
*
const
program
)
{
// TESTS
// Tell the profiling thread to start, execute the program that needs
void resume_pause_profiler(Profiler& profile_wrapper, unsigned long sleep_millis){
// to be profiled, and then tell the profiling thread to stop.
profile_wrapper.resume_profiler();
start_
=
true
;
std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
const
auto
result
=
std
::
system
(
program
);
profile_wrapper.stop_profiler();
stop_
=
true
;
}
// Records power while the program is running
void
recordPower
()
{
// Obtain the new start time, wait for the start signal, and keep
// profiling until the stop flag is set.
start_time_
=
std
::
chrono
::
high_resolution_clock
::
now
();
while
(
!
start_
);
while
(
!
stop_
)
getPowerReading
();
}
// Calculates stats for the entire execution (CPU+GPU phase)
void
updateTotalStats
()
{
double
energy
=
0.0
;
double
gpu_energy
=
0.0
;
double
ddr_energy
=
0.0
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
prev_time
=
start_time_
;
for
(
auto
reading
:
power_readings_
)
{
std
::
chrono
::
duration
<
double
>
duration
=
reading
.
time_
-
prev_time
;
gpu_energy
+=
reading
.
gpu_
*
duration
.
count
();
ddr_energy
+=
reading
.
ddr_
*
duration
.
count
();
prev_time
=
reading
.
time_
;
}
energy
=
gpu_energy
+
ddr_energy
;
auto
time
=
std
::
chrono
::
duration
<
double
>
(
prev_time
-
start_time_
).
count
();
total_info_
.
time_
+=
time
;
total_info_
.
energy_
+=
(
gpu_energy
+
ddr_energy
);
total_info_
.
gpu_energy_
+=
gpu_energy
;
total_info_
.
ddr_energy_
+=
ddr_energy
;
total_info_
.
power_
+=
(
energy
/
time
);
total_info_
.
gpu_power_
+=
(
gpu_energy
/
time
);
total_info_
.
ddr_power_
+=
(
ddr_energy
/
time
);
}
// Calculates energy and power usage of the given tensor operation
void
calculateTensorEP
(
TensorOp
&
top
)
const
{
auto
prev_time
=
top
.
start_
;
unsigned
i
=
0
;
// Skip until we hit the start time of the operation
for
(;
std
::
chrono
::
duration
<
double
>
(
power_readings_
[
i
].
time_
.
time_since_epoch
()).
count
()
<
top
.
start_
;
i
++
);
// Keep going until we hit the finish time of the operation or we run out of readings
for
(
double
curr_time
;
((
curr_time
=
std
::
chrono
::
duration
<
double
>
(
power_readings_
[
i
].
time_
.
time_since_epoch
()).
count
())
<=
top
.
finish_
)
&&
(
i
<
power_readings_
.
size
());
i
++
)
{
auto
duration
=
curr_time
-
prev_time
;
prev_time
=
curr_time
;
top
.
gpu_energy_
+=
power_readings_
[
i
].
gpu_
*
duration
;
top
.
ddr_energy_
+=
power_readings_
[
i
].
ddr_
*
duration
;
}
top
.
energy_
=
top
.
gpu_energy_
+
top
.
ddr_energy_
;
top
.
power_
=
top
.
energy_
/
top
.
time_
;
top
.
gpu_power_
=
top
.
gpu_energy_
/
top
.
time_
;
top
.
ddr_power_
=
top
.
ddr_energy_
/
top
.
time_
;
}
// Calculates stats for all the tensors in the timestamp file
void
updatePerOpStats
()
{
const
char
*
const
op_file
=
"profile_data.txt"
;
std
::
string
line
;
std
::
ifstream
ifs
(
op_file
,
std
::
ios
::
in
);
// Calculate time and energy for each tensor operation. There are two
// possibilities for the file format:
// If the line doesn't begin with #, we are looking at FP32 code
// without any conversions to/from FP16, and each operation occupies
// two consecutive lines in the timestamp file.
// If the line does begin with #, we are looking at FP16 code with
// conversion routines in the middle. In this case, *after* the current
// line, there will be two lines for F2H, two lines for H2F, and then
// one line for the end of the operation.
while
(
std
::
getline
(
ifs
,
line
))
{
std
::
vector
<
std
::
string
>
tokens
;
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
std
::
string
op_name
=
tokens
[
0
];
// FP32
if
(
tokens
[
0
][
0
]
!=
'#'
)
{
// First line with tensor op name and start time
std
::
string
op_name
=
tokens
[
0
];
const
auto
start
=
std
::
stod
(
tokens
[
1
]);
// Second line with tensor op end time
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
finish
=
std
::
stod
(
tokens
[
1
]);
TensorOp
top
(
op_name
,
start
,
finish
);
calculateTensorEP
(
top
);
addTensorOp
(
op_name
,
top
);
}
else
{
// First line with tensor op name and start time
std
::
string
op_name
=
tokens
[
0
].
substr
(
1
);
const
auto
start
=
std
::
stod
(
tokens
[
1
]);
// Second line with f2h
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
std
::
string
f2h_name
=
op_name
+
"_f2h"
;
const
auto
f2h_start
=
std
::
stod
(
tokens
[
1
]);
// Third line with f2h
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
f2h_finish
=
std
::
stod
(
tokens
[
1
]);
// Add f2h
TensorOp
f2h
(
f2h_name
,
f2h_start
,
f2h_finish
);
calculateTensorEP
(
f2h
);
addTensorOp
(
f2h_name
,
f2h
);
// Fourth line with h2f
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
std
::
string
h2f_name
=
op_name
+
"_h2f"
;
const
auto
h2f_start
=
std
::
stod
(
tokens
[
1
]);
// Fifth line with h2f
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
h2f_finish
=
std
::
stod
(
tokens
[
1
]);
// Add h2f
TensorOp
h2f
(
h2f_name
,
h2f_start
,
h2f_finish
);
calculateTensorEP
(
h2f
);
addTensorOp
(
h2f_name
,
h2f
);
// Sixth and final line with tensor op end time
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
finish
=
std
::
stod
(
tokens
[
1
]);
// Subtract f2h's and h2f's time and energy to get just the computation's info
TensorOp
top
(
op_name
,
start
,
finish
);
calculateTensorEP
(
top
);
top
.
time_
-=
(
f2h
.
time_
+
h2f
.
time_
);
top
.
energy_
-=
(
f2h
.
energy_
+
h2f
.
energy_
);
top
.
gpu_energy_
-=
(
f2h
.
gpu_energy_
+
h2f
.
gpu_energy_
);
top
.
ddr_energy_
-=
(
f2h
.
ddr_energy_
+
h2f
.
ddr_energy_
);
top
.
power_
=
top
.
energy_
/
top
.
time_
;
top
.
gpu_power_
=
top
.
gpu_energy_
/
top
.
time_
;
top
.
ddr_power_
=
top
.
ddr_energy_
/
top
.
time_
;
addTensorOp
(
op_name
,
top
);
}
}
ifs
.
close
();
}
void
updateStats
()
{
updatePerOpStats
();
updateTotalStats
();
}
// Calculates the average and standard deviation of each metric of each tensor op
void
calculateAggregateStats
()
{
for
(
auto
it
=
tensor_info_
.
begin
();
it
!=
tensor_info_
.
end
();
it
++
)
{
AggTensorInfo
ati
;
ati
.
name_
=
it
->
first
;
auto
topv
=
it
->
second
;
double
total_time
=
0.0
;
double
total_energy
=
0.0
;
double
total_gpu_energy
=
0.0
;
double
total_ddr_energy
=
0.0
;
double
total_power
=
0.0
;
double
total_gpu_power
=
0.0
;
double
total_ddr_power
=
0.0
;
double
time_sum
=
0.0
;
double
energy_sum
=
0.0
;
double
gpu_energy_sum
=
0.0
;
double
ddr_energy_sum
=
0.0
;
double
power_sum
=
0.0
;
double
gpu_power_sum
=
0.0
;
double
ddr_power_sum
=
0.0
;
// Calculate average
for
(
const
auto
&
top
:
topv
)
{
total_time
+=
top
.
time_
;
total_energy
+=
top
.
energy_
;
total_gpu_energy
+=
top
.
gpu_energy_
;
total_ddr_energy
+=
top
.
ddr_energy_
;
total_power
+=
top
.
power_
;
total_gpu_power
+=
top
.
gpu_power_
;
total_ddr_power
+=
top
.
ddr_power_
;
}
ati
.
average_time_
=
total_time
/
iterations_
;
ati
.
average_energy_
=
total_energy
/
iterations_
;
ati
.
average_gpu_energy_
=
total_gpu_energy
/
iterations_
;
ati
.
average_ddr_energy_
=
total_ddr_energy
/
iterations_
;
ati
.
average_power_
=
total_power
/
iterations_
;
ati
.
average_gpu_power_
=
total_gpu_power
/
iterations_
;
ati
.
average_ddr_power_
=
total_ddr_power
/
iterations_
;
// Calculate standard deviation
for
(
const
auto
&
top
:
topv
)
{
auto
time_diff
=
top
.
time_
-
ati
.
average_time_
;
time_sum
+=
time_diff
*
time_diff
;
auto
energy_diff
=
top
.
energy_
-
ati
.
average_energy_
;
energy_sum
+=
energy_diff
*
energy_diff
;
auto
gpu_energy_diff
=
top
.
gpu_energy_
-
ati
.
average_gpu_energy_
;
gpu_energy_sum
+=
gpu_energy_diff
*
gpu_energy_diff
;
auto
ddr_energy_diff
=
top
.
ddr_energy_
-
ati
.
average_ddr_energy_
;
ddr_energy_sum
+=
ddr_energy_diff
*
ddr_energy_diff
;
auto
power_diff
=
top
.
power_
-
ati
.
average_power_
;
power_sum
+=
power_diff
*
power_diff
;
auto
gpu_power_diff
=
top
.
gpu_power_
-
ati
.
average_gpu_power_
;
gpu_power_sum
+=
gpu_power_diff
*
gpu_power_diff
;
auto
ddr_power_diff
=
top
.
ddr_power_
-
ati
.
average_ddr_power_
;
ddr_power_sum
+=
ddr_power_diff
*
ddr_power_diff
;
}
ati
.
time_std_
=
std
::
sqrt
(
time_sum
/
iterations_
);
ati
.
energy_std_
=
std
::
sqrt
(
energy_sum
/
iterations_
);
ati
.
gpu_energy_std_
=
std
::
sqrt
(
gpu_energy_sum
/
iterations_
);
ati
.
ddr_energy_std_
=
std
::
sqrt
(
ddr_energy_sum
/
iterations_
);
ati
.
power_std_
=
std
::
sqrt
(
power_sum
/
iterations_
);
ati
.
gpu_power_std_
=
std
::
sqrt
(
gpu_power_sum
/
iterations_
);
ati
.
ddr_power_std_
=
std
::
sqrt
(
ddr_power_sum
/
iterations_
);
agg_tensor_info_
.
push_back
(
ati
);
}
}
public
:
Profiler
()
{
cpu_stream_
.
open
(
cpu_power_rail
,
std
::
ifstream
::
in
);
gpu_stream_
.
open
(
gpu_power_rail
,
std
::
ifstream
::
in
);
ddr_stream_
.
open
(
ddr_power_rail
,
std
::
ifstream
::
in
);
soc_stream_
.
open
(
soc_power_rail
,
std
::
ifstream
::
in
);
sys_stream_
.
open
(
sys_power_rail
,
std
::
ifstream
::
in
);
if
(
!
cpu_stream_
.
is_open
()
or
!
gpu_stream_
.
is_open
()
or
!
ddr_stream_
.
is_open
()
or
!
soc_stream_
.
is_open
()
or
!
sys_stream_
.
is_open
())
{
std
::
cout
<<
"Failed to open one of the power rails for reading
\n
"
;
exit
(
1
);
}
}
~
Profiler
()
{
cpu_stream_
.
close
();
gpu_stream_
.
close
();
ddr_stream_
.
close
();
soc_stream_
.
close
();
sys_stream_
.
close
();
}
void
profile
(
const
char
*
const
program
,
const
int
iterations
)
{
iterations_
=
iterations
;
resetGlobal
();
for
(
unsigned
i
=
0
;
i
<
iterations_
;
i
++
)
{
resetLocal
();
// Launch two threads: one for running the program and one for
// profiling it. Pin the threads to specific cores to remove migration
// overhead. Profiling showed that the sampling rate increases slightly
// with pinning.
std
::
thread
prog
(
&
Profiler
::
runProgram
,
this
,
program
);
std
::
thread
power
(
&
Profiler
::
recordPower
,
this
);
pinThread
(
prog
,
core1
);
pinThread
(
power
,
core2
);
prog
.
join
();
power
.
join
();
updateStats
();
// Sleep for some time to bring the GPU back to idle
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
gpu_idle_time
));
}
calculateAggregateStats
();
}
void
dumpTensorInfo
(
const
char
*
const
filename
)
const
{
const
std
::
string
header
=
"Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std
\n
"
;
std
::
ofstream
ofs
;
ofs
.
open
(
filename
);
//ofs << header;
for
(
const
auto
&
ati
:
agg_tensor_info_
)
{
ofs
<<
ati
.
name_
<<
","
<<
ati
.
average_time_
*
1e3
<<
","
<<
ati
.
average_energy_
<<
","
<<
ati
.
average_gpu_energy_
<<
","
<<
ati
.
average_ddr_energy_
<<
","
<<
ati
.
average_power_
<<
","
<<
ati
.
average_gpu_power_
<<
","
<<
ati
.
average_ddr_power_
<<
","
<<
ati
.
time_std_
*
1e3
<<
","
<<
ati
.
energy_std_
<<
","
<<
ati
.
gpu_energy_std_
<<
","
<<
ati
.
ddr_energy_std_
<<
","
<<
ati
.
power_std_
<<
","
<<
ati
.
gpu_power_std_
<<
","
<<
ati
.
ddr_power_std_
<<
"
\n
"
;
std
::
cout
<<
ati
.
average_time_
*
1e3
<<
","
<<
ati
.
average_energy_
<<
"
\n
"
;
}
ofs
.
close
();
}
void
dumpPowerReadings
(
const
char
*
const
filename
)
const
{
std
::
ofstream
ofs
;
ofs
.
open
(
filename
);
for
(
const
auto
&
reading
:
power_readings_
)
{
std
::
chrono
::
duration
<
double
>
duration
=
reading
.
time_
-
start_time_
;
//std::chrono::duration<double> duration = reading.time_.time_since_epoch();
ofs
<<
std
::
to_string
(
duration
.
count
())
<<
" "
<<
reading
.
gpu_
<<
" "
<<
reading
.
ddr_
<<
"
\n
"
;
}
ofs
.
close
();
}
void
dumpTotalInfo
()
const
{
auto
total_time
=
total_info_
.
time_
/
iterations_
;
auto
total_energy
=
total_info_
.
energy_
/
iterations_
;
auto
gpu_energy
=
total_info_
.
gpu_energy_
/
iterations_
;
auto
ddr_energy
=
total_info_
.
ddr_energy_
/
iterations_
;
auto
power
=
total_info_
.
power_
/
iterations_
;
auto time_energy_pair = profile_wrapper.get_time_energy();
auto
gpu_power
=
total_info_
.
gpu_power_
/
iterations_
;
profile_wrapper.reset();
auto
ddr_power
=
total_info_
.
ddr_power_
/
iterations_
;
std
::
cout
<<
"-----------------------------------------------------
\n
"
;
printf("time: %f, energy: %f\n", time_energy_pair.first, time_energy_pair.second);
std
::
cout
<<
"Program info (average)
\n
"
;
std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
std
::
cout
<<
"-----------------------------------------------------
\n
"
;
}
std
::
cout
<<
"
\t
Execution time: "
<<
total_time
<<
" seconds
\n
"
;
std
::
cout
<<
"
\t
Total energy: "
<<
total_energy
<<
" mJ
\n
"
;
std
::
cout
<<
"
\t
GPU: "
<<
gpu_energy
<<
" mJ
\n
"
;
std
::
cout
<<
"
\t
DDR: "
<<
ddr_energy
<<
" mJ
\n
"
;
std
::
cout
<<
"
\t
Power: "
<<
power
<<
" mW
\n
"
;
std
::
cout
<<
"
\t
GPU: "
<<
gpu_power
<<
" mW
\n
"
;
std
::
cout
<<
"
\t
DDR: "
<<
ddr_power
<<
" mW
\n
"
;
std
::
cout
<<
"-----------------------------------------------------
\n
"
;
}
};
int
main
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
NUM_ARGS
)
{
std
::
cout
<<
"Usage: "
<<
argv
[
0
]
<<
" <program> <iterations> <tensor output file> [power output file]
\n
"
;
exit
(
1
);
}
Profiler
pp
;
int main(){
pp
.
p
rofile
(
argv
[
1
],
std
::
stoi
(
argv
[
2
]))
;
P
rofile
r profile_wrapper
;
pp
.
dumpTensorInfo
(
argv
[
3
]
);
profile_wrapper.initialize(
);
if
(
argc
>
NUM_ARGS
)
unsigned long sleep_millis = 5000;
pp
.
dumpPowerReadings
(
argv
[
4
]);
resume_pause_profiler(profile_wrapper, sleep_millis);
resume_pause_profiler(profile_wrapper, sleep_millis);
resume_pause_profiler(profile_wrapper, sleep_millis);
resume_pause_profiler(profile_wrapper, sleep_millis);
// IMPORTANT
profile_wrapper.exit_profiler();
return 0;
return 0;
}
}
*/
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment