Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hpvm-release
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
llvm
hpvm-release
Commits
5febbf3a
"hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp" did not exist on "3a7d5153488f34646254bf2d0aaf729e5b79db56"
Commit
5febbf3a
authored
5 years ago
by
Elizabeth
Browse files
Options
Downloads
Patches
Plain Diff
Removed all fields besides total energy/time from output file
parent
dbfd8a93
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
llvm/projects/gpu_profiler/offline_profiler.cpp
+584
-0
584 additions, 0 deletions
llvm/projects/gpu_profiler/offline_profiler.cpp
with
584 additions
and
0 deletions
llvm/projects/gpu_profiler/offline_profiler.cpp
0 → 100644
+
584
−
0
View file @
5febbf3a
#include
<cmath>
#include
<chrono>
#include
<iostream>
#include
<fstream>
#include
<string>
#include
<boost/algorithm/string.hpp>
#include
<vector>
#include
<map>
#include
<thread>
#include
<atomic>
#include
<sched.h>
#define NUM_ARGS 4
// This is a simple power profiler that can sample the power of the various
// components in a Jetson TX2. The usage is simple: profile() measures power
// for the specified program, and then dumpOutput() prints the readings to a
// file. profile() can be called as many times as desired - the internal state
// is reset each time and thus the measurements are not cumulative.
class
Profiler
{
private:
// Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
// we can't use them.
const
unsigned
core0
=
0
;
const
unsigned
core1
=
3
;
const
unsigned
core2
=
4
;
const
unsigned
core3
=
5
;
// sysfs paths for i2c buses of various components
const
char
*
const
cpu_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input"
;
const
char
*
const
gpu_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input"
;
const
char
*
const
ddr_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input"
;
const
char
*
const
soc_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input"
;
const
char
*
const
sys_power_rail
=
"/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input"
;
// It takes some time for the GPU's power to return to idle (ms)
const
unsigned
gpu_idle_time
=
0
;
// An individual power reading
struct
PowerReading
{
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
time_
;
double
cpu_
;
double
gpu_
;
double
ddr_
;
double
soc_
;
double
sys_
;
};
// Individual tensor op
struct
TensorOp
{
std
::
string
name_
;
double
start_
;
double
finish_
;
double
time_
;
double
energy_
;
double
gpu_energy_
;
double
ddr_energy_
;
double
power_
;
double
gpu_power_
;
double
ddr_power_
;
TensorOp
(
std
::
string
name
,
double
start
,
double
finish
)
:
name_
(
name
),
start_
(
start
),
finish_
(
finish
),
time_
(
finish
-
start
),
energy_
(
0.0
),
gpu_energy_
(
0.0
),
ddr_energy_
(
0.0
),
power_
(
0.0
),
gpu_power_
(
0.0
),
ddr_power_
(
0.0
)
{
}
};
// Aggregate tensor info
struct
AggTensorInfo
{
// Op name
std
::
string
name_
;
// Averages
double
average_time_
;
double
average_energy_
;
double
average_gpu_energy_
;
double
average_ddr_energy_
;
double
average_power_
;
double
average_gpu_power_
;
double
average_ddr_power_
;
// Standard deviations
double
time_std_
;
double
energy_std_
;
double
gpu_energy_std_
;
double
ddr_energy_std_
;
double
power_std_
;
double
gpu_power_std_
;
double
ddr_power_std_
;
};
// Total time, energy, and power
struct
TotalInfo
{
double
time_
;
double
energy_
;
double
gpu_energy_
;
double
ddr_energy_
;
double
power_
;
double
gpu_power_
;
double
ddr_power_
;
void
clear
()
{
time_
=
0.0
;
energy_
=
0.0
;
gpu_energy_
=
0.0
;
ddr_energy_
=
0.0
;
power_
=
0.0
;
gpu_power_
=
0.0
;
ddr_power_
=
0.0
;
}
};
// For reading the i2c buses via sysfs
std
::
ifstream
cpu_stream_
;
std
::
ifstream
gpu_stream_
;
std
::
ifstream
ddr_stream_
;
std
::
ifstream
soc_stream_
;
std
::
ifstream
sys_stream_
;
// Start time (so graph begins from t=0)
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
start_time_
;
// Per-run info
std
::
vector
<
PowerReading
>
power_readings_
;
// Aggregate (across all runs) info
std
::
map
<
std
::
string
,
std
::
vector
<
TensorOp
>>
tensor_info_
;
std
::
vector
<
AggTensorInfo
>
agg_tensor_info_
;
TotalInfo
total_info_
;
unsigned
iterations_
;
// Start and stop flags to synchronize the program and profiling threads
std
::
atomic_bool
start_
;
std
::
atomic_bool
stop_
;
private
:
// Resets tensor info and total time and energy
void
resetGlobal
()
{
tensor_info_
.
clear
();
agg_tensor_info_
.
clear
();
total_info_
.
clear
();
}
// Resets power readings and flags
void
resetLocal
()
{
power_readings_
.
clear
();
start_
=
false
;
stop_
=
false
;
}
// Pins the given thread to the specified core
void
pinThread
(
std
::
thread
&
t
,
const
unsigned
core
)
const
{
cpu_set_t
cpuset
;
CPU_ZERO
(
&
cpuset
);
CPU_SET
(
core
,
&
cpuset
);
if
(
pthread_setaffinity_np
(
t
.
native_handle
(),
sizeof
(
cpu_set_t
),
&
cpuset
)
!=
0
)
std
::
cout
<<
"Couldn't set thread affinity
\n
"
;
}
// Adds a tensor op to the map
void
addTensorOp
(
std
::
string
&
op_name
,
TensorOp
&
top
)
{
// Create a vector if this is the first entry
auto
it
=
tensor_info_
.
find
(
op_name
);
if
(
it
==
tensor_info_
.
end
())
{
tensor_info_
.
insert
(
std
::
pair
<
std
::
string
,
std
::
vector
<
TensorOp
>>
(
op_name
,
std
::
vector
<
TensorOp
>
()));
}
tensor_info_
[
op_name
].
push_back
(
top
);
}
// Obtain's a single power reading from the GPU and DDR rails
void
getPowerReading
()
{
PowerReading
reading
;
// The order matters here. All the reads have to happen together first
// and then all the seeks have to happen together at the end, otherwise
// there will be a significant time difference between the readings of
// the different rails.
reading
.
time_
=
std
::
chrono
::
high_resolution_clock
::
now
();
gpu_stream_
>>
reading
.
gpu_
;
ddr_stream_
>>
reading
.
ddr_
;
power_readings_
.
push_back
(
reading
);
// Reset the input position of the files
gpu_stream_
.
seekg
(
0
);
ddr_stream_
.
seekg
(
0
);
}
// Executes the program to be profiled
void
runProgram
(
const
char
*
const
program
)
{
// Tell the profiling thread to start, execute the program that needs
// to be profiled, and then tell the profiling thread to stop.
start_
=
true
;
const
auto
result
=
std
::
system
(
program
);
stop_
=
true
;
}
// Records power while the program is running
void
recordPower
()
{
// Obtain the new start time, wait for the start signal, and keep
// profiling until the stop flag is set.
start_time_
=
std
::
chrono
::
high_resolution_clock
::
now
();
while
(
!
start_
);
while
(
!
stop_
)
getPowerReading
();
}
// Calculates stats for the entire execution (CPU+GPU phase)
void
updateTotalStats
()
{
double
energy
=
0.0
;
double
gpu_energy
=
0.0
;
double
ddr_energy
=
0.0
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
prev_time
=
start_time_
;
for
(
auto
reading
:
power_readings_
)
{
std
::
chrono
::
duration
<
double
>
duration
=
reading
.
time_
-
prev_time
;
gpu_energy
+=
reading
.
gpu_
*
duration
.
count
();
ddr_energy
+=
reading
.
ddr_
*
duration
.
count
();
prev_time
=
reading
.
time_
;
}
energy
=
gpu_energy
+
ddr_energy
;
auto
time
=
std
::
chrono
::
duration
<
double
>
(
prev_time
-
start_time_
).
count
();
total_info_
.
time_
+=
time
;
total_info_
.
energy_
+=
(
gpu_energy
+
ddr_energy
);
total_info_
.
gpu_energy_
+=
gpu_energy
;
total_info_
.
ddr_energy_
+=
ddr_energy
;
total_info_
.
power_
+=
(
energy
/
time
);
total_info_
.
gpu_power_
+=
(
gpu_energy
/
time
);
total_info_
.
ddr_power_
+=
(
ddr_energy
/
time
);
}
// Calculates energy and power usage of the given tensor operation
void
calculateTensorEP
(
TensorOp
&
top
)
const
{
auto
prev_time
=
top
.
start_
;
unsigned
i
=
0
;
// Skip until we hit the start time of the operation
for
(;
std
::
chrono
::
duration
<
double
>
(
power_readings_
[
i
].
time_
.
time_since_epoch
()).
count
()
<
top
.
start_
;
i
++
);
// Keep going until we hit the finish time of the operation or we run out of readings
for
(
double
curr_time
;
((
curr_time
=
std
::
chrono
::
duration
<
double
>
(
power_readings_
[
i
].
time_
.
time_since_epoch
()).
count
())
<=
top
.
finish_
)
&&
(
i
<
power_readings_
.
size
());
i
++
)
{
auto
duration
=
curr_time
-
prev_time
;
prev_time
=
curr_time
;
top
.
gpu_energy_
+=
power_readings_
[
i
].
gpu_
*
duration
;
top
.
ddr_energy_
+=
power_readings_
[
i
].
ddr_
*
duration
;
}
top
.
energy_
=
top
.
gpu_energy_
+
top
.
ddr_energy_
;
top
.
power_
=
top
.
energy_
/
top
.
time_
;
top
.
gpu_power_
=
top
.
gpu_energy_
/
top
.
time_
;
top
.
ddr_power_
=
top
.
ddr_energy_
/
top
.
time_
;
}
// Calculates stats for all the tensors in the timestamp file
void
updatePerOpStats
()
{
const
char
*
const
op_file
=
"profile_data.txt"
;
std
::
string
line
;
std
::
ifstream
ifs
(
op_file
,
std
::
ios
::
in
);
// Calculate time and energy for each tensor operation. There are two
// possibilities for the file format:
// If the line doesn't begin with #, we are looking at FP32 code
// without any conversions to/from FP16, and each operation occupies
// two consecutive lines in the timestamp file.
// If the line does begin with #, we are looking at FP16 code with
// conversion routines in the middle. In this case, *after* the current
// line, there will be two lines for F2H, two lines for H2F, and then
// one line for the end of the operation.
while
(
std
::
getline
(
ifs
,
line
))
{
std
::
vector
<
std
::
string
>
tokens
;
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
std
::
string
op_name
=
tokens
[
0
];
// FP32
if
(
tokens
[
0
][
0
]
!=
'#'
)
{
// First line with tensor op name and start time
std
::
string
op_name
=
tokens
[
0
];
const
auto
start
=
std
::
stod
(
tokens
[
1
]);
// Second line with tensor op end time
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
finish
=
std
::
stod
(
tokens
[
1
]);
TensorOp
top
(
op_name
,
start
,
finish
);
calculateTensorEP
(
top
);
addTensorOp
(
op_name
,
top
);
}
else
{
// First line with tensor op name and start time
std
::
string
op_name
=
tokens
[
0
].
substr
(
1
);
const
auto
start
=
std
::
stod
(
tokens
[
1
]);
// Second line with f2h
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
std
::
string
f2h_name
=
op_name
+
"_f2h"
;
const
auto
f2h_start
=
std
::
stod
(
tokens
[
1
]);
// Third line with f2h
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
f2h_finish
=
std
::
stod
(
tokens
[
1
]);
// Add f2h
TensorOp
f2h
(
f2h_name
,
f2h_start
,
f2h_finish
);
calculateTensorEP
(
f2h
);
addTensorOp
(
f2h_name
,
f2h
);
// Fourth line with h2f
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
std
::
string
h2f_name
=
op_name
+
"_h2f"
;
const
auto
h2f_start
=
std
::
stod
(
tokens
[
1
]);
// Fifth line with h2f
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
h2f_finish
=
std
::
stod
(
tokens
[
1
]);
// Add h2f
TensorOp
h2f
(
h2f_name
,
h2f_start
,
h2f_finish
);
calculateTensorEP
(
h2f
);
addTensorOp
(
h2f_name
,
h2f
);
// Sixth and final line with tensor op end time
std
::
getline
(
ifs
,
line
);
tokens
.
clear
();
boost
::
split
(
tokens
,
line
,
boost
::
is_any_of
(
"
\t
"
));
const
auto
finish
=
std
::
stod
(
tokens
[
1
]);
// Subtract f2h's and h2f's time and energy to get just the computation's info
TensorOp
top
(
op_name
,
start
,
finish
);
calculateTensorEP
(
top
);
top
.
time_
-=
(
f2h
.
time_
+
h2f
.
time_
);
top
.
energy_
-=
(
f2h
.
energy_
+
h2f
.
energy_
);
top
.
gpu_energy_
-=
(
f2h
.
gpu_energy_
+
h2f
.
gpu_energy_
);
top
.
ddr_energy_
-=
(
f2h
.
ddr_energy_
+
h2f
.
ddr_energy_
);
top
.
power_
=
top
.
energy_
/
top
.
time_
;
top
.
gpu_power_
=
top
.
gpu_energy_
/
top
.
time_
;
top
.
ddr_power_
=
top
.
ddr_energy_
/
top
.
time_
;
addTensorOp
(
op_name
,
top
);
}
}
ifs
.
close
();
}
void
updateStats
()
{
updatePerOpStats
();
updateTotalStats
();
}
// Calculates the average and standard deviation of each metric of each tensor op
void
calculateAggregateStats
()
{
for
(
auto
it
=
tensor_info_
.
begin
();
it
!=
tensor_info_
.
end
();
it
++
)
{
AggTensorInfo
ati
;
ati
.
name_
=
it
->
first
;
auto
topv
=
it
->
second
;
double
total_time
=
0.0
;
double
total_energy
=
0.0
;
double
total_gpu_energy
=
0.0
;
double
total_ddr_energy
=
0.0
;
double
total_power
=
0.0
;
double
total_gpu_power
=
0.0
;
double
total_ddr_power
=
0.0
;
double
time_sum
=
0.0
;
double
energy_sum
=
0.0
;
double
gpu_energy_sum
=
0.0
;
double
ddr_energy_sum
=
0.0
;
double
power_sum
=
0.0
;
double
gpu_power_sum
=
0.0
;
double
ddr_power_sum
=
0.0
;
// Calculate average
for
(
const
auto
&
top
:
topv
)
{
total_time
+=
top
.
time_
;
total_energy
+=
top
.
energy_
;
total_gpu_energy
+=
top
.
gpu_energy_
;
total_ddr_energy
+=
top
.
ddr_energy_
;
total_power
+=
top
.
power_
;
total_gpu_power
+=
top
.
gpu_power_
;
total_ddr_power
+=
top
.
ddr_power_
;
}
ati
.
average_time_
=
total_time
/
iterations_
;
ati
.
average_energy_
=
total_energy
/
iterations_
;
ati
.
average_gpu_energy_
=
total_gpu_energy
/
iterations_
;
ati
.
average_ddr_energy_
=
total_ddr_energy
/
iterations_
;
ati
.
average_power_
=
total_power
/
iterations_
;
ati
.
average_gpu_power_
=
total_gpu_power
/
iterations_
;
ati
.
average_ddr_power_
=
total_ddr_power
/
iterations_
;
// Calculate standard deviation
for
(
const
auto
&
top
:
topv
)
{
auto
time_diff
=
top
.
time_
-
ati
.
average_time_
;
time_sum
+=
time_diff
*
time_diff
;
auto
energy_diff
=
top
.
energy_
-
ati
.
average_energy_
;
energy_sum
+=
energy_diff
*
energy_diff
;
auto
gpu_energy_diff
=
top
.
gpu_energy_
-
ati
.
average_gpu_energy_
;
gpu_energy_sum
+=
gpu_energy_diff
*
gpu_energy_diff
;
auto
ddr_energy_diff
=
top
.
ddr_energy_
-
ati
.
average_ddr_energy_
;
ddr_energy_sum
+=
ddr_energy_diff
*
ddr_energy_diff
;
auto
power_diff
=
top
.
power_
-
ati
.
average_power_
;
power_sum
+=
power_diff
*
power_diff
;
auto
gpu_power_diff
=
top
.
gpu_power_
-
ati
.
average_gpu_power_
;
gpu_power_sum
+=
gpu_power_diff
*
gpu_power_diff
;
auto
ddr_power_diff
=
top
.
ddr_power_
-
ati
.
average_ddr_power_
;
ddr_power_sum
+=
ddr_power_diff
*
ddr_power_diff
;
}
ati
.
time_std_
=
std
::
sqrt
(
time_sum
/
iterations_
);
ati
.
energy_std_
=
std
::
sqrt
(
energy_sum
/
iterations_
);
ati
.
gpu_energy_std_
=
std
::
sqrt
(
gpu_energy_sum
/
iterations_
);
ati
.
ddr_energy_std_
=
std
::
sqrt
(
ddr_energy_sum
/
iterations_
);
ati
.
power_std_
=
std
::
sqrt
(
power_sum
/
iterations_
);
ati
.
gpu_power_std_
=
std
::
sqrt
(
gpu_power_sum
/
iterations_
);
ati
.
ddr_power_std_
=
std
::
sqrt
(
ddr_power_sum
/
iterations_
);
agg_tensor_info_
.
push_back
(
ati
);
}
}
public
:
Profiler
()
{
cpu_stream_
.
open
(
cpu_power_rail
,
std
::
ifstream
::
in
);
gpu_stream_
.
open
(
gpu_power_rail
,
std
::
ifstream
::
in
);
ddr_stream_
.
open
(
ddr_power_rail
,
std
::
ifstream
::
in
);
soc_stream_
.
open
(
soc_power_rail
,
std
::
ifstream
::
in
);
sys_stream_
.
open
(
sys_power_rail
,
std
::
ifstream
::
in
);
if
(
!
cpu_stream_
.
is_open
()
or
!
gpu_stream_
.
is_open
()
or
!
ddr_stream_
.
is_open
()
or
!
soc_stream_
.
is_open
()
or
!
sys_stream_
.
is_open
())
{
std
::
cout
<<
"Failed to open one of the power rails for reading
\n
"
;
exit
(
1
);
}
}
~
Profiler
()
{
cpu_stream_
.
close
();
gpu_stream_
.
close
();
ddr_stream_
.
close
();
soc_stream_
.
close
();
sys_stream_
.
close
();
}
void
profile
(
const
char
*
const
program
,
const
int
iterations
)
{
iterations_
=
iterations
;
resetGlobal
();
for
(
unsigned
i
=
0
;
i
<
iterations_
;
i
++
)
{
resetLocal
();
// Launch two threads: one for running the program and one for
// profiling it. Pin the threads to specific cores to remove migration
// overhead. Profiling showed that the sampling rate increases slightly
// with pinning.
std
::
thread
prog
(
&
Profiler
::
runProgram
,
this
,
program
);
std
::
thread
power
(
&
Profiler
::
recordPower
,
this
);
pinThread
(
prog
,
core1
);
pinThread
(
power
,
core2
);
prog
.
join
();
power
.
join
();
updateStats
();
// Sleep for some time to bring the GPU back to idle
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
gpu_idle_time
));
}
calculateAggregateStats
();
}
void
dumpTensorInfo
(
const
char
*
const
filename
)
const
{
const
std
::
string
header
=
"Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std
\n
"
;
std
::
ofstream
ofs
;
ofs
.
open
(
filename
);
//ofs << header;
for
(
const
auto
&
ati
:
agg_tensor_info_
)
{
ofs
<<
ati
.
name_
<<
","
<<
ati
.
average_time_
*
1e3
<<
","
<<
ati
.
average_energy_
/*
<< "," << ati.average_gpu_energy_
<< "," << ati.average_ddr_energy_
<< "," << ati.average_power_
<< "," << ati.average_gpu_power_
<< "," << ati.average_ddr_power_
<< "," << ati.time_std_ * 1e3
<< "," << ati.energy_std_
<< "," << ati.gpu_energy_std_
<< "," << ati.ddr_energy_std_
<< "," << ati.power_std_
<< "," << ati.gpu_power_std_
<< "," << ati.ddr_power_std_*/
<<
"
\n
"
;
std
::
cout
<<
ati
.
average_time_
*
1e3
<<
","
<<
ati
.
average_energy_
<<
"
\n
"
;
}
ofs
.
close
();
}
void
dumpPowerReadings
(
const
char
*
const
filename
)
const
{
std
::
ofstream
ofs
;
ofs
.
open
(
filename
);
for
(
const
auto
&
reading
:
power_readings_
)
{
std
::
chrono
::
duration
<
double
>
duration
=
reading
.
time_
-
start_time_
;
//std::chrono::duration<double> duration = reading.time_.time_since_epoch();
ofs
<<
std
::
to_string
(
duration
.
count
())
<<
" "
<<
reading
.
gpu_
<<
" "
<<
reading
.
ddr_
<<
"
\n
"
;
}
ofs
.
close
();
}
void
dumpTotalInfo
()
const
{
auto
total_time
=
total_info_
.
time_
/
iterations_
;
auto
total_energy
=
total_info_
.
energy_
/
iterations_
;
auto
gpu_energy
=
total_info_
.
gpu_energy_
/
iterations_
;
auto
ddr_energy
=
total_info_
.
ddr_energy_
/
iterations_
;
auto
power
=
total_info_
.
power_
/
iterations_
;
auto
gpu_power
=
total_info_
.
gpu_power_
/
iterations_
;
auto
ddr_power
=
total_info_
.
ddr_power_
/
iterations_
;
std
::
cout
<<
"-----------------------------------------------------
\n
"
;
std
::
cout
<<
"Program info (average)
\n
"
;
std
::
cout
<<
"-----------------------------------------------------
\n
"
;
std
::
cout
<<
"
\t
Execution time: "
<<
total_time
<<
" seconds
\n
"
;
std
::
cout
<<
"
\t
Total energy: "
<<
total_energy
<<
" mJ
\n
"
;
std
::
cout
<<
"
\t
GPU: "
<<
gpu_energy
<<
" mJ
\n
"
;
std
::
cout
<<
"
\t
DDR: "
<<
ddr_energy
<<
" mJ
\n
"
;
std
::
cout
<<
"
\t
Power: "
<<
power
<<
" mW
\n
"
;
std
::
cout
<<
"
\t
GPU: "
<<
gpu_power
<<
" mW
\n
"
;
std
::
cout
<<
"
\t
DDR: "
<<
ddr_power
<<
" mW
\n
"
;
std
::
cout
<<
"-----------------------------------------------------
\n
"
;
}
};
int
main
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
NUM_ARGS
)
{
std
::
cout
<<
"Usage: "
<<
argv
[
0
]
<<
" <program> <iterations> <tensor output file> [power output file]
\n
"
;
exit
(
1
);
}
Profiler
pp
;
pp
.
profile
(
argv
[
1
],
std
::
stoi
(
argv
[
2
]));
pp
.
dumpTensorInfo
(
argv
[
3
]);
if
(
argc
>
NUM_ARGS
)
pp
.
dumpPowerReadings
(
argv
[
4
]);
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment