diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt index cbabc8bbe0111a0ec6c99520176a8b37a530a4fb..be42ebec07cc5aebad8ad975155f86cc25715dab 100644 --- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -140,8 +140,12 @@ add_tensor_runtime(tensor_runtime_online -DONLINE_PROFILING=true -DFP16_tuning=f add_dependencies(tensor_runtime_online tensor_runtime) # Adding rule for the debugging source -add_executable(unit_tests tests/unit_tests.cc) -target_link_libraries(unit_tests tensor_runtime_online) +add_executable(sampling_tests tests/sampling_tests.cc) +target_link_libraries(sampling_tests tensor_runtime_online) + +add_executable(perforation_tests tests/perforation_tests.cc) +target_link_libraries(perforation_tests tensor_runtime_online) + # -- Compile tensor_runtime.ll if possible if(INDEP_BUILD) diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/jetson_freq_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/jetson_freq_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..7caee936a232516d6b8a4bd5531d09aa3e939ab9 --- /dev/null +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/jetson_freq_utils.h @@ -0,0 +1,125 @@ + +/**** + + This file contains freqency setting routines specific to the Jetson Tx2 + + NOTE: These routines are not used directly in the current code. + + Users testing frequency changes on the Jetson Tx2 (or similar devices) can use/repurpose these routines + +***/ + +#include <fstream> + + +const int available_freqs[] = { + 140250000, // 0 + 229500000, // 1 + 318750000, // 2 + 408000000, // 3 + 497250000, // 4 + 586500000, // 5 + 675750000, // 6 + 765000000, // 7 + 854250000, // 8 + 943500000, // 9 + 1032750000, // 10 + 1122000000, // 11 + 1211250000, // 12 + 1300500000 // 13 +}; + + +// Sets frequency +void setFreq(unsigned freq_index) { + + unsigned target_freq = available_freqs[freq_index]; + + const char *const min_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; + const char *const max_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; + + std::ofstream min_stream; + std::ofstream max_stream; + + min_stream.open(min_freq_file, std::ofstream::out); + max_stream.open(max_freq_file, std::ofstream::out); + + min_stream << target_freq << std::flush; + max_stream << target_freq << std::flush; + + min_stream.close(); + max_stream.close(); +} + +// Records frequency +unsigned recordFreq() { + + // Current frequency file + const char *const cur_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; + std::ifstream cur_stream; + cur_stream.open(cur_freq_file, std::ifstream::in); + + // Get starting frequency + unsigned cur_freq; + cur_stream >> cur_freq; + std::cout << "Starting frequency = " << cur_freq << "\n"; + cur_stream.close(); + + return cur_freq; +} + +// There will be no frequency request for the first batch +// Therefore, we skip the first element by initializing to 1, not 0. +FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) + : idx_list(il), rep_factor(rf), count(1), idx(0) {} + +unsigned FrequencyIndexList::getNextIndex() { + if (count == rep_factor) { + count = 0; + idx = (idx + 1) % idx_list.size(); + } + count++; + return idx_list[idx]; +} + + +void RuntimeController::readIterationFrequency() { + if (PI) + PI->readIterationFrequency(); +} + +unsigned long RuntimeController::getIterationFrequency() { + return (PI ? PI->getIterationFrequency() : 0); +} + +void RuntimeController::updateFrequency() { +#ifdef JETSON_EXECUTION + unsigned freq_idx = FIL->getNextIndex(); + //--- updateJetsonGPUFreq(freq_idx); + + setFreq(freq_idx); + +#endif // JETSON_EXECUTION +} + +unsigned long RuntimeController::getLastFrequency() { return g_freq; } + +void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; } + + + +void ProfileInfo::readIterationFrequency() { +#ifdef JETSON_EXECUTION + //----- frequency_current_iteration = readJetsonGPUFreq(); + frequency_current_iteration = recordFreq(); +#else + frequency_current_iteration = 0; +#endif // JETSON_EXECUTION +} + +unsigned long ProfileInfo::getIterationFrequency() { + return frequency_current_iteration; +} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index bea66370ba073490fe7970014f1005f123e58988..0332313c573bcd28215a4277cd788e63a7820b2a 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -3,155 +3,35 @@ // //===----------------------------------------------------------------------===// // -// This file contains code for that allows the tensor runtime to adapt -// in response to external changes in conditions (such as frequency changes) -// by helping to choose correct approximation configurations. It also provides -// routines for the rest of the runtime to get performance and energy profiling. +// This file contains code for HPVM Dynamic Approximation Control. +// +// The runtime controller: +// * Reads in the configuration file passed to the HPVM binary +// * Contructs a Pareto Curve +// * Based on the selected Mode it switches configurations at runtime +// +// Author: Maria Kotsifakou // //===----------------------------------------------------------------------===// -#include "hpvm-rt-controller.h" -#include "global_data.h" -#include <fstream> - -//-------- Functionality to read and update frequency on Jetson board -------// -/*const char* available_freqs[] = {"140250000", "229500000", "318750000", - "408000000", "497250000", "586500000", - "675750000", "765000000", "854250000", - "943500000", "1032750000", "1122000000", - "1211250000", "1300500000"}; - -*/ - -const int available_freqs[] = { - 140250000, // 0 - 229500000, // 1 - 318750000, // 2 - 408000000, // 3 - 497250000, // 4 - 586500000, // 5 - 675750000, // 6 - 765000000, // 7 - 854250000, // 8 - 943500000, // 9 - 1032750000, // 10 - 1122000000, // 11 - 1211250000, // 12 - 1300500000 // 13 -}; - -/*void updateJetsonGPUFreq(int freq_level) { - - if (freq_level < 0 || freq_level > 13) { - printf("ERROR: Provide freq level between {0, 13} \n\n\n"); - abort(); - } - - const char* freq_val = available_freqs[freq_level]; - printf("freq-val[0] = %s \n", freq_val); - - FILE* max_file = - fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq", "w+"); - if (max_file == NULL) { - printf("Could not min_freq file \n"); - } - fwrite(freq_val, strlen(freq_val), 1, max_file); - fclose(max_file); - - FILE* min_file = - fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+"); - if (min_file == NULL){ - printf("Could not min_freq file \n"); - abort(); - } - fwrite(freq_val, strlen(freq_val), 1, min_file); - fclose(min_file); -} - -unsigned long int readJetsonGPUFreq() { - FILE* cur_freq_file = - fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq", "r"); -// fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "r"); - if (cur_freq_file == NULL) { - printf("Could not open cur_freq file \n"); - } - - char buf[50]; - char* ptr; - - fread(buf, 50, 1, cur_freq_file); - unsigned long cur_freq = strtoul(buf, &ptr, 10); - fclose(cur_freq_file); - return cur_freq; -} - -*/ - -// Sets frequency -void setFreq(unsigned freq_index) { - - unsigned target_freq = available_freqs[freq_index]; - - const char *const min_freq_file = - "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; - const char *const max_freq_file = - "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; - - std::ofstream min_stream; - std::ofstream max_stream; - - min_stream.open(min_freq_file, std::ofstream::out); - max_stream.open(max_freq_file, std::ofstream::out); - min_stream << target_freq << std::flush; - max_stream << target_freq << std::flush; +// ***NOTE*** The macro definitions below control the runtime policy - min_stream.close(); - max_stream.close(); -} +//--- llvm_hpvm_invokeRtControl_BASE is the baseline policy (default) that just uses the first config (configuration file) +#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl +//--- llvm_hpvm_invokeRtControl_ADJUST_PR is the probabilistic config selection from Pareto curve - Uncomment to use +//#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl -// Records frequency -unsigned recordFreq() { - // Current frequency file - const char *const cur_freq_file = - "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; - std::ifstream cur_stream; - cur_stream.open(cur_freq_file, std::ifstream::in); - // Get starting frequency - unsigned cur_freq; - cur_stream >> cur_freq; - std::cout << "Starting frequency = " << cur_freq << "\n"; - cur_stream.close(); +#include "hpvm-rt-controller.h" +#include "global_data.h" +#include "jetson_freq_utils.h" +#include <fstream> - return cur_freq; -} //---------------------------------------------------------------------------// -/* - * Check if a file exists - * Return true if the file exists, false else - */ -bool fileExists(const std::string &file) { - struct stat buf; - return (stat(file.c_str(), &buf) == 0); -} - -// There will be no frequency request for the first batch -// Therefore, we skip the first element by initializing to 1, not 0. -FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) - : idx_list(il), rep_factor(rf), count(1), idx(0) {} - -unsigned FrequencyIndexList::getNextIndex() { - if (count == rep_factor) { - count = 0; - idx = (idx + 1) % idx_list.size(); - } - count++; - return idx_list[idx]; -} // Functions void ProfileInfo::resetCurrentIterationTime() { @@ -208,18 +88,6 @@ void ProfileInfo::end_iteration() { in_iteration = false; } -void ProfileInfo::readIterationFrequency() { -#ifdef JETSON_EXECUTION - //----- frequency_current_iteration = readJetsonGPUFreq(); - frequency_current_iteration = recordFreq(); -#else - frequency_current_iteration = 0; -#endif // JETSON_EXECUTION -} - -unsigned long ProfileInfo::getIterationFrequency() { - return frequency_current_iteration; -} void ProfileInfo::addToCurrentIterationComputeTime(const char *s, double t) { start_iteration(); @@ -346,8 +214,8 @@ ProfileInfo::ProfileInfo() in_iteration(false) {} Slowdowns::Slowdowns() { - idx = 0; + idx = 0; std::ifstream s_in("slowdowns.txt"); if (!s_in) { DEBUG("slowdowns file not found. Initializing slowdowns randomly.\n"); @@ -446,12 +314,8 @@ void RuntimeController::init(const char *Cstr) { // compute3DParetoConfigurationPoints(); Not using 3D curve INFO("Speedup Configurations\n"); printConfigurations(SpeedupConfigurations); - // INFO("Energy Configurations\n"); - // printConfigurations(EnergyConfigurations); - // INFO("3D Configurations\n"); - // printConfigurations(ThreeDCurveConfigurations); - configurationIdx = - 0; // TODO: initialize using pareto curve - findTargetConfiguration ? + + configurationIdx = 0; Configurations = &SpeedupConfigurations; // Initializations for different runtime control strategies @@ -461,10 +325,8 @@ void RuntimeController::init(const char *Cstr) { // Pseudo random variable (when we did few experiments) // or true random numbers for probabilistic control pseudo_rd = 0.0; - std::random_device - rd; // Will be used to obtain a seed for the random number engine - generator = - std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd() + std::random_device rd; // Will be used to obtain a seed for the random number engine + generator = std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd() distr = std::uniform_real_distribution<>(0.0, 1.0); g_freq = available_freqs[13]; @@ -526,24 +388,6 @@ double RuntimeController::getCurrentIterationComputeEnergy() { return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0); } -void RuntimeController::readIterationFrequency() { - if (PI) - PI->readIterationFrequency(); -} - -unsigned long RuntimeController::getIterationFrequency() { - return (PI ? PI->getIterationFrequency() : 0); -} - -void RuntimeController::updateFrequency() { -#ifdef JETSON_EXECUTION - unsigned freq_idx = FIL->getNextIndex(); - //--- updateJetsonGPUFreq(freq_idx); - - setFreq(freq_idx); - -#endif // JETSON_EXECUTION -} void RuntimeController::writeProfileInfo() { if (PI) @@ -575,6 +419,7 @@ std::pair<double, double> RuntimeController::fc_profile( const unsigned num_rows_a, const unsigned num_cols_a, const unsigned num_rows_b, const unsigned num_cols_b, const unsigned voltage_swing, const unsigned patch_factor) { + return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b, num_cols_b, voltage_swing, patch_factor) : std::make_pair(0.0, 0.0)); @@ -585,6 +430,7 @@ std::pair<double, double> RuntimeController::conv_profile( const unsigned c_out, const unsigned c_in, const unsigned k_h, const unsigned k_w, const unsigned s_h, const unsigned s_w, const unsigned voltage_swing, const unsigned patch_factor) { + return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing, patch_factor) : std::make_pair(0.0, 0.0)); @@ -593,8 +439,12 @@ std::pair<double, double> RuntimeController::conv_profile( // Constructor and descructor RuntimeController::RuntimeController() { configurationIdx = 0; + + // NOTE: The 14 Frequency levels are specific to NVIDIA Jetson Tx2 + // More Frequency utils (not used by default) present in include/jetson_freq_utils.h FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10); + #ifdef ACTIVE_PROFILING PI = new ProfileInfo(); profiler = new Profiler(); @@ -1052,20 +902,7 @@ void RuntimeController::computeParetoConfigurationPoints() { start_idx = end_idx; } - // All elements in InitialConfigurations whose index is in Indices are no - // longer needed. - // for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e = - // Indices.end(); - // idx_it != idx_e; ++idx_it) { - // std::map<std::string, NodeConfiguration * > ConfSetup = - // InitialConfigurations[*idx_it].setup; - // for (std::map<std::string, NodeConfiguration* >::const_iterator it = - // ConfSetup.begin(); - // it != ConfSetup.end(); ++it) { - // delete it->second; - // } - // } - // InitialConfigurations.clear(); + } void RuntimeController::compute3DParetoConfigurationPoints() { @@ -1153,8 +990,9 @@ void RuntimeController::printConfigurations( std::vector<struct Configuration> &Confs) { for (std::vector<struct Configuration>::iterator it = Confs.begin(), - ie = Confs.end(); + ie = Confs.end(); it != ie; ++it) { + it->print(); } } @@ -1163,15 +1001,13 @@ void RuntimeController::printConfigurations( std::vector<struct Configuration *> &Confs) { for (std::vector<struct Configuration *>::iterator it = Confs.begin(), - ie = Confs.end(); + ie = Confs.end(); it != ie; ++it) { + (*it)->print(); } } -unsigned long RuntimeController::getLastFrequency() { return g_freq; } - -void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; } double RuntimeController::getLastSpeedup() { return g_speedup; } @@ -1196,24 +1032,24 @@ void RuntimeController::findTargetConfiguration(float goal, // Assigning one of Pareto configs to 'Configurations' class attribute Configurations = &SpeedupConfigurations; low_it = - std::lower_bound(Configurations->begin(), Configurations->end() - 1, - goal, ConfigurationLessThan_SP()); + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_SP()); configurationIdx = low_it - Configurations->begin(); break; } case ENERGY: { Configurations = &EnergyConfigurations; low_it = - std::lower_bound(Configurations->begin(), Configurations->end() - 1, - goal, ConfigurationLessThan_E()); + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_E()); configurationIdx = low_it - Configurations->begin(); break; } case ACCURACY_LOSS: { Configurations = &SpeedupConfigurations; low_it = - std::lower_bound(Configurations->begin(), Configurations->end() - 1, - goal, ConfigurationLessThan_AL()); + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_AL()); if ((*low_it)->accuracyLoss > goal) --low_it; configurationIdx = low_it - Configurations->begin(); @@ -1232,6 +1068,11 @@ void RuntimeController::findTargetConfiguration(float goal, configurationIdx); } +/*** This routine takes as input goal (target speedup) and computes the probabilty of selecting the higher configuration + (one with higher than target speedup) and probability of lower configuration (config with lower than target speedup). + + Motivation: The Pareto curve often does not have a configuration providing the exact req speedup +***/ void RuntimeController::adjustTargetConfiguration(float goal) { DEBUG("adjustTargetConfiguration: goalVal: %f.\n\n", goal); @@ -1245,53 +1086,22 @@ void RuntimeController::adjustTargetConfiguration(float goal) { // Get the two configurations' speedup, and compute the appropriate ranges float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup; float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup; - float sp_diff = curr_conf_speedup - prev_conf_speedup; + // Computation of how far the target speedup is for lower and higher speedup config + float sp_diff = curr_conf_speedup - prev_conf_speedup; float high_range = curr_conf_speedup - goal; float low_range = goal - prev_conf_speedup; // These represent how likely we are to pick the upper or lower configuration float high_pb = 0.0, low_pb = 0.0; + if (configurationIdx == prev_conf_idx) { high_pb = low_pb = 1.0; - } else { + } + else { + // Compute the probabitly of selection for higher config and lower config high_pb = low_range / sp_diff; low_pb = high_range / sp_diff; - - //***--- Probability adjustment strategy 1 ---***// - // No big adjustments at edges of probability range - // float adjust_val = 0.0; - // if (low_pb < high_pb) { - // adjust_val = low_pb * 0.2; - // } else { - // adjust_val = high_pb * 0.2; - // } - // low_pb -= adjust_val; - // high_pb += adjust_val; - //***--- ---***// - - //***--- Probability adjustment strategy 2 ---***// - // No big adjustment at high edge of probability range - // float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; - // low_pb -= adjust_val; - // high_pb += adjust_val; - //***--- ---***// - - //***--- Probability adjustment strategy 3 ---***// - // Similar to 2, but higher always increases, more significantly - // float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; - // low_pb -= adjust_val; - // high_pb += adjust_val; - //***--- ---***// - - //***--- Probability adjustment strategy 4 ---***// - // Similar to 2, but higher always increases, more significantly - // Low end, high end a bit less aggressive than total range - float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6; - adjust_val = adjust_val > high_pb ? high_pb : adjust_val; - low_pb -= adjust_val; - high_pb += adjust_val; - //***--- ---***// } DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: " @@ -1373,13 +1183,6 @@ uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, fclose(file); } - // int num_labels = end - start; - // uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); - // for (unsigned i = start; i < end; i++) { - // labels[i-start] = labels_from_file[i]; - // } - // return labels; - // Return pointer to labels return &labels_from_file[start]; } @@ -1387,7 +1190,7 @@ uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, static float average_accuracy = 0.0; static int num_executations = 0; -//*** Copied from dnn_sources/include/utils.h ***// + float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { struct Tensor *result = (struct Tensor *)result_ptr; @@ -1433,10 +1236,8 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { return accuracy; } -#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl -//#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl -//#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl - +// This routine is used when llvm_hpvm_invokeRtControl macro is set to llvm_hpvm_invokeRtControl_BASE +// This is the default config selection routine - it selects the first configuration in the config-file extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str, int start, int end) { @@ -1462,92 +1263,12 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str, RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str, - int start, int end) { - - uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->resume_profiler(); - RC->findNextConfiguration(); - // Still use findNext configuration, to update the configurationIdx, - // to point to next location - enum SEARCH_KIND k = ACCURACY_LOSS; - float goalVal = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss; - RC->findTargetConfiguration(goalVal, k); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} - -extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str, - int start, int end) { - - uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - RC->readIterationFrequency(); - - RC->resume_profiler(); - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double target_speedup; - if (RC->getLastFrequency() == RC->getIterationFrequency()) { - target_speedup = RC->getLastSpeedup(); - } else { - double baseline_time = RC->getBaselineTime(); - // Relative to current configuration - target_speedup = current_iteration_time / baseline_time; - // Adjust to baseline - target_speedup *= RC->getCurrentConfigurationSpeedup(); - RC->setLastFrequency(RC->getIterationFrequency()); - RC->setLastSpeedup(target_speedup); - } - RC->findTargetConfiguration(target_speedup, SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - //* * - //*Needed for the frequency variation experiment - not part of the control * - RC->resume_profiler(); - RC->updateFrequency(); - RC->pause_profiler(); - - std::pair<double, double> pinfo2 = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationConfigTime(pinfo2.first); - RC->addToCurrentIterationConfigEnergy(pinfo2.second); - //* */ - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("target speedup = %lf\n\n", target_speedup); - - // Note the end of iteration - RC->end_iteration(); -} +/// This routine is used when `llvm_hpvm_invokeRtControl` macro is set to `llvm_hpvm_invokeRtControl_ADJUST_PR` +/// This routine does probabilistic selection of configurations from the Pareto curve extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, - const char *str, int start, + const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); @@ -1555,22 +1276,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, // Read stats for iteration that was just completed double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - RC->readIterationFrequency(); RC->resume_profiler(); double current_iteration_time = RC->getCurrentIterationComputeTime(); double target_speedup; - if (RC->getLastFrequency() == RC->getIterationFrequency()) { - target_speedup = RC->getLastSpeedup(); - } else { - double baseline_time = RC->getBaselineTime(); - // Relative to current configuration - target_speedup = current_iteration_time / baseline_time; - // Adjust to baseline - target_speedup *= RC->getCurrentConfigurationSpeedup(); - RC->setLastFrequency(RC->getIterationFrequency()); - RC->setLastSpeedup(target_speedup); - } + + double baseline_time = RC->getBaselineTime(); + // Relative to current configuration + target_speedup = current_iteration_time / baseline_time; + // Adjust to baseline + target_speedup *= RC->getCurrentConfigurationSpeedup(); + RC->findTargetConfiguration(target_speedup, SPEEDUP); RC->adjustTargetConfiguration(target_speedup); RC->pause_profiler(); @@ -1580,18 +1296,14 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - //* * - //*Needed for the frequency variation experiment - not part of the control * RC->resume_profiler(); - RC->updateFrequency(); RC->pause_profiler(); std::pair<double, double> pinfo2 = RC->get_time_energy(); RC->reset_profiler(); RC->addToCurrentIterationConfigTime(pinfo2.first); RC->addToCurrentIterationConfigEnergy(pinfo2.second); - //* */ - + INFO("current iteration time = %f, current iteration energy = %f\n", current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); @@ -1600,119 +1312,4 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result, - const char *str, int start, - int end) { - - uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - std::string prev_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - - RC->resume_profiler(); - float slowdown = RC->getSlowdowns()->getNextSlowdown(); - RC->findTargetConfiguration(slowdown, SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - std::string next_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - float next_conf_speedup = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("slowdown (target speedup) = %f\n", slowdown); - INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result, - const char *str, - int start, int end) { - - uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - std::string prev_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - - RC->resume_profiler(); - float slowdown = RC->getSlowdowns()->getNextSlowdown(); - RC->findTargetConfiguration(slowdown, SPEEDUP); - RC->adjustTargetConfiguration(slowdown); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - std::string next_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - float next_conf_speedup = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("slowdown (target speedup) = %f\n", slowdown); - INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); - - // Note the end of iteration - RC->end_iteration(); -} -extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str, - int start, int end) { - - uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->resume_profiler(); - RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} - -template <typename T> -static void writeVectorToFile(const char *path, const std::vector<T> &vec) { - std::ofstream of(path, std::ofstream::out | std::ofstream::app); - if (!of.good()) - ERROR("Cannot write to %s file", path); - for (float f : vec) - of << f << ' '; - of << '\n'; -} diff --git a/hpvm/projects/hpvm-tensor-rt/tests/perforation_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/perforation_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..c36a2c81d04a64398e106006c49746e0dd70037b --- /dev/null +++ b/hpvm/projects/hpvm-tensor-rt/tests/perforation_tests.cc @@ -0,0 +1,16 @@ + +#include "tests.h" + + +int main() { + + llvm_hpvm_initTensorRt(0); + + UnitTestResults unitTestResults; + + testPerforation(unitTestResults); + + unitTestResults.printSummary(); + + return 0; +} diff --git a/hpvm/projects/hpvm-tensor-rt/tests/sampling_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/sampling_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..087511413e56c7b8653ea5cb5d9798839af88ebb --- /dev/null +++ b/hpvm/projects/hpvm-tensor-rt/tests/sampling_tests.cc @@ -0,0 +1,16 @@ + +#include "tests.h" + + +int main() { + + llvm_hpvm_initTensorRt(0); + + UnitTestResults unitTestResults; + + testSampling(unitTestResults); + + unitTestResults.printSummary(); + + return 0; +} diff --git a/hpvm/projects/hpvm-tensor-rt/tests/tests.h b/hpvm/projects/hpvm-tensor-rt/tests/tests.h new file mode 100644 index 0000000000000000000000000000000000000000..e2cf1d70de2e640568232c61abe39248d1eb9bc6 --- /dev/null +++ b/hpvm/projects/hpvm-tensor-rt/tests/tests.h @@ -0,0 +1,451 @@ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <vector> +#include <string.h> +#include "tensor_runtime.h" +#include "tensor_cpu_runtime.h" +#include "tensorUtils.h" +#include "tensor_custom_ops_cpu.h" + +using namespace std; + + +class UnitTestResults { + +private: + unsigned int total_tests; + unsigned int failed_tests; + unsigned int passed_tests; + std::vector<string> failed_test_ids; + +public: + UnitTestResults() { + total_tests = 0; + failed_tests = 0; + passed_tests = 0; + } + + void evalTestResult(Tensor *res, const float *expected_result, + size_t num_elems, float epsilon, string test_name) { + + total_tests += 1; + if (res->num_elems != num_elems) { + failed_tests += 1; + failed_test_ids.push_back(test_name); + return; + } + + float *data_ptr = (float *)res->host_data; + for (unsigned int i = 0; i < res->num_elems; i++) { + if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) { + failed_tests += 1; + failed_test_ids.push_back(test_name); + return; + } + } + + passed_tests += 1; + } + + void compareTensors(Tensor *res, Tensor *gold_res, float epsilon, + string test_name) { + + const float *expected_result = (float *)gold_res->host_data; + unsigned int num_elems = res->num_elems; + + evalTestResult(res, expected_result, num_elems, epsilon, test_name); + } + + void printSummary() { + + printf("\n\n\n ************* Printing Results Summary ********** \n\n"); + printf("-- Total tests := %d \n", total_tests); + printf("-- Tests Passed := %d \n", passed_tests); + printf("-- Tests Failed := %d \n", failed_tests); + + printf("\n\n Tests that failed : \n\n"); + for (int i = 0; i < failed_test_ids.size(); i++) { + printf("*** Test = %s \n", failed_test_ids[i].c_str()); + } + + if (failed_test_ids.size() > 0){ + + printf("Some Tests Failed. Aborting"); + exit(1); + } + + } +}; + + + + + +void testSampleFilter() { + + printf("***** Tensor Sample Filter ***** \n\n"); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + + fillWithOnesAndTwos(input); + + Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, + 3, 2, 32, 32); + fillTensorWithVal(input2, 1); + printTensorValues(input); + + void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1); + printTensorValues(exact_res); + + void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0); + + printTensorValues(res); +} + +void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w, + int stride_h, int stride_w, int row, int col, + UnitTestResults &unitTestResults) { + + float interpolation_rate = 1.0; + for (int offset = 0; offset < 2; offset++) { + + printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " + "row = %d col = %d offset= %d \n\n", + pad_h, pad_w, stride_h, stride_w, row, col, offset); + + void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1); + + printf("tensorConvolution Result :"); + printTensorValues(res_exact); + + void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, 1, 1); + + printf("\nBaseline Result :"); + printTensorValues(res_exact2); + + void *res_exact3 = tensorConvApproxHalf2( + input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); + convertToFP32((struct Tensor *)res_exact3); + + printf("\nFP16_Baseline Result :"); + printTensorValues(res_exact3); + + void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, offset); + + printf("\nConvPerfCuda Result :"); + printTensorValues(res_sim); + + void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, 1, offset); + + printf("\nConvApprox Result :"); + printTensorValues(res); + + hpvm_request_tensor(input, HOST); + hpvm_request_tensor(filter, HOST); + + void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, 1, offset); + + printf("\nConvApproxCPU Result :"); + printTensorValues(res_cpu); + + void *res_half = + tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, + 1, 1, row, col, 1, offset); + + convertToFP32((struct Tensor *)res_half); + + printf("\nConvApproxHalf2 Result :"); + printTensorValues(res_half); + + std::string suffix = + std::string(" pad_h = ") + std::to_string(pad_h) + + std::string(" pad_w = ") + std::to_string(pad_w) + + std::string(" stride_h = ") + std::to_string(stride_h) + + std::string(" stride_w = ") + std::to_string(stride_w) + + std::string(" row = ") + std::to_string(row) + std::string(" col = ") + + std::to_string(col) + std::string(" offset = ") + + std::to_string(offset); + + std::string test_name = std::string("PERF_FP32 ") + suffix; + + unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05, + test_name); + + std::string fp16_test_name = std::string("PERF_FP16 ") + suffix; + unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1, + fp16_test_name); + + std::string cpu_test_name = std::string("PERF_CPU ") + suffix; + unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05, + cpu_test_name); + } + + printf("\n\n\n--- End of Test \n\n\n"); +} + +/**** Tests Perforation for a set of different inputs */ +void testPerforation(UnitTestResults &unitTestResults) { + + printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + fillTensorWithVal(input, 1); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); + + + testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2, unitTestResults); + + testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1, unitTestResults); + + testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3, unitTestResults); + + testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1, unitTestResults); + + testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4, unitTestResults); + + testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1, unitTestResults); +} + +void testSampling() { + + printf("***** Testing Sampling ***** \n\n"); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + fillTensorWithVal(input, 1); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); + + float *host_ptr = (float *)((struct Tensor *)filter)->host_data; + host_ptr[0] = 2; + host_ptr[2] = 2; + host_ptr[4] = 2; + host_ptr[6] = 2; + host_ptr[8] = 2; + host_ptr[10] = 2; + host_ptr[12] = 2; + host_ptr[14] = 2; + host_ptr[16] = 2; + host_ptr[18] = 2; + host_ptr[20] = 2; + host_ptr[22] = 2; + host_ptr[24] = 2; + host_ptr[26] = 2; + // printTensorValues(input); + + void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); + + printTensorValues(res); + + void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1); + + printTensorValues(res2); + + void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0); + + printTensorValues(res2_sim); + + void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0); + + printTensorValues(res3); + + void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); + + printTensorValues(res4); + + void *res4_half = + tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); + + convertToFP32((struct Tensor *)res4_half); + + printTensorValues(res4_half); +} + +void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w, + int stride_h, int stride_w, int skip_every, + std::string filter_string, + UnitTestResults &unitTestResults) { + + float interpolation_rate = 1.0; + for (int offset = 0; offset < 2; offset++) { + + printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " + "skip_every = %d offset= %d interpolation_rate = %f \n\n", + pad_h, pad_w, stride_h, stride_w, skip_every, offset, + interpolation_rate); + + void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1); + + printf("tensorConvolution Result :"); + printTensorValues(res_exact); + + void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, 1, 1); + + printf("\nBaseline Result :"); + printTensorValues(res_exact2); + + void *res_exact3 = tensorConvApproxHalf2( + input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); + convertToFP32((struct Tensor *)res_exact3); + + printf("\nFP16_Baseline Result :"); + printTensorValues(res_exact3); + + void *res_sim = + tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1, + 1, skip_every, offset, interpolation_rate); + + printf("\nConvSampSim Result :"); + printTensorValues(res_sim); + + void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, skip_every, offset); + + printf("\nConvApprox Result :"); + printTensorValues(res); + + hpvm_request_tensor(input, HOST); + hpvm_request_tensor(filter, HOST); + + void *res_cpu = + tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, stride_w, 1, + 1, 1, 1, skip_every, offset); + + printf("\nConvApproxCPU Result :"); + printTensorValues(res_cpu); + + void *res_half = + tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, + 1, 1, 1, 1, skip_every, offset); + + convertToFP32((struct Tensor *)res_half); + + printf("\nConvApproxHalf2 Result :"); + printTensorValues(res_half); + + std::string suffix = + "filter = " + std::string(filter_string) + std::string(" pad_h = ") + + std::to_string(pad_h) + std::string(" pad_w = ") + + std::to_string(pad_w) + std::string(" stride_h = ") + + std::to_string(stride_h) + std::string(" stride_w = ") + + std::to_string(stride_w) + std::string(" skip_every = ") + + std::to_string(skip_every) + std::string(" offset = ") + + std::to_string(offset); + + std::string test_name = std::string("SAMP_FP32 ") + suffix; + + unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05, + test_name); + + std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; + unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1, + fp16_test_name); + + std::string cpu_test_name = std::string("SAMP_CPU ") + suffix; + unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05, + cpu_test_name); + } + + printf("\n\n\n --- End of Test \n\n\n"); +} + +/**** Tests Sample for a sample 3 * 3 Filter */ +void testSampling_3_3(UnitTestResults &unitTestResults) { + + printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + fillTensorWithVal(input, 1); + // fillWithOnesAndTwos(input); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); + + float *host_ptr = (float *)((struct Tensor *)filter)->host_data; + host_ptr[0] = 10; + host_ptr[2] = 2; + host_ptr[4] = 2; + host_ptr[6] = 2; + host_ptr[8] = 2; + host_ptr[10] = 2; + host_ptr[12] = 2; + host_ptr[14] = 2; + host_ptr[16] = 2; + host_ptr[18] = 2; + host_ptr[20] = 2; + host_ptr[22] = 2; + host_ptr[24] = 2; + host_ptr[26] = 10; + + // Tests with padding = 0 stride = 1 + testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "3_3", unitTestResults); + + testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "3_3", unitTestResults); + + testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "3_3", unitTestResults); + + // Tests with padding = 1 stride = 1 + testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "3_3", unitTestResults); + + testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "3_3", unitTestResults); + + testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "3_3", unitTestResults); + + // Tests with padding = 1 stride = 2 + testSamplingCalls(input, filter, 1, 1, 2, 2, 2, "3_3", unitTestResults); + + testSamplingCalls(input, filter, 1, 1, 2, 2, 3, "3_3", unitTestResults); + + testSamplingCalls(input, filter, 1, 1, 2, 2, 4, "3_3", unitTestResults); +} + +/**** Tests Sample for a sample 1 * 1 Filter */ +void testSampling_1_1(UnitTestResults &unitTestResults) { + + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2); + fillTensorWithVal(input, 2); + // fillWithOnesAndTwos(input); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1); + fillTensorWithVal(filter, 2); + + // Tests with padding = 0 stride = 1 + testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "1_1", unitTestResults); + + testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "1_1", unitTestResults); + + testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "1_1", unitTestResults); + + // Tests with padding = 1 stride = 1 + testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "1_1", unitTestResults); + + testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "1_1", unitTestResults); + + testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "1_1", unitTestResults); +} + + + +void testSampling(UnitTestResults &unitTestResults){ + + testSampling_3_3(unitTestResults); + testSampling_1_1(unitTestResults); +} + diff --git a/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc deleted file mode 100644 index ffb4c3a809b3e936f6c27ebd7c11aef5c4460104..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc +++ /dev/null @@ -1,1120 +0,0 @@ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <vector> -#include <string.h> -#include "tensor_runtime.h" -#include "tensor_cpu_runtime.h" -#include "tensorUtils.h" -#include "tensor_custom_ops_cpu.h" - -using namespace std; - -class UnitTestResults { - -private: - unsigned int total_tests; - unsigned int failed_tests; - unsigned int passed_tests; - std::vector<string> failed_test_ids; - -public: - UnitTestResults() { - total_tests = 0; - failed_tests = 0; - passed_tests = 0; - } - - void evalTestResult(Tensor *res, const float *expected_result, - size_t num_elems, float epsilon, string test_name) { - - total_tests += 1; - if (res->num_elems != num_elems) { - failed_tests += 1; - failed_test_ids.push_back(test_name); - return; - } - - float *data_ptr = (float *)res->host_data; - for (unsigned int i = 0; i < res->num_elems; i++) { - // printf("**diff value = %f ", std::abs(data_ptr[i] - - // expected_result[i])); - if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) { - failed_tests += 1; - failed_test_ids.push_back(test_name); - return; - } - } - - passed_tests += 1; - } - - void compareTensors(Tensor *res, Tensor *gold_res, float epsilon, - string test_name) { - - const float *expected_result = (float *)gold_res->host_data; - unsigned int num_elems = res->num_elems; - - evalTestResult(res, expected_result, num_elems, epsilon, test_name); - } - - void printSummary() { - - printf("\n\n\n ************* Printing Results Summary ********** \n\n"); - printf("-- Total tests := %d \n", total_tests); - printf("-- Tests Passed := %d \n", passed_tests); - printf("-- Tests Failed := %d \n", failed_tests); - - printf("\n\n Tests that failed : \n\n"); - for (int i = 0; i < failed_test_ids.size(); i++) { - printf("*** Test = %s \n", failed_test_ids[i].c_str()); - } - } -}; - -void testTensorHgemm(UnitTestResults &unitTestResults) { - - printf("***** TensorHgemm ***** \n\n"); - void *lhs_ptr = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor *lhs = (struct Tensor *)lhs_ptr; - fillTensorWithOnes(lhs); - - float *data_arr = (float *)lhs->host_data; - for (int i = 0; i < lhs->num_elems; i++) { - data_arr[i] = (i / 4) + 1; - } - - void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); - fillTensorWithOnes(rhs); - - void *output = tensorHalfGemm(lhs, rhs); - convertToFP32((struct Tensor *)output); - - printTensorValues(output); - - const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, - 12, 16, 16, 16, 20, 20, 20}; - - unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01, - "Hgemm"); -} - -void testTensorSgemm(UnitTestResults &unitTestResults) { - - printf("***** TensorSgemm ***** \n\n"); - void *lhs_ptr = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor *lhs = (struct Tensor *)lhs_ptr; - fillTensorWithOnes(lhs); - - float *data_arr = (float *)lhs->host_data; - for (int i = 0; i < lhs->num_elems; i++) { - data_arr[i] = (i / 4) + 1; - } - - void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); - fillTensorWithOnes(rhs); - - void *output = tensorGemmGPU(lhs, rhs); - printTensorValues(output); - - const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, - 12, 16, 16, 16, 20, 20, 20}; - - unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01, - "Sgemm"); -} - -void testTensorConcatAndSplit() { - - int conv_mode = 1; // CROSS_CORRELATION mode - int compute_precision = 0; // floating point precision - - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); - fillWithOnesAndTwos(input); - void **splits = tensorSplit(input, 2, 1); - - void *conv2W = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); - fillTensorWithOnes(conv2W); - - void **conv2fils = tensorSplit(conv2W, 2, 0); - - void *conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, 1, 1, - conv_mode, compute_precision); - printTensorDims(conv2a_out); - - void *conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, 1, 1, - conv_mode, compute_precision); - printTensorDims(conv2b_out); - - void *conv2_outs[2]; - conv2_outs[0] = conv2a_out; - conv2_outs[1] = conv2b_out; - - void *conv2_concat_out = tensorConcat(conv2_outs, 2, 1); - printTensorDims(conv2_concat_out); - printTensorValues(conv2_concat_out); -} - -void testLRN() { - - void *input = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20); - fillTensorWithOnes(input); - - unsigned LRN_window = 5; - double LRN_alpha = 2e-05; - printf("LRN_alpha = %f \n", LRN_alpha); - - double LRN_beta = 0.75; - double LRN_k = 1.0; - - // TEST-point - Compare TF vs CUDNN - void *lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k); - printTensorDims(lrn1out); - dumpWeightsToFile("tensors_out/lrn1_test.out", lrn1out); - - void *input2 = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7); - fillTensorWithOnes(input2); - - LRN_window = 5; - LRN_alpha = 0.5 * LRN_window; - - LRN_beta = 0.75; - LRN_k = 1.0; - - void *lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k); - printTensorDims(lrn2out); - dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); -} - -void testTensorAdd() { - - // Tensor add with equal dimensions - void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); - void *bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); - fillTensorWithOnes(x); - fillTensorWithOnes(bias); - - printTensorValues(x); - printTensorValues(bias); - - tensorAdd(x, bias); - printTensorValues(x); - - // Tensor addd with matching channel dimension - void *x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2); - void *bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1); - fillTensorWithOnes(x2); - fillTensorWithOnes(bias2); - - tensorAdd(x2, bias2); - printTensorValues(x2); -} - -void testTensorConv() { - - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void *filter = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); - - fillTensorWithOnes(input); - fillTensorWithOnes(filter); - - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION - int compute_precision = 0; // floating point precision for conv - - void *conv_out = tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, - compute_precision); - printTensorValues(conv_out); -} - -void testTensorHalfConv() { - - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void *filter = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); - - fillTensorWithOnes(input); - fillTensorWithOnes(filter); - - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION - int compute_precision = 0; // floating point precision for conv - - void *conv_out = tensorHalfConvolution(input, filter, 0, 0, 1, 1, conv_mode, - compute_precision); - printTensorValues(conv_out); -} - -void testTensorGroupConv() { - - // NOTE: The input channel count value (param2 to Tensor and Filter) must be - // the same - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void *filter = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); - - // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, - // val) - fillTensorWithOnes(input); - fillTensorWithOnes(filter); - - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION - int conv_groups = 2; - - void *conv_out = - tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups); - printTensorValues(conv_out); -} - -void testTensorHalfGroupConv() { - - // NOTE: The input channel count value (param2 to Tensor and Filter) must be - // the same - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void *filter = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); - - fillTensorWithOnes(input); - fillTensorWithOnes(filter); - - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION - int conv_groups = 2; - - void *conv_out = - tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups); - - convertToFP32((struct Tensor *)conv_out); - - printTensorValues(conv_out); -} - -void testTensorPooling() { - - void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); - fillTensorWithOnes(x); - - float *data_arr = (float *)((Tensor *)x)->host_data; - for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) { - data_arr[i] = i; - } - - void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); - printTensorValues(output); -} - -void testTensorHalfPooling() { - - void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); - fillTensorWithOnes(x); - - float *data_arr = (float *)((Tensor *)x)->host_data; - for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) { - data_arr[i] = i; - } - - void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); - convertToFP32((struct Tensor *)output); - - printTensorValues(output); -} - -void testTensorBatchNorm() { - - void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); - fillTensorWithVal(x, 3); - - void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(gamma, 1); - - void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(beta, 0); - - void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(mean, 1); - - void *variance = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(variance, 1); - - double epsilon = 1; - // NOTE: result = X - mean / sqrt(epsilon + variance) - void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); - - printTensorValues(output); -} - -void testTensorHalfBatchNorm() { - - void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); - fillTensorWithVal(x, 3); - - void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(gamma, 1); - - void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(beta, 0); - - void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(mean, 1); - - void *variance = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); - fillTensorWithVal(variance, 1); - - double epsilon = 1; - // NOTE: result = X - mean / sqrt(epsilon + variance) - void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); - convertToFP32((struct Tensor *)output); - - printTensorValues(output); -} - -void testTensorRelu() { - - // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match - printf("***** TensorRelu ***** \n\n"); - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); - fillTensorWithNegOnes(input); - - void *output = tensorRelu(input); - printTensorValues(output); -} - -void testTensorSoftmax() { - - printf("***** TensorSoftmax ***** \n\n"); - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1); - - float *host_ptr = (float *)((struct Tensor *)input)->host_data; - host_ptr[0] = 0.1; - host_ptr[1] = 0.2; - host_ptr[2] = 0.3; - host_ptr[3] = 0.4; - host_ptr[4] = 0.5; - host_ptr[5] = 0.6; - host_ptr[6] = 0.7; - host_ptr[7] = 2.5; - - void *output = tensorSoftmax(input); - printTensorValues(output); -} - -void testSoftmaxOutput(void *output_ptr) { - - struct Tensor *output = (struct Tensor *)output_ptr; - - size_t batch_dim = output->dims.dim_sizes[0]; - size_t channels = output->dims.dim_sizes[1]; - - float *data = (float *)output->host_data; - for (int i = 0; i < batch_dim; i++) { - float sum = 0.0; - for (int j = 0; j < channels; j++) { - sum += data[i * channels + j]; - } - printf("output_sum = %f \n", sum); - } -} - -void testPromiseError() { - - printf("***** TensorQuantize ***** \n\n"); - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float *host_ptr = (float *)((struct Tensor *)input)->host_data; - - void *gold_tensor = - create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float *gold_ptr = (float *)((struct Tensor *)gold_tensor)->host_data; - - gold_ptr[0] = -1; - gold_ptr[1] = -2; - gold_ptr[2] = -3; - gold_ptr[3] = -4; - gold_ptr[4] = -5; - gold_ptr[5] = 0; - gold_ptr[6] = 5; - gold_ptr[7] = 4; - gold_ptr[8] = 3; - gold_ptr[9] = 2; - gold_ptr[10] = 1; - gold_ptr[11] = 1; - - int num_elems = 12; - int num_runs = 1000; - - float *result_ptr = (float *)malloc(sizeof(float) * num_elems); - - for (int swing = 1; swing <= 7; swing++) { - - for (int j = 0; j < num_elems; j++) { - result_ptr[j] = 0; - } - - float error_sum = 0.0; - - for (int i = 0; i < 1000; i++) { - host_ptr[0] = -1; - host_ptr[1] = -2; - host_ptr[2] = -3; - host_ptr[3] = -4; - host_ptr[4] = -5; - host_ptr[5] = 0; - host_ptr[6] = 5; - host_ptr[7] = 4; - host_ptr[8] = 3; - host_ptr[9] = 2; - host_ptr[10] = 1; - host_ptr[11] = 1; - - void *error_out = addPromiseError(input, swing); - // printTensorValues(error_out); - - // Move result data back to the host - hpvm_request_tensor(input, 0); - float *error_out_ptr = (float *)((struct Tensor *)input)->host_data; - - for (int j = 0; j < num_elems; j++) { - result_ptr[j] += error_out_ptr[j]; - error_sum += - (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); - } - } - - printf("\n\n - Swing %d results : \n", swing); - for (int j = 0; j < num_elems; j++) { - result_ptr[j] = result_ptr[j] / num_runs; - printf(" %f ", result_ptr[j]); - } - - printf("mean_error = %f \n", error_sum / num_runs); - - printf(" \n"); - } -} - -void testQuantization() { - - printf("***** TensorQuantize ***** \n\n"); - void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - - float *host_ptr = (float *)((struct Tensor *)input)->host_data; - host_ptr[0] = -0.1; - host_ptr[1] = -25; - host_ptr[2] = 0.2; - host_ptr[3] = -0.4; - host_ptr[4] = 1.7; - host_ptr[5] = -2.9; - host_ptr[6] = 0.7; - host_ptr[7] = 0.99; - host_ptr[8] = 7; - host_ptr[9] = 7.2; - host_ptr[10] = 2.5; - host_ptr[11] = 3; - - void *quantize_result1 = quantizeTensorPromise(input, -4, 6); - - printf("\n ** quantizing with range min = %d max = %d \n", -4, 6); - printTensorValues(quantize_result1); - - host_ptr[0] = -0.1; - host_ptr[1] = -25; - host_ptr[2] = 0.2; - host_ptr[3] = -0.4; - host_ptr[4] = 1.7; - host_ptr[5] = -2.9; - host_ptr[6] = 0.7; - host_ptr[7] = 0.99; - host_ptr[8] = 7; - host_ptr[9] = 7.2; - host_ptr[10] = 2.5; - host_ptr[11] = 3; - - void *quantize_result2 = quantizeTensorPromise(input, -2, 2); - - printf("\n ** quantizing with range min = %d max = %d \n", -2, 2); - printTensorValues(quantize_result2); - - host_ptr[0] = -0.1; - host_ptr[1] = -25; - host_ptr[2] = 0.2; - host_ptr[3] = -0.4; - host_ptr[4] = 1.7; - host_ptr[5] = -2.9; - host_ptr[6] = 0.7; - host_ptr[7] = 0.99; - host_ptr[8] = 7; - host_ptr[9] = 7.2; - host_ptr[10] = 2.5; - host_ptr[11] = 3; - - void *quantize_result3 = quantizeTensorPromise(input, -25, 8); - - printf("\n ** quantizing with range min = %d max = %d \n", -25, 8); - printTensorValues(quantize_result3); - - printf("\n ** quantizing with range min = %d max = %d \n", -10, 10); - - host_ptr[0] = -0.1; - host_ptr[1] = -25; - host_ptr[2] = 0.2; - host_ptr[3] = -0.4; - host_ptr[4] = 1.7; - host_ptr[5] = -2.9; - host_ptr[6] = 0.7; - host_ptr[7] = 0.99; - host_ptr[8] = 7; - host_ptr[9] = 7.2; - host_ptr[10] = 2.5; - host_ptr[11] = 3; - - void *quantize_result4 = quantizeTensorPromise(input, -10, 10); - printTensorValues(quantize_result4); - - void *quantize_result5 = quantizeTensorPromise(input, -10, 10); - printTensorValues(quantize_result5); - - // void* error_out = addPromiseError(quantize_result, 1); - // printTensorValues(error_out); -} - -void testSampleFilter() { - - printf("***** Tensor Sample Filter ***** \n\n"); - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); - // fillTensorWithVal(input, 3); - fillWithOnesAndTwos(input); - - Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, - 3, 2, 32, 32); - fillTensorWithVal(input2, 1); - - /* float* host_ptr = (float*) ((struct Tensor*) input)->host_data; - host_ptr[0] = -0.1; - host_ptr[1] = -25; - host_ptr[2] = 0.2; - host_ptr[3] = -0.4; - host_ptr[4] = 1.7; - host_ptr[5] = -2.9; - host_ptr[6] = 0.7; - host_ptr[7] = 0.99; - */ - - printTensorValues(input); - - /* printf("\n\n"); - - hpvm_request_tensor(input, DEVICE); - - sampleFilter(input, 2, 1); - - hpvm_request_tensor(input, HOST); - - printTensorValues(input); - */ - - void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1); - printTensorValues(exact_res); - - void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0); - - // void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3); - - printTensorValues(res); -} - -void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w, - int stride_h, int stride_w, int row, int col, - UnitTestResults &unitTestResults) { - - float interpolation_rate = 1.0; - for (int offset = 0; offset < 2; offset++) { - - printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " - "row = %d col = %d offset= %d \n\n", - pad_h, pad_w, stride_h, stride_w, row, col, offset); - - void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1); - - printf("tensorConvolution Result :"); - printTensorValues(res_exact); - - void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1, 1, 1, 1, 1); - - printf("\nBaseline Result :"); - printTensorValues(res_exact2); - - void *res_exact3 = tensorConvApproxHalf2( - input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); - convertToFP32((struct Tensor *)res_exact3); - - printf("\nFP16_Baseline Result :"); - printTensorValues(res_exact3); - - void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1, row, col, offset); - - printf("\nConvPerfCuda Result :"); - printTensorValues(res_sim); - - void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1, row, col, 1, offset); - - printf("\nConvApprox Result :"); - printTensorValues(res); - - hpvm_request_tensor(input, HOST); - hpvm_request_tensor(filter, HOST); - - void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1, row, col, 1, offset); - - printf("\nConvApproxCPU Result :"); - printTensorValues(res_cpu); - - void *res_half = - tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, - 1, 1, row, col, 1, offset); - - convertToFP32((struct Tensor *)res_half); - - printf("\nConvApproxHalf2 Result :"); - printTensorValues(res_half); - - std::string suffix = - std::string(" pad_h = ") + std::to_string(pad_h) + - std::string(" pad_w = ") + std::to_string(pad_w) + - std::string(" stride_h = ") + std::to_string(stride_h) + - std::string(" stride_w = ") + std::to_string(stride_w) + - std::string(" row = ") + std::to_string(row) + std::string(" col = ") + - std::to_string(col) + std::string(" offset = ") + - std::to_string(offset); - - std::string test_name = std::string("PERF_FP32 ") + suffix; - - unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05, - test_name); - - std::string fp16_test_name = std::string("PERF_FP16 ") + suffix; - unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1, - fp16_test_name); - - std::string cpu_test_name = std::string("PERF_CPU ") + suffix; - unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05, - cpu_test_name); - } - - printf("\n\n\n--- End of Test \n\n\n"); -} - -/**** Tests Perforation for a set of different inputs */ -void testPerforation(UnitTestResults &unitTestResults) { - - printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); - fillTensorWithVal(input, 1); - - Tensor *filter = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); - - /* - float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; - host_ptr[0] = 2; - host_ptr[2] = 2; - host_ptr[4] = 2; - host_ptr[6] = 2; - host_ptr[8] = 2; - host_ptr[10] = 2; - host_ptr[12] = 2; - host_ptr[14] = 2; - host_ptr[16] = 2; - host_ptr[18] = 2; - host_ptr[20] = 2; - host_ptr[22] = 2; - host_ptr[24] = 2; - host_ptr[26] = 2; - */ - - testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2, unitTestResults); - - testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1, unitTestResults); - - testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3, unitTestResults); - - testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1, unitTestResults); - - testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4, unitTestResults); - - testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1, unitTestResults); -} - -void testSampling() { - - printf("***** Testing Sampling ***** \n\n"); - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); - fillTensorWithVal(input, 1); - // fillWithOnesAndTwos(input); - - Tensor *filter = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); - - float *host_ptr = (float *)((struct Tensor *)filter)->host_data; - host_ptr[0] = 2; - host_ptr[2] = 2; - host_ptr[4] = 2; - host_ptr[6] = 2; - host_ptr[8] = 2; - host_ptr[10] = 2; - host_ptr[12] = 2; - host_ptr[14] = 2; - host_ptr[16] = 2; - host_ptr[18] = 2; - host_ptr[20] = 2; - host_ptr[22] = 2; - host_ptr[24] = 2; - host_ptr[26] = 2; - // printTensorValues(input); - - void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); - - printTensorValues(res); - - void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1); - - printTensorValues(res2); - - void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0); - - printTensorValues(res2_sim); - - void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0); - - printTensorValues(res3); - - void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - - printTensorValues(res4); - - void *res4_half = - tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - - convertToFP32((struct Tensor *)res4_half); - - printTensorValues(res4_half); -} - -void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w, - int stride_h, int stride_w, int skip_every, - std::string filter_string, - UnitTestResults &unitTestResults) { - - float interpolation_rate = 1.0; - for (int offset = 0; offset < 2; offset++) { - - printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " - "skip_every = %d offset= %d interpolation_rate = %f \n\n", - pad_h, pad_w, stride_h, stride_w, skip_every, offset, - interpolation_rate); - - void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1); - - printf("tensorConvolution Result :"); - printTensorValues(res_exact); - - void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1, 1, 1, 1, 1); - - printf("\nBaseline Result :"); - printTensorValues(res_exact2); - - void *res_exact3 = tensorConvApproxHalf2( - input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); - convertToFP32((struct Tensor *)res_exact3); - - printf("\nFP16_Baseline Result :"); - printTensorValues(res_exact3); - - void *res_sim = - tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1, - 1, skip_every, offset, interpolation_rate); - - printf("\nConvSampSim Result :"); - printTensorValues(res_sim); - - void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, - stride_w, 1, 1, 1, 1, skip_every, offset); - - printf("\nConvApprox Result :"); - printTensorValues(res); - - hpvm_request_tensor(input, HOST); - hpvm_request_tensor(filter, HOST); - - void *res_cpu = - tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, stride_w, 1, - 1, 1, 1, skip_every, offset); - - printf("\nConvApproxCPU Result :"); - printTensorValues(res_cpu); - - void *res_half = - tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, - 1, 1, 1, 1, skip_every, offset); - - convertToFP32((struct Tensor *)res_half); - - printf("\nConvApproxHalf2 Result :"); - printTensorValues(res_half); - - std::string suffix = - "filter = " + std::string(filter_string) + std::string(" pad_h = ") + - std::to_string(pad_h) + std::string(" pad_w = ") + - std::to_string(pad_w) + std::string(" stride_h = ") + - std::to_string(stride_h) + std::string(" stride_w = ") + - std::to_string(stride_w) + std::string(" skip_every = ") + - std::to_string(skip_every) + std::string(" offset = ") + - std::to_string(offset); - - std::string test_name = std::string("SAMP_FP32 ") + suffix; - - unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05, - test_name); - - std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; - unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1, - fp16_test_name); - - std::string cpu_test_name = std::string("SAMP_CPU ") + suffix; - unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05, - cpu_test_name); - } - - printf("\n\n\n --- End of Test \n\n\n"); -} - -/**** Tests Sample for a sample 3 * 3 Filter */ -void testSampling_3_3(UnitTestResults &unitTestResults) { - - printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); - fillTensorWithVal(input, 1); - // fillWithOnesAndTwos(input); - - Tensor *filter = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); - - float *host_ptr = (float *)((struct Tensor *)filter)->host_data; - host_ptr[0] = 10; - host_ptr[2] = 2; - host_ptr[4] = 2; - host_ptr[6] = 2; - host_ptr[8] = 2; - host_ptr[10] = 2; - host_ptr[12] = 2; - host_ptr[14] = 2; - host_ptr[16] = 2; - host_ptr[18] = 2; - host_ptr[20] = 2; - host_ptr[22] = 2; - host_ptr[24] = 2; - host_ptr[26] = 10; - - // Tests with padding = 0 stride = 1 - testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "3_3", unitTestResults); - - testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "3_3", unitTestResults); - - testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "3_3", unitTestResults); - - // Tests with padding = 1 stride = 1 - testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "3_3", unitTestResults); - - testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "3_3", unitTestResults); - - testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "3_3", unitTestResults); - - // Tests with padding = 1 stride = 2 - testSamplingCalls(input, filter, 1, 1, 2, 2, 2, "3_3", unitTestResults); - - testSamplingCalls(input, filter, 1, 1, 2, 2, 3, "3_3", unitTestResults); - - testSamplingCalls(input, filter, 1, 1, 2, 2, 4, "3_3", unitTestResults); -} - -/**** Tests Sample for a sample 1 * 1 Filter */ -void testSampling_1_1(UnitTestResults &unitTestResults) { - - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2); - fillTensorWithVal(input, 2); - // fillWithOnesAndTwos(input); - - Tensor *filter = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1); - fillTensorWithVal(filter, 2); - - // Tests with padding = 0 stride = 1 - testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "1_1", unitTestResults); - - testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "1_1", unitTestResults); - - testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "1_1", unitTestResults); - - // Tests with padding = 1 stride = 1 - testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "1_1", unitTestResults); - - testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "1_1", unitTestResults); - - testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "1_1", unitTestResults); -} - -void *testTensorArgMax() { - - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1); - - float *host_ptr = (float *)((struct Tensor *)input)->host_data; - - // Input 0 - host_ptr[0] = 1; - host_ptr[1] = 7; // highest - max index = 1 - host_ptr[2] = 3; - - // Input 1 - host_ptr[3] = 3; - host_ptr[4] = 3; - host_ptr[5] = 8; // highest - max index = 2 - - // Input 2 - host_ptr[6] = 2; - host_ptr[7] = 5; - host_ptr[8] = 9; // highest - max index = 2 - - // Input 3 - host_ptr[9] = 11; // highest - max index = 0 - host_ptr[10] = 2; - host_ptr[11] = 8; - - void *argmax_out = tensorArgMax(input); - - // Expect Output of call below to be: - // 1 2 2 0 - printTensorValues(argmax_out); - - return argmax_out; -} - -void *testTensorSelect(void *argmax_out) { - - void *select_out = tensorSelect(argmax_out, 2); - printf("***** tensorSelect output \n"); - - printTensorValues(select_out); - - return select_out; -} - -void testTensorContract(void *select_out) { - - Tensor *input = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1); - float *host_ptr = (float *)((struct Tensor *)input)->host_data; - - // Input 0 - host_ptr[0] = 1; - host_ptr[1] = 1; - host_ptr[2] = 1; - host_ptr[3] = 1; - - // Input 1 - host_ptr[4] = 2; - host_ptr[5] = 2; - host_ptr[6] = 2; - host_ptr[7] = 2; - - // Input 2 - host_ptr[8] = 3; - host_ptr[9] = 3; - host_ptr[10] = 3; - host_ptr[11] = 3; - - // Input 3 - host_ptr[12] = 4; - host_ptr[13] = 4; - host_ptr[14] = 4; - host_ptr[15] = 4; - - void *contract_out = tensorContract(input, select_out); - printf("***** tensorContract output \n"); - - printTensorValues(contract_out); -} - -void testNewTensorOps() { - - void *argmax_out = testTensorArgMax(); - void *select_out = testTensorSelect(argmax_out); - testTensorContract(select_out); -} - -int main() { - - llvm_hpvm_initTensorRt(0); - - UnitTestResults unitTestResults; - - // Function call per unit test - testTensorHgemm(unitTestResults); - testTensorSgemm(unitTestResults); - - /* - testTensorConv(); - testTensorHalfConv(); - - testTensorGroupConv(); - testTensorHalfGroupConv(); - - testTensorBatchNorm(); - testTensorHalfBatchNorm(); - - testTensorPooling(); - testTensorHalfPooling(); - - */ - - testSampling_3_3(unitTestResults); - testSampling_1_1(unitTestResults); - - testPerforation(unitTestResults); - - unitTestResults.printSummary(); - - // testTensorError(); - // testQuantization(); - // testTensorGemm(); - // testTensorGemmGPU(); - // testTensorGemmBias(); - // testTensorConv2(); - // testTensorConv3(); - // testLRN(); - // testSampleFilter(); - // testNewTensorOps(); - // testQuantization(); - // testPromiseError(); - - return 0; -}