diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h index 138ddd0887b57ce583b8f5cfeaba19ad7d20eb4e..330d97600e6cdcf44bb93dbf28625cca8051c3ec 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h @@ -3,48 +3,44 @@ #ifndef APPROXHPVM_RUNTIME_UTILS #define APPROXHPVM_RUNTIME_UTILS + +#include "tensor_runtime.h" +#include "tensor_cpu_runtime.h" #include "configuration.h" #include "hpvm-rt-controller.h" -#include "tensor_runtime.h" #include "approx_knob_utils.h" // Utilities header for ApproxHPVM runtime API (wrapper runtime API) -void *handleTensorAddApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, void *bias) { +//----------------------------------------------------------------------------// +//--- CPU Approximation handling ---// +//----------------------------------------------------------------------------// - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; +void* handleTensorAddApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, void* bias) { + +if (approxTuples.size() == 1) { + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorAdd(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfAdd(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorAddCPU(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -57,42 +53,32 @@ void *handleTensorAddApproximationTuples( return NULL; } -void *handleTensorMulApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *lhs, void *rhs) { +void* handleTensorMulApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* lhs, void* rhs) { if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorGemmGPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfGemmGPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorGemmCPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here - } + } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); abort(); @@ -103,90 +89,79 @@ void *handleTensorMulApproximationTuples( return NULL; } -void *handleTensorConvApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, void *filter, int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w) { +void* handleTensorConvApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, void* filter, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w) { if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = - tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", - pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::PERFORATION: - case GPUNodeConfiguration::APPROX::PERFORATION_HP: { - PerfParams params = perfParamSet->getPerfParams(param); - // PerfParams params = PerfParamSet().getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); - void *t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2( - input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, - 1, 1, params.row, params.col, 1, params.skip_offset); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", - pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", - pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::INPUT_SAMPLING: - case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: { - SampParams params = sampParamSet->getSampParams(param); - // SampParams params = SampParamSet().getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, - params.skip_offset); - void *t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, 1, 1, - params.skip_rate, params.skip_offset); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", - pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", - pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); + return t_out; + } + case CPUNodeConfiguration::APPROX::PERFORATION : + { + PerfParams params = perfParamSet->getPerfParams(param); + INFO("perforation param = %i\n", param); + INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", + params.row, params.col, params.skip_offset); + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + params.row, params.col, 1, params.skip_offset); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", pinfo.second); + return t_out; + } + case CPUNodeConfiguration::APPROX::INPUT_SAMPLING : + { + SampParams params = sampParamSet->getSampParams(param); + INFO("sampling param = %i\n", param); + INFO("params.skip_rate = %i, params.skip_offset = %i\n", + params.skip_rate, params.skip_offset); + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + 1, 1, + params.skip_rate, params.skip_offset); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -199,49 +174,75 @@ void *handleTensorConvApproximationTuples( return NULL; } -void *handleTensorGroupConvApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, void *filter, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int conv_groups) { +void* handleTensorGroupConvApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, void* filter, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups) { if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, conv_mode, - conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", - pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", - pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorConvCutlassCPU(input, filter, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorBatchNormApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input_ptr, void* gamma_ptr, void* beta_ptr, + void* mean_ptr, void* variance_ptr, double epsilon) { + + if (approxTuples.size() == 1) { + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, + mean_ptr, variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); @@ -253,44 +254,211 @@ void *handleTensorGroupConvApproximationTuples( return NULL; } -void *handleTensorBatchNormApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, - void *variance_ptr, double epsilon) { +void* handleTensorReluApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, - variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); - return t_out; + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorReluCPU(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, - variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", - pinfo.second); - return t_out; + return NULL; +} + +void* handleTensorClippedReluApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, float min, float max) { + + if (approxTuples.size() == 1) { + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorRelu2CPU(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorTanhApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + + if (approxTuples.size() == 1) { + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorTanhCPU(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); + return NULL; +} + +void* handleTensorPoolingApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input_ptr, int poolFunction, + int window_height, int window_width, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { + + if (approxTuples.size() == 1) { + enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case CPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorPoolingCPU(input_ptr, + poolFunction, + window_height, window_width, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); abort(); + } + return NULL; +} + +void* handleTensorSoftmaxApproximationTuples_CPU( + std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input_ptr) { + void* t_out; + RC->resume_profiler(); + t_out = tensorSoftmaxCPU(input_ptr); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorSoftmaxCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorSoftmaxCPU", pinfo.second); + return t_out; +} + +//----------------------------------------------------------------------------// +//--- GPU Approximation handling ---// +//----------------------------------------------------------------------------// + +void* handleTensorAddApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, void* bias) { + + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorAdd(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfAdd(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -303,42 +471,44 @@ void *handleTensorBatchNormApproximationTuples( return NULL; } -void *handleTensorReluApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input) { +void* handleTensorMulApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* lhs, void* rhs) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorRelu(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfRelu(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorGemmGPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfGemmGPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here - } + } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); abort(); @@ -349,40 +519,100 @@ void *handleTensorReluApproximationTuples( return NULL; } -void *handleTensorClippedReluApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, float min, float max) { +void* handleTensorConvApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, void* filter, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorRelu2(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfRelu2(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApprox(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + 1, 1, 1, 1); + + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + 1, 1, 1, 1); + + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::PERFORATION : + case GPUNodeConfiguration::APPROX::PERFORATION_HP : + { + PerfParams params = perfParamSet->getPerfParams(param); + INFO("perforation param = %i\n", param); + INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", + params.row, params.col, params.skip_offset); + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + params.row, params.col, 1, params.skip_offset); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::INPUT_SAMPLING : + case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP : + { + SampParams params = sampParamSet->getSampParams(param); + INFO("sampling param = %i\n", param); + INFO("params.skip_rate = %i, params.skip_offset = %i\n", + params.skip_rate, params.skip_offset); + void* t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 1, + 1, 1, + params.skip_rate, params.skip_offset); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -395,41 +625,103 @@ void *handleTensorClippedReluApproximationTuples( return NULL; } -void *handleTensorTanhApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input) { +void* handleTensorGroupConvApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, void* filter, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorTanh(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfTanh(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorConvCutlass(input, filter, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfConvCutlass(input, filter, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorBatchNormApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input_ptr, void* gamma_ptr, void* beta_ptr, + void* mean_ptr, void* variance_ptr, double epsilon) { + + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, + mean_ptr, variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, + mean_ptr, variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); @@ -441,64 +733,215 @@ void *handleTensorTanhApproximationTuples( return NULL; } -void *handleTensorPoolingApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input_ptr, int poolFunction, int window_height, int window_width, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride) { +void* handleTensorReluApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorPooling(input_ptr, poolFunction, window_height, - window_width, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); - return t_out; + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorRelu(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfRelu(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorHalfPooling(input_ptr, poolFunction, window_height, - window_width, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); - return t_out; + return NULL; +} + +void* handleTensorClippedReluApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input, float min, float max) { + + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorRelu2(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfRelu2(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); + return NULL; +} + +void* handleTensorTanhApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorTanh(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfTanh(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); abort(); + } + return NULL; +} + +void* handleTensorPoolingApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input_ptr, int poolFunction, + int window_height, int window_width, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { + + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorPooling(input_ptr, + poolFunction, + window_height, window_width, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorHalfPooling(input_ptr, + poolFunction, + window_height, window_width, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } return NULL; } -void *handleTensorSoftmaxApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input_ptr) { - // TODO: if approximation choices are added for softmax operation, +void* handleTensorSoftmaxApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input_ptr) { + //TODO: if approximation choices are added for softmax operation, // implement this like the other handle* functions - void *t_out; + void* t_out; RC->resume_profiler(); t_out = tensorSoftmax(input_ptr); RC->pause_profiler(); @@ -509,4 +952,5 @@ void *handleTensorSoftmaxApproximationTuples( return t_out; } + #endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h index 2067609c5a476291a27763b80a558da099e62e60..d27f463e789fae2e2c41bf31ea6498b47fd5240f 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h @@ -11,7 +11,7 @@ // Describes the internal choices made for an ApproxHPVM node class NodeConfiguration { public: - enum NODE_CONFIGURATION_TARGET { PROMISE, GPU, END }; + enum NODE_CONFIGURATION_TARGET { PROMISE, GPU, CPU, END }; protected: enum NODE_CONFIGURATION_TARGET NODE_CONFIGURATION_TARGET_ID; @@ -21,6 +21,8 @@ public: bool isGPUNodeConfiguration(); + bool isCPUNodeConfiguration(); + virtual void print() = 0; }; @@ -108,6 +110,57 @@ public: void print() override; }; +class CPUNodeConfiguration : public NodeConfiguration { +public: + // Approximation methods available for this HW type + enum APPROX { + FP32, + PERFORATION, + INPUT_SAMPLING, + // ADDITIONAL_APPROXIMATION_METHOD + APPROX_END + }; + + // Operations to be approximated in the node using this configuration + enum TENSOR_OP { + ADD, + BATCHNORM, + CONV, + GROUP_CONV, + MUL, + RELU, + CLIPPED_RELU, + TANH, + POOL_MAX, + POOL_MEAN, + POOL_MIN, + SOFTMAX, + // ADDITIONAL_TENSOR_OPERATION + TENSOR_OP_END + }; + +private: + // A vector, containing pairs of approximation method and tunable parameter + // (expressed as int, or ignored when not applicable) for each operation + std::vector< + std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>> + ApproxChoices; + +public: + void pushNewTensorOperation(enum TENSOR_OP top); + + void pushNewApproximationChoiceForOperation(enum APPROX approx, int u); + + std::vector< + std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>> & + getApproxChoices(); + + CPUNodeConfiguration(); + ~CPUNodeConfiguration(); + + void print() override; +}; + // Configuration : Includes configuration information : // - name // - speedup @@ -140,8 +193,8 @@ struct Configuration { // Comparison operator definition, in increasing accuracy loss // (for std sort, used in pareto optimal computation) struct ConfigurationLessThan { - bool operator()(const struct Configuration &a, - const struct Configuration &b) const; + bool operator()( + const struct Configuration &a, const struct Configuration &b) const; }; // Comparison operator definition, in increasing accuracy loss diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h index 24f69c03903faf29b074284482f172efa334549f..31214eaaa799d5cd8d4de5b0935b41aa7fce617d 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h @@ -1,64 +1,75 @@ -#include <cmath> +#include <stdio.h> #include <cstdlib> +#include <cmath> #include <memory> -#include <stdio.h> #include <string> -#ifndef CUDNN_HEADER -#define CUDNN_HEADER -extern "C" { -/**** Initialization Routine - Must be inserted at program start (in the - * backend) ****/ -void llvm_hpvm_initTensorRt(int gpuid = 0); -void llvm_hpvm_cleanupTensorRt(); +#ifndef TENSOR_CPU_HEADER +#define TENSOR_CPU_HEADER -// Routine to moving tensor data (from and to GPU,CPU) -void hpvm_request_tensor(void *tensor, int destination); -// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for -// cuDNN operations NOTE: The only data format supported as of now is: NCHW -// (batch_dimension, channels, Height, Width) -void *create4DTensor(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size, - bool freeMemory = true); +extern "C"{ + /**** Initialization Routine - Must be inserted at program start (in the backend) ****/ + void llvm_hpvm_initTensorRtCPU(); + void llvm_hpvm_cleanupTensorRtCPU(); -void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes); + // Routine to moving tensor data (from and to GPU,CPU) + void hpvm_request_tensorCPU(void* tensor, int destination); -/********** Tensor Operation API ******/ -// NOTE: For conv_mode, only value '1' is supported -void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int compute_precision, int row, int col, - int skip_every, int start); + // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations + // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width) + //void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, + /// size_t dim3_size, size_t dim4_size, bool freeMemory = true); + + void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes); -void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int conv_groups); + /********** Tensor Operation API ******/ -void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, - void *mean_ptr, void *variance_ptr, double epsilon); + // NOTE: For conv_mode, only value '1' is supported +void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int row, int col, int skip_every, int start); -void *tensorPoolingCPU(void *input, int poolFunction, int window_height, - int window_width, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride); +void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int row, int col, int skip_every, int start); -void *tensorGemmCPU(void *lhs, void *rhs); +void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups); + + void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr, + void* mean_ptr, void* variance_ptr, double epsilon); -void *tensorAddCPU(void *x, void *bias); -void *tensorReluCPU(void *input); + void* tensorPoolingCPU(void* input, + int poolFunction, + int window_height, int window_width, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride); -void *tensorRelu2CPU(void *input, float min, float max); + void* tensorGemmCPU(void* lhs, void* rhs); -void *tensorTanhCPU(void *input); + void* tensorAddCPU(void* x, void* bias); -void *tensorSoftmaxCPU(void *input); + void* tensorReluCPU(void* input); + + void* tensorRelu2CPU(void* input, float min, float max); + + void* tensorTanhCPU(void* input); + + void* tensorSoftmaxCPU(void* input); + } + /* void dummyFunction(){ @@ -92,8 +103,9 @@ void dummyFunction(){ void* tensorTanhPtr = (void*) &tensorTanh; void* tensorHalfTanhPtr = (void*) &tensorHalfTanh; void* tensorSoftmaxPtr = (void*) &tensorSoftmax; - void* tensorAddErrorPtr = (void*) &tensorAddError; + void* tensorAddErrorPtr = (void*) &tensorAddError; } */ + #endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp index 517b7c7009645c43c7f2af6fa733b4205590efd8..d9d598d2a64cd898bc6c2b51607e1fb92b9afb8a 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp @@ -2,7 +2,9 @@ using P_APPROX = PROMISENodeConfiguration::APPROX; using G_APPROX = GPUNodeConfiguration::APPROX; +using C_APPROX = CPUNodeConfiguration::APPROX; using G_TENSOR_OP = GPUNodeConfiguration::TENSOR_OP; +using C_TENSOR_OP = CPUNodeConfiguration::TENSOR_OP; bool NodeConfiguration::isPROMISENodeConfiguration() { return NODE_CONFIGURATION_TARGET_ID == PROMISE; @@ -12,8 +14,12 @@ bool NodeConfiguration::isGPUNodeConfiguration() { return NODE_CONFIGURATION_TARGET_ID == GPU; } -void PROMISENodeConfiguration::pushNewApproximationChoice(P_APPROX approx, - int u) { +bool NodeConfiguration::isCPUNodeConfiguration() { + return NODE_CONFIGURATION_TARGET_ID == CPU; +} + +void PROMISENodeConfiguration::pushNewApproximationChoice( + P_APPROX approx, int u) { ApproxChoices.push_back(std::make_pair(approx, u)); } @@ -28,7 +34,7 @@ PROMISENodeConfiguration::PROMISENodeConfiguration() { PROMISENodeConfiguration::~PROMISENodeConfiguration() {} -void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) { +void GPUNodeConfiguration::pushNewTensorOperation(G_TENSOR_OP top) { std::vector<std::pair<G_APPROX, int>> emptyVec; ApproxChoices.push_back(std::make_pair(top, emptyVec)); } @@ -36,8 +42,9 @@ void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) { void GPUNodeConfiguration::pushNewApproximationChoiceForOperation( G_APPROX approx, int u) { unsigned size = ApproxChoices.size(); - CUSTOM_ASSERT(size >= 1 && - "Cannot apply approximation choice to non existent operation."); + CUSTOM_ASSERT( + size >= 1 && + "Cannot apply approximation choice to non existent operation."); ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); } @@ -51,8 +58,32 @@ GPUNodeConfiguration::GPUNodeConfiguration() { } GPUNodeConfiguration::~GPUNodeConfiguration() {} -Configuration::Configuration(std::string &n, float f, float e, float a, - float al) +void CPUNodeConfiguration::pushNewTensorOperation(C_TENSOR_OP top) { + std::vector<std::pair<C_APPROX, int>> emptyVec; + ApproxChoices.push_back(std::make_pair(top, emptyVec)); +} + +void CPUNodeConfiguration::pushNewApproximationChoiceForOperation( + C_APPROX approx, int u) { + unsigned size = ApproxChoices.size(); + CUSTOM_ASSERT( + size >= 1 && + "Cannot apply approximation choice to non existent operation."); + ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); +} + +std::vector<std::pair<C_TENSOR_OP, std::vector<std::pair<C_APPROX, int>>>> & +CPUNodeConfiguration::getApproxChoices() { + return ApproxChoices; +} + +CPUNodeConfiguration::CPUNodeConfiguration() { + NODE_CONFIGURATION_TARGET_ID = CPU; +} +CPUNodeConfiguration::~CPUNodeConfiguration() {} + +Configuration::Configuration( + std::string &n, float f, float e, float a, float al) : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {} float Configuration::getSpeedup() { return speedup; } @@ -62,20 +93,20 @@ float Configuration::getEnergy() { return energy; } float Configuration::getAccuracy() { return accuracy; } float Configuration::getAccuracyLoss() { return accuracyLoss; } -bool ConfigurationLessThan::operator()(const struct Configuration &a, - const struct Configuration &b) const { +bool ConfigurationLessThan:: +operator()(const struct Configuration &a, const struct Configuration &b) const { return (a.accuracyLoss < b.accuracyLoss); } -bool ConfigurationLessThan_AL::operator()(const struct Configuration *a, - const float &b) const { +bool ConfigurationLessThan_AL:: +operator()(const struct Configuration *a, const float &b) const { return (a->accuracyLoss < b); } -bool ConfigurationLessThan_SP::operator()(const struct Configuration *a, - const float &b) const { +bool ConfigurationLessThan_SP:: +operator()(const struct Configuration *a, const float &b) const { return (a->speedup < b); } -bool ConfigurationLessThan_E::operator()(const struct Configuration *a, - const float &b) const { +bool ConfigurationLessThan_E:: +operator()(const struct Configuration *a, const float &b) const { return (a->energy < b); } @@ -92,10 +123,10 @@ void PROMISENodeConfiguration::print() { case P_APPROX::SWING_LEVEL: printf("swing_level"); break; + // TODO additional approx methods to be printed here default: ERROR("Unknown approximation option"); break; - // TODO additional approx methods to be printed here } printf(" %d", it.second); } @@ -110,64 +141,64 @@ void GPUNodeConfiguration::print() { printf(" "); switch (it.first) { - case TENSOR_OP::ADD: + case G_TENSOR_OP::ADD: printf("add"); break; - case TENSOR_OP::BATCHNORM: + case G_TENSOR_OP::BATCHNORM: printf("batchnorm"); break; - case TENSOR_OP::CONV: + case G_TENSOR_OP::CONV: printf("conv"); break; - case TENSOR_OP::GROUP_CONV: + case G_TENSOR_OP::GROUP_CONV: printf("group_conv"); break; - case TENSOR_OP::MUL: + case G_TENSOR_OP::MUL: printf("mul"); break; - case TENSOR_OP::RELU: + case G_TENSOR_OP::RELU: printf("relu"); break; - case TENSOR_OP::CLIPPED_RELU: + case G_TENSOR_OP::CLIPPED_RELU: printf("clipped_relu"); break; - case TENSOR_OP::TANH: + case G_TENSOR_OP::TANH: printf("tanh"); break; - case TENSOR_OP::POOL_MAX: + case G_TENSOR_OP::POOL_MAX: printf("pool_max"); break; - case TENSOR_OP::POOL_MEAN: + case G_TENSOR_OP::POOL_MEAN: printf("pool_mean"); break; - case TENSOR_OP::POOL_MIN: + case G_TENSOR_OP::POOL_MIN: printf("pool_min"); break; - case TENSOR_OP::SOFTMAX: + case G_TENSOR_OP::SOFTMAX: printf("softmax"); break; - case TENSOR_OP::FFT: + case G_TENSOR_OP::FFT: printf("fft"); break; - case TENSOR_OP::REDUCE: + case G_TENSOR_OP::REDUCE: printf("reduce"); break; - case TENSOR_OP::PROJECTIVE_T: + case G_TENSOR_OP::PROJECTIVE_T: printf("projectiveT"); break; - case TENSOR_OP::MAP1: + case G_TENSOR_OP::MAP1: printf("map1"); break; - case TENSOR_OP::MAP2: + case G_TENSOR_OP::MAP2: printf("map2"); break; - case TENSOR_OP::MAP3: + case G_TENSOR_OP::MAP3: printf("map3"); break; + // TODO additional operations to be printed here default: ERROR("Unknown tensor operation."); break; - // TODO additional operations to be printed here } auto &approxVec = it.second; @@ -195,10 +226,85 @@ void GPUNodeConfiguration::print() { case G_APPROX::REDUCTION_SAMPLING: printf("red_samp"); break; + // TODO additional approx methods to be printed here default: ERROR("Unknown approximation option"); break; + } + + printf(" %d", inner_it.second); + } + } + + printf("\n"); +} + +void CPUNodeConfiguration::print() { + + printf(" cpu"); + for (auto &it : ApproxChoices) { + + printf(" "); + switch (it.first) { + case C_TENSOR_OP::ADD: + printf("add"); + break; + case C_TENSOR_OP::BATCHNORM: + printf("batchnorm"); + break; + case C_TENSOR_OP::CONV: + printf("conv"); + break; + case C_TENSOR_OP::GROUP_CONV: + printf("group_conv"); + break; + case C_TENSOR_OP::MUL: + printf("mul"); + break; + case C_TENSOR_OP::RELU: + printf("relu"); + break; + case C_TENSOR_OP::CLIPPED_RELU: + printf("clipped_relu"); + break; + case C_TENSOR_OP::TANH: + printf("tanh"); + break; + case C_TENSOR_OP::POOL_MAX: + printf("pool_max"); + break; + case C_TENSOR_OP::POOL_MEAN: + printf("pool_mean"); + break; + case C_TENSOR_OP::POOL_MIN: + printf("pool_min"); + break; + case C_TENSOR_OP::SOFTMAX: + printf("softmax"); + break; + // TODO additional operations to be printed here + default: + ERROR("Unknown tensor operation."); + break; + } + + auto &approxVec = it.second; + for (auto &inner_it : approxVec) { + printf(" "); + switch (inner_it.first) { + case C_APPROX::FP32: + printf("fp32"); + break; + case C_APPROX::PERFORATION: + printf("perf"); + break; + case C_APPROX::INPUT_SAMPLING: + printf("samp"); + break; // TODO additional approx methods to be printed here + default: + ERROR("Unknown approximation option"); + break; } printf(" %d", inner_it.second); @@ -211,8 +317,9 @@ void GPUNodeConfiguration::print() { void Configuration::print() { printf("+++++\n"); - printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, - accuracyLoss); + printf( + "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, + accuracyLoss); for (std::map<std::string, NodeConfiguration *>::const_iterator it = setup.begin(); it != setup.end(); ++it) { diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index fd308a3409dc679a07b5374238e7150fb3c34beb..2dcbf9dcef6f049439f1f7332662db33f532e7a6 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -1,35 +1,37 @@ #include "hpvm-rt-controller.h" -#include "global_data.h" #include "img_tensor_utils.h" +#include "global_data.h" #include <fstream> //-------- Functionality to read and update frequency on Jetson board -------// /*const char* available_freqs[] = {"140250000", "229500000", "318750000", - "408000000", "497250000", "586500000", + "408000000", "497250000", "586500000", "675750000", "765000000", "854250000", "943500000", "1032750000", "1122000000", "1211250000", "1300500000"}; */ + const int available_freqs[] = { - 140250000, // 0 - 229500000, // 1 - 318750000, // 2 - 408000000, // 3 - 497250000, // 4 - 586500000, // 5 - 675750000, // 6 - 765000000, // 7 - 854250000, // 8 - 943500000, // 9 - 1032750000, // 10 - 1122000000, // 11 - 1211250000, // 12 - 1300500000 // 13 +140250000, // 0 +229500000, // 1 +318750000, // 2 +408000000, // 3 +497250000, // 4 +586500000, // 5 +675750000, // 6 +765000000, // 7 +854250000, // 8 +943500000, // 9 +1032750000,// 10 +1122000000,// 11 +1211250000,// 12 +1300500000 // 13 }; + /*void updateJetsonGPUFreq(int freq_level) { if (freq_level < 0 || freq_level > 13) { @@ -37,7 +39,7 @@ const int available_freqs[] = { abort(); } - const char* freq_val = available_freqs[freq_level]; + const char* freq_val = available_freqs[freq_level]; printf("freq-val[0] = %s \n", freq_val); FILE* max_file = @@ -47,7 +49,7 @@ const int available_freqs[] = { } fwrite(freq_val, strlen(freq_val), 1, max_file); fclose(max_file); - + FILE* min_file = fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+"); if (min_file == NULL){ @@ -68,7 +70,7 @@ unsigned long int readJetsonGPUFreq() { char buf[50]; char* ptr; - + fread(buf, 50, 1, cur_freq_file); unsigned long cur_freq = strtoul(buf, &ptr, 10); fclose(cur_freq_file); @@ -77,15 +79,14 @@ unsigned long int readJetsonGPUFreq() { */ + // Sets frequency void setFreq(unsigned freq_index) { unsigned target_freq = available_freqs[freq_index]; - - const char *const min_freq_file = - "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; - const char *const max_freq_file = - "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; + + const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; + const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; std::ofstream min_stream; std::ofstream max_stream; @@ -104,8 +105,7 @@ void setFreq(unsigned freq_index) { unsigned recordFreq() { // Current frequency file - const char *const cur_freq_file = - "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; + const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; std::ifstream cur_stream; cur_stream.open(cur_freq_file, std::ifstream::in); @@ -118,6 +118,10 @@ unsigned recordFreq() { return cur_freq; } + + + + //---------------------------------------------------------------------------// /* @@ -131,13 +135,13 @@ bool fileExists(const std::string &file) { // There will be no frequency request for the first batch // Therefore, we skip the first element by initializing to 1, not 0. -FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) - : idx_list(il), rep_factor(rf), count(1), idx(0) {} +FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) : + idx_list(il), rep_factor(rf), count(1), idx(0) {} unsigned FrequencyIndexList::getNextIndex() { if (count == rep_factor) { count = 0; - idx = (idx + 1) % idx_list.size(); + idx = (idx+1) % idx_list.size(); } count++; return idx_list[idx]; @@ -204,7 +208,7 @@ void ProfileInfo::readIterationFrequency() { frequency_current_iteration = recordFreq(); #else frequency_current_iteration = 0; -#endif // JETSON_EXECUTION +#endif //JETSON_EXECUTION } unsigned long ProfileInfo::getIterationFrequency() { @@ -271,14 +275,15 @@ void ProfileInfo::printToFile() { // to have equal sizes, in outer and inner vectors both, // and all time_info and energy_info vectors must have the same size. unsigned iterations = tensor_time_info.size(); - CUSTOM_ASSERT((tensor_time_info.size() == iterations) && - (tensor_energy_info.size() == iterations) && - (control_time_info.size() == iterations) && - (control_energy_info.size() == iterations) && - (config_time_info.size() == iterations) && - (config_energy_info.size() == iterations) && - (frequency_info.size() == iterations) && - "time_info, energy_info, frequency_info size: \ + CUSTOM_ASSERT( + (tensor_time_info.size() == iterations) && + (tensor_energy_info.size() == iterations) && + (control_time_info.size() == iterations) && + (control_energy_info.size() == iterations) && + (config_time_info.size() == iterations) && + (config_energy_info.size() == iterations) && + (frequency_info.size() == iterations) && + "time_info, energy_info, frequency_info size: \ iteration number does not match."); for (unsigned i = 0; i < tensor_time_info.size(); i++) { @@ -328,8 +333,8 @@ ProfileInfo::ProfileInfo() time_control_current_iteration(0.0), time_config_current_iteration(0.0), energy_compute_current_iteration(0.0), energy_control_current_iteration(0.0), - energy_config_current_iteration(0.0), frequency_current_iteration(0), - in_iteration(false) {} + energy_config_current_iteration(0.0), + frequency_current_iteration(0), in_iteration(false) {} Slowdowns::Slowdowns() { idx = 0; @@ -371,37 +376,37 @@ void RuntimeController::stop_profiler() { profiler->stop_profiler(); } // For testing purposes only - do not use widely -std::vector<struct Configuration *> & -RuntimeController::getSpeedupConfigurations() { +std::vector<struct Configuration *> &RuntimeController:: +getSpeedupConfigurations() { return SpeedupConfigurations; } // For testing purposes only - do not use widely -std::vector<struct Configuration *> & -RuntimeController::getEnergyConfigurations() { +std::vector<struct Configuration *> &RuntimeController:: +getEnergyConfigurations() { return EnergyConfigurations; } // For testing purposes only - do not use widely -std::vector<struct Configuration *> & -RuntimeController::getThreeDCurveConfigurations() { +std::vector<struct Configuration *> &RuntimeController:: +getThreeDCurveConfigurations() { return ThreeDCurveConfigurations; } // For testing purposes only - do not use widely unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; } double RuntimeController::getCurrentConfigurationSpeedup() { - return (double)(*Configurations)[configurationIdx]->speedup; + return (double) (*Configurations)[configurationIdx]->speedup; } double RuntimeController::getCurrentConfigurationEnergy() { - return (double)(*Configurations)[configurationIdx]->energy; + return (double) (*Configurations)[configurationIdx]->energy; } double RuntimeController::getCurrentConfigurationAccuracy() { - return (double)(*Configurations)[configurationIdx]->accuracy; + return (double) (*Configurations)[configurationIdx]->accuracy; } double RuntimeController::getCurrentConfigurationAccuracyLoss() { - return (double)(*Configurations)[configurationIdx]->accuracyLoss; + return (double) (*Configurations)[configurationIdx]->accuracyLoss; } std::vector<float> &RuntimeController::getQuantizationRanges(const char *data) { @@ -443,10 +448,8 @@ void RuntimeController::init(const char *Cstr, const char *Qstr) { // Pseudo random variable (when we did few experiments) // or true random numbers for probabilistic control pseudo_rd = 0.0; - std::random_device - rd; // Will be used to obtain a seed for the random number engine - generator = - std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd() + std::random_device rd; //Will be used to obtain a seed for the random number engine + generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd() distr = std::uniform_real_distribution<>(0.0, 1.0); g_freq = available_freqs[13]; @@ -468,8 +471,8 @@ void RuntimeController::end_iteration() { PI->end_iteration(); } -void RuntimeController::addToCurrentIterationComputeTime(const char *s, - double t) { +void RuntimeController::addToCurrentIterationComputeTime( + const char *s, double t) { if (PI) PI->addToCurrentIterationComputeTime(s, t); } @@ -484,8 +487,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) { PI->addToCurrentIterationConfigTime(t); } -void RuntimeController::addToCurrentIterationComputeEnergy(const char *s, - double e) { +void RuntimeController::addToCurrentIterationComputeEnergy( + const char *s, double e) { if (PI) PI->addToCurrentIterationComputeEnergy(s, e); } @@ -523,8 +526,8 @@ void RuntimeController::updateFrequency() { //--- updateJetsonGPUFreq(freq_idx); setFreq(freq_idx); - -#endif // JETSON_EXECUTION + +#endif //JETSON_EXECUTION } void RuntimeController::writeProfileInfo() { @@ -557,9 +560,11 @@ std::pair<double, double> RuntimeController::fc_profile( const unsigned num_rows_a, const unsigned num_cols_a, const unsigned num_rows_b, const unsigned num_cols_b, const unsigned voltage_swing, const unsigned patch_factor) { - return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b, - num_cols_b, voltage_swing, patch_factor) - : std::make_pair(0.0, 0.0)); + return ( + promise ? promise->fc_profile( + num_rows_a, num_cols_a, num_rows_b, num_cols_b, + voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)); } std::pair<double, double> RuntimeController::conv_profile( @@ -567,16 +572,17 @@ std::pair<double, double> RuntimeController::conv_profile( const unsigned c_out, const unsigned c_in, const unsigned k_h, const unsigned k_w, const unsigned s_h, const unsigned s_w, const unsigned voltage_swing, const unsigned patch_factor) { - return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w, - s_h, s_w, voltage_swing, patch_factor) - : std::make_pair(0.0, 0.0)); + return ( + promise ? promise->conv_profile( + n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing, + patch_factor) + : std::make_pair(0.0, 0.0)); } // Constructor and descructor RuntimeController::RuntimeController() { configurationIdx = 0; - FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - 10); + FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10); #ifdef ACTIVE_PROFILING PI = new ProfileInfo(); profiler = new Profiler(); @@ -713,13 +719,14 @@ void RuntimeController::readConfigurationFile(const char *str) { std::getline(qin, first_line); DEBUG("first_line: %s\n", first_line.c_str()); - try { + try{ baseline_time = std::stod(first_line); DEBUG("Baseline time: %lf\n\n", baseline_time); - } catch (...) { + } + catch(...){ ERROR("Please Add/Fix Baseline Time at Top of Config File.. "); } - + for (std::string line; std::getline(qin, line);) { DEBUG("line: %s\n", line.c_str()); @@ -751,9 +758,9 @@ void RuntimeController::readConfigurationFile(const char *str) { if (readingFirstLine) { // Read first line, to create the new configuration struct readingFirstLine = false; - InitialConfigurations.push_back( - Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), - std::stof(tokens[3]), std::stof(tokens[4]))); + InitialConfigurations.push_back(Configuration( + tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), + std::stof(tokens[3]), std::stof(tokens[4]))); continue; } @@ -761,8 +768,8 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG("Found promise configuration\n"); // There must be at least one approximation option - CUSTOM_ASSERT((tokens.size() >= 2) && - "Not enough approximation options."); + CUSTOM_ASSERT( + (tokens.size() >= 2) && "Not enough approximation options."); PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration(); InitialConfigurations.back().setup.insert( @@ -785,8 +792,9 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG("Found gpu configuration\n"); // There must be at least one operation, with an approximation option - CUSTOM_ASSERT((tokens.size() >= 5) && - "Not enough operations - approximation options."); + CUSTOM_ASSERT( + (tokens.size() >= 5) && + "Not enough operations - approximation options."); GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration(); InitialConfigurations.back().setup.insert( @@ -938,6 +946,106 @@ void RuntimeController::readConfigurationFile(const char *str) { // TODO: other approximation options handled here } + } else if (tokens[1] == "cpu") { + DEBUG("Found gpu configuration\n"); + + // There must be at least one operation, with an approximation option + CUSTOM_ASSERT( + (tokens.size() >= 5) && + "Not enough operations - approximation options."); + + CPUNodeConfiguration *NodeConf = new CPUNodeConfiguration(); + InitialConfigurations.back().setup.insert( + std::make_pair(tokens[0], NodeConf)); + + unsigned idx = 2; + while (idx < tokens.size()) { + if (tokens[idx] == "add") { + DEBUG("Found add operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::ADD); + idx++; + } else if (tokens[idx] == "batchnorm") { + DEBUG("Found batchnorm operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::BATCHNORM); + idx++; + } else if (tokens[idx] == "conv") { + DEBUG("Found conv operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::CONV); + idx++; + } else if (tokens[idx] == "group_conv") { + DEBUG("Found group_conv operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::GROUP_CONV); + idx++; + } else if (tokens[idx] == "mul") { + DEBUG("Found mul operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::MUL); + idx++; + } else if (tokens[idx] == "relu") { + DEBUG("Found relu operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::RELU); + idx++; + } else if (tokens[idx] == "clipped_relu") { + DEBUG("Found clipped_relu operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU); + idx++; + } else if (tokens[idx] == "tanh") { + DEBUG("Found tanh operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::TANH); + idx++; + } else if (tokens[idx] == "pool_max") { + DEBUG("Found pool_max operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::POOL_MAX); + idx++; + } else if (tokens[idx] == "pool_mean") { + DEBUG("Found pool_mean operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::POOL_MEAN); + idx++; + } else if (tokens[idx] == "pool_min") { + DEBUG("Found pool_min operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::POOL_MIN); + idx++; + } else if (tokens[idx] == "softmax") { + DEBUG("Found softmax operation\n"); + NodeConf->pushNewTensorOperation( + CPUNodeConfiguration::TENSOR_OP::SOFTMAX); + idx++; + } else /*Not a new operation. This means an approximation option*/ + if (tokens[idx] == "fp32") { + DEBUG("Found fp32 option\n"); + int fp32 = std::stoi(tokens[idx + 1]); + DEBUG("fp32 parameter: %d, ignoring\n", fp32); + NodeConf->pushNewApproximationChoiceForOperation( + CPUNodeConfiguration::APPROX::FP32, fp32); + idx += 2; + } else if (tokens[idx] == "perf") { + DEBUG("Found perf option\n"); + int perf = std::stoi(tokens[idx + 1]); + DEBUG("perf parameter: %d\n", perf); + NodeConf->pushNewApproximationChoiceForOperation( + CPUNodeConfiguration::APPROX::PERFORATION, perf); + idx += 2; + } else if (tokens[idx] == "samp") { + DEBUG("Found samp option\n"); + int samp = std::stoi(tokens[idx + 1]); + DEBUG("samp parameter: %d\n", samp); + NodeConf->pushNewApproximationChoiceForOperation( + CPUNodeConfiguration::APPROX::INPUT_SAMPLING, samp); + idx += 2; + } + // TODO: other approximation options handled here + } + } else { DEBUG("Invalid Configuration File\n"); exit(1); @@ -960,8 +1068,9 @@ void RuntimeController::computeParetoConfigurationPoints() { // Sort the configurations according to accuracy loss INFO("Sorting autotuner configurations...\n"); - std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(), - ConfigurationLessThan()); + std::sort( + InitialConfigurations.begin() + 1, InitialConfigurations.end(), + ConfigurationLessThan()); INFO("Done sorting.\n"); for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) { @@ -995,12 +1104,14 @@ void RuntimeController::computeParetoConfigurationPoints() { en_idx = i; } } - DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n", - InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); + DEBUG( + "accuracy loss = %f, speedup = %f, at sp_idx = %d\n", + InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); // Found best speedup for this accuracy point (not dominated by any of // these). - DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n", - InitialConfigurations[en_idx].accuracyLoss, en, en_idx); + DEBUG( + "accuracy loss = %f, energy = %f, at en_idx = %d\n", + InitialConfigurations[en_idx].accuracyLoss, en, en_idx); // Found best energy for this accuracy point (not dominated by any of // these). @@ -1070,8 +1181,9 @@ void RuntimeController::compute3DParetoConfigurationPoints() { // Sort the configurations according to accuracy loss INFO("Sorting autotuner configurations...\n"); - std::sort(InitialConfigurations.begin(), InitialConfigurations.end(), - ConfigurationLessThan()); + std::sort( + InitialConfigurations.begin(), InitialConfigurations.end(), + ConfigurationLessThan()); INFO("Done sorting.\n"); for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) { @@ -1105,10 +1217,11 @@ void RuntimeController::compute3DParetoConfigurationPoints() { } } if (!dominated) { - DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", - InitialConfigurations[i].accuracyLoss, - InitialConfigurations[i].speedup, InitialConfigurations[i].energy, - i); + DEBUG( + "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", + InitialConfigurations[i].accuracyLoss, + InitialConfigurations[i].speedup, InitialConfigurations[i].energy, + i); Indices.push_back(i); } } @@ -1167,22 +1280,31 @@ void RuntimeController::printConfigurations( } } -unsigned long RuntimeController::getLastFrequency() { return g_freq; } +unsigned long RuntimeController::getLastFrequency() { + return g_freq; +} -void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; } +void RuntimeController::setLastFrequency(unsigned long f) { + g_freq = f; +} -double RuntimeController::getLastSpeedup() { return g_speedup; } +double RuntimeController::getLastSpeedup() { + return g_speedup; +} -void RuntimeController::setLastSpeedup(double s) { g_speedup = s; } +void RuntimeController::setLastSpeedup(double s) { + g_speedup = s; +} void RuntimeController::findNextConfiguration() { configurationIdx = (configurationIdx + 1) % Configurations->size(); - DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n", - configurationIdx); + DEBUG( + "findNextConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); } -void RuntimeController::findTargetConfiguration(float goal, - enum SEARCH_KIND sk) { +void RuntimeController::findTargetConfiguration( + float goal, enum SEARCH_KIND sk) { // We search in range begin(), end()-1 . It is OK to decrement end(), because // the configurations vector always points to one of the pareto curves, and // they are never empty - we have always pushed at least one configuration. @@ -1192,25 +1314,25 @@ void RuntimeController::findTargetConfiguration(float goal, switch (sk) { case SPEEDUP: { Configurations = &SpeedupConfigurations; - low_it = - std::lower_bound(Configurations->begin(), Configurations->end() - 1, - goal, ConfigurationLessThan_SP()); + low_it = std::lower_bound( + Configurations->begin(), Configurations->end() - 1, goal, + ConfigurationLessThan_SP()); configurationIdx = low_it - Configurations->begin(); break; } case ENERGY: { Configurations = &EnergyConfigurations; - low_it = - std::lower_bound(Configurations->begin(), Configurations->end() - 1, - goal, ConfigurationLessThan_E()); + low_it = std::lower_bound( + Configurations->begin(), Configurations->end() - 1, goal, + ConfigurationLessThan_E()); configurationIdx = low_it - Configurations->begin(); break; } case ACCURACY_LOSS: { Configurations = &SpeedupConfigurations; - low_it = - std::lower_bound(Configurations->begin(), Configurations->end() - 1, - goal, ConfigurationLessThan_AL()); + low_it = std::lower_bound( + Configurations->begin(), Configurations->end() - 1, goal, + ConfigurationLessThan_AL()); if ((*low_it)->accuracyLoss > goal) --low_it; configurationIdx = low_it - Configurations->begin(); @@ -1225,8 +1347,9 @@ void RuntimeController::findTargetConfiguration(float goal, // After search, low_it points to the Configuration to the element with the // goal value or the immediately lower value if it does not exist - DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n", - configurationIdx); + DEBUG( + "findTargetConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); } void RuntimeController::adjustTargetConfiguration(float goal) { @@ -1237,8 +1360,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) { // Find configuration before the selected one. // There is always one, unless goal is 1. Then, we would pick baseline, and // both upper and lower should be the same configuration, at index 0. - unsigned prev_conf_idx = - configurationIdx > 0 ? configurationIdx - 1 : configurationIdx; + unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1 + : configurationIdx; // Get the two configurations' speedup, and compute the appropriate ranges float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup; float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup; @@ -1257,32 +1380,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) { //***--- Probability adjustment strategy 1 ---***// // No big adjustments at edges of probability range - // float adjust_val = 0.0; - // if (low_pb < high_pb) { - // adjust_val = low_pb * 0.2; - // } else { - // adjust_val = high_pb * 0.2; - // } - // low_pb -= adjust_val; - // high_pb += adjust_val; +// float adjust_val = 0.0; +// if (low_pb < high_pb) { +// adjust_val = low_pb * 0.2; +// } else { +// adjust_val = high_pb * 0.2; +// } +// low_pb -= adjust_val; +// high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 2 ---***// // No big adjustment at high edge of probability range - // float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; - // low_pb -= adjust_val; - // high_pb += adjust_val; +// float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; +// low_pb -= adjust_val; +// high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 3 ---***// - // Similar to 2, but higher always increases, more significantly - // float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; - // low_pb -= adjust_val; - // high_pb += adjust_val; + //Similar to 2, but higher always increases, more significantly +// float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; +// low_pb -= adjust_val; +// high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 4 ---***// - // Similar to 2, but higher always increases, more significantly + //Similar to 2, but higher always increases, more significantly // Low end, high end a bit less aggressive than total range float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6; adjust_val = adjust_val > high_pb ? high_pb : adjust_val; @@ -1291,18 +1414,20 @@ void RuntimeController::adjustTargetConfiguration(float goal) { //***--- ---***// } - DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: " - "%f.\n", - ((*Configurations)[configurationIdx]->name).c_str(), high_pb); - DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: " - "%f.\n\n", - ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb); + DEBUG( + "**---- adjustTargetConfiguration: upper conf = %s with probability: " + "%f.\n", + ((*Configurations)[configurationIdx]->name).c_str(), high_pb); + DEBUG( + "**---- adjustTargetConfiguration: lower conf = %s with probability: " + "%f.\n\n", + ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb); // Select a random number from 0 to 1 // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1) // to the upper // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ; - // float rd = pseudo_rd; + //float rd = pseudo_rd; float rd = distr(generator); if (rd < low_pb) { // If the probability is in the low range @@ -1322,8 +1447,8 @@ double RuntimeController::getBaselineTime() { return baseline_time; } Slowdowns *RuntimeController::getSlowdowns() { return slowdowns; } // Functions to be inserted with initializeTensorRT and clearTensorRT -extern "C" void llvm_hpvm_initializeRuntimeController(const char *ConfigFile, - const char *QRangeFile) { +extern "C" void llvm_hpvm_initializeRuntimeController( + const char *ConfigFile, const char *QRangeFile) { RC = new RuntimeController(); RC->init(ConfigFile, QRangeFile); return; @@ -1337,8 +1462,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() { //*** Methods to compute accuracy of a tensor by the runtime controller ***// uint32_t *labels_from_file = NULL; -uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, - int end) { +uint32_t * +hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) { // Initialize buffer if (!labels_from_file) { @@ -1423,12 +1548,13 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { return accuracy; } + //#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl -#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl -//#define llvm_hpvm_invokeRtControl_ADJUST llvm_hpvm_invokeRtControl +//#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl +#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl -extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str, - int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_BASE( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1445,15 +1571,16 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str, RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str, - int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ITERATE( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1477,15 +1604,16 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str, RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str, - int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ADJUST( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1528,17 +1656,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str, RC->addToCurrentIterationConfigEnergy(pinfo2.second); //* */ - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, - const char *str, int start, - int end) { +extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1582,17 +1710,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, RC->addToCurrentIterationConfigEnergy(pinfo2.second); //* */ - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result, - const char *str, int start, - int end) { +extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1619,20 +1747,21 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result, float next_conf_speedup = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("slowdown (target speedup) = %f\n", slowdown); INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); + INFO( + "Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result, - const char *str, - int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1660,19 +1789,21 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result, float next_conf_speedup = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("slowdown (target speedup) = %f\n", slowdown); INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); + INFO( + "Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str, - int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_RAND( + void *result, const char *str, int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1690,8 +1821,9 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str, RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO( + "current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); @@ -1702,13 +1834,12 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) { std::ofstream of(path, std::ofstream::out | std::ofstream::app); if (!of.good()) ERROR("Cannot write to %s file", path); - for (float f : vec) + for (float f: vec) of << f << ' '; of << '\n'; } -extern "C" void llvm_hpvm_imgInvokeRtControl(void *result, void *gold, - int start, int end) { +extern "C" void llvm_hpvm_imgInvokeRtControl(void* result, void *gold, int start, int end) { RC->resume_profiler(); if (gold != nullptr) { diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 98fd30ba9ee0ec7e0b81cfcaa9b3a699ec8e57b0..898d92c18cb8ad0b2df7a6d0c9d905c9649c53c1 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -1,12 +1,10 @@ -/* This file includes the API implementation of the HPVM tensor runtime built -*for CPU +/* This file includes the API implementation of the HPVM tensor runtime built for CPU ** ** Author: Hashim Sharif ** Email: hsharif3@illinois.edu */ #include <algorithm> -#include <bits/stdc++.h> #include <cfloat> #include <cmath> #include <cstdio> @@ -16,1152 +14,1101 @@ #include <iostream> #include <limits> #include <map> -#include <math.h> +#include <cmath> #include <memory> -#include <omp.h> -#include <pthread.h> +#include <vector> #include <sstream> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string> #include <vector> +#include <math.h> +#include<bits/stdc++.h> +#include <pthread.h> +#include <omp.h> // Tensor runtime header files -#include "tensor_cpu.h" +//#include "tensor_cpu.h" +#include "tensor.h" #include "tensor_cpu_runtime.h" -void llvm_hpvm_initTensorRt(int) { - // NOTE: Do Nothing +void llvm_hpvm_initTensorRtCPU() { + // NOTE: Do Nothing } -void llvm_hpvm_cleanupTensorRt() { - // NOTE: Do Nothing +void llvm_hpvm_cleanupTensorRtCPU() { + // NOTE: Do Nothing } -void hpvm_request_tensor(void *tensor, int destination) { - // NOTE: Do Nothing +void hpvm_request_tensorCPU(void *tensor, int destination) { + // NOTE: Do Nothing } - + std::vector<void *> PtrVect; void freeBatchMemory() { - for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { - free(*it); - } - PtrVect.erase(PtrVect.begin(), PtrVect.end()); + for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { + free(*it); + } + PtrVect.erase(PtrVect.begin(), PtrVect.end()); } -inline int getTypeSize(int data_type) { - return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); + +int getTypeSizeCPU(int data_type) __attribute__((always_inline)); +inline int getTypeSizeCPU(int data_type) { + return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); } -void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) - __attribute__((always_inline)); -inline void setSizeInBytes(struct Tensor *tensor, int data_type, - size_t num_elems) { - int type_size = getTypeSize(data_type); - size_t size_in_bytes = type_size * num_elems; - tensor->size_in_bytes = size_in_bytes; +void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); +inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) { + int type_size = getTypeSizeCPU(data_type); + size_t size_in_bytes = type_size * num_elems; + tensor->size_in_bytes = size_in_bytes; } -void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, - bool freeMemory = true) __attribute__((always_inline)); -inline void allocateMemCPU(struct Tensor *tensor, int data_type, - size_t num_elems, bool freeMemory) { - setSizeInBytes(tensor, data_type, num_elems); - tensor->data_type = data_type; - tensor->num_elems = num_elems; - tensor->host_data = - (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host - if (freeMemory) - PtrVect.push_back(tensor->host_data); +void allocateMemCPU(struct Tensor *tensor, int data_type, + size_t num_elems, bool freeMemory = true) __attribute__((always_inline)); +inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) { + setSizeInBytesCPU(tensor, data_type, num_elems); + tensor->data_type = data_type; + tensor->num_elems = num_elems; + tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host + if(freeMemory) + PtrVect.push_back(tensor->host_data); } -void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - if (tensor->size_in_bytes != size_in_bytes) { - printf("The destination and source sizes don't match"); - } - memcpy(tensor->host_data, data_ptr, - size_in_bytes); // Is this efficient enough? +void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) __attribute__((always_inline)); +inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { + Tensor *tensor = (Tensor *)tensor_ptr; + if (tensor->size_in_bytes != size_in_bytes) { + printf("The destination and source sizes don't match"); + } + memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough? } -// void *create4DTensor(int data_type, int data_format, size_t dim1_size, -// size_t dim2_size, size_t dim3_size, size_t dim4_size, -// bool freeMemory = true) __attribute__((always_inline)); -inline void *create4DTensor(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, - size_t dim4_size, bool freeMemory) { - struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - if (freeMemory) - PtrVect.push_back(tensor); - allocateMemCPU(tensor, data_type, num_elems, freeMemory); - - // Setting the tensor dimensions - size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; - - return tensor; +void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, size_t dim4_size, + bool freeMemory = true) __attribute__((always_inline)); +inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, + size_t dim4_size, bool freeMemory) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + if(freeMemory) + PtrVect.push_back(tensor); + allocateMemCPU(tensor, data_type, num_elems, freeMemory); + + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; + tensor->data_placement = HOST; + return tensor; } -void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / - horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int output_size = output_width * output_height; - - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, - output_height, output_width); - float *__restrict__ output_data = (float *)output->host_data; - - long int conv_data_size = sizeof(float) * num_filter_elem * output_height * - output_width * batch_size; - float *host_data = (float *)malloc(conv_data_size); - // printf("number of batches: %d\n", batch_size); - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - for (int h = 0; h < output_height; h++) { - for (int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for (int i = 0; i < kernel_height; i++) { - for (int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size + - output_index * num_filter_elem + - filter_elem_num; - if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && - inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height + - (inH + i)) * - image_width + - (inW + j)]; - } else { - host_data[out_index] = 0; - } +void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int output_size = output_width * output_height; + printf("--CREATE 4D TENSOR\n"); + Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + printf("CREATED 4D TENSOR\n"); + long int conv_data_size = + sizeof(float) * num_filter_elem * output_height * output_width * batch_size; + float *host_data = (float *) malloc(conv_data_size); + printf("host data: %p\n", host_data); + printf("number of batches: %d\n", batch_size); + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } } - } } - } - } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; -#pragma omp simd reduction(+ : sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = - k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } } - output_data[b * (output_size * num_filters) + p * output_size + m] = - sum; - } } - } - free(host_data); - printf("END: %p\n", output); - return output; + free(host_data); + printf("END: %p\n", output); + return output; } -void *tensorRegularFilterSamplingConvolutionCPU( - void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int compute_precision, int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + - ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int num_filter_elem = kernel_height * kernel_width * channels; - - const int remainder = ((num_filter_elem - start) % skip_every > 0); - const int reduced_num_filter_elem = - num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, - output_height, output_width); - float *__restrict__ output_data = (float *)output->host_data; - - const long int host_data_size = sizeof(float) * reduced_num_filter_elem * - output_height * output_width * batch_size; - float *host_data = (float *)malloc(host_data_size); - - const int reduced_filer_size = - sizeof(float) * num_filters * reduced_num_filter_elem; - float *reduced_kernels = (float *)malloc(reduced_filer_size); - - float fac = (((float)skip_every) / ((float)skip_every - 1)); - int reduced_filter_dim = reduced_num_filter_elem / channels; - - // Create reduced filter - omp_set_num_threads(4); -#pragma omp parallel for - for (int f = 0; f < num_filters; f++) { - for (int i = 0; i < reduced_num_filter_elem; i++) { - int ch = i / reduced_filter_dim; - int offset = (start + ch) % skip_every; - int in_index; - if (i < offset) { - in_index = i; - } else { - in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) + - (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + - offset - 1; - } - reduced_kernels[f * reduced_num_filter_elem + i] = - fac * host_filter[num_filter_elem * f + in_index]; - } - } - - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int h = 0; h < output_height; h++) { - for (int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for (int fi = 0; fi < reduced_num_filter_elem; fi++) { - int in_index; - const int ch = fi / reduced_filter_dim; - const int offset = (start + ch) % skip_every; - if (fi < offset) { - in_index = fi; - } else { - in_index = - ((fi - offset + 1) * skip_every) / (skip_every - 1) + - (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + - offset - 1; - } - const int i = - (in_index % (kernel_width * kernel_height)) / kernel_width; - const int j = in_index % kernel_width; - const int output_index = h * output_width + w; - const int out_index = b * reduced_num_filter_elem * output_size + - output_index * reduced_num_filter_elem + fi; - if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && - inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height + (inH + i)) * - image_width + - (inW + j)]; - } else { - host_data[out_index] = 0; - } +void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem + * output_height * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *) malloc(reduced_filer_size); + + float fac = (((float) skip_every) / ((float) skip_every - 1)); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create reduced filter + omp_set_num_threads(4); + #pragma omp parallel for + for(int f = 0; f < num_filters; f++) { + for(int i = 0; i < reduced_num_filter_elem; i++) { + int ch = i / reduced_filter_dim; + int offset = (start + ch) % skip_every; + int in_index; + if(i < offset) { + in_index = i; + } else { + in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) + + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1; + } + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; } - } } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; -#pragma omp simd reduction(+ : sum) - for (int k = 0; k < reduced_num_filter_elem; ++k) { - int input_index = k + reduced_num_filter_elem * m + - b * reduced_num_filter_elem * output_size; - sum += host_data[input_index] * - reduced_kernels[p * reduced_num_filter_elem + k]; + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + const int ch = fi / reduced_filter_dim; + const int offset = (start + ch) % skip_every; + if(fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; + } + const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } } - output_data[b * (output_size * num_filters) + p * output_size + m] = - sum; - } - } - } - free(reduced_kernels); - free(host_data); - - return output; -} -void *tensorIrregularFilterSamplingConvolutionCPU( - void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int compute_precision, int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + - ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int num_filter_elem = kernel_height * kernel_width * channels; - - const int remainder = ((num_filter_elem - start) % skip_every > 0); - const int reduced_num_filter_elem = - num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, - output_height, output_width); - float *__restrict__ output_data = (float *)output->host_data; - - const long int host_data_size = sizeof(float) * reduced_num_filter_elem * - output_height * output_width * batch_size; - float *host_data = (float *)malloc(host_data_size); - - const int reduced_filer_size = - sizeof(float) * num_filters * reduced_num_filter_elem; - float *reduced_kernels = (float *)malloc(reduced_filer_size); - - float fac = (((float)skip_every) / ((float)skip_every - 1)); - int reduced_filter_dim = reduced_num_filter_elem / channels; - - // Create Reduced filter - omp_set_num_threads(4); -#pragma omp parallel for - for (int f = 0; f < num_filters; f++) { - for (int i = 0; i < start; i++) { - reduced_kernels[f * reduced_num_filter_elem + i] = - host_filter[num_filter_elem * f + i]; - } -#pragma omp simd - for (int i = start; i < reduced_num_filter_elem; i++) { - int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) + - (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + - start - 1; - reduced_kernels[f * reduced_num_filter_elem + i] = - fac * host_filter[num_filter_elem * f + in_index]; - } - } - -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int h = 0; h < output_height; h++) { - for (int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for (int fi = 0; fi < reduced_num_filter_elem; fi++) { - int in_index; - int offset = start; - if (fi < offset) { - in_index = fi; - } else { - in_index = - ((fi - offset + 1) * skip_every) / (skip_every - 1) + - (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + - offset - 1; - } - const int ch = in_index / (kernel_width * kernel_height); - const int i = - (in_index % (kernel_width * kernel_height)) / kernel_width; - const int j = in_index % kernel_width; - const int output_index = h * output_width + w; - const int out_index = b * reduced_num_filter_elem * output_size + - output_index * reduced_num_filter_elem + fi; - if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && - inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height + (inH + i)) * - image_width + - (inW + j)]; - } else { - host_data[out_index] = 0; - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] + * reduced_kernels[p * reduced_num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } } - } - } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; -#pragma omp simd reduction(+ : sum) - for (int k = 0; k < reduced_num_filter_elem; ++k) { - int input_index = k + reduced_num_filter_elem * m + - b * reduced_num_filter_elem * output_size; - sum += host_data[input_index] * - reduced_kernels[p * reduced_num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = - sum; - } } - } - free(reduced_kernels); - free(host_data); - - return output; + free(reduced_kernels); + free(host_data); + + return output; } -void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, int row, - int start) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - - int full_output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int full_output_width = - 1 + - ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int full_output_size = full_output_height * full_output_width; - - Tensor *full_output = (Tensor *)create4DTensor( - 0, 0, batch_size, num_filters, full_output_height, full_output_width); - float *__restrict__ full_output_data = (float *)full_output->host_data; - - int remainder = (full_output_height - start) % row > 0; - int output_height = - full_output_height - ((full_output_height - start) / row) - remainder; - - int output_width = full_output_width; - float *output_data = (float *)malloc( - sizeof(float) * batch_size * num_filters * output_height * output_width); - int output_size = output_width * output_height; - long int host_data_size = sizeof(float) * num_filter_elem * output_height * - output_width * batch_size; - float *host_data = (float *)malloc(host_data_size); - - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - for (int h = 0; h < output_height; h++) { - int inH; - if (h < start) { - inH = h * vertical_stride - vertical_pad; - } else { - int h_index = ((h - start + 1) * row) / (row - 1) + - (((h - start + 1) * row) % (row - 1) > 0) + start - 1; - inH = h_index * vertical_stride - vertical_pad; +void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem + * output_height * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *) malloc(reduced_filer_size); + + float fac = (((float) skip_every) / ((float) skip_every - 1)); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create Reduced filter + omp_set_num_threads(4); + #pragma omp parallel for + for(int f = 0; f < num_filters; f++) { + for(int i = 0; i < start; i++) { + reduced_kernels[f * reduced_num_filter_elem + i] = + host_filter[num_filter_elem * f + i]; } - for (int w = 0; w < output_width; w++) { - int inW = w * horizontal_stride - horizontal_pad; - for (int i = 0; i < kernel_height; i++) { - for (int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size + - output_index * num_filter_elem + - filter_elem_num; - if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && - inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height + - (inH + i)) * - image_width + - (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } + #pragma omp simd + for(int i = start; i < reduced_num_filter_elem; i++) { + int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) + + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1; + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; } - } } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; -#pragma omp simd reduction(+ : sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = - k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + int offset = start; + if(fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; + } + const int ch = in_index / (kernel_width * kernel_height); + const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } } - output_data[b * (output_size * num_filters) + p * output_size + m] = - sum; - } - } - // Interpolate - for (int p = 0; p < num_filters; ++p) { - for (int h = 0; h < full_output_height; h++) { - for (int w = 0; w < full_output_width; w++) { - int full_output_index = b * num_filters * full_output_size + - p * full_output_size + h * full_output_width + - w; - if (h < start) { - int output_index = b * num_filters * output_size + p * output_size + - h * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if (h == full_output_height - 1) { - int output_index = b * num_filters * output_size + p * output_size + - (output_height - 1) * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if (h == 0) { - int output_index = b * num_filters * output_size + p * output_size + - 0 * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if ((h - start) % row == 0) { - int row_index = h - ((h + 1 - start) / row); - int output_index = b * num_filters * output_size + p * output_size + - row_index * output_width + w; - full_output_data[full_output_index] = - (output_data[output_index] + - output_data[output_index - output_width]) / - 2; - } else { - int remainder = ((h + 1 - start) % row) > 0; - int row_index = h - ((h + 1 - start) / row) - remainder; - int output_index = b * num_filters * output_size + p * output_size + - row_index * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] + * reduced_kernels[p * reduced_num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } } - } - } - } - free(output_data); - free(host_data); - return full_output; + } + free(reduced_kernels); + free(host_data); + + return output; } -void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, int col, - int start) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int full_output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int full_output_width = - 1 + - ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int full_output_size = full_output_height * full_output_width; - - Tensor *full_output = (Tensor *)create4DTensor( - 0, 0, batch_size, num_filters, full_output_height, full_output_width); - float *__restrict__ full_output_data = (float *)full_output->host_data; - - int remainder = (full_output_width - start) % col > 0; - int output_width = - full_output_width - ((full_output_width - start) / col) - remainder; - - int output_height = full_output_height; - float *output_data = (float *)malloc( - sizeof(float) * batch_size * num_filters * output_height * output_width); - int output_size = output_width * output_height; - long int host_data_size = sizeof(float) * num_filter_elem * output_height * - output_width * batch_size; - float *host_data = (float *)malloc(host_data_size); - - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - for (int h = 0; h < output_height; h++) { - int inH = h * vertical_stride - vertical_pad; - for (int w = 0; w < output_width; w++) { - int inW; - if (w < start) { - inW = w * horizontal_stride - horizontal_pad; - } else { - int w_index = ((w - start + 1) * col) / (col - 1) + - (((w - start + 1) * col) % (col - 1) > 0) + start - 1; - inW = w_index * horizontal_stride - horizontal_pad; - } - for (int i = 0; i < kernel_height; i++) { - for (int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size + - output_index * num_filter_elem + - filter_elem_num; - if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && - inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height + - (inH + i)) * - image_width + - (inW + j)]; - } else { - host_data[out_index] = 0; - } +void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int row, int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, + full_output_height, full_output_width); + float * __restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_height - start) % row > 0; + int output_height = + full_output_height - ((full_output_height - start) / row) - remainder; + + int output_width = full_output_width; + float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters + * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height + * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + int inH; + if(h < start) { + inH = h * vertical_stride - vertical_pad; + } else { + int h_index = ((h - start + 1) * row) / (row - 1) + + (((h - start + 1) * row) % (row - 1) > 0) + start - 1; + inH = h_index * vertical_stride - vertical_pad; + } + for(int w = 0; w < output_width; w++) { + int inW = w * horizontal_stride - horizontal_pad; + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } } - } } - } - } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; -#pragma omp simd reduction(+ : sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = - k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } } - output_data[b * (output_size * num_filters) + p * output_size + m] = - sum; - } - } - // Interpolate - for (int p = 0; p < num_filters; ++p) { - for (int h = 0; h < full_output_height; h++) { - for (int w = 0; w < full_output_width; w++) { - int full_output_index = b * num_filters * full_output_size + - p * full_output_size + h * full_output_width + - w; - if (w < start) { - int output_index = b * num_filters * output_size + p * output_size + - h * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if (w == full_output_width - 1) { - int output_index = b * num_filters * output_size + p * output_size + - h * output_width + output_width - 1; - full_output_data[full_output_index] = output_data[output_index]; - } else if (w == 0) { - int output_index = b * num_filters * output_size + p * output_size + - h * output_width + 0; - full_output_data[full_output_index] = output_data[output_index]; - } else if ((w - start) % col == 0) { - int col_index = w - ((w + 1 - start) / col); - int output_index = b * num_filters * output_size + p * output_size + - h * output_width + col_index; - full_output_data[full_output_index] = - (output_data[output_index] + output_data[output_index - 1]) / 2; - } else { - int remainder = ((w + 1 - start) % col) > 0; - int col_index = w - ((w + 1 - start) / col) - remainder; - int output_index = b * num_filters * output_size + p * output_size + - h * output_width + col_index; - full_output_data[full_output_index] = output_data[output_index]; - } - } - } + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for(int h = 0; h < full_output_height; h++) { + for(int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + w; + if(h < start) { + int output_index = b * num_filters * output_size + + p * output_size + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if(h == full_output_height - 1) { + int output_index = b * num_filters * output_size + p * output_size + + (output_height - 1) * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if(h == 0) { + int output_index = b * num_filters * output_size + + p * output_size + 0 * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if((h - start) % row == 0) { + int row_index = h - ((h + 1 - start) / row); + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = + (output_data[output_index] + output_data[output_index - output_width]) / 2; + } else { + int remainder = ((h + 1 - start) % row) > 0; + int row_index = h - ((h + 1 - start) / row) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } + } + } + } } - } - free(output_data); - free(host_data); + free(output_data); + free(host_data); - return full_output; + return full_output; } -void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int compute_precision, int row, int col, int skip_every, - int start) { - if (row > 1) { - printf("ROW PERFORATION\n"); - return tensorRowPerfConvolutionCPU( - input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision, row, start); - } - if (col > 1) { - printf("COL PERFORATION\n"); - return tensorColPerfConvolutionCPU( - input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision, col, start); - } - if (skip_every > 1) { - printf("INPUT FILTERING\n"); +void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int col, int start) { + Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, + full_output_height, full_output_width); + float * __restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_width - start) % col > 0; + int output_width = full_output_width - ((full_output_width - start) / col) - remainder; + + int output_height = full_output_height; + float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters + * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height + * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + int inH = h * vertical_stride - vertical_pad; + for(int w = 0; w < output_width; w++) { + int inW; + if(w < start) { + inW = w * horizontal_stride - horizontal_pad; + } else { + int w_index = ((w - start + 1) * col) / (col - 1) + + (((w - start + 1) * col) % (col - 1) > 0) + start - 1; + inW = w_index * horizontal_stride - horizontal_pad; + } + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + } + } - if (!(kernel_height * kernel_width % skip_every)) { - return tensorRegularFilterSamplingConvolutionCPU( - input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision, skip_every, start); - } - return tensorIrregularFilterSamplingConvolutionCPU( - input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision, skip_every, start); - } - printf("REGULAR CONV\n"); - return tensorRegularConvolutionCPU( - input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision); -} + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = k + num_filter_elem * m + + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } + } -void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int conv_groups) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / - horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int output_size = output_width * output_height; - - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, - output_height, output_width); - float *__restrict__ output_data = (float *)output->host_data; - - long int conv_data_size = sizeof(float) * num_filter_elem * output_height * - output_width * batch_size; - float *host_data = (float *)malloc(conv_data_size); - - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - for (int h = 0; h < output_height; h++) { - for (int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for (int i = 0; i < kernel_height; i++) { - for (int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size + - output_index * num_filter_elem + - filter_elem_num; - if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && - inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height + - (inH + i)) * - image_width + - (inW + j)]; - } else { - host_data[out_index] = 0; - } + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for(int h = 0; h < full_output_height; h++) { + for(int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + w; + if(w < start) { + int output_index = b * num_filters * output_size + + p * output_size + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if(w == full_output_width - 1) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + output_width - 1; + full_output_data[full_output_index] = output_data[output_index]; + } else if(w == 0) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + 0; + full_output_data[full_output_index] = output_data[output_index]; + } else if((w - start) % col == 0) { + int col_index = w - ((w + 1 - start) / col); + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = + (output_data[output_index] + output_data[output_index - 1]) / 2; + } else { + int remainder = ((w + 1 - start) % col) > 0; + int col_index = w - ((w + 1 - start) / col) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = output_data[output_index]; + } + } } - } } - } } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; -#pragma omp simd reduction(+ : sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = - k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + free(output_data); + free(host_data); + + return full_output; +} + +void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int row, int col, int skip_every, int start) { + if(row > 1) { + printf("ROW PERFORATION\n"); + return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, + horizontal_pad, vertical_stride, horizontal_stride, conv_mode, + compute_precision, row, start); + } + if(col > 1) { + printf("COL PERFORATION\n"); + return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, + horizontal_pad, vertical_stride, horizontal_stride, conv_mode, + compute_precision, col, start); + } + if(skip_every > 1) { + printf("INPUT FILTERING\n"); + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + + if(!(kernel_height * kernel_width % skip_every)) { + return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, + vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, + compute_precision, skip_every, start); } - output_data[b * (output_size * num_filters) + p * output_size + m] = - sum; - } + return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, + vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, + compute_precision, skip_every, start); } - } - free(host_data); - return output; + printf("---REGULAR CONV\n"); + return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad, + horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision); } -void *tensorAddCPU(void *x_ptr, void *bias_ptr) { - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; - - float *__restrict__ x_data = (float *)x->host_data; - float *__restrict__ bias_data = (float *)bias->host_data; - int n = x->dims.dim_sizes[0]; - int c = x->dims.dim_sizes[1]; - int h = x->dims.dim_sizes[2]; - int w = x->dims.dim_sizes[3]; - - if (x->num_elems == bias->num_elems) { - int const1 = c * h * w; - int const2 = h * w; +void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups){ + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int filter_dim = kernel_height * kernel_width; + const int num_filter_elem = filter_dim * channels; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, channels, + output_height * output_width); + float * __restrict__ output_data = (float *)output->host_data; + + const long int conv_data_size = + sizeof(float) * num_filter_elem * output_height * output_width * batch_size; + float *host_data = (float *) malloc(conv_data_size); + omp_set_num_threads(4); -#pragma omp parallel for - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { -#pragma omp simd collapse(2) - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * const1 + j * const2 + (k * w) + l] += - bias_data[i * const1 + j * const2 + (k * w) + l]; - } + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + } } - } - } - } else { - omp_set_num_threads(4); -#pragma omp parallel for - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { -#pragma omp simd collapse(2) - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; - } + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + for (int ch = 0; ch < channels; ch++) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < filter_dim; ++k) { + int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + ch * filter_dim + k]; + } + output_data[b * (output_size * num_filters * channels) + p * output_size * channels + ch * output_size + m] = sum; + } + } } - } } - } - return x; + free(host_data); + return output; +} + +void* tensorAddCPU(void *x_ptr, void *bias_ptr) { + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; + + float * __restrict__ x_data = (float *)x->host_data; + float * __restrict__ bias_data = (float *)bias->host_data; + int n = x->dims.dim_sizes[0]; + int c = x->dims.dim_sizes[1]; + int h = x->dims.dim_sizes[2]; + int w = x->dims.dim_sizes[3]; + + if(x->num_elems == bias->num_elems) { + int const1 = c * h * w; + int const2 = h * w; + omp_set_num_threads(4); + #pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + #pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * const1 + j * const2 + (k * w) + l] += + bias_data[i * const1 + j * const2 + (k*w) + l]; + } + } + } + } + } else { + omp_set_num_threads(4); + #pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + #pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; + } + } + } + } + } + + return x; } float max(float v1, float v2) __attribute__((always_inline)); -inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; } +inline float maximum(float v1, float v2){ + return (v1 < v2) ? v2 : v1; +} void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, - int window_width, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { - - Tensor *input = (Tensor *)input_ptr; - float *__restrict__ input_data = (float *)input->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - - int output_height = - 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / - horizontal_stride); - - int center_x = (window_width - 1) / 2 - horizontal_pad; - int center_y = (window_height - 1) / 2 - vertical_pad; - int x_radius = (window_width - 1) / 2; - int y_radius = (window_height - 1) / 2; - - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels, - output_height, output_width); - float *__restrict__ output_data = (float *)output->host_data; - - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - float val = (poolFunction == 0) ? -3.40282e+38 : 0; - int y_radius_var = y_radius - r; - int y_radius_var_max = y_radius_var + image_height; - int x_radius_var = x_radius - c; - int x_radius_var_max = x_radius_var + image_width; - int ki_min = - (y_radius_var > 0) - ? ((y_radius_var < window_height) ? y_radius_var : -1) - : 0; - int ki_max = (y_radius_var_max < window_height) - ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1) - : window_height; - int kj_min = (x_radius_var > 0) - ? ((x_radius_var < window_width) ? x_radius_var : -1) - : 0; - int kj_max = (x_radius_var_max < window_width) - ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1) - : window_width; - - if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 && - ki_max != -1 && kj_min != -1 && kj_max != -1) { - if (!poolFunction) { - for (int ki = 0; ki < window_height; ki++) { - for (int kj = 0; kj < window_width; kj++) { - val = maximum( - val, - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - (r - y_radius + ki) * image_width + - (c - x_radius + kj)]); - } - } - } else { - for (int ki = 0; ki < window_height; ki++) { - for (int kj = 0; kj < window_width; kj++) { - val += - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - (r - y_radius + ki) * image_width + - (c - x_radius + kj)]; + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { + + Tensor *input = (Tensor *)input_ptr; + float * __restrict__ input_data = (float *)input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + + int output_height = + 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); + int output_width = + 1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride); + + int center_x = (window_width - 1) / 2 - horizontal_pad; + int center_y = (window_height - 1) / 2 - vertical_pad; + int x_radius = (window_width - 1) / 2; + int y_radius = (window_height - 1) / 2; + + Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, channels, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + omp_set_num_threads(4); + #pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + int ii = 0, jj = 0; + for (int r = center_y; r < image_height + vertical_pad - y_radius; + r += vertical_stride) { + for (int c = center_x; c < image_width + horizontal_pad - x_radius; + c += horizontal_stride) { + float val = (poolFunction == 0) ? -3.40282e+38 : 0; + int y_radius_var = y_radius - r; + int y_radius_var_max = y_radius_var + image_height; + int x_radius_var = x_radius - c; + int x_radius_var_max = x_radius_var + image_width; + int ki_min = (y_radius_var > 0) ? + ((y_radius_var < window_height) ? y_radius_var : -1) : 0; + int ki_max = (y_radius_var_max < window_height) ? + ((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height; + int kj_min = (x_radius_var > 0) ? + ((x_radius_var < window_width) ? x_radius_var : -1) : 0; + int kj_max = (x_radius_var_max < window_width) ? + ((x_radius_var_max >= 0) ? x_radius_var_max : -1) : window_width; + + if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 + && ki_max != -1 && kj_min != -1 && kj_max != -1) { + if(!poolFunction) { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val = maximum( + val, + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + (c - x_radius + kj)]); + } + } + } else { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val += input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + (c - x_radius + kj)]; + } + } + } + } + if (poolFunction == 1) { + val /= window_height * window_width; + } + output_data[b * (channels * output_height * output_width) + + ch * (output_height * output_width) + ii * output_width + jj] = val; + jj++; + if (jj == output_width) { + jj = 0; + ii++; + } } - } } - } - if (poolFunction == 1) { - val /= window_height * window_width; - } - output_data[b * (channels * output_height * output_width) + - ch * (output_height * output_width) + ii * output_width + - jj] = val; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } } - } } - } - - return output; + + return output; } void *tensorTanhCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - omp_set_num_threads(4); -#pragma omp parallel for - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = tanhf(input_data[i]); - } - - return input; + Tensor *input = (Tensor *)input_ptr; + + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + omp_set_num_threads(4); + #pragma omp parallel for + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = tanhf(input_data[i]); + } + + return input; } void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { - Tensor *lhs = (Tensor *)lhs_ptr; - Tensor *rhs = (Tensor *)rhs_ptr; - // printf("GEMM lhs_ptr: %p\n", lhs_ptr); - // printf("GEMM rhs_ptr: %p\n", rhs_ptr); - - int m = lhs->dims.dim_sizes[0]; - int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; - - Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1); - - float *__restrict__ lhs_arr = (float *)lhs->host_data; - float *__restrict__ rhs_arr = (float *)rhs->host_data; - float *__restrict__ output_arr = (float *)output->host_data; - - int k = 1; -#pragma unroll 4 // Can we unroll more??? - for (int j = 1; j < lhs->dims.num_dims; j++) { - k = k * lhs->dims.dim_sizes[j]; // input neurons - } - // printf("unroll\n"); - float *tran_rhs = (float *)malloc(sizeof(float) * k * n); - // printf("tran_rhs: %p\n", tran_rhs); - // printf("rhs_arr: %p\n", rhs_arr); - // printf("lhs_arr: %p\n", lhs_arr); - omp_set_num_threads(4); -#pragma omp parallel for simd - for (int l = 0; l < k; l++) { - for (int j = 0; j < n; j++) { - tran_rhs[j * k + l] = rhs_arr[l * n + j]; + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; + + int m = lhs->dims.dim_sizes[0]; + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); + + float * __restrict__ lhs_arr = (float *)lhs->host_data; + float * __restrict__ rhs_arr = (float *)rhs->host_data; + float * __restrict__ output_arr = (float *)output->host_data; + + int k = 1; + #pragma unroll 4 // Can we unroll more??? + for (int j = 1; j < lhs->dims.num_dims; j++) { + k = k * lhs->dims.dim_sizes[j]; // input neurons } - } -// printf("TRANS\n"); -#pragma omp parallel for - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - float sum = 0.0; -#pragma omp simd reduction(+ : sum) - for (int l = 0; l < k; l++) { - sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; - } - output_arr[i * n + j] = sum; + float *tran_rhs = (float *) malloc(sizeof(float) * k * n); + omp_set_num_threads(4); + #pragma omp parallel for simd + for (int l = 0; l < k; l++) { + for (int j = 0; j < n; j++) { + tran_rhs[j * k + l] = rhs_arr[l * n + j]; + } + } + + #pragma omp parallel for + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0.0; + #pragma omp simd reduction(+:sum) + for (int l = 0; l < k; l++) { + sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; + } + output_arr[i * n + j] = sum; + } } - } - free(tran_rhs); - // printf("GEMM OUTPUT: %p\n", output); - return output; + free(tran_rhs); + return output; } void *tensorSoftmaxCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - - float *logits = (float *)input->host_data; - int n = input->dims.dim_sizes[0]; - int c = input->dims.dim_sizes[1]; - - omp_set_num_threads(4); -#pragma omp parallel for - for (int i = 0; i < n; i++) { - float x = 0; - for (int j = i * c; j < c + i * c; j++) { - logits[j] = expf(logits[j]); - } - -#pragma omp simd reduction(+ : x) - for (int j = i * c; j < i * c + c; j++) { - x += logits[j]; - } - -#pragma omp simd - for (int j = i * c; j < i * c + c; j++) { - logits[j] /= x; + Tensor *input = (Tensor *)input_ptr; + + float *logits = (float *)input->host_data; + int n = input->dims.dim_sizes[0]; + int c = input->dims.dim_sizes[1]; + + omp_set_num_threads(4); + #pragma omp parallel for + for (int i = 0; i < n; i++) { + float x = 0; + for(int j = i*c; j < c + i*c; j++) { + logits[j] = expf(logits[j]); + } + + #pragma omp simd reduction(+:x) + for(int j = i*c; j < i*c+c; j++) { + x += logits[j]; + } + + #pragma omp simd + for(int j = i*c; j < i*c + c; j++) { + logits[j] /= x; + } } - } - return input; + return input; } -void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, - void *mean_ptr, void *variance_ptr, double epsilon) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *gamma = (Tensor *)gamma_ptr; - Tensor *beta = (Tensor *)beta_ptr; - Tensor *mean = (Tensor *)mean_ptr; - Tensor *variance = (Tensor *)variance_ptr; - - float *__restrict__ host_image = (float *)input->host_data; - float *__restrict__ host_beta = (float *)beta->host_data; - float *__restrict__ host_gamma = (float *)gamma->host_data; - float *__restrict__ host_mean = (float *)mean->host_data; - float *__restrict__ host_variance = (float *)variance->host_data; - - float alpha_val = 1.0f, beta_val = 0.0f; - size_t num_elems = input->num_elems; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int image_dim = image_height * image_width; - - omp_set_num_threads(4); -#pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - float mean = 0; -#pragma omp simd reduction(+ : mean) - for (int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - mean += host_image[index]; - } - mean = mean / channels; - - float variance = 0; -#pragma omp simd reduction(+ : variance) - for (int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - float tmp = host_image[index] - mean; - variance += (tmp * tmp); - } - variance = variance / channels; - -#pragma omp simd - for (int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - host_image[index] = - host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) / - sqrt(epsilon + variance))); - } +void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr, + void* mean_ptr, void* variance_ptr, double epsilon) { + + Tensor* input = (Tensor*) input_ptr; + Tensor* gamma = (Tensor*) gamma_ptr; + Tensor* beta = (Tensor*) beta_ptr; + Tensor* mean = (Tensor*) mean_ptr; + Tensor* variance = (Tensor*) variance_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_beta = (float *)beta->host_data; + float * __restrict__ host_gamma = (float *)gamma->host_data; + float * __restrict__ host_mean = (float *)mean->host_data; + float * __restrict__ host_variance = (float *)variance->host_data; + + float alpha_val = 1.0f, beta_val = 0.0f; + size_t num_elems = input->num_elems; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int image_dim = image_height * image_width; + + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + float mean = 0; + #pragma omp simd reduction(+:mean) + for(int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + mean += host_image[index]; + } + mean = mean / channels; + + float variance = 0; + #pragma omp simd reduction(+:variance) + for(int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + float tmp = host_image[index] - mean; + variance += (tmp * tmp); + } + variance = variance / channels; + + #pragma omp simd + for(int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + host_image[index] = host_beta[ch] + + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance))); + } + } } - } - return input; + return input; } -void *tensorReluCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - -#pragma omp simd - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; - } + void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + #pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; + } - return input; + return input; } void *tensorRelu2CPU(void *input_ptr, float min, float max) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - -#pragma omp simd - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = (input_data[i] < min) - ? min - : ((input_data[i] > max) ? max : input_data[i]); - } - - return input; -} \ No newline at end of file + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + #pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? + max : input_data[i]); + } + + return input; +}