Separating tensor runtime initialization routines into different source

58487a52 · Hashim Sharif · 2cdc4519 · 58487a52 · 58487a52 · 58487a52
Commit 58487a52 authored 4 years ago by Hashim Sharif
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -64,6 +64,7 @@ set(
  tensor_utils.cu
  wrapper_runtime.cu
  approx_knobs_utils.cc
+  init_api.cc
 )
 foreach(FILE ${RUNTIME_SRCS_FILENAME})
  list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}")

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
+#include <stdio.h>
+#include <stdarg.h>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <cublas_api.h>
+#include <cuda_fp16.h>
+// Tensor runtime header files
+#include "tensor_runtime.h"
+#include "tensor_utils.h"
+#include "debug.h"
+#include "profiling.h"
+#include "global_data.h"
+#include "error.h"
+#include "tensor.h"
+#include "op_overheads.h"
+#include "approx_simulation.h"
+void llvm_hpvm_initTensorRt(int gpuid);
+void llvm_hpvm_cleanupTensorRt();
+void llvm_hpvm_initApproxhpvmRt(int gpuid);
+void llvm_hpvm_cleanupApproxhpvmRt();
+void dumpAccuracyNorms();
+// Returns the number of GPUs active on the platform
+unsigned int getGPUCount();
+void clearTensorMap();
+void startMemTracking();
+void freeOutputTensors();
+void clearOpCounter();
+void freeBatchMemory();
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
+#include <stdio.h>
+#include <stdarg.h>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <cublas_api.h>
+#include <cuda_fp16.h>
+// Tensor runtime header files
+#include "tensor_runtime.h"
+#include "tensor_utils.h"
+#include "debug.h"
+#include "profiling.h"
+#include "global_data.h"
+#include "error.h"
+#include "tensor.h"
+#include "op_overheads.h"
+#include "approx_simulation.h"
+#include "init_api.h"
+void llvm_hpvm_initTensorRt(int gpuid){
+  if(!runtime_initialized){
+    printf("INITIALIZING GPU %d \n", gpuid);
+    // NOTE: Setting the target GPU. Can we use multiple GPUs?
+    checkCudaErrors(cudaSetDevice(gpuid));
+    // Initializing cuDNN and cuBlas handles
+    checkCudaErrors(cublasCreate(&cublasHandle));
+    checkCUDNN(cudnnCreate(&cudnnHandle));
+    printf("CREATED HANDLES %d \n", gpuid);
+#ifdef PROMISE_TUNER_ENABLED
+    //    readOpenTunerFlags("opentuner_flags");
+    readOpenTunerFlags("promise_flags");
+    initializeAutotuner();
+    printf("Read PROMISE FLAGS %d \n", gpuid);
+#endif
+#ifdef ERROR_INJECTION_ENABLED
+    readOpenTunerFlags("opentuner_flags");
+#endif
+    runtime_initialized = true;
+  }
+  printf("DONE INTIALIZING GPU %d \n", gpuid);
+}
+void llvm_hpvm_cleanupTensorRt(){
+  DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
+  dumpAccuracyNorms();
+}
+void llvm_hpvm_initApproxhpvmRt(int gpuid){
+  llvm_hpvm_initTensorRt(gpuid);
+  approxhpvm_runtime_mode = true;
+}
+void llvm_hpvm_cleanupApproxhpvmRt(){
+}
+void dumpAccuracyNorms(){
+  #ifdef ERROR_INJECTION_ENABLED
+  #endif
+  dump_result("accuracy_summary");
+}
+// Returns the number of GPUs active on the platform
+unsigned int getGPUCount(){
+  int num_gpus;
+  checkCudaErrors(cudaGetDeviceCount(&num_gpus));
+  return num_gpus;
+}
+void clearTensorMap(){
+  tensors_ptr.clear();
+  host_ptr.clear();
+  obj_ptr.clear();
+  tracked_tensors.clear();
+}
+void startMemTracking(){
+  tensors_ptr.clear();
+  host_ptr.clear();
+  obj_ptr.clear();
+  tracked_tensors.clear();
+}
+void freeOutputTensors(){
+  DEBUG("**** Freeing Ouput Tensors *** \n");
+  for (void *ptr: tensors_ptr)
+    cudaFree(ptr);
+  for(void *ptr: host_ptr)
+    free(ptr);
+  for(void *ptr: obj_ptr)
+    free(ptr);
+  clearTensorMap();
+}
+void clearOpCounter(){
+  total_ops = 0;
+  op_counter = 0;
+  op_accuracies.clear();
+}
+void freeBatchMemory(){
+  // Free allocated memory for the current mini-batch
+  freeOutputTensors();
+  // Reinitialize couter for OpenTuner flags - next mini-batch of execution
+  op_counter = 0;
+  // Clearing profiling data map
+  func_counters.clear();
+}
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -35,6 +35,7 @@
 // Tensor runtime header files
 #include "tensor_runtime.h"
 #include "tensor_utils.h"
+#include "init_api.h"
 #include "debug.h"
 #include "profiling.h"
 #include "fp16_conversion.h"
@@ -48,135 +49,6 @@
-void llvm_hpvm_initTensorRt(int gpuid){
-  if(!runtime_initialized){
-    printf("INITIALIZING GPU %d \n", gpuid);
-    // NOTE: Setting the target GPU. Can we use multiple GPUs?
-    checkCudaErrors(cudaSetDevice(gpuid));
-    // Initializing cuDNN and cuBlas handles
-    checkCudaErrors(cublasCreate(&cublasHandle));
-    checkCUDNN(cudnnCreate(&cudnnHandle));
-    printf("CREATED HANDLES %d \n", gpuid);
-#ifdef PROMISE_TUNER_ENABLED
-    //    readOpenTunerFlags("opentuner_flags");
-    readOpenTunerFlags("promise_flags");
-    initializeAutotuner();
-    printf("Read PROMISE FLAGS %d \n", gpuid);
-#endif
-#ifdef ERROR_INJECTION_ENABLED
-    readOpenTunerFlags("opentuner_flags");
-#endif
-    runtime_initialized = true;
-  }
-  printf("DONE INTIALIZING GPU %d \n", gpuid);
-}
-void llvm_hpvm_cleanupTensorRt(){
-  DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
-  dumpAccuracyNorms();
-}
-void llvm_hpvm_initApproxhpvmRt(int gpuid){
-  llvm_hpvm_initTensorRt(gpuid);
-  approxhpvm_runtime_mode = true;
-}
-void llvm_hpvm_cleanupApproxhpvmRt(){
-}
-void dumpAccuracyNorms(){
-  #ifdef ERROR_INJECTION_ENABLED
-  #endif
-  dump_result("accuracy_summary");
-}
-// Returns the number of GPUs active on the platform
-int getGPUCount(){
-  int num_gpus;
-  checkCudaErrors(cudaGetDeviceCount(&num_gpus));
-  return num_gpus;
-}
-void clearTensorMap(){
-  tensors_ptr.clear();
-  host_ptr.clear();
-  obj_ptr.clear();
-  tracked_tensors.clear();
-}
-void startMemTracking(){
-  tensors_ptr.clear();
-  host_ptr.clear();
-  obj_ptr.clear();
-  tracked_tensors.clear();
-}
-void freeOutputTensors(){
-  DEBUG("**** Freeing Ouput Tensors *** \n");
-  for (void *ptr: tensors_ptr)
-    cudaFree(ptr);
-  for(void *ptr: host_ptr)
-    free(ptr);
-  for(void *ptr: obj_ptr)
-    free(ptr);
-  clearTensorMap();
-}
-void clearOpCounter(){
-  total_ops = 0;
-  op_counter = 0;
-  op_accuracies.clear();
-}
-void freeBatchMemory(){
-  // Free allocated memory for the current mini-batch
-  freeOutputTensors();
-  // Reinitialize couter for OpenTuner flags - next mini-batch of execution
-  op_counter = 0;
-  // Clearing profiling data map
-  func_counters.clear();
-}
 // FIXIT: Fix any assumptions on the NCHW format
 // TODO: benchmark split performance and check if it is prohibitively high?