diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index b3b6cb6e179ee39c82459f13b1734f8c0e626e01..562ca1538dfddc812c32d84ff8b8595473289974 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -64,6 +64,7 @@ set(
   tensor_utils.cu
   wrapper_runtime.cu
   approx_knobs_utils.cc
+  init_api.cc
 )
 foreach(FILE ${RUNTIME_SRCS_FILENAME})
   list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}")
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac742876b054c88d50634ecae306b25d471f5c06
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
@@ -0,0 +1,53 @@
+
+
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <cublas_api.h>
+#include <cuda_fp16.h>
+
+// Tensor runtime header files
+#include "tensor_runtime.h"
+#include "tensor_utils.h"
+#include "debug.h"
+#include "profiling.h"
+#include "global_data.h"
+#include "error.h"
+#include "tensor.h"
+#include "op_overheads.h"
+#include "approx_simulation.h"
+
+
+
+void llvm_hpvm_initTensorRt(int gpuid);
+
+void llvm_hpvm_cleanupTensorRt();
+
+void llvm_hpvm_initApproxhpvmRt(int gpuid);
+
+void llvm_hpvm_cleanupApproxhpvmRt();
+
+void dumpAccuracyNorms();
+
+// Returns the number of GPUs active on the platform
+unsigned int getGPUCount();
+
+void clearTensorMap();
+
+void startMemTracking();
+
+void freeOutputTensors();
+
+void clearOpCounter();
+
+void freeBatchMemory();
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b311f50f99bf6ffc8ec508300d3e92bd9b314796
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
@@ -0,0 +1,157 @@
+
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <cublas_api.h>
+#include <cuda_fp16.h>
+
+// Tensor runtime header files
+#include "tensor_runtime.h"
+#include "tensor_utils.h"
+#include "debug.h"
+#include "profiling.h"
+#include "global_data.h"
+#include "error.h"
+#include "tensor.h"
+#include "op_overheads.h"
+#include "approx_simulation.h"
+#include "init_api.h"
+
+
+void llvm_hpvm_initTensorRt(int gpuid){
+
+  if(!runtime_initialized){
+    
+    printf("INITIALIZING GPU %d \n", gpuid);
+    // NOTE: Setting the target GPU. Can we use multiple GPUs?
+    checkCudaErrors(cudaSetDevice(gpuid));
+    // Initializing cuDNN and cuBlas handles
+    checkCudaErrors(cublasCreate(&cublasHandle));
+    checkCUDNN(cudnnCreate(&cudnnHandle));
+
+    printf("CREATED HANDLES %d \n", gpuid);
+
+
+#ifdef PROMISE_TUNER_ENABLED
+    //    readOpenTunerFlags("opentuner_flags");
+    
+    readOpenTunerFlags("promise_flags");
+    initializeAutotuner();
+
+    printf("Read PROMISE FLAGS %d \n", gpuid);
+
+#endif
+
+
+#ifdef ERROR_INJECTION_ENABLED
+    readOpenTunerFlags("opentuner_flags");
+#endif
+
+    
+    runtime_initialized = true;
+  }
+
+  printf("DONE INTIALIZING GPU %d \n", gpuid);
+
+}
+
+
+void llvm_hpvm_cleanupTensorRt(){
+  DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
+  dumpAccuracyNorms();
+}
+
+
+void llvm_hpvm_initApproxhpvmRt(int gpuid){
+  llvm_hpvm_initTensorRt(gpuid);
+  approxhpvm_runtime_mode = true;
+}
+
+void llvm_hpvm_cleanupApproxhpvmRt(){
+
+}
+
+
+
+void dumpAccuracyNorms(){
+
+  #ifdef ERROR_INJECTION_ENABLED
+  
+  
+  #endif
+
+  dump_result("accuracy_summary");
+
+}
+
+
+// Returns the number of GPUs active on the platform
+unsigned int getGPUCount(){
+  int num_gpus;
+  checkCudaErrors(cudaGetDeviceCount(&num_gpus));
+  return num_gpus;
+}
+
+
+
+void clearTensorMap(){
+  tensors_ptr.clear();
+  host_ptr.clear();
+  obj_ptr.clear();
+  tracked_tensors.clear();
+}
+
+
+void startMemTracking(){
+  tensors_ptr.clear();
+  host_ptr.clear();
+  obj_ptr.clear();
+
+  tracked_tensors.clear();
+}
+
+
+void freeOutputTensors(){
+
+  DEBUG("**** Freeing Ouput Tensors *** \n");
+  for (void *ptr: tensors_ptr)
+    cudaFree(ptr);
+
+  for(void *ptr: host_ptr)
+    free(ptr);
+  
+  for(void *ptr: obj_ptr)
+    free(ptr);
+  
+  clearTensorMap();
+}
+
+
+
+void clearOpCounter(){
+  total_ops = 0;
+  op_counter = 0;
+  op_accuracies.clear();
+}
+
+
+
+void freeBatchMemory(){
+  // Free allocated memory for the current mini-batch
+  freeOutputTensors();
+  // Reinitialize couter for OpenTuner flags - next mini-batch of execution
+  op_counter = 0;
+  // Clearing profiling data map
+  func_counters.clear();
+}
+
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 59d3b5618389871d4e0463d4b36ed5bb93f12452..201b75429daf152cdd35989da7ff32bfdc2fb604 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -35,6 +35,7 @@
 // Tensor runtime header files
 #include "tensor_runtime.h"
 #include "tensor_utils.h"
+#include "init_api.h"
 #include "debug.h"
 #include "profiling.h"
 #include "fp16_conversion.h"
@@ -48,135 +49,6 @@
 
 
 
-void llvm_hpvm_initTensorRt(int gpuid){
-
-  if(!runtime_initialized){
-    
-    printf("INITIALIZING GPU %d \n", gpuid);
-    // NOTE: Setting the target GPU. Can we use multiple GPUs?
-    checkCudaErrors(cudaSetDevice(gpuid));
-    // Initializing cuDNN and cuBlas handles
-    checkCudaErrors(cublasCreate(&cublasHandle));
-    checkCUDNN(cudnnCreate(&cudnnHandle));
-
-    printf("CREATED HANDLES %d \n", gpuid);
-
-
-#ifdef PROMISE_TUNER_ENABLED
-    //    readOpenTunerFlags("opentuner_flags");
-    
-    readOpenTunerFlags("promise_flags");
-    initializeAutotuner();
-
-    printf("Read PROMISE FLAGS %d \n", gpuid);
-
-#endif
-
-
-#ifdef ERROR_INJECTION_ENABLED
-    readOpenTunerFlags("opentuner_flags");
-#endif
-
-    
-    runtime_initialized = true;
-  }
-
-  printf("DONE INTIALIZING GPU %d \n", gpuid);
-
-}
-
-
-void llvm_hpvm_cleanupTensorRt(){
-  DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
-  dumpAccuracyNorms();
-}
-
-
-void llvm_hpvm_initApproxhpvmRt(int gpuid){
-  llvm_hpvm_initTensorRt(gpuid);
-  approxhpvm_runtime_mode = true;
-}
-
-void llvm_hpvm_cleanupApproxhpvmRt(){
-
-}
-
-
-
-void dumpAccuracyNorms(){
-
-  #ifdef ERROR_INJECTION_ENABLED
-  
-  
-  #endif
-
-  dump_result("accuracy_summary");
-
-}
-
-
-// Returns the number of GPUs active on the platform
-int getGPUCount(){
-  int num_gpus;
-  checkCudaErrors(cudaGetDeviceCount(&num_gpus));
-  return num_gpus;
-}
-
-
-
-void clearTensorMap(){
-  tensors_ptr.clear();
-  host_ptr.clear();
-  obj_ptr.clear();
-  tracked_tensors.clear();
-}
-
-
-void startMemTracking(){
-  tensors_ptr.clear();
-  host_ptr.clear();
-  obj_ptr.clear();
-
-  tracked_tensors.clear();
-}
-
-
-void freeOutputTensors(){
-
-  DEBUG("**** Freeing Ouput Tensors *** \n");
-  for (void *ptr: tensors_ptr)
-    cudaFree(ptr);
-
-  for(void *ptr: host_ptr)
-    free(ptr);
-  
-  for(void *ptr: obj_ptr)
-    free(ptr);
-  
-  clearTensorMap();
-}
-
-
-
-void clearOpCounter(){
-  total_ops = 0;
-  op_counter = 0;
-  op_accuracies.clear();
-}
-
-
-
-void freeBatchMemory(){
-  // Free allocated memory for the current mini-batch
-  freeOutputTensors();
-  // Reinitialize couter for OpenTuner flags - next mini-batch of execution
-  op_counter = 0;
-  // Clearing profiling data map
-  func_counters.clear();
-}
-
-
-
 
 // FIXIT: Fix any assumptions on the NCHW format
 // TODO: benchmark split performance and check if it is prohibitively high?