Skip to content
Snippets Groups Projects
Commit 58487a52 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Separating tensor runtime initialization routines into different source

parent 2cdc4519
No related branches found
No related tags found
No related merge requests found
...@@ -64,6 +64,7 @@ set( ...@@ -64,6 +64,7 @@ set(
tensor_utils.cu tensor_utils.cu
wrapper_runtime.cu wrapper_runtime.cu
approx_knobs_utils.cc approx_knobs_utils.cc
init_api.cc
) )
foreach(FILE ${RUNTIME_SRCS_FILENAME}) foreach(FILE ${RUNTIME_SRCS_FILENAME})
list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}") list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}")
......
#include <stdio.h>
#include <stdarg.h>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cudnn.h>
#include <cublas_api.h>
#include <cuda_fp16.h>
// Tensor runtime header files
#include "tensor_runtime.h"
#include "tensor_utils.h"
#include "debug.h"
#include "profiling.h"
#include "global_data.h"
#include "error.h"
#include "tensor.h"
#include "op_overheads.h"
#include "approx_simulation.h"
void llvm_hpvm_initTensorRt(int gpuid);
void llvm_hpvm_cleanupTensorRt();
void llvm_hpvm_initApproxhpvmRt(int gpuid);
void llvm_hpvm_cleanupApproxhpvmRt();
void dumpAccuracyNorms();
// Returns the number of GPUs active on the platform
unsigned int getGPUCount();
void clearTensorMap();
void startMemTracking();
void freeOutputTensors();
void clearOpCounter();
void freeBatchMemory();
#include <stdio.h>
#include <stdarg.h>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cudnn.h>
#include <cublas_api.h>
#include <cuda_fp16.h>
// Tensor runtime header files
#include "tensor_runtime.h"
#include "tensor_utils.h"
#include "debug.h"
#include "profiling.h"
#include "global_data.h"
#include "error.h"
#include "tensor.h"
#include "op_overheads.h"
#include "approx_simulation.h"
#include "init_api.h"
void llvm_hpvm_initTensorRt(int gpuid){
if(!runtime_initialized){
printf("INITIALIZING GPU %d \n", gpuid);
// NOTE: Setting the target GPU. Can we use multiple GPUs?
checkCudaErrors(cudaSetDevice(gpuid));
// Initializing cuDNN and cuBlas handles
checkCudaErrors(cublasCreate(&cublasHandle));
checkCUDNN(cudnnCreate(&cudnnHandle));
printf("CREATED HANDLES %d \n", gpuid);
#ifdef PROMISE_TUNER_ENABLED
// readOpenTunerFlags("opentuner_flags");
readOpenTunerFlags("promise_flags");
initializeAutotuner();
printf("Read PROMISE FLAGS %d \n", gpuid);
#endif
#ifdef ERROR_INJECTION_ENABLED
readOpenTunerFlags("opentuner_flags");
#endif
runtime_initialized = true;
}
printf("DONE INTIALIZING GPU %d \n", gpuid);
}
void llvm_hpvm_cleanupTensorRt(){
DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
dumpAccuracyNorms();
}
void llvm_hpvm_initApproxhpvmRt(int gpuid){
llvm_hpvm_initTensorRt(gpuid);
approxhpvm_runtime_mode = true;
}
void llvm_hpvm_cleanupApproxhpvmRt(){
}
void dumpAccuracyNorms(){
#ifdef ERROR_INJECTION_ENABLED
#endif
dump_result("accuracy_summary");
}
// Returns the number of GPUs active on the platform
unsigned int getGPUCount(){
int num_gpus;
checkCudaErrors(cudaGetDeviceCount(&num_gpus));
return num_gpus;
}
void clearTensorMap(){
tensors_ptr.clear();
host_ptr.clear();
obj_ptr.clear();
tracked_tensors.clear();
}
void startMemTracking(){
tensors_ptr.clear();
host_ptr.clear();
obj_ptr.clear();
tracked_tensors.clear();
}
void freeOutputTensors(){
DEBUG("**** Freeing Ouput Tensors *** \n");
for (void *ptr: tensors_ptr)
cudaFree(ptr);
for(void *ptr: host_ptr)
free(ptr);
for(void *ptr: obj_ptr)
free(ptr);
clearTensorMap();
}
void clearOpCounter(){
total_ops = 0;
op_counter = 0;
op_accuracies.clear();
}
void freeBatchMemory(){
// Free allocated memory for the current mini-batch
freeOutputTensors();
// Reinitialize couter for OpenTuner flags - next mini-batch of execution
op_counter = 0;
// Clearing profiling data map
func_counters.clear();
}
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
// Tensor runtime header files // Tensor runtime header files
#include "tensor_runtime.h" #include "tensor_runtime.h"
#include "tensor_utils.h" #include "tensor_utils.h"
#include "init_api.h"
#include "debug.h" #include "debug.h"
#include "profiling.h" #include "profiling.h"
#include "fp16_conversion.h" #include "fp16_conversion.h"
...@@ -48,135 +49,6 @@ ...@@ -48,135 +49,6 @@
void llvm_hpvm_initTensorRt(int gpuid){
if(!runtime_initialized){
printf("INITIALIZING GPU %d \n", gpuid);
// NOTE: Setting the target GPU. Can we use multiple GPUs?
checkCudaErrors(cudaSetDevice(gpuid));
// Initializing cuDNN and cuBlas handles
checkCudaErrors(cublasCreate(&cublasHandle));
checkCUDNN(cudnnCreate(&cudnnHandle));
printf("CREATED HANDLES %d \n", gpuid);
#ifdef PROMISE_TUNER_ENABLED
// readOpenTunerFlags("opentuner_flags");
readOpenTunerFlags("promise_flags");
initializeAutotuner();
printf("Read PROMISE FLAGS %d \n", gpuid);
#endif
#ifdef ERROR_INJECTION_ENABLED
readOpenTunerFlags("opentuner_flags");
#endif
runtime_initialized = true;
}
printf("DONE INTIALIZING GPU %d \n", gpuid);
}
void llvm_hpvm_cleanupTensorRt(){
DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
dumpAccuracyNorms();
}
void llvm_hpvm_initApproxhpvmRt(int gpuid){
llvm_hpvm_initTensorRt(gpuid);
approxhpvm_runtime_mode = true;
}
void llvm_hpvm_cleanupApproxhpvmRt(){
}
void dumpAccuracyNorms(){
#ifdef ERROR_INJECTION_ENABLED
#endif
dump_result("accuracy_summary");
}
// Returns the number of GPUs active on the platform
int getGPUCount(){
int num_gpus;
checkCudaErrors(cudaGetDeviceCount(&num_gpus));
return num_gpus;
}
void clearTensorMap(){
tensors_ptr.clear();
host_ptr.clear();
obj_ptr.clear();
tracked_tensors.clear();
}
void startMemTracking(){
tensors_ptr.clear();
host_ptr.clear();
obj_ptr.clear();
tracked_tensors.clear();
}
void freeOutputTensors(){
DEBUG("**** Freeing Ouput Tensors *** \n");
for (void *ptr: tensors_ptr)
cudaFree(ptr);
for(void *ptr: host_ptr)
free(ptr);
for(void *ptr: obj_ptr)
free(ptr);
clearTensorMap();
}
void clearOpCounter(){
total_ops = 0;
op_counter = 0;
op_accuracies.clear();
}
void freeBatchMemory(){
// Free allocated memory for the current mini-batch
freeOutputTensors();
// Reinitialize couter for OpenTuner flags - next mini-batch of execution
op_counter = 0;
// Clearing profiling data map
func_counters.clear();
}
// FIXIT: Fix any assumptions on the NCHW format // FIXIT: Fix any assumptions on the NCHW format
// TODO: benchmark split performance and check if it is prohibitively high? // TODO: benchmark split performance and check if it is prohibitively high?
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment