Starting with Auto Data conversions for FP16/FP32 - Incomplete

706182bf · Hashim Sharif · 939b99c8 · 706182bf · 706182bf · 706182bf
Commit 706182bf authored 5 years ago by Hashim Sharif
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_gemm.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_gemm.cu
+
+
+#ifndef FP16_UTILS_HEADER
+#define FP16_UTILS_HEADER
+
 #include <iostream>
 #include <string>
 #include <cublas_v2.h>
@@ -270,3 +275,5 @@ void hgemm(const float * const __restrict__ af,
    return 0;
 }
 */
+
+#endif
\ No newline at end of file
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
@@ -41,8 +41,11 @@ struct Tensor{
  data_location_t data_placement; // Maintains the location of the tensor {host, device...} 
  cudnnTensorDescriptor_t tensor_desc;
  cudnnFilterDescriptor_t filter_desc; // FIXIT: Rethink if this should be in tensor struct
+  cudnnTensorDescriptor_t tensor_half_desc;
+  cudnnFilterDescriptor_t filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
  void* host_data;
-  void* gpu_data; // The pointers should not be device specific per se - TODO: Better design needed
+  void* gpu_data; // Pointer to GPU FP32 data
+  void* gpu_half_data; // Pointer to GPU FP16 data
  size_t num_elems; // Total elements
  size_t size_in_bytes; // Total size in bytes
  struct Dimension dims;

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
@@ -35,6 +35,7 @@
 #include "debug.h"
 #include "tensor.h"
 #include "global_data.h"
+#include "fp16_gemm.cu"


 // used to map HPVM tensors to runtime tensors (with extra runtime-specific information)
@@ -118,6 +119,7 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
  tensor->num_elems = num_elems;
  tensor->host_data = (void*) malloc(tensor->size_in_bytes); // Allocate memory on the host
  tensor->data_placement = HOST; // By defaut data is on the host
+  //printf("Allocating Sizes = %lu \n", tensor->size_in_bytes);
  checkCudaErrors(cudaMalloc(&tensor->gpu_data, tensor->size_in_bytes)); // Allocate memory on GPU

  tensors_ptr.push_back(tensor->gpu_data);
@@ -150,6 +152,9 @@ void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
  setCudnnDataFormat(tensor, data_format);
  
  checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_desc));
+
+  checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_half_desc));
+
  
  checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_desc,
 					(cudnnDataType_t) tensor->data_type,
@@ -157,7 +162,16 @@ void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
 					dim1_size,
 					dim2_size, 
 					dim3_size,
+					dim4_size));
+
+  checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_half_desc,
+					(cudnnDataType_t) tensor->data_type,
+					(cudnnTensorFormat_t) CUDNN_DATA_HALF,
+					dim1_size,
+					dim2_size, 
+					dim3_size,
 					dim4_size));  
+
 }


@@ -169,15 +183,24 @@ void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s

  checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_desc));

+  checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_half_desc));
+
  // For certain operations, the strides may need to change - in which case the descriptor
  // needs to be reinitialized
-  // FIXIT: Only specific to floats - make generic and test
  cudnnSetTensor4dDescriptor(tensor->tensor_desc,
 			     (cudnnTensorFormat_t) tensor->data_format, // Data format
 			     (cudnnDataType_t) tensor->data_type, // Data type
 			     dim1_size, dim2_size, 
 			     dim3_size, dim4_size);

+
+  cudnnSetTensor4dDescriptor(tensor->tensor_half_desc,
+			     (cudnnTensorFormat_t) tensor->data_format, // Data format
+			     (cudnnDataType_t) CUDNN_DATA_HALF, // Data type
+			     dim1_size, dim2_size, 
+			     dim3_size, dim4_size);
+
+  
  cudnnDataType_t dType;
  int nStride, cStride, hStride, wStride;
  int size1, size2, size3, size4;
@@ -374,11 +397,38 @@ extern "C"{
  
  }

+
+  
+}
+
+
+
+
+void convertToFP16(struct Tensor* tensor){
+
+  size_t size_in_bytes = tensor->size_in_bytes / 2;  
+  checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU
+
+  f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data);
+
+}
+
+void convertToFP32(struct Tensor* tensor){
+
+  h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
+
+  cudaFree(tensor->gpu_half_data);
+  tensor->gpu_half_data = NULL;
 }

+
+
 // Called from within the runtime to change the data placement
 // This routine is required to change the output data placements from host to device
 void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement){
+
+  if(tensor == NULL)
+    ERROR("Tensor == NULL");
  tensor->data_placement = data_placement;
 }