Skip to content
Snippets Groups Projects
Commit 706182bf authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Starting with Auto Data conversions for FP16/FP32 - Incomplete

parent 939b99c8
No related branches found
No related tags found
No related merge requests found
#ifndef FP16_UTILS_HEADER
#define FP16_UTILS_HEADER
#include <iostream>
#include <string>
#include <cublas_v2.h>
......@@ -270,3 +275,5 @@ void hgemm(const float * const __restrict__ af,
return 0;
}
*/
#endif
\ No newline at end of file
......@@ -41,8 +41,11 @@ struct Tensor{
data_location_t data_placement; // Maintains the location of the tensor {host, device...}
cudnnTensorDescriptor_t tensor_desc;
cudnnFilterDescriptor_t filter_desc; // FIXIT: Rethink if this should be in tensor struct
cudnnTensorDescriptor_t tensor_half_desc;
cudnnFilterDescriptor_t filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
void* host_data;
void* gpu_data; // The pointers should not be device specific per se - TODO: Better design needed
void* gpu_data; // Pointer to GPU FP32 data
void* gpu_half_data; // Pointer to GPU FP16 data
size_t num_elems; // Total elements
size_t size_in_bytes; // Total size in bytes
struct Dimension dims;
......
......@@ -35,6 +35,7 @@
#include "debug.h"
#include "tensor.h"
#include "global_data.h"
#include "fp16_gemm.cu"
// used to map HPVM tensors to runtime tensors (with extra runtime-specific information)
......@@ -118,6 +119,7 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
tensor->num_elems = num_elems;
tensor->host_data = (void*) malloc(tensor->size_in_bytes); // Allocate memory on the host
tensor->data_placement = HOST; // By defaut data is on the host
//printf("Allocating Sizes = %lu \n", tensor->size_in_bytes);
checkCudaErrors(cudaMalloc(&tensor->gpu_data, tensor->size_in_bytes)); // Allocate memory on GPU
tensors_ptr.push_back(tensor->gpu_data);
......@@ -150,6 +152,9 @@ void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
setCudnnDataFormat(tensor, data_format);
checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_desc));
checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_half_desc));
checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_desc,
(cudnnDataType_t) tensor->data_type,
......@@ -157,7 +162,16 @@ void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
dim1_size,
dim2_size,
dim3_size,
dim4_size));
checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_half_desc,
(cudnnDataType_t) tensor->data_type,
(cudnnTensorFormat_t) CUDNN_DATA_HALF,
dim1_size,
dim2_size,
dim3_size,
dim4_size));
}
......@@ -169,15 +183,24 @@ void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_desc));
checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_half_desc));
// For certain operations, the strides may need to change - in which case the descriptor
// needs to be reinitialized
// FIXIT: Only specific to floats - make generic and test
cudnnSetTensor4dDescriptor(tensor->tensor_desc,
(cudnnTensorFormat_t) tensor->data_format, // Data format
(cudnnDataType_t) tensor->data_type, // Data type
dim1_size, dim2_size,
dim3_size, dim4_size);
cudnnSetTensor4dDescriptor(tensor->tensor_half_desc,
(cudnnTensorFormat_t) tensor->data_format, // Data format
(cudnnDataType_t) CUDNN_DATA_HALF, // Data type
dim1_size, dim2_size,
dim3_size, dim4_size);
cudnnDataType_t dType;
int nStride, cStride, hStride, wStride;
int size1, size2, size3, size4;
......@@ -374,11 +397,38 @@ extern "C"{
}
}
void convertToFP16(struct Tensor* tensor){
size_t size_in_bytes = tensor->size_in_bytes / 2;
checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU
f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data);
}
void convertToFP32(struct Tensor* tensor){
h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
cudaFree(tensor->gpu_half_data);
tensor->gpu_half_data = NULL;
}
// Called from within the runtime to change the data placement
// This routine is required to change the output data placements from host to device
void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement){
if(tensor == NULL)
ERROR("Tensor == NULL");
tensor->data_placement = data_placement;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment