diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h index 6e81c7a3fbfbe4cae3cd1c40f43a4c7d5ea2d7c8..4d89d38ad193164027c2e7fde78764df3cdd7a92 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h @@ -33,14 +33,11 @@ struct Tensor { int data_type; int cur_type; int data_format; - data_location_t - data_placement; // Maintains the location of the tensor {host, device...} - cudnnTensorDescriptor_t tensor_desc; - cudnnFilterDescriptor_t - filter_desc; // FIXIT: Rethink if this should be in tensor struct - cudnnTensorDescriptor_t tensor_half_desc; - cudnnFilterDescriptor_t - filter_half_desc; // FIXIT: Rethink if this should be in tensor struct + data_location_t data_placement; // Maintains the location of the tensor {host, device...} + cudnnTensorDescriptor_t tensor_desc; + cudnnFilterDescriptor_t filter_desc; // FIXIT: Rethink if this should be in tensor struct + cudnnTensorDescriptor_t tensor_half_desc; + cudnnFilterDescriptor_t filter_half_desc; // FIXIT: Rethink if this should be in tensor struct void *host_data; void *gpu_data; // Pointer to GPU FP32 data void *gpu_half_data; // Pointer to GPU FP16 data diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h index 22f202ef039ed45eb0d2690f2c4a35c9b73e4bc5..698ab026dac0324cb456f69b4398111ac708412d 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h @@ -201,8 +201,7 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type, long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; float *tensor_data = (float *)malloc(sizeof(float) * num_elems); - //printf("size_in_bytes = %lu \n", size_in_bytes); - + int file_header_size = 0; FILE *file = fopen(file_name, "rb"); @@ -220,12 +219,13 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type, data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); initTensorData(weights, tensor_data, size_in_bytes); - // compareValues(weights, tensor_data, num_elems); + free(tensor_data); return weights; } + struct Tensor *readInputBatch(const char *file_name, long data_type, long start, long end, long dim2_size, long dim3_size, long dim4_size) { @@ -252,8 +252,7 @@ struct Tensor *readInputBatch(const char *file_name, long data_type, fclose(file); - struct Tensor *weights = (struct Tensor *)create4DTensor( - data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + struct Tensor *weights = (struct Tensor *) create4DTensor(data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); initTensorData(weights, tensor_data, size_in_bytes); free(tensor_data); @@ -589,3 +588,4 @@ void dumpOutput(void *output_ptr, const char *file_name) { } #endif + diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index 9f17a3dc5134e576ec0ba9819821247bc1e5fa6c..2d0ba3288b6e440046655a0acef9454b335ce55c 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -1191,7 +1191,7 @@ static int num_executations = 0; float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { - struct Tensor *result = (struct Tensor *)result_ptr; + struct Tensor *result = (struct Tensor *) result_ptr; size_t batch_dim = result->dims.dim_sizes[0]; size_t num_classes = result->dims.dim_sizes[1]; @@ -1204,6 +1204,7 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { int chosen = 0; for (int id = 1; id < num_classes; ++id) { + //printf(" check = %f \n ", data[i * num_classes + id]); if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 95c17752fd3fd0e441f2c6335591eeb741105db9..dcc6e2e7ce832afc24b560006cb07272f08842cf 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -873,8 +873,9 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int x_radius = (window_width - 1) / 2; int y_radius = (window_height - 1) / 2; - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels, + Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels, output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; omp_set_num_threads(4); @@ -891,10 +892,10 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int y_radius_var_max = y_radius_var + image_height; int x_radius_var = x_radius - c; int x_radius_var_max = x_radius_var + image_width; - int ki_min = - (y_radius_var > 0) - ? ((y_radius_var < window_height) ? y_radius_var : -1) - : 0; + int ki_min = (y_radius_var > 0) + ? ((y_radius_var < window_height) ? y_radius_var : -1) + : 0; + int ki_max = (y_radius_var_max < window_height) ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height; @@ -977,11 +978,11 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { int m = lhs->dims.dim_sizes[0]; int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1); + Tensor *output = (Tensor *) create4DTensor(0, 0, m, n, 1, 1); - float *__restrict__ lhs_arr = (float *)lhs->host_data; - float *__restrict__ rhs_arr = (float *)rhs->host_data; - float *__restrict__ output_arr = (float *)output->host_data; + float *__restrict__ lhs_arr = (float *) lhs->host_data; + float *__restrict__ rhs_arr = (float *) rhs->host_data; + float *__restrict__ output_arr = (float *) output->host_data; int k = 1; #pragma unroll 4 // Can we unroll more??? @@ -1009,26 +1010,35 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { output_arr[i * n + j] = sum; } } + free(tran_rhs); + return output; } void *tensorSoftmaxCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; + Tensor *input = (Tensor *) input_ptr; deviceToHostCopy(input); - float *logits = (float *)input->host_data; + float *logits = (float *) input->host_data; int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; + + float max = logits[0]; + for (unsigned int i = 0; i < n * c; i++){ + if (logits[i] > max){ + max = logits[i]; + } + } omp_set_num_threads(4); #pragma omp parallel for for (int i = 0; i < n; i++) { - float x = 0; + double x = 0; for (int j = i * c; j < c + i * c; j++) { - logits[j] = expf(logits[j]); + logits[j] = exp(logits[j] / max ); } #pragma omp simd reduction(+ : x) @@ -1036,11 +1046,14 @@ void *tensorSoftmaxCPU(void *input_ptr) { x += logits[j]; } + //printf("x = %f \n ", x); + #pragma omp simd for (int j = i * c; j < i * c + c; j++) { logits[j] /= x; } } + return input; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu index f6bfe700b44c88fea06c6a76267b49af4a523716..4934f5834f07a37d91575a7b5821ec243762b52a 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu @@ -289,6 +289,7 @@ void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size, void *create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, size_t dim3_size, size_t dim4_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; allocateMem(tensor, data_type, num_elems); @@ -308,15 +309,16 @@ void *create4DTensor(int data_type, int data_format, size_t dim1_size, set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size); + changeTensorPlacement(tensor, HOST); + return tensor; } void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - + Tensor *tensor = (Tensor *) tensor_ptr; size_t host_size_in_bytes = tensor->num_elems * 4; - // if(tensor->size_in_bytes != size_in_bytes){ + if (host_size_in_bytes != size_in_bytes) { ERROR("The destination and source sizes don't match"); } @@ -330,29 +332,33 @@ void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { void hostToDeviceCopy(struct Tensor *tensor) { + DEBUG("** HostToDevice *** \n"); if (tensor->data_placement != DEVICE) { cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, cudaMemcpyHostToDevice); DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); tensor->data_placement = DEVICE; - } else { + } + else { DEBUG("No data movement required - Data on Device \n"); } } void deviceToHostCopy(struct Tensor *tensor) { + DEBUG("*** DeviceToHost *** "); if (tensor->data_placement != HOST) { cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, cudaMemcpyDeviceToHost); DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); tensor->data_placement = HOST; - } else { + } + else { DEBUG("No data movement required - Data on Host \n"); } } -// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){ + void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) { @@ -364,7 +370,8 @@ void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) { srcTensor->size_in_bytes); DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes); dstTensor->data_placement = HOST; - } else if (srcTensor->data_placement == DEVICE) { + } + else if (srcTensor->data_placement == DEVICE) { cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice); DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes); @@ -382,7 +389,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) { cudaMemcpyDeviceToHost); DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); tensor->data_placement = HOST; - } else { + } + else { DEBUG("No data movement required - Data on Host \n"); } } @@ -394,7 +402,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) { cudaMemcpyHostToDevice); DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); tensor->data_placement = DEVICE; - } else { + } + else { DEBUG("No data movement required - Data on Device \n"); } }