From 7c7949d804c73bbe30ade8ee485226bbe9de5c7b Mon Sep 17 00:00:00 2001 From: Yasmin <ycs4@cornell.edu> Date: Sat, 7 Dec 2019 18:42:13 -0500 Subject: [PATCH] sampling accuracy discrepancy --- .../dnn_sources/src/alexnet2_approxhalf.cc | 148 +++++++ .../dnn_sources/src/alexnet2_sampsim.cc | 148 +++++++ .../tensor_runtime/include/approx_api.h | 7 + .../include/approx_techniques2.h | 370 ++++++++++++++++++ 4 files changed, 673 insertions(+) create mode 100644 llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc create mode 100644 llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc new file mode 100644 index 0000000000..bd604a334a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc @@ -0,0 +1,148 @@ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + + + +/* NOTE: Reference Architecture to use for profiling */ +void testCifarNet(){ + + printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); + + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); + + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + + startMemTracking(); + + int test_input_size = 10000; + int batch_size = 2500; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + void* conv1out = tensorConvApproxHalf2(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 2, 1); + + + tensorAdd(conv1out, conv1_bias); + void* conv1_tanh = tensorTanh(conv1out); + + // 2nd Layer + void* conv2out = tensorConvApproxHalf(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv2out, conv2_bias); + void* conv2_tanh = tensorTanh(conv2out); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 3rd Layer + void* conv3out = tensorConvApproxHalf(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv3out, conv3_bias); + void* conv3_tanh = tensorTanh(conv3out); + + // 4th Layer + void* conv4out = tensorConvApproxHalf(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv4out, conv4_bias); + void* conv4_tanh = tensorTanh(conv4out); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 5th Layer + void* conv5out = tensorConvApproxHalf(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv5out, conv5_bias); + void* conv5_tanh = tensorTanh(conv5out); + + // 6th Layer + void* conv6out = tensorConvApproxHalf(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv6out, conv6_bias); + + void* conv6_tanh = tensorTanh(conv6out); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + + // final FC Layer + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void* result = tensorSoftmax(gemm1biasout); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, result); + final_accuracy += accuracy; + + freeBatchMemory(); + } + + stopProfiling(); + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + +} + + +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + testCifarNet(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc new file mode 100644 index 0000000000..a0ac48b5ef --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc @@ -0,0 +1,148 @@ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + + + +/* NOTE: Reference Architecture to use for profiling */ +void testCifarNet(){ + + printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); + + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); + + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + + startMemTracking(); + + int test_input_size = 10000; + int batch_size = 2500; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + void* conv1out = tensorConvSampSim(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 2, 0); + + + tensorAdd(conv1out, conv1_bias); + void* conv1_tanh = tensorTanh(conv1out); + + // 2nd Layer + void* conv2out = tensorConvApproxHalf(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv2out, conv2_bias); + void* conv2_tanh = tensorTanh(conv2out); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 3rd Layer + void* conv3out = tensorConvApproxHalf(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv3out, conv3_bias); + void* conv3_tanh = tensorTanh(conv3out); + + // 4th Layer + void* conv4out = tensorConvApproxHalf(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv4out, conv4_bias); + void* conv4_tanh = tensorTanh(conv4out); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 5th Layer + void* conv5out = tensorConvApproxHalf(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv5out, conv5_bias); + void* conv5_tanh = tensorTanh(conv5out); + + // 6th Layer + void* conv6out = tensorConvApproxHalf(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision, 1, 1, 1, 1); + tensorAdd(conv6out, conv6_bias); + + void* conv6_tanh = tensorTanh(conv6out); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + + // final FC Layer + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void* result = tensorSoftmax(gemm1biasout); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, result); + final_accuracy += accuracy; + + freeBatchMemory(); + } + + stopProfiling(); + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + +} + + +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + testCifarNet(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h index 811fa4090f..50438eb22c 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h @@ -67,4 +67,11 @@ extern "C"{ int row, int col, int skip_every, int skip_offset); + void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups, + int row, int col, + int skip_every, int skip_offset); + } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index 6042cc7dae..97c7b9070e 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -1671,3 +1671,373 @@ void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr, return new_output; } + + +//Functions for correct offset numbering + +//produces COL MAJOR matrix with reduced_filter_elem rows and NF cols +__global__ void createReducedFiltersHalfNew(__half * output, + const __half * const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, + const float fac) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int fIdx = tx / num_filter_elem; //filter index + const int offset = tx % num_filter_elem; //offset within filter + if(fIdx < NF) { //is thread id within bounds? + if(offset % skip_every != skip_offset) { //are we including this filter element? + const int output_row = offset - + (offset/skip_every + (offset % skip_every > skip_offset)); + output[fIdx*reduced_filter_elem + output_row] = + __hmul(__float2half_rn(fac), input[tx]); + } + } +} + +//COL Major matrix with N*H*W columns and reduced_filter_elem rows +//skip_every = 1 means no perforation +__global__ void convToGemmHalfInputNew(__half * const __restrict__ output, + const __half * const __restrict input, + const int N, const int C, + const int H, const int W, + const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, + const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_out); //output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number + const int h = tx % (H_out * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + if(filter_elem_num % skip_every != skip_offset) { + int output_col = filter_elem_num - + (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); + if(skip_every == 1) + output_col = filter_elem_num; + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col*N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col*N + n) * H_out + h) * W_out + w] = 0; + } + } + } + } +} + + +//COL Major matrix with N*H*W columns and reduced_filter_elem rows +//Can only be used when skipping every other element in input sampling +__global__ void convToGemmHalfInput2New(__half * const __restrict__ output, + const __half * const __restrict input, + const int N, const int C, + const int H, const int W, + const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, + const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_out); //output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number + const int h = tx % (H_out * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + const int filter_elem_num = c * KH * KW; + + for(int l = (filter_elem_num % 2 == skip_offset); l < KH * KW; l+=2) { + int i = l / KW; + int j = l % KW; + + const int new_idx = filter_elem_num + i * KW + j; + const int output_col = new_idx - + (new_idx / 2 + (new_idx % 2 > skip_offset)); //new output column + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col*N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col*N + n) * H_out + h) * W_out + w] = 0; + + } + } +} + +void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int skip_every, int offset){ + + INFO("*** TensorConvolution half approximation \n"); + profileEvent("#Conv"); + + Tensor* input = (Tensor*)input_ptr; + Tensor* filter = (Tensor*)filter_ptr; + //FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { + conv_groups = 1; + } + + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + profileEvent("F2H_start"); + convertToFP16(input); + convertToFP16(filter); + profileEvent("F2H_end"); + + /******* END OF INPUT DATA CONVERSIONS*/ + profileEvent("F2H_end"); + + profileEvent("Conv"); + + int n, c, h, w; // output dimensions + n = input->dims.dim_sizes[0]; + c = filter->dims.dim_sizes[0]; //number of filters + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + int h_eff = h - h / row; + if(h % row > row - 1 - offset) + h_eff = h_eff - 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + int w_eff = w - w / col; + if(w % col > col - 1 - offset) + w_eff = w_eff - 1; + + + INFO("input: %d %d %d %d\n", input->dims.dim_sizes[0], input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3]); + INFO("filter: %d %d %d %d\n", filter->dims.dim_sizes[0], filter->dims.dim_sizes[1], + filter->dims.dim_sizes[2], filter->dims.dim_sizes[3]); + INFO("output: %d %d %d %d\n", n, c, h, w); + + + Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(new_output, DEVICE); + + if(row > 1){ + Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, + n, c, h_eff, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output_half, DEVICE); + + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + __half * convData; + int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, + horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); + + + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h_eff * w, c, num_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h_eff * w, + (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, + (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + + //interpolate + int numBlocks = (n * c * h * w + 255) / 256; + approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, + row, offset); + cudaDeviceSynchronize(); + + freeTensor(output_half); + cudaFree(convData); + } + else if(col > 1){ + Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output_half, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + __half * convData; + int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, + horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); + + + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + + + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h * w_eff, c, num_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h * w_eff, + (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, + (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + + //interpolate + int numBlocks = (n * c * h * w + 255) / 256; + approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, + col, offset); + + cudaDeviceSynchronize(); + + freeTensor(output_half); + cudaFree(convData); + + } + else{ + Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + //reduced number after skipping + int reduced_filter_elem; + if(offset != skip_every){ + reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every); + if(num_filter_elem % skip_every > offset) + reduced_filter_elem = reduced_filter_elem - 1; + } + else + reduced_filter_elem = num_filter_elem; + + __half * convData; + int convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + __half * reducedFilter; + checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); + const int filtBlockSize = 128; + const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize; + if(offset != skip_every){ + float fac = (skip_every * 1.0) / (skip_every - 1); + createReducedFiltersHalfNew<<<filtGridSize, filtBlockSize>>>(reducedFilter, + (__half *)filter->gpu_half_data, + c, num_filter_elem, reduced_filter_elem, + skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + } + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + if(skip_every == 2){ + convToGemmHalfInput2New<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], + input->dims.dim_sizes[2], + input->dims.dim_sizes[3], + KH, KW, vertical_pad, horizontal_pad, + h, w, vertical_stride, horizontal_stride, + reduced_filter_elem, skip_every, + offset); + } + else{ + convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], + input->dims.dim_sizes[2], + input->dims.dim_sizes[3], + KH, KW, vertical_pad, horizontal_pad, + h, w, vertical_stride, horizontal_stride, + reduced_filter_elem, skip_every, + offset); + } + + checkCudaErrors(cudaDeviceSynchronize()); + //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + + if(offset != skip_every) + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h * w, c, reduced_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h * w, + reducedFilter, CUDA_R_16F, reduced_filter_elem, + beta_half, + (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + else + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h * w, c, reduced_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h * w, + (__half*) filter->gpu_half_data, CUDA_R_16F, + reduced_filter_elem, + beta_half, + (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + + + int numBlocks = (n * c * h * w + 255) / 256; + switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, + (__half *)output->gpu_half_data, + (__half *)new_output->gpu_half_data); + + checkCudaErrors(cudaDeviceSynchronize()); + + cudaFree(convData); + cudaFree(reducedFilter); + freeTensor(output); + + } + + profileEvent("H2F_start"); + convertToFP32_offline(new_output); + profileEvent("H2F_end"); + + profileEvent("#Conv_end"); + + + return new_output; +} -- GitLab