Adding New Sampling Knobs to tensorConvSampSim -- with half interpolation

a532ece7 · Hashim Sharif · 714bef9e · a532ece7 · a532ece7 · a532ece7
Commit a532ece7 authored 5 years ago by Hashim Sharif
--- a/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt
@@ -18,15 +18,24 @@ perf,135	4,1,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
 perf,136	4,1,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
 perf,137	4,1,2	1.33	tensorConvolution	tensorConvApprox	dev	conv
 perf,138	4,1,3	1.33	tensorConvolution	tensorConvApprox	dev	conv
-samp,231	2,0	2.0	tensorConvolution	tensorConvApprox	dev	conv
-samp,232	2,1	2.0	tensorConvolution	tensorConvApprox	dev	conv
-samp,233	3,0	1.5	tensorConvolution	tensorConvApprox	dev	conv
-samp,234	3,1	1.5	tensorConvolution	tensorConvApprox	dev	conv
-samp,235	3,2	1.5	tensorConvolution	tensorConvApprox	dev	conv
-samp,236	4,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
-samp,237	4,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
-samp,238	4,2	1.33	tensorConvolution	tensorConvApprox	dev	conv
-samp,239	4,3	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,231	2,0,0	2.0	tensorConvolution	tensorConvApprox	dev	conv
+samp,232	2,1,0	2.0	tensorConvolution	tensorConvApprox	dev	conv
+samp,233	3,0,0	1.5	tensorConvolution	tensorConvApprox	dev	conv
+samp,234	3,1,0	1.5	tensorConvolution	tensorConvApprox	dev	conv
+samp,235	3,2,0	1.5	tensorConvolution	tensorConvApprox	dev	conv
+samp,236	4,0,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,237	4,1,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,238	4,2,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,239	4,3,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,240	2,0,1	2.0	tensorConvolution	tensorConvApprox	dev	conv
+samp,241	2,1,1	2.0	tensorConvolution	tensorConvApprox	dev	conv
+samp,242	3,0,1	1.5	tensorConvolution	tensorConvApprox	dev	conv
+samp,243	3,1,1	1.5	tensorConvolution	tensorConvApprox	dev	conv
+samp,244	3,2,1	1.5	tensorConvolution	tensorConvApprox	dev	conv
+samp,245	4,0,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,246	4,1,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,247	4,2,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp,248	4,3,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
 red_samp,41	1	1.5	tensorReduction		tensorReduction		dev	red
 red_samp,42	1	2.25	tensorReduction		tensorReduction		dev	red
 red_samp,43	1	1.4	tensorReduction		tensorReduction		dev	red

--- a/llvm/projects/hpvm-tensor-rt/autotuner/tuner_driver_src/benchmarks.py
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/tuner_driver_src/benchmarks.py
@@ -67,7 +67,17 @@

 # After Fixing Yasmin's code first batch of runs on CIFAR-10 DNNs
 # NOTE: First batch with 33% sampling - 2K runs for each threshold
-batch_id = "batch325"
+#-- batch_id = "batch325"
+
+
+# After Fixing Yasmin's code second batch of runs on CIFAR-10 DNNs
+# NOTE: Second batch with 33% sampling - 5K runs for each threshold
+# NOTE: First batch with dumping CPU runtime configs
+#-- batch_id = "batch327"
+
+
+# IMP: Increased SAMPLING Knobs ---- Adding interpolation-based Knobs
+batch_id = "batch328"




--- a/llvm/projects/hpvm-tensor-rt/autotuner/tuner_driver_src/run_devtime_tuner.py
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/tuner_driver_src/run_devtime_tuner.py
@@ -19,7 +19,7 @@ class DevTimeTuner:
  def __init__(self, Bench):

    self.piped_execution = True
-    self.autotuner_runs = 100
+    self.autotuner_runs = 8000

    self.promise_binary = Bench.promise_binary
    

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
@@ -55,7 +55,8 @@ void postInterpolateRow(int N, int n, int c, int h, int w,


 __global__
-void postInterpolateCol(int N, int n, int c, int h, int w, float* data, int int_col){
+void postInterpolateCol(int N, int n, int c, int h, int w,
+			float* data, int int_col){

  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -254,7 +255,6 @@ void sampleFilterElems(int N,
    int ch = (i % (c * h * w)) / (h * w);
    int n = i / (c * h * w);

-    //int local_index = row * w + col;
    int local_index = (ch * (h * w)) + (row * w) + col;

    if(skip_elem == 3 && h == 3 && w == 3){
@@ -284,12 +284,15 @@ void sampleFilter(Tensor* newFilter, Tensor* filter,
    
  int numBlocks = (n * c * h * w  + 127) / 128;
  int N = n * c * h * w;
-  //float mul_factor = skip_rate / (skip_rate - 1); 
+
  float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); 

+  //float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
+  //mul_factor = (mul_factor + 1.0) / 2;
+
+  
  DEBUG ("mul_factor = %f \n", mul_factor);

-  //printf ("*** skip_rate = %d  h = %d  w = %d \n", skip_rate, h, w);
  
  sampleFilterElems<<<numBlocks,128>>>(N,
 				       n, c, h, w,
@@ -335,16 +338,17 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,

  Tensor* newFilter;
  newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
-  	      	      CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
-		      filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
-		      filter->dims.dim_sizes[3]);
+					CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
+					filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
+					filter->dims.dim_sizes[3]);


  // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
  sampleFilter(newFilter, filter, skip_rate, skip_offset);
  

-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
+       vertical_stride, horizontal_stride);

  checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));

@@ -453,6 +457,194 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,



+void sampleFilter2(Tensor* newFilter, Tensor* filter,
+		   int skip_rate, int skip_offset, int interpolation_id){
+
+  int n = filter->dims.dim_sizes[0];
+  int c = filter->dims.dim_sizes[1];
+  int h = filter->dims.dim_sizes[2];
+  int w = filter->dims.dim_sizes[3];
+    
+  int numBlocks = (n * c * h * w  + 127) / 128;
+  int N = n * c * h * w;
+
+  float mul_factor;
+  
+  if (interpolation_id == 0){
+    mul_factor = (skip_rate * 1.0) / (skip_rate - 1); 
+  }
+  else if (interpolation_id == 1){
+    mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
+    mul_factor = (mul_factor + 1.0) / 2;
+  }
+ 
+
+  printf ("mul_factor = %f \n", mul_factor);
+
+  DEBUG ("mul_factor = %f \n", mul_factor);
+
+  
+  sampleFilterElems<<<numBlocks,128>>>(N,
+				       n, c, h, w,
+				       (float *) filter->gpu_data,
+				       skip_rate, skip_offset, mul_factor,
+				       (float *) newFilter->gpu_data);
+
+}
+
+
+
+// A 'Simulation' of perforated tensor convolution
+void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
+			 int vertical_pad, int horizontal_pad,
+			 int vertical_stride, int horizontal_stride,
+			 int conv_mode, int conv_groups,
+			 int skip_rate, int skip_offset, int interpolation_id){
+  
+
+  INFO("*** TensorConvolution \n");
+  profileEvent("tensorConv");
+
+  Tensor* input = (Tensor*) input_ptr;
+  Tensor* filter = (Tensor*) filter_ptr;
+
+  
+  cudnnConvolutionDescriptor_t convDesc;
+  cudnnConvolutionFwdAlgo_t convAlgo;  
+  cudnnConvolutionMode_t mode;
+  
+  if(conv_mode == 0)
+    mode = CUDNN_CONVOLUTION;
+  else if(conv_mode == 1)
+    mode = CUDNN_CROSS_CORRELATION;
+
+  float alpha = 1.0f, beta = 0.0f;
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP32(input);
+  convertToFP32(filter);
+
+  Tensor* newFilter;
+  newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
+					CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
+					filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
+					filter->dims.dim_sizes[3]);
+
+
+  // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
+  sampleFilter2(newFilter, filter, skip_rate, skip_offset, interpolation_id);
+  
+
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
+       vertical_stride, horizontal_stride);
+
+  checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
+
+  //FIXME: Current hack to preserve backward compatibilty
+  if(conv_groups == 0){
+    conv_groups = 1;
+  }
+
+  // NOTE: Adding support for grouped convolution
+  checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
+
+  int new_v = vertical_stride + 0;
+  int new_h = horizontal_stride + 0;
+  cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+  
+  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+					     vertical_pad, horizontal_pad, // conv padding
+					     new_v, new_h, // conv strides
+					     1, 1, // upscaling values
+					     mode , // mode is configurable
+					     computeType)); // defines compute precision
+
+  int n, c, h, w; // output dimensions
+  // Find dimension of convolution output
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+						   input->tensor_desc,
+						   filter->filter_desc,
+						   &n, &c, &h, &w));
+
+
+  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+
+  Tensor* output;
+  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
+				      CUDNN_TENSOR_NCHW, n, c, h, w);
+  
+
+  // NOTE: Changing output tensor placement from host to device
+  changeTensorPlacement(output, DEVICE);
+  // NOTE: Necessary to insert the above call for every output tensor
+
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
+	output->data_type, output->data_format, output->dims.dim_sizes[0],
+	output->dims.dim_sizes[1],
+	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+
+  if(convDesc == NULL || input->tensor_desc == NULL ||
+     filter->filter_desc == NULL || output->tensor_desc == NULL)
+    ERROR("NULL descriptor! \n");
+
+
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
+						 input->tensor_desc,
+						 filter->filter_desc,
+						 convDesc,
+						 output->tensor_desc,
+						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+						 0,
+						 &convAlgo));
+
+
+  DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
+	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
+
+
+  // NOTE: Using GEMM-based Algo
+  convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+
+  size_t workspace_size;
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+						     input->tensor_desc,
+						     filter->filter_desc,
+						     convDesc,
+						     output->tensor_desc,
+						     convAlgo,
+						     &workspace_size));
+
+  // Allocating memory for the convolution workspace
+  void* workspace;
+  checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+  DEBUG("workspace size = %d \n", workspace_size);
+
+
+  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
+				     input->gpu_data, filter->filter_desc, newFilter->gpu_data,
+				     convDesc, convAlgo, workspace, workspace_size,
+				     &beta, output->tensor_desc, output->gpu_data));
+
+
+ 
+  freeTensor(newFilter);
+  profileEvent("tensorConv_end", true);
+
+  return output;
+}
+
+
+
+
+
+
+
+

 /************ NOTE: API for ApproxHPVM Wrapper runtime *******/ 

@@ -833,15 +1025,17 @@ class SampParams{
 public:  
  int skip_rate;
  int skip_offset;
-
+  int interpolation_id;
+  
  SampParams(){
    skip_rate = 1;
    skip_offset = 0;
  }
  
-  SampParams(int skip_rate1, int skip_offset1){
+  SampParams(int skip_rate1, int skip_offset1, int interpolation_id1){
    skip_rate = skip_rate1;
    skip_offset = skip_offset1;
+    interpolation_id = interpolation_id1;
  }
 		
 };
@@ -892,8 +1086,11 @@ public:
          std::getline(token_stream, tok, ',');
 	  int offset = atoi(tok.c_str());

+	  std::getline(token_stream, tok, ',');
+	  int interpolation_id = atoi(tok.c_str());
+
 	  printf ("skip_every = %d, offset = %d \n", skip_every, offset);
-	  SampParams params(skip_every, offset);
+	  SampParams params(skip_every, offset, interpolation_id);
 	  samp_knob_map[knob] = params;
 	  
 	}
@@ -990,22 +1187,23 @@ void* Autotuner_SampConv(void* input, float i_min, float i_max,
  DEBUG("params.skip_rate = %d, params.skip_offset = %d \n",
 	params.skip_rate, params.skip_offset);
  
-  /*
-    conv_out = tensorConvolutionKernelSamp(input, filter,  conv_pad_h, conv_pad_w,
-               conv_stride_h, conv_stride_w,
-               1, 1,
-               2);
-  */
-
  void* conv_out;
  
  if (!FP16_tuning){
 
-    conv_out = tensorConvSampSim(input, filter,
+    /* conv_out = tensorConvSampSim(input, filter,
 				 conv_pad_h, conv_pad_w,
 				 conv_stride_h, conv_stride_w, 1, 1,
 				 params.skip_rate, params.skip_offset);
+    */

+    
+    conv_out = tensorConvSampSim2(input, filter,
+				  conv_pad_h, conv_pad_w,
+				  conv_stride_h, conv_stride_w, 1, 1,
+				  params.skip_rate, params.skip_offset, params.interpolation_id);
+
+    
  }
  else{
        
@@ -1174,9 +1372,9 @@ void* Autotuner_Pooling(void* input,
 	
    else {
      pool_out = tensorHalfPooling(input, 0, pool_size, pool_size,
-				   0, 0, pool_stride, pool_stride);
-      
-     }
+				   0, 0, pool_stride, pool_stride);  
+    }
+    
   
  }
  else{
@@ -1190,7 +1388,8 @@ void* Autotuner_Pooling(void* input,



-void* Autotuner_Activation(void* input, int activation_id, int out_min, int out_max, int swing){
+void* Autotuner_Activation(void* input, int activation_id,
+			   int out_min, int out_max, int swing){

  void* activation_out;

@@ -1243,7 +1442,8 @@ void* Autotuner_Activation(void* input, int activation_id, int out_min, int out_
  return activation_out;
 }

-void *autotuner_tensorFft(void *input, bool inverse) {
+
+void* autotuner_tensorFft(void *input, bool inverse) {
  if(ONLINE_PROFILING){
    ERROR("Online Profiling cannot be enabled\n");
    abort();
@@ -1264,7 +1464,8 @@ void *autotuner_tensorFft(void *input, bool inverse) {
  return NULL;
 }

-void *autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
+
+void* autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
  if(ONLINE_PROFILING){
    ERROR("Online Profiling cannot be enabled\n");
    abort();
@@ -1295,7 +1496,7 @@ void *autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
  return NULL;
 }

-void *autotuner_tensorProjectiveT(void *input, void *transformation) {
+void* autotuner_tensorProjectiveT(void *input, void *transformation) {
  if(ONLINE_PROFILING){
    ERROR("Online Profiling cannot be enabled\n");
    abort();
@@ -1313,7 +1514,8 @@ void *autotuner_tensorProjectiveT(void *input, void *transformation) {
  return NULL;
 }

-void *autotuner_tensorMap1(MathOp func, void *input) {
+
+void* autotuner_tensorMap1(MathOp func, void *input) {
  if(ONLINE_PROFILING){
    ERROR("Online Profiling cannot be enabled\n");
    abort();
@@ -1335,7 +1537,7 @@ void *autotuner_tensorMap1(MathOp func, void *input) {
  return NULL;
 }

-void *autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
+void* autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
  if(ONLINE_PROFILING){
    ERROR("Online Profiling cannot be enabled\n");
    abort();
@@ -1357,7 +1559,7 @@ void *autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
  return NULL;
 }

-void *autotuner_tensorMap3(MathOp func, void *input1, void *input2,
+void* autotuner_tensorMap3(MathOp func, void *input1, void *input2,
                           void *input3) {
  if(ONLINE_PROFILING){
    ERROR("Online Profiling cannot be enabled\n");