Delete #error.h#

c237d77d · hsharif3 · 1f18d5e3 · 1f18d5e3
Commit c237d77d authored 5 years ago by hsharif3
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/#error.h#
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/#error.h#
-
-#ifndef ERROR_HEADER
-#define ERROR_HEADER
-
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include <ctime>
-#include <cfloat>
-#include <algorithm>
-#include <sstream>
-#include <vector>
-#include <iostream>
-#include <random>
-#include <string>
-#include <time.h>
-
-#include <curand.h>
-#include <curand_kernel.h>
-#include <math.h>
-#include <assert.h>
-
-
-#include "../include/debug.h"
-#include "tensor.h"
-#include "profiling.h"
-#include "tensor_utils.cu"
-#include "global_data.h"
-
-
-
-
-void readOpenTunerFlags(char* file_name){
-
-  total_ops = 0;
-  op_counter = 0;
-  op_accuracies.clear();
-
-  
-  FILE* fp = fopen(file_name, "r");
-  if(fp == NULL){
-    ERROR("File 'opentuner_flags' not found \n");
-  }
-    
-  int retVal = 200;
-  while(retVal != EOF){
-
-    int op_acc;
-    if(fp != NULL)
-      retVal = fscanf(fp, "%d", &op_acc);
-    else
-      op_acc = 0;
-    
-    op_accuracies.push_back(op_acc);
-    //printf("op_accuracies = %d, total_ops =%d \n", op_accuracies[total_ops], total_ops);
-    total_ops++;
-  }
-  
-  fclose(fp);
-}
-
-
-
-
-
-/*__device__ inline void atomicAdd(float* address, float value)
-
-{
-
-  float old = value;
-  float new_old;
-
-  do{
-    new_old = atomicExch(address, 0.0f);
-    new_old += old;
-  }
-
-  while ((old = atomicExch(address, new_old))!=0.0f);
-
-};
-*/
-
-
-
-
-
-Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
-
-  deviceToHostCopy(x);
-  deviceToHostCopy(x_orig);
-
-  // NOTE: Move floats to doubles - overflow is quite possible
-  float l1_norm = 0.0;
-  float l2_norm = 0.0;
-  float inf_norm = -1.0;
-  double total = 0.0;
-
-  float* arr1 = (float*) x->host_data;
-  float* arr2 = (float*) x_orig->host_data;
-  
-  for(unsigned int i = 0; i < x->num_elems; i++){
-
-    total = total + arr2[i];
-    
-    float diff = abs(arr1[i] - arr2[i]);
-    l1_norm += diff;
-    l2_norm += (arr1[i] - arr2[i]) *  (arr1[i] - arr2[i]);
-
-    if(inf_norm < diff)
-      inf_norm = diff;
-  }
-
-  l1_norm = l1_norm / (x->num_elems * 1.0);
-  l2_norm = l2_norm / (x->num_elems * 1.0);
-
-  double distribution_mean = total / (x->num_elems * 1.0);
-  l1_norm = l1_norm / distribution_mean;
-  l2_norm = l2_norm / distribution_mean;
-
-    
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  norms->l1_norm = l1_norm;
-  norms->l2_norm = l2_norm;
-  norms->inf_norm = inf_norm;  
-  
-  INFO("l1_norm = %f \n", l1_norm);
-  INFO("l2_norm = %f \n", l2_norm);
-  INFO("inf_norm = %f \n", inf_norm);
-
-  return norms;
-}
-
-
-
-Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
-
-  deviceToHostCopy(x);
-  deviceToHostCopy(x_orig);
-
-  // NOTE: Move all floats to doubles - overflow is quite possible
-  double l0_norm_A = 0.0;
-  double l0_norm_B = 0.0;
-
-  double l1_norm_A = 0.0;
-  double l1_norm_B = 0.0;
-  
-  double l2_norm_A = 0.0;
-  double l2_norm_B = 0.0;
-  float inf_norm = -1.0;
-  float orig_inf_norm = -1.0;
-  double total_diff = 0.0;
-  double total_diff_squared = 0.0;
- 
-  float* arr1 = (float*) x->host_data;
-  float* arr2 = (float*) x_orig->host_data;
-  
-  for(unsigned int i = 0; i < x->num_elems; i++){
-
-    if(arr2[i] != 0.0)
-      l0_norm_A = l0_norm_A + 1.0;
-    if(arr1[i] != 0.0)
-      l0_norm_B = l0_norm_B + 1.0;
-        
-    l1_norm_A = l1_norm_A + abs(arr2[i]);
-    l1_norm_B = l1_norm_B + abs(arr1[i]);
-
-    l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]);
-    l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]);
-      
-    float diff = abs(arr1[i] - arr2[i]);
-    total_diff = total_diff + diff;
-    float diff_squared = diff * diff;
-    total_diff_squared = total_diff_squared + diff_squared; 
-
-
-    if(orig_inf_norm < diff){
-      orig_inf_norm = diff;
-    }
-    
-    // Relative difference value
-    float normalized_diff = diff / arr2[i];   
-    if(inf_norm < normalized_diff){
-      inf_norm = normalized_diff;
-    }    
-  }
-
-  // Relative L1 and Mean L1 norms of the difference Matrix
-  float mean_l1 = ( total_diff ) / x->num_elems;
-  float relative_l1 = ( total_diff ) / l1_norm_A;
-
-  // Computing Relative L2 norm - i.e., Euclidean distance
-  double norm_root_A = sqrt(l2_norm_A);
-  double diff_root = sqrt(total_diff_squared);
-  float mean_l2 = diff_root / x->num_elems;
-  float relative_l2 = diff_root / norm_root_A;
-
-  // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
-  norms->mean_l1 = mean_l1;
-  norms->mean_l2 = mean_l2;
-  norms->orig_inf_norm = orig_inf_norm;
-
-  // Relative metrics (relative to distribution) - suitable for PROMISE
-  norms->l1_norm = relative_l1;
-  norms->l2_norm = relative_l2;
-  norms->inf_norm = inf_norm;  
-  
-  INFO("l1_norm = %f \n", relative_l1);
-  INFO("l2_norm = %f \n", relative_l2);
-  INFO("inf_norm = %f \n", inf_norm);
-
-  return norms;
-}
-
-
-
-
-
-__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A,
-				  double* l1_diff, double* l2_diff, unsigned int n){
-
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if(i < n){
-    
-    double diff = fabsf(A[i] - B[i]);
-    double diff_squared = diff * diff;   
-
-    atomicAdd( l1_A,  fabsf(A[i]) );
-    atomicAdd( l2_A, (A[i] * A[i]) );
-
-    atomicAdd( l1_diff, diff);
-    atomicAdd( l2_diff, diff_squared);
-  }
-}
-
-
-
-// Compute Norms on the GPU
-Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
-
-  hostToDeviceCopy(x);
-  hostToDeviceCopy(x_orig);
-
-  // FIXIT: Move all floats to doubles - overflow is possible
-  
-  double l1_norm_A;
-  double l2_norm_A;
-
-  double l1_diff;
-  double l2_diff;
-
-  // Device pointers
-  double *l1_norm_A_d;
-  double *l2_norm_A_d;
-  double *l1_diff_d;
-  double *l2_diff_d;
-  
-  cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l1_diff_d, sizeof(double));
-  cudaMalloc( (void**) &l2_diff_d, sizeof(double));
- 
-    
-  float* arr1 = (float*) x->gpu_data;
-  float* arr2 = (float*) x_orig->gpu_data;
-
-  int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
-  INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
-
-  normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
-
-  cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
-  
-
-  // Relative L1 and Mean L1 norms of the difference Matrix
-  float mean_l1 = l1_diff / x->num_elems;
-  float relative_l1 = l1_diff / l1_norm_A;
-
-  // Computing Relative L2 norm - i.e., Euclidean distance
-  double norm_root_A = sqrt(l2_norm_A);
-  double diff_root = sqrt(l2_diff);
-  float mean_l2 = diff_root / x->num_elems;
-  float relative_l2 = diff_root / norm_root_A;
-
-  // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
-  norms->mean_l1 = mean_l1;
-  norms->mean_l2 = mean_l2;
-  norms->orig_inf_norm = 0.0;
-
-  // Relative metrics (relative to distribution) - suitable for PROMISE
-  norms->l1_norm = relative_l1;
-  norms->l2_norm = relative_l2;
-  norms->inf_norm = 0.0;  
-  
-  INFO("l1_norm = %f \n", relative_l1);
-  INFO("l2_norm = %f \n", relative_l2);
-
-  return norms;
-}
-
-
-
-
-__global__ void vecConstMul(float* A, float mul_factor, int n){
-
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if(id < n)
-    A[id] = A[id] * mul_factor; 
-}
-
-
-__global__ void vecRound(float* A, int n){
-
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if(id < n)
-    A[id] = roundf(A[id]); 
-}
-
-
-__global__ void vecConstDiv(float* A, float div_factor, int n){
-
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if(id < n)
-    A[id] = A[id] / div_factor; 
-}
-
-
-
-__global__ void vecMul(float* A, float* B, int n){
-
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if(id < n)
-    B[id] = A[id] * B[id]; 
-}
-
-
-/****  ERROR injecion routines  ******/
-
-void initRandValues(Tensor* bias, int error_scale){
-
-  float scaling_values[20];
-
-  // FIXIT: Error knob 0 should be 0 zero
-  scaling_values[0] = 0.016;
-  scaling_values[1] = 0.018;
-  scaling_values[2] = 0.022;
-  scaling_values[3] = 0.026;
-  scaling_values[4] = 0.030;
-  scaling_values[5] = 0.035;  
-  scaling_values[6] = 0.04;
-  scaling_values[7] = 0.06;
-  scaling_values[8] = 0.08;
-  scaling_values[9] = 0.1;
-  //scaling_values[8] = 0.15;
-  //scaling_values[9] = 0.2;
-  scaling_values[10] = 0.25;
-  scaling_values[11] = 0.3;
-  scaling_values[12] = 0.35;
-  scaling_values[13] = 0.4;
-  scaling_values[14] = 0.45;
-  // Values below are currently unused by Opentuner
-  scaling_values[15] = 0.5;
-  scaling_values[16] = 0.55;
-  scaling_values[17] = 0.6;
-  scaling_values[18] = 0.65;
-  scaling_values[19] = 0.7;
-
-  curandGenerator_t gen;
-
-  struct timespec ts;
-  if(timespec_get(&ts, TIME_UTC) == 0){
-    printf("crashed \n");
-    abort();
-  }
-
-  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-
-  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
-    
-  curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]);
-
-  
-  /*
-  std::random_device rd;
-  std::mt19937 mt(rd());
-  std::normal_distribution<float> distribution(0.0, 1.0);
-  
-  float* data_arr = (float*) bias->host_data;
-  for(int i = 0; i < bias->num_elems; i++){
-    float rand_num = distribution(mt);
-    data_arr[i] = scaling_values[error_scale] * rand_num;   
-  }
-  */
-  
-}
-
-
-
-void* addBitError(void* x_ptr, int error_scale){
-
-  if(error_scale > 6 || error_scale < 0){
-    ERROR("Error Scale out of bounds \n");
-  }
-      
-  INFO("*** TensorBitError \n");  
-  profileEvent("tensorBitError");
-
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes; 
-  Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
-					        dim_sizes[0], dim_sizes[1],
-						dim_sizes[2], dim_sizes[3]);
-
-  // Copying x data into x_original - for computing Norms 
-  tensorCopy(x, x_original);
-
-  // Quadratic Error
-  float freq_factors[6];
-  freq_factors[0] = 0.1;
-  freq_factors[1] = 0.2;
-  freq_factors[2] = 0.4;
-  freq_factors[3] = 0.6;
-  freq_factors[4] = 0.8;
-  freq_factors[5] = 1.0;
-
-  float error_freq = freq_factors[error_scale];
-  
-  deviceToHostCopy(x);
-
-  unsigned char* data_arr = reinterpret_cast<unsigned char*>(x->host_data);
-  // FIXIT: Need to be careful about floating point datatype assumptions
-  int size_of_elem = 4; 
-
-  long int total_bytes = x->size_in_bytes;
-  long int error_iterations = total_bytes * 0.01 * error_freq;
-  INFO("total_bytes = %lu, error_iterations = %lu \n", total_bytes, error_iterations);
-
-  srand(time(NULL));
-  
-  for(int i = 0; i < error_iterations; i++){
-    // FIXIT: The rand() is only specific to int - need long 
-    long int index = rand() % total_bytes;
-    int N = 5; // The operation below flips the Nth bit 
-    unsigned char fil = 1UL << N;
-    unsigned char val = data_arr[index];
-    char flipped = val^fil;
-    data_arr[i] = flipped;
-  }
-  
-
-  Norm_t* norms = calculateNorms2(x, x_original);
-  
-  profileEvent("tensorBitError_end", true);
-  
-  return (void*) norms;
-}
-
-
-void randomCeilAndFloor(float* x, size_t num_elems){
-
-  INFO("randomCeilAndFloor\n");
-  
-  std::random_device rd;
-  std::mt19937 mt(rd());
-  std::normal_distribution<float> distribution(0.0, 1.0);
-
-  for(size_t i = 0; i < num_elems; i++){
-    float rand_num = distribution(mt);
-    int val = abs(((int) rand_num) % 2);
-    if(val == 0)
-      x[i] = floor(x[i]);
-    else if(val == 1)
-      x[i] = ceil(x[i]);
-  }
-
-}
-
-// Routine for Adding RoundOff Errors
-void* addRoundError(void* x_ptr, int error_scale){
-
-  if(error_scale > 11 || error_scale < 0){
-    ERROR("Error Scale out of bounds \n");
-  }
-      
-  INFO("*** TensorRoundError \n");  
-  profileEvent("tensorRoundError");
-
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes; 
-  Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
-					        dim_sizes[0], dim_sizes[1],
-						dim_sizes[2], dim_sizes[3]);
-
-  // Copying x data into x_original - for computing Norms 
-  tensorCopy(x, x_original);
-
-  float round_factors[12];
-  round_factors[0] = 1000000; // FIXIT: This should be zero error
-  round_factors[1] = 100;
-  round_factors[2] = 10;
-  round_factors[3] = 7; // Beyond this point, the error function is linear
-  round_factors[4] = 3;
-  round_factors[5] = 1;
-  round_factors[6] = 0.7;
-  round_factors[7] = 0.3;
-  round_factors[8] = 0.1;
-  round_factors[9] = 0.07;
-  round_factors[10] = 0.03;
-  round_factors[11] = 0.01;
-  
-  // THINK: Considering using error magnitudes in this scenario
-  
-
-  float round_factor = round_factors[error_scale];
-  INFO("round_factor = %f \n", round_factor);
-  
-  hostToDeviceCopy(x);
-
-  int blockSize = 128;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
-  INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
-
-  // NOTE: Check if a large gridSize will work with really large tensors
-  vecConstMul<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
-  //vecRound<<<gridSize, blockSize>>>((float*) x->gpu_data, x->num_elems);
-  
-  deviceToHostCopy(x);
-  randomCeilAndFloor((float*) x->host_data, x->num_elems);
-  hostToDeviceCopy(x);
-  
-  vecConstDiv<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
-  
-  Norm_t* norms = calculateNorms2(x, x_original);
-  
-  profileEvent("tensorRoundError_end", true);
-  
-  return (void*) norms;
-}
-
-
-
-
-// Routine for Adding Gaussian Error
-void* addGaussianError(void* x_ptr, int error_scale){
-
-  if(error_scale > 11 || error_scale < 0){
-    ERROR("Error Scale out of bounds \n");
-  }
-      
-  INFO("*** TensorAddError \n");  
-  profileEvent("tensorAddError");
-
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes;
-  Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format,
-					  dim_sizes[0], dim_sizes[1],
-					  dim_sizes[2], dim_sizes[3]);
-  
-  Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
-					        dim_sizes[0], dim_sizes[1],
-						dim_sizes[2], dim_sizes[3]);
-
-  // Copying x data into x_original - for computing Norms 
-  tensorCopy(x, x_original);
-
-  // NOTE: Error scale is used to generate the bias matrix
-  initRandValues(bias, error_scale);  
-
-  hostToDeviceCopy(x);
-  //hostToDeviceCopy(bias);
-
- 
-  int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
-  INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
-
-  // NOTE: Check if a large gridSize will work with really large tensors
-  vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems);
-  
-  float alpha = 1.0f, beta = 0.0f;
-    
-  // FIXIT: routine fails for 3D tensors
-  checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
-			    bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
-
-
-  //Norm_t* norms = calculateNorms2(x, x_original);
-  Norm_t* norms = calculateNormsGPU(x, x_original);
-  
-  
-  profileEvent("tensorAddError_end", true);
-  
-  return (void*) norms;
-}
-
-
-
-void* tensorAddError(void* x_ptr, int error_scale){
-
-  void * new_x = addGaussianError(x_ptr, error_scale);
-  //void * new_x = addRoundError(x_ptr, error_scale);
-  //void * new_x = addBitError(x_ptr, error_scale);
-  return new_x;
-}
-
-
-
-
-#endif