Skip to content
Snippets Groups Projects
Commit c237d77d authored by hsharif3's avatar hsharif3
Browse files

Delete #error.h#

parent 1f18d5e3
No related branches found
No related tags found
No related merge requests found
#ifndef ERROR_HEADER
#define ERROR_HEADER
#include <stdio.h>
#include <stdarg.h>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cfloat>
#include <algorithm>
#include <sstream>
#include <vector>
#include <iostream>
#include <random>
#include <string>
#include <time.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>
#include "../include/debug.h"
#include "tensor.h"
#include "profiling.h"
#include "tensor_utils.cu"
#include "global_data.h"
void readOpenTunerFlags(char* file_name){
total_ops = 0;
op_counter = 0;
op_accuracies.clear();
FILE* fp = fopen(file_name, "r");
if(fp == NULL){
ERROR("File 'opentuner_flags' not found \n");
}
int retVal = 200;
while(retVal != EOF){
int op_acc;
if(fp != NULL)
retVal = fscanf(fp, "%d", &op_acc);
else
op_acc = 0;
op_accuracies.push_back(op_acc);
//printf("op_accuracies = %d, total_ops =%d \n", op_accuracies[total_ops], total_ops);
total_ops++;
}
fclose(fp);
}
/*__device__ inline void atomicAdd(float* address, float value)
{
float old = value;
float new_old;
do{
new_old = atomicExch(address, 0.0f);
new_old += old;
}
while ((old = atomicExch(address, new_old))!=0.0f);
};
*/
Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
deviceToHostCopy(x);
deviceToHostCopy(x_orig);
// NOTE: Move floats to doubles - overflow is quite possible
float l1_norm = 0.0;
float l2_norm = 0.0;
float inf_norm = -1.0;
double total = 0.0;
float* arr1 = (float*) x->host_data;
float* arr2 = (float*) x_orig->host_data;
for(unsigned int i = 0; i < x->num_elems; i++){
total = total + arr2[i];
float diff = abs(arr1[i] - arr2[i]);
l1_norm += diff;
l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
if(inf_norm < diff)
inf_norm = diff;
}
l1_norm = l1_norm / (x->num_elems * 1.0);
l2_norm = l2_norm / (x->num_elems * 1.0);
double distribution_mean = total / (x->num_elems * 1.0);
l1_norm = l1_norm / distribution_mean;
l2_norm = l2_norm / distribution_mean;
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
norms->l1_norm = l1_norm;
norms->l2_norm = l2_norm;
norms->inf_norm = inf_norm;
INFO("l1_norm = %f \n", l1_norm);
INFO("l2_norm = %f \n", l2_norm);
INFO("inf_norm = %f \n", inf_norm);
return norms;
}
Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
deviceToHostCopy(x);
deviceToHostCopy(x_orig);
// NOTE: Move all floats to doubles - overflow is quite possible
double l0_norm_A = 0.0;
double l0_norm_B = 0.0;
double l1_norm_A = 0.0;
double l1_norm_B = 0.0;
double l2_norm_A = 0.0;
double l2_norm_B = 0.0;
float inf_norm = -1.0;
float orig_inf_norm = -1.0;
double total_diff = 0.0;
double total_diff_squared = 0.0;
float* arr1 = (float*) x->host_data;
float* arr2 = (float*) x_orig->host_data;
for(unsigned int i = 0; i < x->num_elems; i++){
if(arr2[i] != 0.0)
l0_norm_A = l0_norm_A + 1.0;
if(arr1[i] != 0.0)
l0_norm_B = l0_norm_B + 1.0;
l1_norm_A = l1_norm_A + abs(arr2[i]);
l1_norm_B = l1_norm_B + abs(arr1[i]);
l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]);
l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]);
float diff = abs(arr1[i] - arr2[i]);
total_diff = total_diff + diff;
float diff_squared = diff * diff;
total_diff_squared = total_diff_squared + diff_squared;
if(orig_inf_norm < diff){
orig_inf_norm = diff;
}
// Relative difference value
float normalized_diff = diff / arr2[i];
if(inf_norm < normalized_diff){
inf_norm = normalized_diff;
}
}
// Relative L1 and Mean L1 norms of the difference Matrix
float mean_l1 = ( total_diff ) / x->num_elems;
float relative_l1 = ( total_diff ) / l1_norm_A;
// Computing Relative L2 norm - i.e., Euclidean distance
double norm_root_A = sqrt(l2_norm_A);
double diff_root = sqrt(total_diff_squared);
float mean_l2 = diff_root / x->num_elems;
float relative_l2 = diff_root / norm_root_A;
// Packing computed norms in Norm_t struct
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
// Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
norms->mean_l1 = mean_l1;
norms->mean_l2 = mean_l2;
norms->orig_inf_norm = orig_inf_norm;
// Relative metrics (relative to distribution) - suitable for PROMISE
norms->l1_norm = relative_l1;
norms->l2_norm = relative_l2;
norms->inf_norm = inf_norm;
INFO("l1_norm = %f \n", relative_l1);
INFO("l2_norm = %f \n", relative_l2);
INFO("inf_norm = %f \n", inf_norm);
return norms;
}
__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A,
double* l1_diff, double* l2_diff, unsigned int n){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < n){
double diff = fabsf(A[i] - B[i]);
double diff_squared = diff * diff;
atomicAdd( l1_A, fabsf(A[i]) );
atomicAdd( l2_A, (A[i] * A[i]) );
atomicAdd( l1_diff, diff);
atomicAdd( l2_diff, diff_squared);
}
}
// Compute Norms on the GPU
Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
hostToDeviceCopy(x);
hostToDeviceCopy(x_orig);
// FIXIT: Move all floats to doubles - overflow is possible
double l1_norm_A;
double l2_norm_A;
double l1_diff;
double l2_diff;
// Device pointers
double *l1_norm_A_d;
double *l2_norm_A_d;
double *l1_diff_d;
double *l2_diff_d;
cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
cudaMalloc( (void**) &l1_diff_d, sizeof(double));
cudaMalloc( (void**) &l2_diff_d, sizeof(double));
float* arr1 = (float*) x->gpu_data;
float* arr2 = (float*) x_orig->gpu_data;
int blockSize = 1024;
int gridSize = (int) ceil ((float) x->num_elems / blockSize);
INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
// Relative L1 and Mean L1 norms of the difference Matrix
float mean_l1 = l1_diff / x->num_elems;
float relative_l1 = l1_diff / l1_norm_A;
// Computing Relative L2 norm - i.e., Euclidean distance
double norm_root_A = sqrt(l2_norm_A);
double diff_root = sqrt(l2_diff);
float mean_l2 = diff_root / x->num_elems;
float relative_l2 = diff_root / norm_root_A;
// Packing computed norms in Norm_t struct
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
// Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
norms->mean_l1 = mean_l1;
norms->mean_l2 = mean_l2;
norms->orig_inf_norm = 0.0;
// Relative metrics (relative to distribution) - suitable for PROMISE
norms->l1_norm = relative_l1;
norms->l2_norm = relative_l2;
norms->inf_norm = 0.0;
INFO("l1_norm = %f \n", relative_l1);
INFO("l2_norm = %f \n", relative_l2);
return norms;
}
__global__ void vecConstMul(float* A, float mul_factor, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
A[id] = A[id] * mul_factor;
}
__global__ void vecRound(float* A, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
A[id] = roundf(A[id]);
}
__global__ void vecConstDiv(float* A, float div_factor, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
A[id] = A[id] / div_factor;
}
__global__ void vecMul(float* A, float* B, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
B[id] = A[id] * B[id];
}
/**** ERROR injecion routines ******/
void initRandValues(Tensor* bias, int error_scale){
float scaling_values[20];
// FIXIT: Error knob 0 should be 0 zero
scaling_values[0] = 0.016;
scaling_values[1] = 0.018;
scaling_values[2] = 0.022;
scaling_values[3] = 0.026;
scaling_values[4] = 0.030;
scaling_values[5] = 0.035;
scaling_values[6] = 0.04;
scaling_values[7] = 0.06;
scaling_values[8] = 0.08;
scaling_values[9] = 0.1;
//scaling_values[8] = 0.15;
//scaling_values[9] = 0.2;
scaling_values[10] = 0.25;
scaling_values[11] = 0.3;
scaling_values[12] = 0.35;
scaling_values[13] = 0.4;
scaling_values[14] = 0.45;
// Values below are currently unused by Opentuner
scaling_values[15] = 0.5;
scaling_values[16] = 0.55;
scaling_values[17] = 0.6;
scaling_values[18] = 0.65;
scaling_values[19] = 0.7;
curandGenerator_t gen;
struct timespec ts;
if(timespec_get(&ts, TIME_UTC) == 0){
printf("crashed \n");
abort();
}
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]);
/*
std::random_device rd;
std::mt19937 mt(rd());
std::normal_distribution<float> distribution(0.0, 1.0);
float* data_arr = (float*) bias->host_data;
for(int i = 0; i < bias->num_elems; i++){
float rand_num = distribution(mt);
data_arr[i] = scaling_values[error_scale] * rand_num;
}
*/
}
void* addBitError(void* x_ptr, int error_scale){
if(error_scale > 6 || error_scale < 0){
ERROR("Error Scale out of bounds \n");
}
INFO("*** TensorBitError \n");
profileEvent("tensorBitError");
Tensor* x = (Tensor*) x_ptr;
size_t* dim_sizes = x->dims.dim_sizes;
Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
// Copying x data into x_original - for computing Norms
tensorCopy(x, x_original);
// Quadratic Error
float freq_factors[6];
freq_factors[0] = 0.1;
freq_factors[1] = 0.2;
freq_factors[2] = 0.4;
freq_factors[3] = 0.6;
freq_factors[4] = 0.8;
freq_factors[5] = 1.0;
float error_freq = freq_factors[error_scale];
deviceToHostCopy(x);
unsigned char* data_arr = reinterpret_cast<unsigned char*>(x->host_data);
// FIXIT: Need to be careful about floating point datatype assumptions
int size_of_elem = 4;
long int total_bytes = x->size_in_bytes;
long int error_iterations = total_bytes * 0.01 * error_freq;
INFO("total_bytes = %lu, error_iterations = %lu \n", total_bytes, error_iterations);
srand(time(NULL));
for(int i = 0; i < error_iterations; i++){
// FIXIT: The rand() is only specific to int - need long
long int index = rand() % total_bytes;
int N = 5; // The operation below flips the Nth bit
unsigned char fil = 1UL << N;
unsigned char val = data_arr[index];
char flipped = val^fil;
data_arr[i] = flipped;
}
Norm_t* norms = calculateNorms2(x, x_original);
profileEvent("tensorBitError_end", true);
return (void*) norms;
}
void randomCeilAndFloor(float* x, size_t num_elems){
INFO("randomCeilAndFloor\n");
std::random_device rd;
std::mt19937 mt(rd());
std::normal_distribution<float> distribution(0.0, 1.0);
for(size_t i = 0; i < num_elems; i++){
float rand_num = distribution(mt);
int val = abs(((int) rand_num) % 2);
if(val == 0)
x[i] = floor(x[i]);
else if(val == 1)
x[i] = ceil(x[i]);
}
}
// Routine for Adding RoundOff Errors
void* addRoundError(void* x_ptr, int error_scale){
if(error_scale > 11 || error_scale < 0){
ERROR("Error Scale out of bounds \n");
}
INFO("*** TensorRoundError \n");
profileEvent("tensorRoundError");
Tensor* x = (Tensor*) x_ptr;
size_t* dim_sizes = x->dims.dim_sizes;
Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
// Copying x data into x_original - for computing Norms
tensorCopy(x, x_original);
float round_factors[12];
round_factors[0] = 1000000; // FIXIT: This should be zero error
round_factors[1] = 100;
round_factors[2] = 10;
round_factors[3] = 7; // Beyond this point, the error function is linear
round_factors[4] = 3;
round_factors[5] = 1;
round_factors[6] = 0.7;
round_factors[7] = 0.3;
round_factors[8] = 0.1;
round_factors[9] = 0.07;
round_factors[10] = 0.03;
round_factors[11] = 0.01;
// THINK: Considering using error magnitudes in this scenario
float round_factor = round_factors[error_scale];
INFO("round_factor = %f \n", round_factor);
hostToDeviceCopy(x);
int blockSize = 128;
int gridSize = (int) ceil ((float) x->num_elems / blockSize);
INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
// NOTE: Check if a large gridSize will work with really large tensors
vecConstMul<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
//vecRound<<<gridSize, blockSize>>>((float*) x->gpu_data, x->num_elems);
deviceToHostCopy(x);
randomCeilAndFloor((float*) x->host_data, x->num_elems);
hostToDeviceCopy(x);
vecConstDiv<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
Norm_t* norms = calculateNorms2(x, x_original);
profileEvent("tensorRoundError_end", true);
return (void*) norms;
}
// Routine for Adding Gaussian Error
void* addGaussianError(void* x_ptr, int error_scale){
if(error_scale > 11 || error_scale < 0){
ERROR("Error Scale out of bounds \n");
}
INFO("*** TensorAddError \n");
profileEvent("tensorAddError");
Tensor* x = (Tensor*) x_ptr;
size_t* dim_sizes = x->dims.dim_sizes;
Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
// Copying x data into x_original - for computing Norms
tensorCopy(x, x_original);
// NOTE: Error scale is used to generate the bias matrix
initRandValues(bias, error_scale);
hostToDeviceCopy(x);
//hostToDeviceCopy(bias);
int blockSize = 1024;
int gridSize = (int) ceil ((float) x->num_elems / blockSize);
INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
// NOTE: Check if a large gridSize will work with really large tensors
vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems);
float alpha = 1.0f, beta = 0.0f;
// FIXIT: routine fails for 3D tensors
checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
//Norm_t* norms = calculateNorms2(x, x_original);
Norm_t* norms = calculateNormsGPU(x, x_original);
profileEvent("tensorAddError_end", true);
return (void*) norms;
}
void* tensorAddError(void* x_ptr, int error_scale){
void * new_x = addGaussianError(x_ptr, error_scale);
//void * new_x = addRoundError(x_ptr, error_scale);
//void * new_x = addBitError(x_ptr, error_scale);
return new_x;
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment