Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hpvm-release
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
llvm
hpvm-release
Commits
c237d77d
Commit
c237d77d
authored
5 years ago
by
hsharif3
Browse files
Options
Downloads
Patches
Plain Diff
Delete #error.h#
parent
1f18d5e3
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
llvm/projects/hpvm-tensor-rt/tensor_runtime/include/#error.h#
+0
-627
0 additions, 627 deletions
.../projects/hpvm-tensor-rt/tensor_runtime/include/#error.h#
with
0 additions
and
627 deletions
llvm/projects/hpvm-tensor-rt/tensor_runtime/include/#error.h#
deleted
100644 → 0
+
0
−
627
View file @
1f18d5e3
#ifndef ERROR_HEADER
#define ERROR_HEADER
#include <stdio.h>
#include <stdarg.h>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cfloat>
#include <algorithm>
#include <sstream>
#include <vector>
#include <iostream>
#include <random>
#include <string>
#include <time.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>
#include "../include/debug.h"
#include "tensor.h"
#include "profiling.h"
#include "tensor_utils.cu"
#include "global_data.h"
void readOpenTunerFlags(char* file_name){
total_ops = 0;
op_counter = 0;
op_accuracies.clear();
FILE* fp = fopen(file_name, "r");
if(fp == NULL){
ERROR("File 'opentuner_flags' not found \n");
}
int retVal = 200;
while(retVal != EOF){
int op_acc;
if(fp != NULL)
retVal = fscanf(fp, "%d", &op_acc);
else
op_acc = 0;
op_accuracies.push_back(op_acc);
//printf("op_accuracies = %d, total_ops =%d \n", op_accuracies[total_ops], total_ops);
total_ops++;
}
fclose(fp);
}
/*__device__ inline void atomicAdd(float* address, float value)
{
float old = value;
float new_old;
do{
new_old = atomicExch(address, 0.0f);
new_old += old;
}
while ((old = atomicExch(address, new_old))!=0.0f);
};
*/
Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
deviceToHostCopy(x);
deviceToHostCopy(x_orig);
// NOTE: Move floats to doubles - overflow is quite possible
float l1_norm = 0.0;
float l2_norm = 0.0;
float inf_norm = -1.0;
double total = 0.0;
float* arr1 = (float*) x->host_data;
float* arr2 = (float*) x_orig->host_data;
for(unsigned int i = 0; i < x->num_elems; i++){
total = total + arr2[i];
float diff = abs(arr1[i] - arr2[i]);
l1_norm += diff;
l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
if(inf_norm < diff)
inf_norm = diff;
}
l1_norm = l1_norm / (x->num_elems * 1.0);
l2_norm = l2_norm / (x->num_elems * 1.0);
double distribution_mean = total / (x->num_elems * 1.0);
l1_norm = l1_norm / distribution_mean;
l2_norm = l2_norm / distribution_mean;
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
norms->l1_norm = l1_norm;
norms->l2_norm = l2_norm;
norms->inf_norm = inf_norm;
INFO("l1_norm = %f \n", l1_norm);
INFO("l2_norm = %f \n", l2_norm);
INFO("inf_norm = %f \n", inf_norm);
return norms;
}
Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
deviceToHostCopy(x);
deviceToHostCopy(x_orig);
// NOTE: Move all floats to doubles - overflow is quite possible
double l0_norm_A = 0.0;
double l0_norm_B = 0.0;
double l1_norm_A = 0.0;
double l1_norm_B = 0.0;
double l2_norm_A = 0.0;
double l2_norm_B = 0.0;
float inf_norm = -1.0;
float orig_inf_norm = -1.0;
double total_diff = 0.0;
double total_diff_squared = 0.0;
float* arr1 = (float*) x->host_data;
float* arr2 = (float*) x_orig->host_data;
for(unsigned int i = 0; i < x->num_elems; i++){
if(arr2[i] != 0.0)
l0_norm_A = l0_norm_A + 1.0;
if(arr1[i] != 0.0)
l0_norm_B = l0_norm_B + 1.0;
l1_norm_A = l1_norm_A + abs(arr2[i]);
l1_norm_B = l1_norm_B + abs(arr1[i]);
l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]);
l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]);
float diff = abs(arr1[i] - arr2[i]);
total_diff = total_diff + diff;
float diff_squared = diff * diff;
total_diff_squared = total_diff_squared + diff_squared;
if(orig_inf_norm < diff){
orig_inf_norm = diff;
}
// Relative difference value
float normalized_diff = diff / arr2[i];
if(inf_norm < normalized_diff){
inf_norm = normalized_diff;
}
}
// Relative L1 and Mean L1 norms of the difference Matrix
float mean_l1 = ( total_diff ) / x->num_elems;
float relative_l1 = ( total_diff ) / l1_norm_A;
// Computing Relative L2 norm - i.e., Euclidean distance
double norm_root_A = sqrt(l2_norm_A);
double diff_root = sqrt(total_diff_squared);
float mean_l2 = diff_root / x->num_elems;
float relative_l2 = diff_root / norm_root_A;
// Packing computed norms in Norm_t struct
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
// Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
norms->mean_l1 = mean_l1;
norms->mean_l2 = mean_l2;
norms->orig_inf_norm = orig_inf_norm;
// Relative metrics (relative to distribution) - suitable for PROMISE
norms->l1_norm = relative_l1;
norms->l2_norm = relative_l2;
norms->inf_norm = inf_norm;
INFO("l1_norm = %f \n", relative_l1);
INFO("l2_norm = %f \n", relative_l2);
INFO("inf_norm = %f \n", inf_norm);
return norms;
}
__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A,
double* l1_diff, double* l2_diff, unsigned int n){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < n){
double diff = fabsf(A[i] - B[i]);
double diff_squared = diff * diff;
atomicAdd( l1_A, fabsf(A[i]) );
atomicAdd( l2_A, (A[i] * A[i]) );
atomicAdd( l1_diff, diff);
atomicAdd( l2_diff, diff_squared);
}
}
// Compute Norms on the GPU
Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
hostToDeviceCopy(x);
hostToDeviceCopy(x_orig);
// FIXIT: Move all floats to doubles - overflow is possible
double l1_norm_A;
double l2_norm_A;
double l1_diff;
double l2_diff;
// Device pointers
double *l1_norm_A_d;
double *l2_norm_A_d;
double *l1_diff_d;
double *l2_diff_d;
cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
cudaMalloc( (void**) &l1_diff_d, sizeof(double));
cudaMalloc( (void**) &l2_diff_d, sizeof(double));
float* arr1 = (float*) x->gpu_data;
float* arr2 = (float*) x_orig->gpu_data;
int blockSize = 1024;
int gridSize = (int) ceil ((float) x->num_elems / blockSize);
INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
// Relative L1 and Mean L1 norms of the difference Matrix
float mean_l1 = l1_diff / x->num_elems;
float relative_l1 = l1_diff / l1_norm_A;
// Computing Relative L2 norm - i.e., Euclidean distance
double norm_root_A = sqrt(l2_norm_A);
double diff_root = sqrt(l2_diff);
float mean_l2 = diff_root / x->num_elems;
float relative_l2 = diff_root / norm_root_A;
// Packing computed norms in Norm_t struct
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
// Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
norms->mean_l1 = mean_l1;
norms->mean_l2 = mean_l2;
norms->orig_inf_norm = 0.0;
// Relative metrics (relative to distribution) - suitable for PROMISE
norms->l1_norm = relative_l1;
norms->l2_norm = relative_l2;
norms->inf_norm = 0.0;
INFO("l1_norm = %f \n", relative_l1);
INFO("l2_norm = %f \n", relative_l2);
return norms;
}
__global__ void vecConstMul(float* A, float mul_factor, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
A[id] = A[id] * mul_factor;
}
__global__ void vecRound(float* A, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
A[id] = roundf(A[id]);
}
__global__ void vecConstDiv(float* A, float div_factor, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
A[id] = A[id] / div_factor;
}
__global__ void vecMul(float* A, float* B, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n)
B[id] = A[id] * B[id];
}
/**** ERROR injecion routines ******/
void initRandValues(Tensor* bias, int error_scale){
float scaling_values[20];
// FIXIT: Error knob 0 should be 0 zero
scaling_values[0] = 0.016;
scaling_values[1] = 0.018;
scaling_values[2] = 0.022;
scaling_values[3] = 0.026;
scaling_values[4] = 0.030;
scaling_values[5] = 0.035;
scaling_values[6] = 0.04;
scaling_values[7] = 0.06;
scaling_values[8] = 0.08;
scaling_values[9] = 0.1;
//scaling_values[8] = 0.15;
//scaling_values[9] = 0.2;
scaling_values[10] = 0.25;
scaling_values[11] = 0.3;
scaling_values[12] = 0.35;
scaling_values[13] = 0.4;
scaling_values[14] = 0.45;
// Values below are currently unused by Opentuner
scaling_values[15] = 0.5;
scaling_values[16] = 0.55;
scaling_values[17] = 0.6;
scaling_values[18] = 0.65;
scaling_values[19] = 0.7;
curandGenerator_t gen;
struct timespec ts;
if(timespec_get(&ts, TIME_UTC) == 0){
printf("crashed \n");
abort();
}
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]);
/*
std::random_device rd;
std::mt19937 mt(rd());
std::normal_distribution<float> distribution(0.0, 1.0);
float* data_arr = (float*) bias->host_data;
for(int i = 0; i < bias->num_elems; i++){
float rand_num = distribution(mt);
data_arr[i] = scaling_values[error_scale] * rand_num;
}
*/
}
void* addBitError(void* x_ptr, int error_scale){
if(error_scale > 6 || error_scale < 0){
ERROR("Error Scale out of bounds \n");
}
INFO("*** TensorBitError \n");
profileEvent("tensorBitError");
Tensor* x = (Tensor*) x_ptr;
size_t* dim_sizes = x->dims.dim_sizes;
Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
// Copying x data into x_original - for computing Norms
tensorCopy(x, x_original);
// Quadratic Error
float freq_factors[6];
freq_factors[0] = 0.1;
freq_factors[1] = 0.2;
freq_factors[2] = 0.4;
freq_factors[3] = 0.6;
freq_factors[4] = 0.8;
freq_factors[5] = 1.0;
float error_freq = freq_factors[error_scale];
deviceToHostCopy(x);
unsigned char* data_arr = reinterpret_cast<unsigned char*>(x->host_data);
// FIXIT: Need to be careful about floating point datatype assumptions
int size_of_elem = 4;
long int total_bytes = x->size_in_bytes;
long int error_iterations = total_bytes * 0.01 * error_freq;
INFO("total_bytes = %lu, error_iterations = %lu \n", total_bytes, error_iterations);
srand(time(NULL));
for(int i = 0; i < error_iterations; i++){
// FIXIT: The rand() is only specific to int - need long
long int index = rand() % total_bytes;
int N = 5; // The operation below flips the Nth bit
unsigned char fil = 1UL << N;
unsigned char val = data_arr[index];
char flipped = val^fil;
data_arr[i] = flipped;
}
Norm_t* norms = calculateNorms2(x, x_original);
profileEvent("tensorBitError_end", true);
return (void*) norms;
}
void randomCeilAndFloor(float* x, size_t num_elems){
INFO("randomCeilAndFloor\n");
std::random_device rd;
std::mt19937 mt(rd());
std::normal_distribution<float> distribution(0.0, 1.0);
for(size_t i = 0; i < num_elems; i++){
float rand_num = distribution(mt);
int val = abs(((int) rand_num) % 2);
if(val == 0)
x[i] = floor(x[i]);
else if(val == 1)
x[i] = ceil(x[i]);
}
}
// Routine for Adding RoundOff Errors
void* addRoundError(void* x_ptr, int error_scale){
if(error_scale > 11 || error_scale < 0){
ERROR("Error Scale out of bounds \n");
}
INFO("*** TensorRoundError \n");
profileEvent("tensorRoundError");
Tensor* x = (Tensor*) x_ptr;
size_t* dim_sizes = x->dims.dim_sizes;
Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
// Copying x data into x_original - for computing Norms
tensorCopy(x, x_original);
float round_factors[12];
round_factors[0] = 1000000; // FIXIT: This should be zero error
round_factors[1] = 100;
round_factors[2] = 10;
round_factors[3] = 7; // Beyond this point, the error function is linear
round_factors[4] = 3;
round_factors[5] = 1;
round_factors[6] = 0.7;
round_factors[7] = 0.3;
round_factors[8] = 0.1;
round_factors[9] = 0.07;
round_factors[10] = 0.03;
round_factors[11] = 0.01;
// THINK: Considering using error magnitudes in this scenario
float round_factor = round_factors[error_scale];
INFO("round_factor = %f \n", round_factor);
hostToDeviceCopy(x);
int blockSize = 128;
int gridSize = (int) ceil ((float) x->num_elems / blockSize);
INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
// NOTE: Check if a large gridSize will work with really large tensors
vecConstMul<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
//vecRound<<<gridSize, blockSize>>>((float*) x->gpu_data, x->num_elems);
deviceToHostCopy(x);
randomCeilAndFloor((float*) x->host_data, x->num_elems);
hostToDeviceCopy(x);
vecConstDiv<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
Norm_t* norms = calculateNorms2(x, x_original);
profileEvent("tensorRoundError_end", true);
return (void*) norms;
}
// Routine for Adding Gaussian Error
void* addGaussianError(void* x_ptr, int error_scale){
if(error_scale > 11 || error_scale < 0){
ERROR("Error Scale out of bounds \n");
}
INFO("*** TensorAddError \n");
profileEvent("tensorAddError");
Tensor* x = (Tensor*) x_ptr;
size_t* dim_sizes = x->dims.dim_sizes;
Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
dim_sizes[0], dim_sizes[1],
dim_sizes[2], dim_sizes[3]);
// Copying x data into x_original - for computing Norms
tensorCopy(x, x_original);
// NOTE: Error scale is used to generate the bias matrix
initRandValues(bias, error_scale);
hostToDeviceCopy(x);
//hostToDeviceCopy(bias);
int blockSize = 1024;
int gridSize = (int) ceil ((float) x->num_elems / blockSize);
INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
// NOTE: Check if a large gridSize will work with really large tensors
vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems);
float alpha = 1.0f, beta = 0.0f;
// FIXIT: routine fails for 3D tensors
checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
//Norm_t* norms = calculateNorms2(x, x_original);
Norm_t* norms = calculateNormsGPU(x, x_original);
profileEvent("tensorAddError_end", true);
return (void*) norms;
}
void* tensorAddError(void* x_ptr, int error_scale){
void * new_x = addGaussianError(x_ptr, error_scale);
//void * new_x = addRoundError(x_ptr, error_scale);
//void * new_x = addBitError(x_ptr, error_scale);
return new_x;
}
#endif
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment