Skip to content
Snippets Groups Projects
Commit 62e59537 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Moving to efficient tree-reduction norm computation

parent 59030885
No related branches found
No related tags found
No related merge requests found
......@@ -286,6 +286,156 @@ __global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_
__inline__ __device__ double warpReduceSum(double val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__ double blockReduceSum(double val) {
static __shared__ double shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane == 0)
shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid == 0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N,
double* A_l1, double* A_l2,
double* diff_l1, double* diff_l2) {
double sum_A_l1 = double(0);
double sum_A_l2 = double(0);
double sum_diff_l1 = double(0);
double sum_diff_l2 = double(0);
for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
sum_A_l1 += fabsf(A[i]);
sum_A_l2 += (A[i] * A[i]);
double diff1 = A[i] - B[i];
sum_diff_l1 += fabsf(diff1);
double diff2 = diff1 * diff1;
sum_diff_l2 += diff2;
}
sum_A_l1 = blockReduceSum(sum_A_l1);
sum_A_l2 = blockReduceSum(sum_A_l2);
sum_diff_l1 = blockReduceSum(sum_diff_l1);
sum_diff_l2 = blockReduceSum(sum_diff_l2);
if (threadIdx.x == 0){
atomicAdd(A_l1, sum_A_l1);
atomicAdd(A_l2, sum_A_l2);
atomicAdd(diff_l1, sum_diff_l1);
atomicAdd(diff_l2, sum_diff_l2);
}
}
void deviceReduce(float* A, float* B, int N,
double* A_l1, double* A_l2,
double* diff_l1, double* diff_l2) {
int threads = 512;
int blocks = min((N + threads - 1) / threads, 1024);
deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, diff_l1, diff_l2);
//-- deviceReduceKernel<<<1, 1024>>>(out, out, blocks);
}
// Compute Norms on the GPU
Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
hostToDeviceCopy(x);
hostToDeviceCopy(x_orig);
// FIXIT: Move all floats to doubles - overflow is possible
double l1_norm_A;
double l2_norm_A;
double l1_diff;
double l2_diff;
// Device pointers
double *l1_norm_A_d;
double *l2_norm_A_d;
double *l1_diff_d;
double *l2_diff_d;
cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
cudaMalloc( (void**) &l1_diff_d, sizeof(double));
cudaMalloc( (void**) &l2_diff_d, sizeof(double));
float* arr1 = (float*) x->gpu_data;
float* arr2 = (float*) x_orig->gpu_data;
//normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d);
cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
INFO("l1_norm_A = %f, l2_norm_A = %f, l1_diff = %f, l2_diff = %f \n",
l1_norm_A, l2_norm_A,l1_diff, l2_diff);
// Relative L1 and Mean L1 norms of the difference Matrix
float mean_l1 = l1_diff / x->num_elems;
float relative_l1 = l1_diff / l1_norm_A;
// Computing Relative L2 norm - i.e., Euclidean distance
double norm_root_A = sqrt(l2_norm_A);
double diff_root = sqrt(l2_diff);
float mean_l2 = diff_root / x->num_elems;
float relative_l2 = diff_root / norm_root_A;
// Packing computed norms in Norm_t struct
Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
// Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
norms->mean_l1 = mean_l1;
norms->mean_l2 = mean_l2;
norms->orig_inf_norm = 0.0;
// Relative metrics (relative to distribution) - suitable for PROMISE
norms->l1_norm = relative_l1;
norms->l2_norm = relative_l2;
norms->inf_norm = 0.0;
INFO("l1_norm = %f \n", relative_l1);
INFO("l2_norm = %f \n", relative_l2);
return norms;
}
// Compute Norms on the GPU
Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
......@@ -407,8 +557,8 @@ void initRandValues(Tensor* bias, int error_scale){
scaling_values[2] = 0.03;
scaling_values[3] = 0.06;
scaling_values[4] = 0.08;
scaling_values[5] = 0.1;
scaling_values[6] = 0.13;
scaling_values[5] = 0.105;
scaling_values[6] = 0.134;
scaling_values[7] = 0.16;
scaling_values[8] = 0.2;
scaling_values[9] = 0.23;
......@@ -495,6 +645,7 @@ void* addBitError(void* x_ptr, int error_scale){
Norm_t* norms = calculateNorms2(x, x_original);
profileEvent("tensorBitError_end", true);
......@@ -634,8 +785,9 @@ void* addGaussianError(void* x_ptr, int error_scale){
//Norm_t* norms = calculateNorms2(x, x_original);
Norm_t* norms = calculateNormsGPU(x, x_original);
//Norm_t* norms = calculateNormsGPU(x, x_original);
Norm_t* norms = calculateNormsTreeReduction(x, x_original);
freeTensor(x_original);
freeTensor(bias);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment