// Header guards
#ifndef UTILS_HEADER
#define UTILS_HEADER


#include <sstream>
#include <vector>
#include <bits/stdc++.h>
#include <tensor_runtime.h>
#include <tensor.h>
#include <cmath>


std::vector<float> run_accuracies;


void printTensorInfo(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;

  if(tensor->gpu_data != NULL){
    printf("Successful cudaMalloc \n");
  }

  printf("tensor dims = %d \n", tensor->dims.num_dims);
  printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]);
  printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]);
  printf("num_elems = %lu \n", tensor->num_elems);
}


// FIXIT: Move this to debug.h and include in all files
void dumpWeightsToFile(char* file_name, void* weights_ptr){

  struct Tensor* weights = (Tensor*) weights_ptr;
  // Move data back to host
  hpvm_request_tensor(weights, 0);
  
  FILE* fp = fopen(file_name, "wb");
  if(fp == NULL){
    printf("File %s could not be created. Check if directory exists \n", file_name);
    abort();
  }

  //printf("size_in_bytes = %lu \n", weights->size_in_bytes);
  size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
  //printf("bytes_written = %lu \n", bytes_written);
  fclose(fp);
}



void fillTensorWithOnes(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
    
  hpvm_request_tensor(tensor, 0);
  
  // initialization is specific to the floating point type
  if(tensor->data_type == CUDNN_DATA_FLOAT){
    float* data_arr = (float*) tensor->host_data;
    for(unsigned int i = 0; i < tensor->num_elems; i++){
      data_arr[i] = 1.0;    
    }
  }
}


void fillWithOnesAndTwos(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
  
  hpvm_request_tensor(tensor, 0);
  
  // initialization is specific to the floating point type
  if(tensor->data_type == CUDNN_DATA_FLOAT){
    float* data_arr = (float*) tensor->host_data;
    for(unsigned int i = 0; i < tensor->num_elems/2; i++){
      data_arr[i] = 1.0;    
    }

    for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){
      data_arr[i] = 2.0;    
    }
 
  }
}


void fillTensorWithVal(void* tensor_ptr, float target_value){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
    
  hpvm_request_tensor(tensor, 0);
  
  // initialization is specific to the floating point type
  if(tensor->data_type == CUDNN_DATA_FLOAT){
    float* data_arr = (float*) tensor->host_data;
    for(unsigned int i = 0; i < tensor->num_elems; i++){
      data_arr[i] = target_value;    
    }
  }
}


void fillTensorWithNegOnes(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
    
  hpvm_request_tensor(tensor, 0);
  
  // initialization is specific to the floating point type
  if(tensor->data_type == CUDNN_DATA_FLOAT){
    float* data_arr = (float*) tensor->host_data;
    for(unsigned int i = 0; i < tensor->num_elems; i++){
      data_arr[i] = -1.0;    
    }
  }
}


void fillTensorVals(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
  // initialization is specific to the floating point type
  if(tensor->data_type == CUDNN_DATA_FLOAT){
    float* data_arr = (float*) tensor->host_data;
    for(unsigned int i = 0; i < tensor->num_elems; i++){
      data_arr[i] = i + 1;    
    }
  }
}


void printTensorValues(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;

  hpvm_request_tensor(tensor, 0);
  
  // printing is specific to the floating point type
  if(tensor->data_type == CUDNN_DATA_FLOAT){
    float* data_arr = (float*) tensor->host_data;
    for(unsigned int i = 0; i < tensor->num_elems; i++){
      printf("%f,", data_arr[i]);    
    }
  }

  printf("\n");
}


void printTensorDims(void* tensor_ptr){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;

  printf("Num_elems = %lu \n", tensor->num_elems);
  for (int i = 0; i < tensor->dims.num_dims; i++){
    printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
  }
}



void compareTensors(void* tensor1_ptr, void* tensor2_ptr){

  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;

  hpvm_request_tensor(tensor1, 0);
  hpvm_request_tensor(tensor2, 0);

  float* tensor_data1 = (float*) tensor1->host_data;
  float* tensor_data2 = (float*) tensor2->host_data;
  
  for(unsigned int i = 0; i < tensor1->num_elems; i++){
    if(tensor_data1[i] != tensor_data2[i]){
      printf("Tensor data mismatch at index %d \n", i);
      abort();
    }
  }
}



void compareValues(void* tensor_ptr, float* data, size_t num_elems){

  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
    
  hpvm_request_tensor(tensor, 0);
  
  float* tensor_data = (float*) tensor->host_data;
  for(unsigned int i = 0; i < num_elems; i++){
    if(tensor_data[i] != data[i]){
      printf("Tensor data mismatch");
      abort();
    }
  }
}


void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size,
		      int dim3_size, int dim4_size){

  int type_size = 4; // NOTE: Assuming floating point tensors
  int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
  int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
  uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems);
  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
  int file_header_size = 16;
  
  FILE* file = fopen(file_name, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting... \n", file_name);
    abort();
  }

 
  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
  size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file);

  fclose(file);
  
  for (size_t i = 0; i < num_elems; ++i){
    tensor_data[i] = (float) file_data[i] / 255.0f;
  }

  // NOTE: Using NCHW format
  struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
					dim3_size, dim4_size);
  
  initTensorData(input, tensor_data, size_in_bytes);
  //  compareValues(input, tensor_data, num_elems);
  
  return input;  
}


//*** FIXIT: Move this to CPU-only
struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
				     int dim1_size, int dim2_size,
				     int dim3_size, int dim4_size){

  // FIXIT: Don't assume floating point types
  int type_size = 4; // NOTE: Assuming floating point tensors
  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
  int file_header_size = 0;
  
  FILE* file = fopen(file_name, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting... \n", file_name);
    abort();
  }
    
  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);

  printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);

  fclose(file);
  
  
  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
					                   dim3_size, dim4_size);
  
  initTensorData(weights, tensor_data, size_in_bytes);
  //compareValues(weights, tensor_data, num_elems);
  free(tensor_data);

  return weights;
}


struct Tensor* readTrainedWeights(const char* file_name, int data_type,
				  long int dim1_size, long int dim2_size,
				  long int dim3_size, long int dim4_size){

  // FIXIT: Don't assume floating point types
  int type_size = 4; // NOTE: Assuming floating point tensors
  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
  printf("size_in_bytes  = %lu \n", size_in_bytes);
  
  int file_header_size = 0;
  
  FILE* file = fopen(file_name, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting... \n", file_name);
    abort();
  }
    
  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);

  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);

  fclose(file);
  
  
  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
					                   dim3_size, dim4_size);
  
  initTensorData(weights, tensor_data, size_in_bytes);
  //compareValues(weights, tensor_data, num_elems);
  free(tensor_data);

  return weights;
}




struct Tensor* readInputBatch(const char* file_name, int data_type,
			      int start, int end,
			      int dim2_size, int dim3_size, int dim4_size){

  int dim1_size = end - start;
  // FIXIT: Don't assume floating point types
  int type_size = 4; // NOTE: Assuming floating point tensors
  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
  
  FILE* file = fopen(file_name, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting... \n", file_name);
    abort();
  }
    
  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);


  fclose(file);
  
  
  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
					                   dim3_size, dim4_size);
  
  initTensorData(weights, tensor_data, size_in_bytes);
  free(tensor_data);

  return weights;
}



void* copyInputBatch(const char* file_name, 
		    int start, int end,
		    int dim2_size, int dim3_size, int dim4_size,
		    void* inputTensor_ptr){

  struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr;
  
  int dim1_size = end - start;
  // FIXIT: Don't assume floating point types
  int type_size = 4; // NOTE: Assuming floating point tensors
  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
  
  FILE* file = fopen(file_name, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting... \n", file_name);
    abort();
  }
    
  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);

  fclose(file);
  
    
  initTensorData(inputTensor, tensor_data, size_in_bytes);
  free(tensor_data);

  printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims);
  if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
    printf("ERROR: NULL data pointers \n");


  // Chaning Tensor Placement to HOST 
  changeTensorPlacement(inputTensor, HOST);


  return inputTensor;
}



uint8_t* readLabels(const char* labels_file, int num_labels){

  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
  FILE* file = fopen(labels_file, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting...\n", labels_file);
    abort();
  }

  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);

  fclose(file);
  
  return labels;
}



uint32_t* readLabels3(const char* labels_file, int num_labels){

  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
  FILE* file = fopen(labels_file, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting...\n", labels_file);
    abort();
  }

  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);

  fclose(file);
  
  return labels;
}


uint8_t* readLabelsBatch(const char* labels_file, int start, int end){

  int num_labels = end - start;
  int file_header_size = sizeof(uint8_t) * start;
  
  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
  FILE* file = fopen(labels_file, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting...\n", labels_file);
    abort();
  }
  
  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
    
  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);


  fclose(file);
  
  // printf("--labels bytes_read = %lu \n", bytes_read);
  return labels;
}


uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){

  int num_labels = end - start;
  int file_header_size = sizeof(uint32_t) * start;
  
  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
  FILE* file = fopen(labels_file, "rb");
  if(file == NULL){
    printf("Data file %s is not found. Aborting...\n", labels_file);
    abort();
  }
  
  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
    
  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);


  fclose(file);
  
  return labels;
}



void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){

  struct Tensor* result = (struct Tensor*) result_ptr;
  
  uint8_t* labels = readLabels(labels_file, num_labels);
  size_t batch_dim = result->dims.dim_sizes[0];
  size_t channels = result->dims.dim_sizes[1];
  float* data = (float*) result->host_data;
  int num_errors = 0;
  
  for(int i = 0; i < batch_dim; i++){
    int chosen = 0;
    for (int id = 1; id < 10; ++id){
      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
    }
    
    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
    if(chosen != labels[i])
      num_errors++;
  }

  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
  printf("****** Accuracy = %f \n\n", accuracy);


  FILE* fp = fopen("final_accuracy", "w+");
  if(fp != NULL){

    std::ostringstream ss;
    ss << std::fixed << accuracy;
    std::string print_str = ss.str();
  
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
    fclose(fp);
  }
  
}




// NOTE: batch_size and num_classes are Unused arguments 
float computeAccuracy2(uint8_t* labels, int batch_size,
		       void* result_ptr, size_t num_classes = 10){

  struct Tensor* result = (struct Tensor*) result_ptr;
  
  size_t batch_dim = result->dims.dim_sizes[0];
  num_classes = result->dims.dim_sizes[1];
  float* data = (float*) result->host_data;
  int num_errors = 0;

  printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes);
  
  for(unsigned int i = 0; i < batch_dim; i++){ 
      
    int chosen = 0;
    for (int id = 1; id < num_classes; ++id){
      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
    }
    
    if(chosen != labels[i])
      num_errors++;

  }

  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
  printf("****** Accuracy = %f \n\n", accuracy);

  FILE* fp = fopen("final_accuracy", "w+");
  if(fp != NULL){

    std::ostringstream ss;
    ss << std::fixed << accuracy;
    std::string print_str = ss.str();
  
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
  }

  fclose(fp);

  return accuracy;    
}



float computeAccuracy3(uint32_t* labels, void* result_ptr){
  
  struct Tensor* result = (struct Tensor*) result_ptr;
  
  size_t batch_dim = result->dims.dim_sizes[0];
  size_t num_classes = result->dims.dim_sizes[1];
  float* data = (float*) result->host_data;
  int num_errors = 0;

  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
  
  for(int i = 0; i < batch_dim; i++){
  
    int chosen = 0;
    for (int id = 1; id < num_classes; ++id){
      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
    }
    
    if(chosen != labels[i])
      num_errors++;
  }

  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
  printf("****** Accuracy = %f \n\n", accuracy);

  FILE* fp = fopen("final_accuracy", "w+");
  if(fp != NULL){

    std::ostringstream ss;
    ss << std::fixed << accuracy;
    std::string print_str = ss.str();
  
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
  }

  fclose(fp);

  return accuracy;    
}



struct ClassProb{
  float prob;
  int index;
};


bool descendFloatComp(ClassProb obj1, ClassProb obj2){
  return obj1.prob > obj2.prob;
}


float computeTop5Accuracy(uint8_t* labels, int num_labels,
			  void* result_ptr, unsigned num_classes = 10){
  
  struct Tensor* result = (struct Tensor*) result_ptr;
  
  size_t batch_dim = result->dims.dim_sizes[0];
  size_t channels = result->dims.dim_sizes[1];
  float* data = (float*) result->host_data;
  int num_errors = 0;

  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
  
  for(int i = 0; i < num_labels; i++){

    std::vector<ClassProb> elem_probs;
    for (int id = 0; id < num_classes; ++id){
      ClassProb cProb;
      cProb.prob = data[i * channels + id];
      cProb.index = id;
      elem_probs.push_back(cProb);   
    }

    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
    // Check if any of top-5 predictions matches
    bool matched = false;
    for(int j = 0; j < 5; j++){
      ClassProb cProb = elem_probs[j];
      if(cProb.index == labels[i])
        matched = true;
    }

    if(!matched)
      num_errors +=1; 
  }

  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
  printf("****** Accuracy = %f \n\n", accuracy);

  FILE* fp = fopen("final_accuracy", "w+");
  if(fp != NULL){

    std::ostringstream ss;
    ss << std::fixed << accuracy;
    std::string print_str = ss.str();
  
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
  }

  fclose(fp);

  return accuracy;    
}




void dumpFinalAccuracy(float accuracy){

  printf("\n\n **** Final Accuracy = %f \n", accuracy);
  
  FILE* fp = fopen("final_accuracy", "w+");
  if(fp != NULL){
    std::ostringstream ss;
    ss << std::fixed << accuracy;
    std::string print_str = ss.str();
  
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
  }

  fclose(fp);

  run_accuracies.push_back(accuracy);
}



void dumpAvgPSNR(float avg_psnr){

  FILE* fp = fopen("avg_psnr", "w+");
  if(fp != NULL){
    std::ostringstream ss;
    ss << std::fixed << avg_psnr;
    std::string print_str = ss.str(); 
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
  }

  fclose(fp);
}


void dumpPSNRStd(float psnr_std){

  FILE* fp = fopen("psnr_std.txt", "w+");
  if(fp != NULL){
    std::ostringstream ss;
    ss << std::fixed << psnr_std;
    std::string print_str = ss.str(); 
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
  }

  fclose(fp);
}





void dumpExecutionAccuracies(){

  FILE* fp = fopen("run_accuracies.txt", "w+");
  if(fp != NULL){  
    for (int i = 0; i < run_accuracies.size(); i++){
      float accuracy = run_accuracies[i];
      std::ostringstream ss;
      ss << std::fixed << accuracy;
      std::string print_str = ss.str();
      fwrite(print_str.c_str(), 1, print_str.length(), fp);
      fwrite("\n", 1, 1, fp);
    }

  }

  fclose(fp);
}


float readPSNRFromFile(const char* file_name){

  float psnr;
  FILE* pFile = fopen(file_name, "r");
  if(pFile == NULL){
    printf("ERROR: psnr.txt not found! \n");
    abort();
  }
  
  fscanf(pFile, "%f", &psnr);
  printf("**** PSNR read = %f \n\n", psnr);
  return psnr; 
}


float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){

  
  PSNR_threshold = readPSNRFromFile("psnr.txt");
  std::vector<float> psnr_list;
  
  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;

  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
  size_t batch_dim = dim_sizes[0];
  size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
  
  printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
	 
  float* gold_data = (float*) gold_tensor->host_data;
  float* approx_data = (float*) approx_tensor->host_data;

  FILE* fp = fopen("img_psnr.txt", "w+");

  float sum_psnr = 0.0;
  int num_errors = 0;  
  for(size_t i = 0; i < batch_dim; i++){
    float mse_sum = 0.0;
    float max_val = -999999;     
    size_t offset = i * image_size;
    
    for(size_t j = 0; j < image_size; j++){
      float diff = gold_data[offset + j] - approx_data[offset + j];
      float diff_square = diff * diff;
      mse_sum += diff_square;

      if(max_val < gold_data[offset + j]){
	max_val = gold_data[offset + j];
      }   
    }

    mse_sum = mse_sum / image_size;
    float psnr = 20 * log10(255 / sqrt(mse_sum));

    sum_psnr += psnr;
    if (psnr < PSNR_threshold)
      num_errors += 1;    

    printf("PSNR value = %f \n", psnr);
    psnr_list.push_back(psnr);

    std::ostringstream ss;
    ss << std::fixed << psnr;
    std::string print_str = ss.str();
    fwrite(print_str.c_str(), 1, print_str.length(), fp);
    fwrite("\n", 1, 1, fp);
  }

  float violation_rate = (num_errors * 1.0) / batch_dim * 100.0;
  printf("*** violation_rate= %f \n\n", violation_rate);

  float avg_psnr = sum_psnr / batch_dim;
  printf("*** avg_psnr =  %f \n\n", avg_psnr);
  dumpAvgPSNR(avg_psnr);
 
  float success_rate = 100.0 - violation_rate;
  dumpFinalAccuracy(success_rate);

  fclose(fp);


  float var = 0.0;
  for(size_t i = 0; i < batch_dim; i++){
    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
  }

  var /= batch_dim;
  float std = sqrt(var);

  dumpPSNRStd(std);
  
  return violation_rate;  
}


void dumpOutput(void* output_ptr, const char* file_name){

  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
  size_t size_in_bytes = out_tensor->size_in_bytes;
  printf ("** Output size = %lu \n", size_in_bytes);
  
  float* host_data = (float*) out_tensor->host_data; 
  FILE* fd = fopen(file_name, "w+");
  fwrite(host_data, 1, size_in_bytes, fd);
  fclose(fd);
}



#endif