#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <vector>
#include <string.h>
#include "tensor_runtime.h"
#include "tensor_cpu_runtime.h"
#include "tensorUtils.h"
#include "tensor_custom_ops_cpu.h"

using namespace std;


class UnitTestResults {

private:
  unsigned int total_tests;
  unsigned int failed_tests;
  unsigned int passed_tests;
  std::vector<string> failed_test_ids;

public:
  UnitTestResults() {
    total_tests = 0;
    failed_tests = 0;
    passed_tests = 0;
  }

  void evalTestResult(Tensor *res, const float *expected_result,
                      size_t num_elems, float epsilon, string test_name) {

    total_tests += 1;
    if (res->num_elems != num_elems) {
      failed_tests += 1;
      failed_test_ids.push_back(test_name);
      return;
    }

    float *data_ptr = (float *)res->host_data;
    for (unsigned int i = 0; i < res->num_elems; i++) {
      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) {
        failed_tests += 1;
        failed_test_ids.push_back(test_name);
        return;
      }
    }

    passed_tests += 1;
  }

  void compareTensors(Tensor *res, Tensor *gold_res, float epsilon,
                      string test_name) {

    const float *expected_result = (float *)gold_res->host_data;
    unsigned int num_elems = res->num_elems;

    evalTestResult(res, expected_result, num_elems, epsilon, test_name);
  }

  void printSummary() {

    printf("\n\n\n ************* Printing Results Summary ********** \n\n");
    printf("-- Total tests :=  %d \n", total_tests);
    printf("-- Tests Passed := %d \n", passed_tests);
    printf("-- Tests Failed := %d \n", failed_tests);

    printf("\n\n Tests that failed : \n\n");
    for (int i = 0; i < failed_test_ids.size(); i++) {
      printf("*** Test = %s \n", failed_test_ids[i].c_str());
    }

    if (failed_test_ids.size() > 0){
      
      printf("Some Tests Failed. Aborting");
      exit(1);
    }
    
  }
};





void testSampleFilter() {

  printf("***** Tensor Sample Filter ***** \n\n");
  Tensor *input =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);

  fillWithOnesAndTwos(input);

  Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
                                            3, 2, 32, 32);
  fillTensorWithVal(input2, 1);
  printTensorValues(input);

  void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1);
  printTensorValues(exact_res);

  void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);

  printTensorValues(res);
}

void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
                          int stride_h, int stride_w, int row, int col,
                          UnitTestResults &unitTestResults) {

  float interpolation_rate = 1.0;
  for (int offset = 0; offset < 2; offset++) {

    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
           "row = %d col = %d  offset= %d \n\n",
           pad_h, pad_w, stride_h, stride_w, row, col, offset);

    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
                                        stride_w, 1, 1);

    printf("tensorConvolution Result :");
    printTensorValues(res_exact);

    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
                                        stride_w, 1, 1, 1, 1, 1, 1);

    printf("\nBaseline Result :");
    printTensorValues(res_exact2);

    void *res_exact3 = tensorConvApproxHalf2(
        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
    convertToFP32((struct Tensor *)res_exact3);

    printf("\nFP16_Baseline Result :");
    printTensorValues(res_exact3);

    void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h,
                                       stride_w, 1, 1, row, col, offset);

    printf("\nConvPerfCuda Result :");
    printTensorValues(res_sim);

    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
                                  stride_w, 1, 1, row, col, 1, offset);

    printf("\nConvApprox Result :");
    printTensorValues(res);

    hpvm_request_tensor(input, HOST);
    hpvm_request_tensor(filter, HOST);

    void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h,
                                        stride_w, 1, 1, row, col, 1, offset);

    printf("\nConvApproxCPU Result :");
    printTensorValues(res_cpu);

    void *res_half =
        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
                              1, 1, row, col, 1, offset);

    convertToFP32((struct Tensor *)res_half);

    printf("\nConvApproxHalf2 Result :");
    printTensorValues(res_half);

    std::string suffix =
        std::string(" pad_h = ") + std::to_string(pad_h) +
        std::string(" pad_w = ") + std::to_string(pad_w) +
        std::string(" stride_h = ") + std::to_string(stride_h) +
        std::string(" stride_w = ") + std::to_string(stride_w) +
        std::string(" row = ") + std::to_string(row) + std::string(" col = ") +
        std::to_string(col) + std::string(" offset = ") +
        std::to_string(offset);

    std::string test_name = std::string("PERF_FP32 ") + suffix;

    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
                                   test_name);

    std::string fp16_test_name = std::string("PERF_FP16 ") + suffix;
    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1,
                                    fp16_test_name);

    std::string cpu_test_name = std::string("PERF_CPU ") + suffix;
    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
                                   cpu_test_name);
  }

  printf("\n\n\n--- End of Test \n\n\n");
}

/**** Tests Perforation for a set of different inputs */
void testPerforation(UnitTestResults &unitTestResults) {

  printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
  Tensor *input =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
  fillTensorWithVal(input, 1);

  Tensor *filter =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
  fillTensorWithVal(filter, 1);


  testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2, unitTestResults);

  testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1, unitTestResults);

  testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3, unitTestResults);

  testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1, unitTestResults);

  testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4, unitTestResults);

  testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1, unitTestResults);
}

void testSampling() {

  printf("***** Testing Sampling ***** \n\n");
  Tensor *input =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
  fillTensorWithVal(input, 1);

  Tensor *filter =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
  fillTensorWithVal(filter, 1);

  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
  host_ptr[0] = 2;
  host_ptr[2] = 2;
  host_ptr[4] = 2;
  host_ptr[6] = 2;
  host_ptr[8] = 2;
  host_ptr[10] = 2;
  host_ptr[12] = 2;
  host_ptr[14] = 2;
  host_ptr[16] = 2;
  host_ptr[18] = 2;
  host_ptr[20] = 2;
  host_ptr[22] = 2;
  host_ptr[24] = 2;
  host_ptr[26] = 2;
  // printTensorValues(input);

  void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);

  printTensorValues(res);

  void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);

  printTensorValues(res2);

  void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);

  printTensorValues(res2_sim);

  void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);

  printTensorValues(res3);

  void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);

  printTensorValues(res4);

  void *res4_half =
      tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);

  convertToFP32((struct Tensor *)res4_half);

  printTensorValues(res4_half);
}

void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
                       int stride_h, int stride_w, int skip_every,
                       std::string filter_string,
                       UnitTestResults &unitTestResults) {

  float interpolation_rate = 1.0;
  for (int offset = 0; offset < 2; offset++) {

    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
           "skip_every = %d offset= %d interpolation_rate = %f \n\n",
           pad_h, pad_w, stride_h, stride_w, skip_every, offset,
           interpolation_rate);

    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
                                        stride_w, 1, 1);

    printf("tensorConvolution Result :");
    printTensorValues(res_exact);

    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
                                        stride_w, 1, 1, 1, 1, 1, 1);

    printf("\nBaseline Result :");
    printTensorValues(res_exact2);

    void *res_exact3 = tensorConvApproxHalf2(
        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
    convertToFP32((struct Tensor *)res_exact3);

    printf("\nFP16_Baseline Result :");
    printTensorValues(res_exact3);

    void *res_sim =
        tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
                           1, skip_every, offset, interpolation_rate);

    printf("\nConvSampSim Result :");
    printTensorValues(res_sim);

    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
                                 stride_w, 1, 1, 1, 1, skip_every, offset);

    printf("\nConvApprox Result :");
    printTensorValues(res);

    hpvm_request_tensor(input, HOST);
    hpvm_request_tensor(filter, HOST);

    void *res_cpu =
        tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
                            1, 1, 1, skip_every, offset);

    printf("\nConvApproxCPU Result :");
    printTensorValues(res_cpu);

    void *res_half =
        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
                              1, 1, 1, 1, skip_every, offset);

    convertToFP32((struct Tensor *)res_half);

    printf("\nConvApproxHalf2 Result :");
    printTensorValues(res_half);

    std::string suffix =
        "filter = " + std::string(filter_string) + std::string(" pad_h = ") +
        std::to_string(pad_h) + std::string(" pad_w = ") +
        std::to_string(pad_w) + std::string(" stride_h = ") +
        std::to_string(stride_h) + std::string(" stride_w = ") +
        std::to_string(stride_w) + std::string(" skip_every = ") +
        std::to_string(skip_every) + std::string(" offset = ") +
        std::to_string(offset);

    std::string test_name = std::string("SAMP_FP32 ") + suffix;

    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
                                   test_name);

    std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix;
    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1,
                                   fp16_test_name);

    std::string cpu_test_name = std::string("SAMP_CPU ") + suffix;
    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
                                   cpu_test_name);
  }

  printf("\n\n\n --- End of Test \n\n\n");
}

/**** Tests Sample for a sample 3 * 3 Filter */
void testSampling_3_3(UnitTestResults &unitTestResults) {

  printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
  Tensor *input =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
  fillTensorWithVal(input, 1);
  // fillWithOnesAndTwos(input);

  Tensor *filter =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
  fillTensorWithVal(filter, 1);

  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
  host_ptr[0] = 10;
  host_ptr[2] = 2;
  host_ptr[4] = 2;
  host_ptr[6] = 2;
  host_ptr[8] = 2;
  host_ptr[10] = 2;
  host_ptr[12] = 2;
  host_ptr[14] = 2;
  host_ptr[16] = 2;
  host_ptr[18] = 2;
  host_ptr[20] = 2;
  host_ptr[22] = 2;
  host_ptr[24] = 2;
  host_ptr[26] = 10;

  // Tests with padding = 0 stride = 1
  testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "3_3", unitTestResults);

  testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "3_3", unitTestResults);

  testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "3_3", unitTestResults);

  // Tests with padding = 1 stride = 1
  testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "3_3", unitTestResults);

  testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "3_3", unitTestResults);

  testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "3_3", unitTestResults);

  // Tests with padding = 1 stride = 2
  testSamplingCalls(input, filter, 1, 1, 2, 2, 2, "3_3", unitTestResults);

  testSamplingCalls(input, filter, 1, 1, 2, 2, 3, "3_3", unitTestResults);

  testSamplingCalls(input, filter, 1, 1, 2, 2, 4, "3_3", unitTestResults);
}

/**** Tests Sample for a sample 1 * 1 Filter */
void testSampling_1_1(UnitTestResults &unitTestResults) {

  Tensor *input =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
  fillTensorWithVal(input, 2);
  // fillWithOnesAndTwos(input);

  Tensor *filter =
      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
  fillTensorWithVal(filter, 2);

  // Tests with padding = 0 stride = 1
  testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "1_1", unitTestResults);

  testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "1_1", unitTestResults);

  testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "1_1", unitTestResults);

  // Tests with padding = 1 stride = 1
  testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "1_1", unitTestResults);

  testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "1_1", unitTestResults);

  testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "1_1", unitTestResults);
}



void testSampling(UnitTestResults &unitTestResults){

  testSampling_3_3(unitTestResults);
  testSampling_1_1(unitTestResults);
}