diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index cda77b905298bd1e518423bc6a667053c5843777..e7b6c1e9214236c5735df61d30933dec863efd02 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -36,6 +36,9 @@ link_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64)
 cuda_add_library(tensor_runtime tensor_runtime/src/tensor_runtime.cu)
 cuda_add_cublas_to_target(tensor_runtime)
 
+# Adding new rule for building a cuDNN runtime library
+cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc)
+
 
 if(USE_GFLAGS)
   target_link_libraries(tensor_runtime gflags cudnn -lcurand)
@@ -43,12 +46,17 @@ else()
   target_link_libraries(tensor_runtime cudnn -lcurand)
 endif()
 
-
+target_link_libraries(tensor_cpu_runtime)
 
 # Adding rule for the debugging source
 add_executable(test_ops  dnn_sources/src/test_ops.cc)
 target_link_libraries(test_ops  tensor_runtime)
 
+#**** CPU sources
+add_executable(fc2_cpu  dnn_sources/src/fc2_cpu.cc)
+target_link_libraries(fc2_cpu  tensor_cpu_runtime)
+
+
 
 # Full-Precision versions
 add_executable(fc2_clipped  dnn_sources/src/fc2_clipped.cc)
@@ -70,6 +78,9 @@ target_link_libraries(lenet_keras  tensor_runtime)
 add_executable(alexnet_cifar10  dnn_sources/src/alexnet_cifar10_front.cc)
 target_link_libraries(alexnet_cifar10  tensor_runtime)
 
+add_executable(alexnet_cifar10_test  dnn_sources/src/alexnet_cifar10_test.cc)
+target_link_libraries(alexnet_cifar10_test  tensor_runtime)
+
 add_executable(alexnet_cifar10_tuner  dnn_sources/src/alexnet_cifar10_tuner.cc)
 target_link_libraries(alexnet_cifar10_tuner  tensor_runtime)
 
@@ -100,11 +111,28 @@ target_link_libraries(vgg16_cifar100_tuner  tensor_runtime)
 add_executable(vgg16_cifar100_top5  dnn_sources/src/vgg16_cifar100_5.cc)
 target_link_libraries(vgg16_cifar100_top5  tensor_runtime)
 
-#add_executable(cifar_keras  dnn_sources/src/cifar_keras.cc)
-#target_link_libraries(cifar_keras  tensor_runtime)
+#### Image Pipeline Tuning sources
+
+add_executable(pipeline_GEMO  dnn_sources/src/pipeline_GEMO.cc)
+target_link_libraries(pipeline_GEMO  tensor_runtime)
+
+add_executable(pipeline_GEO  dnn_sources/src/pipeline_GEO.cc)
+target_link_libraries(pipeline_GEO  tensor_runtime)
+
+add_executable(pipeline_GEOM  dnn_sources/src/pipeline_GEOM.cc)
+target_link_libraries(pipeline_GEOM  tensor_runtime)
 
+add_executable(pipeline_GSM  dnn_sources/src/pipeline_GSM.cc)
+target_link_libraries(pipeline_GSM  tensor_runtime)
+
+add_executable(pipeline_GSME  dnn_sources/src/pipeline_GSME.cc)
+target_link_libraries(pipeline_GSME  tensor_runtime)
+
+
+
+
+#*** Half precision networks
 
-# Half precision networks
 add_executable(fc2_half  dnn_sources/src/half/fc2_half.cc)
 target_link_libraries(fc2_half  tensor_runtime)
 
@@ -125,6 +153,9 @@ target_link_libraries(lenet_keras_half  tensor_runtime)
 add_executable(lenet_keras_promise  dnn_sources/src/promise/lenet_keras_promise.cc)
 target_link_libraries(lenet_keras_promise  tensor_runtime)
 
+add_executable(lenet_promise_relu  dnn_sources/src/promise/lenet_promise_relu.cc)
+target_link_libraries(lenet_promise_relu  tensor_runtime)
+
 add_executable(fc4_clipped_promise  dnn_sources/src/promise/fc4_clipped_promise.cc)
 target_link_libraries(fc4_clipped_promise  tensor_runtime)
 
@@ -132,31 +163,16 @@ add_executable(alexnet_promise  dnn_sources/src/promise/alexnet_promise.cc)
 target_link_libraries(alexnet_promise  tensor_runtime)
 
 
-#add_executable(resnet18_promise_relu  dnn_sources/src/promise/resnet18_promise_relu.cc)
-#target_link_libraries(resnet18_promise_relu  tensor_runtime)
-
-#add_executable(resnet18_promise_quant  dnn_sources/src/promise/resnet18_promise_quant.cc)
-#target_link_libraries(resnet18_promise_quant  tensor_runtime)
-
-#add_executable(resnet18_promise_quant2  dnn_sources/src/promise/resnet18_promise_quant2.cc)
-#target_link_libraries(resnet18_promise_quant2  tensor_runtime)
-
 # Quantized PROMISE sources
 add_executable(alexnet_promise_quant  dnn_sources/src/promise/alexnet_promise_quant.cc)
 target_link_libraries(alexnet_promise_quant  tensor_runtime)
 
-#add_executable(alexnet2_promise  dnn_sources/src/promise/alexnet2_promise.cc)
-#target_link_libraries(alexnet2_promise  tensor_runtime)
-
 add_executable(alexnet2_promise_quant  dnn_sources/src/promise/alexnet2_promise_quant.cc)
 target_link_libraries(alexnet2_promise_quant  tensor_runtime)
 
 add_executable(resnet18_promise_quant  dnn_sources/src/promise/resnet18_promise_quant2.cc)
 target_link_libraries(resnet18_promise_quant  tensor_runtime)
 
-#add_executable(vgg16_cifar100_promise  dnn_sources/src/promise/vgg16_cifar100_promise.cc)
-#target_link_libraries(vgg16_cifar100_promise  tensor_runtime)
-
 add_executable(vgg16_cifar100_promise_quant  dnn_sources/src/promise/vgg16_cifar100_promise_quant.cc)
 target_link_libraries(vgg16_cifar100_promise_quant  tensor_runtime)
 
@@ -165,6 +181,24 @@ target_link_libraries(vgg16_cifar10_promise_quant  tensor_runtime)
 
 
 
+#### Image Pipeline PROMISE sources
+add_executable(pipeline_GEMO_promise  dnn_sources/src/promise/pipeline_GEMO_promise.cc)
+target_link_libraries(pipeline_GEMO_promise  tensor_runtime)
+
+add_executable(pipeline_GEO_promise  dnn_sources/src/promise/pipeline_GEO_promise.cc)
+target_link_libraries(pipeline_GEO_promise  tensor_runtime)
+
+add_executable(pipeline_GEOM_promise  dnn_sources/src/promise/pipeline_GEOM_promise.cc)
+target_link_libraries(pipeline_GEOM_promise  tensor_runtime)
+
+add_executable(pipeline_GSM_promise  dnn_sources/src/promise/pipeline_GSM_promise.cc)
+target_link_libraries(pipeline_GSM_promise  tensor_runtime)
+
+add_executable(pipeline_GSME_promise  dnn_sources/src/promise/pipeline_GSME_promise.cc)
+target_link_libraries(pipeline_GSME_promise  tensor_runtime)
+
+
+
 #############  Promise Validation Sources #############
 
 add_executable(alexnet_valid  dnn_sources/src/promise/alexnet_valid.cc)
@@ -181,3 +215,20 @@ target_link_libraries(vgg16_cifar100_valid  tensor_runtime)
 
 add_executable(vgg16_cifar10_valid  dnn_sources/src/promise/vgg16_cifar10_valid.cc)
 target_link_libraries(vgg16_cifar10_valid  tensor_runtime)
+
+##### Image pipeline validation sources
+add_executable(pipeline_GEMO_valid  dnn_sources/src/promise/pipeline_GEMO_valid.cc)
+target_link_libraries(pipeline_GEMO_valid  tensor_runtime)
+
+add_executable(pipeline_GEO_valid  dnn_sources/src/promise/pipeline_GEO_valid.cc)
+target_link_libraries(pipeline_GEO_valid  tensor_runtime)
+
+add_executable(pipeline_GEOM_valid  dnn_sources/src/promise/pipeline_GEOM_valid.cc)
+target_link_libraries(pipeline_GEOM_valid  tensor_runtime)
+
+add_executable(pipeline_GSM_valid  dnn_sources/src/promise/pipeline_GSM_valid.cc)
+target_link_libraries(pipeline_GSM_valid  tensor_runtime)
+
+add_executable(pipeline_GSME_valid  dnn_sources/src/promise/pipeline_GSME_valid.cc)
+target_link_libraries(pipeline_GSME_valid  tensor_runtime)
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0358556022d81e914af79f4080f6594aa61c924
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
@@ -0,0 +1,482 @@
+
+// Header guards
+#ifndef UTILS_HEADER
+#define UTILS_HEADER
+
+
+#include <sstream>
+#include <vector>
+#include <bits/stdc++.h>
+#include "../../tensor_runtime/include/tensor.h"
+#include "types.h"
+#include <cmath>
+
+
+std::vector<float> run_accuracies;
+
+
+void printTensorInfo(void* tensor_ptr){
+
+  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+
+  if(tensor->gpu_data != NULL){
+    printf("Successful cudaMalloc \n");
+  }
+
+  printf("tensor dims = %d \n", tensor->dims.num_dims);
+  printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]);
+  printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]);
+  printf("num_elems = %lu \n", tensor->num_elems);
+}
+
+
+
+void printTensorDims(void* tensor_ptr){
+
+  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+
+  printf("Num_elems = %lu \n", tensor->num_elems);
+  for (int i = 0; i < tensor->dims.num_dims; i++){
+    printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
+  }
+}
+
+
+
+void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
+
+  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
+  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
+
+  hpvm_request_tensor(tensor1, 0);
+  hpvm_request_tensor(tensor2, 0);
+
+  float* tensor_data1 = (float*) tensor1->host_data;
+  float* tensor_data2 = (float*) tensor2->host_data;
+  
+  for(unsigned int i = 0; i < tensor1->num_elems; i++){
+    if(tensor_data1[i] != tensor_data2[i]){
+      printf("Tensor data mismatch at index %d \n", i);
+      abort();
+    }
+  }
+}
+
+
+
+//*** FIXIT: Move this to CPU-only
+struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
+				     int dim1_size, int dim2_size,
+				     int dim3_size, int dim4_size){
+
+  // FIXIT: Don't assume floating point types
+  int type_size = 4; // NOTE: Assuming floating point tensors
+  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  int file_header_size = 0;
+  
+  FILE* file = fopen(file_name, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting... \n", file_name);
+    abort();
+  }
+    
+  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
+  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+
+  printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+
+  fclose(file);
+  
+  
+  struct Tensor* weights = (struct Tensor*) create4DTensorCPU(data_type, nchw, dim1_size, dim2_size,
+					                   dim3_size, dim4_size);
+  
+  initTensorData(weights, tensor_data, size_in_bytes);
+  //compareValues(weights, tensor_data, num_elems);
+  free(tensor_data);
+
+  return weights;
+}
+
+
+
+uint8_t* readLabels(const char* labels_file, int num_labels){
+
+  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
+  FILE* file = fopen(labels_file, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting...\n", labels_file);
+    abort();
+  }
+
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+
+  fclose(file);
+  
+  return labels;
+}
+
+
+uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+
+  int num_labels = end - start;
+  int file_header_size = sizeof(uint8_t) * start;
+  
+  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
+  FILE* file = fopen(labels_file, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting...\n", labels_file);
+    abort();
+  }
+  
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+    
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+
+
+  fclose(file);
+  
+  // printf("--labels bytes_read = %lu \n", bytes_read);
+  return labels;
+}
+
+
+
+void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
+
+  struct Tensor* result = (struct Tensor*) result_ptr;
+  
+  uint8_t* labels = readLabels(labels_file, num_labels);
+  size_t batch_dim = result->dims.dim_sizes[0];
+  size_t channels = result->dims.dim_sizes[1];
+  float* data = (float*) result->host_data;
+  int num_errors = 0;
+  
+  for(int i = 0; i < batch_dim; i++){
+    int chosen = 0;
+    for (int id = 1; id < 10; ++id){
+      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    }
+    
+    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
+    if(chosen != labels[i])
+      num_errors++;
+  }
+
+  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
+  printf("****** Accuracy = %f \n\n", accuracy);
+
+
+  FILE* fp = fopen("final_accuracy", "w+");
+  if(fp != NULL){
+
+    std::ostringstream ss;
+    ss << std::fixed << accuracy;
+    std::string print_str = ss.str();
+  
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+    fclose(fp);
+  }
+  
+}
+
+
+
+float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){
+
+  unsigned num_zeros = 0;
+  
+  struct Tensor* result = (struct Tensor*) result_ptr;
+  
+  size_t batch_dim = result->dims.dim_sizes[0];
+  size_t channels = result->dims.dim_sizes[1];
+  float* data = (float*) result->host_data;
+  int num_errors = 0;
+
+  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
+  
+  for(int i = 0; i < num_labels; i++){
+  
+    int chosen = 0;
+    for (int id = 1; id < num_classes; ++id){
+      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    }
+    
+    if(labels[i] == 0)
+      num_zeros++;
+      
+    if(chosen != labels[i])
+      num_errors++;
+
+    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
+  }
+
+  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
+  printf("****** Accuracy = %f \n\n", accuracy);
+  //printf("****** Zero class labels %d \n", num_zeros);
+
+  FILE* fp = fopen("final_accuracy", "w+");
+  if(fp != NULL){
+
+    std::ostringstream ss;
+    ss << std::fixed << accuracy;
+    std::string print_str = ss.str();
+  
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+
+  return accuracy;    
+}
+
+
+struct ClassProb{
+  float prob;
+  int index;
+};
+
+
+bool descendFloatComp(ClassProb obj1, ClassProb obj2){
+  return obj1.prob > obj2.prob;
+}
+
+
+float computeTop5Accuracy(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){
+  
+  struct Tensor* result = (struct Tensor*) result_ptr;
+  
+  size_t batch_dim = result->dims.dim_sizes[0];
+  size_t channels = result->dims.dim_sizes[1];
+  float* data = (float*) result->host_data;
+  int num_errors = 0;
+
+  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
+  
+  for(int i = 0; i < num_labels; i++){
+
+    std::vector<ClassProb> elem_probs;
+    for (int id = 0; id < num_classes; ++id){
+      ClassProb cProb;
+      cProb.prob = data[i * channels + id];
+      cProb.index = id;
+      elem_probs.push_back(cProb);   
+    }
+
+    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
+    // Check if any of top-5 predictions matches
+    bool matched = false;
+    for(int j = 0; j < 5; j++){
+      ClassProb cProb = elem_probs[j];
+      if(cProb.index == labels[i])
+        matched = true;
+    }
+
+    if(!matched)
+      num_errors +=1; 
+  }
+
+  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
+  printf("****** Accuracy = %f \n\n", accuracy);
+
+  FILE* fp = fopen("final_accuracy", "w+");
+  if(fp != NULL){
+
+    std::ostringstream ss;
+    ss << std::fixed << accuracy;
+    std::string print_str = ss.str();
+  
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+
+  return accuracy;    
+}
+
+
+
+
+void dumpFinalAccuracy(float accuracy){
+
+  printf("\n\n **** Final Accuracy = %f \n", accuracy);
+  
+  FILE* fp = fopen("final_accuracy", "w+");
+  if(fp != NULL){
+    std::ostringstream ss;
+    ss << std::fixed << accuracy;
+    std::string print_str = ss.str();
+  
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+
+  run_accuracies.push_back(accuracy);
+}
+
+
+
+void dumpAvgPSNR(float avg_psnr){
+
+  FILE* fp = fopen("avg_psnr", "w+");
+  if(fp != NULL){
+    std::ostringstream ss;
+    ss << std::fixed << avg_psnr;
+    std::string print_str = ss.str(); 
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+}
+
+
+void dumpPSNRStd(float psnr_std){
+
+  FILE* fp = fopen("psnr_std.txt", "w+");
+  if(fp != NULL){
+    std::ostringstream ss;
+    ss << std::fixed << psnr_std;
+    std::string print_str = ss.str(); 
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+}
+
+
+
+
+
+void dumpExecutionAccuracies(){
+
+  FILE* fp = fopen("run_accuracies.txt", "w+");
+  if(fp != NULL){  
+    for (int i = 0; i < run_accuracies.size(); i++){
+      float accuracy = run_accuracies[i];
+      std::ostringstream ss;
+      ss << std::fixed << accuracy;
+      std::string print_str = ss.str();
+      fwrite(print_str.c_str(), 1, print_str.length(), fp);
+      fwrite("\n", 1, 1, fp);
+    }
+
+  }
+
+  fclose(fp);
+}
+
+
+float readPSNRFromFile(const char* file_name){
+
+  float psnr;
+  FILE* pFile = fopen(file_name, "r");
+  if(pFile == NULL){
+    printf("ERROR: psnr.txt not found! \n");
+    abort();
+  }
+  
+  fscanf(pFile, "%f", &psnr);
+  printf("**** PSNR read = %f \n\n", psnr);
+  return psnr; 
+}
+
+
+float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
+
+  
+  PSNR_threshold = readPSNRFromFile("psnr.txt");
+  std::vector<float> psnr_list;
+  
+  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
+  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
+
+  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
+  size_t batch_dim = dim_sizes[0];
+  size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
+  
+  printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
+	 
+  float* gold_data = (float*) gold_tensor->host_data;
+  float* approx_data = (float*) approx_tensor->host_data;
+
+  FILE* fp = fopen("img_psnr.txt", "w+");
+
+  float sum_psnr = 0.0;
+  int num_errors = 0;  
+  for(size_t i = 0; i < batch_dim; i++){
+    float mse_sum = 0.0;
+    float max_val = -999999;     
+    size_t offset = i * image_size;
+    
+    for(size_t j = 0; j < image_size; j++){
+      float diff = gold_data[offset + j] - approx_data[offset + j];
+      float diff_square = diff * diff;
+      mse_sum += diff_square;
+
+      if(max_val < gold_data[offset + j]){
+	max_val = gold_data[offset + j];
+      }   
+    }
+
+    mse_sum = mse_sum / image_size;
+    float psnr = 20 * log10(255 / sqrt(mse_sum));
+
+    sum_psnr += psnr;
+    if (psnr < PSNR_threshold)
+      num_errors += 1;    
+
+    printf("PSNR value = %f \n", psnr);
+    psnr_list.push_back(psnr);
+
+    std::ostringstream ss;
+    ss << std::fixed << psnr;
+    std::string print_str = ss.str();
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+    fwrite("\n", 1, 1, fp);
+  }
+
+  float violation_rate = (num_errors * 1.0) / batch_dim * 100.0;
+  printf("*** violation_rate= %f \n\n", violation_rate);
+
+  float avg_psnr = sum_psnr / batch_dim;
+  printf("*** avg_psnr =  %f \n\n", avg_psnr);
+  dumpAvgPSNR(avg_psnr);
+ 
+  float success_rate = 100.0 - violation_rate;
+  dumpFinalAccuracy(success_rate);
+
+  fclose(fp);
+
+
+  float var = 0.0;
+  for(size_t i = 0; i < batch_dim; i++){
+    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
+  }
+
+  var /= batch_dim;
+  float std = sqrt(var);
+
+  dumpPSNRStd(std);
+  
+  return violation_rate;  
+}
+
+
+void dumpOutput(void* output_ptr, const char* file_name){
+
+  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
+  size_t size_in_bytes = out_tensor->size_in_bytes;
+  printf ("** Output size = %lu \n", size_in_bytes);
+  
+  float* host_data = (float*) out_tensor->host_data; 
+  FILE* fd = fopen(file_name, "w+");
+  fwrite(host_data, 1, size_in_bytes, fd);
+  fclose(fd);
+}
+
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/fc2_cpu.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/fc2_cpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..476160ac90420a6957b0e8918a3bdde48f42db01
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/fc2_cpu.cc
@@ -0,0 +1,66 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#include "../../tensor_runtime/include/tensor_cpu_runtime.h"
+#include "../include/utils_cpu.h"
+#include "../include/types.h"
+
+
+void FC2(){
+
+  printf("********* 2-Layer FC with clipped activations and weights ********* \n");
+
+  int test_batch_size = 100;
+
+  uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size);
+
+  void* input = readTrainedWeightsCPU("../model_params/FC_network2/mnist_float_input.bin",
+				      float_type, test_batch_size, 1, 28, 28);
+
+  void* fc1_weights = readTrainedWeightsCPU("../model_params/fc2_clipped/fc1.bin",
+					 float_type, 1, 1, 784, 128);  
+  void* fc1_bias = readTrainedWeightsCPU("../model_params/fc2_clipped/fc1_bias.bin",
+				      float_type, 1, 128, 1, 1);  
+  void* fc2_weights = readTrainedWeightsCPU("../model_params/fc2_clipped/fc2.bin",
+					 float_type, 1, 1, 128, 10);  
+  void* fc2_bias = readTrainedWeightsCPU("../model_params/fc2_clipped/fc2_bias.bin",
+				      float_type, 1, 10, 1, 1);
+
+
+  // Layer-1
+  void* fc1out = tensorGemmCPU(input, fc1_weights);  
+  
+  void* fc1_bias_out = tensorAddCPU(fc1out, fc1_bias);
+  
+  //-- void* fc1_relu = tensorRelu2(fc1_bias_out, 0, 2);
+  
+  // Layer-2
+  //-- void* fc2out = tensorGemmCPU(fc1_relu, fc2_weights);  
+  
+  //-- void* fc2_bias_out = tensorAddCPU(fc2out, fc2_bias);
+  
+  //--- void* fc2_relu = tensorRelu2(fc2_bias_out, 0, 2);
+  
+  //--- void* result = tensorSoftmax(fc2_relu);
+
+  //-- computeAccuracy2(labels, test_batch_size, result);
+    
+}
+
+
+// If an argument is passed - the run goes into OpenTuner mode - waiting on a pipe
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  FC2();
+  
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfa9511055f04d703696d65f5fd24094bc202f3e
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
@@ -0,0 +1,94 @@
+
+#include <stdio.h>
+#include <cstdlib>
+#include <cmath>
+#include <memory>
+#include <string>
+//#include "runtime_types.h"
+
+
+#ifndef CUDNN_HEADER
+#define CUDNN_HEADER
+
+
+extern "C"{
+  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
+  void llvm_hpvm_initTensorRt(int gpuid = 0);
+  void llvm_hpvm_cleanupTensorRt();
+
+  // Routine to moving tensor data (from and to GPU,CPU)
+  void hpvm_request_tensor(void* tensor, int destination);
+
+
+  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
+  // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width)
+  void* create4DTensorCPU(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
+			  size_t dim3_size, size_t dim4_size);
+  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
+
+  /********** Tensor Operation API ******/
+
+  // NOTE: For conv_mode, only value '1' is supported
+  void* tensorConvolutionCPU(void* input, void* filter,
+			     int vertical_pad, int horizontal_pad,
+			     int vertical_stride, int horizontal_stride,
+			     int conv_mode, int compute_precision);
+
+  void* tensorPoolingCPU(void* input,
+			 int poolFunction,
+			 int window_height, int window_width,
+			 int vertical_pad, int horizontal_pad,
+			 int vertical_stride, int horizontal_stride);
+
+  void* tensorGemmCPU(void* lhs, void* rhs);
+  // NOTE: In place operation
+  void* tensorAddCPU(void* x, void* bias);
+  // NOTE: In-place operation
+  void* tensorReluCPU(void* input);
+  
+  void* tensorTanhCPU(void* input);
+  
+  void* tensorSoftmaxCPU(void* input);
+    
+}
+
+
+/*
+void dummyFunction(){
+
+  void* initRT = (void*) &llvm_hpvm_initTensorRt;
+  void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt;
+  void* request_tensorPtr = (void*) &hpvm_request_tensor;
+  void* startProf = (void*) &startProfiling;
+  void* stopProf = (void*) &stopProfiling;
+  void* create2Dptr = (void*) &create2DTensor;
+  void* create3Dptr = (void*) &create3DTensor;
+  void* create4Dptr = (void*) &create4DTensor;
+  void* initTensorPtr = (void*) &initTensorData;
+  void* tensorSplitPtr = (void*) &tensorSplit;
+  void* tensorConcatPtr = (void*) &tensorConcat;
+  void* tensorConvPtr = (void*) &tensorConvolution;
+  void* tensorHConvPtr = (void*) &tensorHalfConvolution;
+  void* tensorPoolPtr = (void*) &tensorPooling;
+  void* tensorHalfPoolPtr = (void*) &tensorHalfPooling;
+  void* tensorLRNPtr = (void*) &tensorLRN;
+  void* tensorGemmPr = (void*) &tensorGemm;
+  void* tensorGemmCPUPtr = (void*) &tensorGemmCPU;
+  void* tensorGemmGPUPtr = (void*) &tensorGemmGPU;
+  void* tensorHgemmPtr = (void*) &tensorHalfGemm;
+  void* tensorGemmBiasPtr = (void*) &tensorGemmBias;
+  void* tensorAddPtr = (void*) &tensorAdd;
+  void* tensorHalfAddPtr = (void*) &tensorHalfAdd;
+  void* tensorReluPtr = (void*) &tensorRelu;
+  //FIXME: --void* tensorHalfReluPtr = (void*) &tensorHalfRelu;
+  void* tensorRelu2Ptr = (void*) &tensorRelu2;
+  void* tensorHalfRelu2Ptr = (void*) &tensorHalfRelu2;
+  void* tensorTanhPtr = (void*) &tensorTanh;
+  void* tensorHalfTanhPtr = (void*) &tensorHalfTanh;
+  void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
+  void* tensorAddErrorPtr = (void*) &tensorAddError;    
+}
+*/
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
new file mode 100644
index 0000000000000000000000000000000000000000..336c98b767d416e98de6e848d395e80f148d118a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -0,0 +1,158 @@
+/* This file includes the API implementation of the HPVM tensor runtime built on cublas, cudnn
+**
+**  Author: Hashim Sharif
+**  Email: hsharif3@illinois.edu
+*/
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <ctime>
+#include <cfloat>
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <stdlib.h>
+#include <cstring>
+
+// Tensor runtime header files
+#include "../include/tensor_cpu_runtime.h"
+#include "../include/tensor.h"
+
+
+void llvm_hpvm_initTensorRt(int gpuid){
+  // NOTE: Do Nothing
+}
+
+void llvm_hpvm_cleanupTensorRt(){
+  // NOTE: Do Nothing
+}
+
+void hpvm_request_tensor(void* tensor, int destination){
+  // NOTE: Do Nothing
+}
+
+
+// Returns the size of the target cudnn datatype
+int getTypeSize(int data_type){
+  // Float/Int data type - Full Precision
+  if(data_type == 0)
+    return 4;
+  // Half data type
+  if(data_type == 1)
+    return 2;
+
+  return 1;
+}
+
+
+void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){
+  int type_size = getTypeSize(data_type);
+  size_t size_in_bytes = type_size * num_elems;
+  tensor->size_in_bytes = size_in_bytes;
+}
+
+
+void allocateMemCPU(struct Tensor* tensor, int data_type, size_t num_elems){
+  setSizeInBytes(tensor, data_type, num_elems);
+  tensor->data_type = data_type;
+  tensor->num_elems = num_elems;
+  tensor->host_data = (void*) malloc(tensor->size_in_bytes); // Allocate memory on the host
+}
+
+
+void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){
+
+  Tensor* tensor = (Tensor*) tensor_ptr;  
+  if(tensor->size_in_bytes != size_in_bytes){
+    printf("The destination and source sizes don't match");
+  }  
+  memcpy(tensor->host_data, data_ptr, size_in_bytes);
+}
+
+
+
+void* create4DTensorCPU(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
+		        size_t dim3_size, size_t dim4_size){
+  
+  struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+
+  allocateMemCPU(tensor, data_type, num_elems);  
+  // Setting the tensor dimensions  
+  size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 4);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  dim_sizes[3] = dim4_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 4;
+
+  return tensor;
+}
+
+
+
+void* tensorAddCPU(void* x_ptr, void* bias_ptr){
+
+  Tensor* x = (Tensor*) x_ptr;
+  Tensor* bias = (Tensor*) bias_ptr;
+  
+  float* x_data = (float*) x->host_data;
+  float* bias_data = (float*) bias->host_data;
+
+  size_t num_elems = x->num_elems; 
+  for(size_t i = 0; i < num_elems; i++){
+    x_data[i] += bias_data[i];
+  }  
+  
+  return x;
+}
+
+
+void* tensorGemmCPU(void* lhs_ptr, void* rhs_ptr){
+
+  Tensor* lhs = (Tensor*) lhs_ptr;
+  Tensor* rhs = (Tensor*) rhs_ptr;
+  
+  // 'm' holds the batch dimension - assuming NCHW format Tensors
+  int m = lhs->dims.dim_sizes[0];
+  // The rhs must be a 2D tensor
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int k = 1;
+  // Flattening the dimensions after the batch dimension
+  // NOTE: Allowing any number of dimensions > 2 for lhs
+  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+    k = k * lhs->dims.dim_sizes[j]; // input neurons
+  }
+
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+
+  // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines
+  Tensor* output = (Tensor*) create4DTensorCPU(0, 0, m, n, 1, 1);
+ 
+  float* lhs_arr = (float*) lhs->host_data;
+  float* rhs_arr = (float*) rhs->host_data;
+  float* output_arr = (float*) output->host_data;
+  
+  for(int i = 0; i < m; i++){
+    for(int j = 0; j < n; j++){
+      float sum = 0.0;
+      for(int l = 0; l < k; l++){
+	float mul = lhs_arr[i*k+l] * rhs_arr[l*n+j];
+	sum = sum + mul;
+      }
+      output_arr[i*n+j] = sum;
+    }
+  }
+   
+  return output;
+}
+
+