From d353e1e771421f5718fff3fb136955a6b2b392c3 Mon Sep 17 00:00:00 2001
From: Akash Kothari <akashk4@miranda.cs.illinois.edu>
Date: Sun, 23 May 2021 17:03:05 -0500
Subject: [PATCH] Add NVDLA backend pass

---
 hpvm/lib/Transforms/CMakeLists.txt            |    1 +
 hpvm/lib/Transforms/HPVM2NVDLA/CMakeLists.txt |   34 +
 .../Transforms/HPVM2NVDLA/HPVM2NVDLA.exports  |    0
 .../Transforms/HPVM2NVDLA/HPVM2NVDLAPass.cpp  | 1652 +++++++++++++++++
 hpvm/lib/Transforms/HPVM2NVDLA/LLVMBuild.txt  |   21 +
 5 files changed, 1708 insertions(+)
 create mode 100644 hpvm/lib/Transforms/HPVM2NVDLA/CMakeLists.txt
 create mode 100644 hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLA.exports
 create mode 100644 hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLAPass.cpp
 create mode 100644 hpvm/lib/Transforms/HPVM2NVDLA/LLVMBuild.txt

diff --git a/hpvm/lib/Transforms/CMakeLists.txt b/hpvm/lib/Transforms/CMakeLists.txt
index b18cd4551b..296e3f87cd 100644
--- a/hpvm/lib/Transforms/CMakeLists.txt
+++ b/hpvm/lib/Transforms/CMakeLists.txt
@@ -6,5 +6,6 @@ add_subdirectory(GenHPVM)
 add_subdirectory(LocalMem)
 add_subdirectory(DFG2LLVM_WrapperAPI)
 add_subdirectory(DFG2LLVM_CUDNN)
+add_subdirectory(HPVM2NVDLA)
 add_subdirectory(FuseHPVMTensorNodes)
 add_subdirectory(InPlaceDFG)
diff --git a/hpvm/lib/Transforms/HPVM2NVDLA/CMakeLists.txt b/hpvm/lib/Transforms/HPVM2NVDLA/CMakeLists.txt
new file mode 100644
index 0000000000..2e82ec555a
--- /dev/null
+++ b/hpvm/lib/Transforms/HPVM2NVDLA/CMakeLists.txt
@@ -0,0 +1,34 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}")
+
+add_definitions(-DNVDLA_UTILS_ERROR_TAG="DLA")
+
+include_directories(../../../sw/umd/external/include)
+include_directories(../../../sw/umd/core/include)
+include_directories(../../../sw/umd/core/src/common/include)
+include_directories(../../../sw/umd/core/src/compiler/include)
+
+add_llvm_library( LLVMHPVM2NVDLAPass
+  MODULE
+  HPVM2NVDLAPass.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
+find_library(NVDLA_COMPILER
+    NAMES nvdla_compiler
+    HINTS ../../../sw/lib
+)
+
+find_library(PROTOBUF
+    NAMES protobuf
+    HINTS ../../../sw/lib
+)
+
+target_link_libraries(LLVMHPVM2NVDLAPass ${NVDLA_COMPILER} ${PROTOBUF})
diff --git a/hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLA.exports b/hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLA.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLAPass.cpp b/hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLAPass.cpp
new file mode 100644
index 0000000000..e3b8f5dfc9
--- /dev/null
+++ b/hpvm/lib/Transforms/HPVM2NVDLA/HPVM2NVDLAPass.cpp
@@ -0,0 +1,1652 @@
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "DFG2NVDLA"
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/ADT/STLExtras.h"
+
+#include "SupportHPVM/DFG2LLVM.h"
+
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <map>
+
+#include "dlaerror.h"
+#include "dlatypes.h"
+
+#include "nvdla/IRuntime.h"
+#include "DlaImageUtils.h"
+
+#include "ErrorMacros.h"
+#include "nvdla_inf.h"
+#include "nvdla_os_inf.h"
+#include "nvdla/IType.h"
+#include "nvdla/ITensor.h"
+#include "nvdla/INetwork.h"
+#include "nvdla/ILayer.h"
+#include "nvdla/IProfiler.h"
+#include "nvdla/IProfile.h"
+#include "nvdla/ICompiler.h"
+#include "nvdla/ILoadable.h"
+#include "nvdla/IWisdom.h"
+
+#include "rapidjson/document.h"
+#include "rapidjson/filereadstream.h"
+#include "rapidjson/error/en.h"
+#include "half.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+using namespace nvdla;
+
+typedef half_float::half float16;
+
+static cl::opt<std::string> ComputePrecision("cprecision",
+                    cl::desc("Compute precision (int8 or fp16)."), cl::init("float16"));
+
+static cl::opt<std::string> CalibTablePath("calib-table", 
+                    cl::desc("Path to tensor scales file"), 
+					cl::value_desc("filename"), cl::Required);
+
+
+#define DEFAULT_BATCH_SIZE 0
+#define DEFAULT_DATA_FMT nvdla::DataFormat::NCHW
+#define DEFAULT_QUANT_MODE nvdla::QuantizationMode::NONE
+#define TARGET_CONFIG_NAME "nv_full"
+#define TEST_PARAM_FILE_MAX_SIZE    65536
+
+struct HPVM2NVDLA : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  HPVM2NVDLA() : ModulePass(ID) {}
+
+public:
+  // Functions
+  virtual bool runOnModule(Module &M);
+  
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+	  AU.addRequired<BuildDFG>();
+	  AU.addPreserved<BuildDFG>();
+  }
+
+private:
+  //bool transformHPVM2NVDLA(Module &M);
+  
+  //void codeGenHPVM2NVDLA(CGT_NVDLA *, DFNode *);
+};
+
+struct TestAppArgs
+{
+    std::string project;
+    std::string inputPath;
+    std::string inputName;
+    std::string outputPath;
+    std::string testname;
+    std::string testArgs;
+    std::string prototxt; // This should be folded into testArgs
+    std::string caffemodel; // This should be folded into testArgs
+    std::string cachemodel; // This should be folded into testArgs
+
+    std::string profileName; // ok here?
+    std::string profileFile;
+    std::string configtarget;
+    std::string calibTable;
+    nvdla::QuantizationMode quantizationMode;
+    
+    Module *M;
+    std::vector<DFInternalNode *> *Roots;
+
+    NvU16 numBatches;
+    nvdla::DataFormat inDataFormat;
+    nvdla::DataType computePrecision;
+
+    std::map<std::string, NvF32> tensorScales;
+};
+
+struct TestInfo
+{
+    // common
+    nvdla::IWisdom* wisdom;
+    std::string wisdomPath;
+
+    // parse
+    std::string modelsPath;
+    std::string profilesPath;
+    std::string calibTablesPath;
+
+    // runtime
+   // nvdla::IRuntime* runtime;
+    nvdla::ILoadable* compiledLoadable;
+    NvU8 *pData;
+    //std::string inputImagesPath;
+    //std::string inputLoadablePath;
+   // std::map<std::string, NvDlaImage*> inputImages;
+   // std::map<std::string, void *> inputBuffers;
+   // std::map<std::string, NvDlaImage*> outputImages;
+   // std::map<std::string, void *> outputBuffers;
+   // std::vector<SubmitContext*> submits;
+    NvU32 timeout;
+    NvU16 numBatches; // runtime's point-of-view
+    NvU32 numSubmits;
+};
+
+static TestAppArgs defaultTestAppArgs =
+{
+    /* .project = */ "OpenDLA",
+    /* .inputPath = */ "./",
+    /* .inputName = */ "",
+    /* .outputPath = */ "./",
+    /* .testname = */ "",
+    /* .testArgs = */ "",
+    /* .prototxt = */ "",
+    /* .caffemodel = */ "",
+    /* .cachemodel = */ "",
+    /* .profileName = */ "fast-math",
+    /* .profileFile = */ "",
+    /* .configtarget = */ TARGET_CONFIG_NAME,
+    /* .calibtable = */ "",
+    /* .quantizationMode = */ DEFAULT_QUANT_MODE,
+	nullptr, nullptr,
+    /* .numBatches = */ DEFAULT_BATCH_SIZE,
+    /* .inDataFormat = */ DEFAULT_DATA_FMT,
+    /* .computePrecision = */ nvdla::DataType::INT8
+};
+
+char HPVM2NVDLA::ID = 0;
+static RegisterPass<HPVM2NVDLA> X("hpvm-nvdla",
+				 "Dataflow Graph to NVDLA IR Pass",
+				 false, false);
+
+
+// Visitor for Code generation traversal of HPVM IR
+class CGT_NVDLA : public CodeGenTraversal {
+private:
+  // Data information
+  //DataFormat InDataFormat;
+  //DataType ComputePrecision;
+  //QuantizationMode Quantization;
+  //NvU16 NumBatches;
+	
+  // Wisdom and network information
+  IWisdom *Wisdom;
+  INetwork *Network;
+
+  std::map<std::string, int> LayerNameMap;
+  
+  // Maps dataflow edges in HPVM IR to Tensors in NVDLA IR
+  DenseMap<const DFEdge *, ITensor *> EdgeToTensorMap;
+	
+  // Virtual Functions
+  void init();
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+  
+  // Codegen functions for all supported layers
+  void generateConvolutionLayer(DFLeafNode *, const IntrinsicInst *);
+  void generatePoolingLayer(DFLeafNode *, const IntrinsicInst *);
+  void generateBatchNormLayer(DFLeafNode *, const IntrinsicInst *);
+  void generateReluLayer(DFLeafNode *, const IntrinsicInst *);
+  void generateGemmLayer(DFLeafNode *, const IntrinsicInst *);
+  void generateSoftMaxLayer(DFLeafNode *, const IntrinsicInst *);
+  void generateTanhLayer(DFLeafNode *, const IntrinsicInst *);
+  
+  // Map edges to output tensors
+  void mapOutputTensor(DFNode *N, ITensor *Tensor);
+  
+  // Get input tensors to nodes
+  ITensor *getIntermediateInputTensor(DFNode *N);
+  
+  // Get binding tensors to nodes
+  User *getBindingTensor(DFLeafNode* N, unsigned index);
+  
+  // Get the input NVDLA tensors to nodes
+   ITensor *getNVDLAInputTensor(DFLeafNode* N, const User *InputTensor);
+  
+  // Get index for an input tensor
+  unsigned getInputIndex(DFLeafNode* N, const IntrinsicInst *II);
+  
+  // Gets nodes with add ops meant to be combined with convolution and gemm
+  void getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes,
+  			SmallVector<IntrinsicInst *, 4> &AddInsts);
+  
+  // Getting weights
+   Weights readTrainedWeights(User *TensorPtr,
+  					int dim1_size, int dim2_size,
+  					int dim3_size, int dim4_size);
+   
+  // Identify outputs
+  unsigned identifyOutputs();
+   
+  // Generate profile based on data parameters
+  //void generateProfile(std::string &, std::string &);
+
+std::string getLayerName(std::string Name);
+  
+public:
+
+  CGT_NVDLA(Module &_M, BuildDFG &_DFG)
+  : CodeGenTraversal(_M, _DFG) {// : Network(nullptr) {
+    //initRuntimeAPI();
+    init();
+  }
+  
+  //void destroySetUp();
+  
+  //void setUpWisdom();
+  
+  //void compileProfile();
+  
+  //void transformHPVM2NVDLA(DFNode *);
+  
+  NvDlaError generateTensorScales(const TestAppArgs*, TestInfo*, nvdla::INetwork*);
+  
+  NvDlaError updateProfileWithCmdLineArgs(const TestAppArgs*, TestInfo*, const char*, nvdla::DataFormat);
+  
+  NvDlaError beginWithNamedProfile(const TestAppArgs*, TestInfo*);
+  
+  NvDlaError generateProfile(const TestAppArgs*, std::string*, TestInfo*);
+  
+  NvDlaError compileProfile(const TestAppArgs*, TestInfo*);
+  
+  NvDlaError launchTest(const TestAppArgs*);
+  
+  NvDlaError testSetup(const TestAppArgs*, TestInfo*);
+  
+  NvDlaError parseAndCompile(const TestAppArgs*, TestInfo*);
+  
+  NvDlaError transformHPVM2NVDLA(const TestAppArgs*, TestInfo*);
+  
+  NvDlaError parseSetup(const TestAppArgs*, TestInfo*);
+
+  NvDlaError readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network);
+};
+
+void CGT_NVDLA::init() {
+	// Default paramters
+	//InDataFormat = DataFormat::NCHW;
+	//ComputePrecision = DataType::FLOAT;
+	//Quantization = QuantizationMode::NONE;
+	//NumBatches = 0;
+}
+
+void CGT_NVDLA::initRuntimeAPI() {
+	// Nothing to do here!
+}
+
+Weights CGT_NVDLA::readTrainedWeights(User *TensorPtr,
+					int dim1_size, int dim2_size,
+					int dim3_size, int dim4_size) {
+	DEBUG(errs() << "READ TRAINED WEIGHTS\n");
+	// Get weights file name
+	User *MemcpyPtr = dyn_cast<User>(TensorPtr->getOperand(0));
+	DEBUG(MemcpyPtr->print(errs()));  
+        DEBUG(errs() << "\n");
+	while(!dyn_cast<AllocaInst>(MemcpyPtr)) {
+		MemcpyPtr = dyn_cast<User>(MemcpyPtr->getOperand(0));
+	}
+	User *MemcpyArg = nullptr;
+	for(User *U: MemcpyPtr->users()) {
+		DEBUG(U->print(errs()));
+                DEBUG(errs() << "\n");
+		if(auto *BCO = dyn_cast<BitCastOperator>(U)) { 
+			for(User *CU: BCO->users()) {
+				if(auto *CI = dyn_cast<CallInst>(CU)) {
+					CI->getCalledFunction()->getName().contains(StringRef("memcpy"));
+					 MemcpyArg = dyn_cast<User>(CI->getOperand(1));
+		                        break;
+				}
+			}
+			if(MemcpyArg)
+				break;
+		}
+	}
+	assert(MemcpyArg && "File name not found.");
+	auto *WeightFileName = dyn_cast<GlobalVariable>(MemcpyArg->getOperand(0));
+	assert(WeightFileName && "Weight file name must be a global variable.");
+	auto* CDA = dyn_cast<ConstantDataArray>(WeightFileName->getInitializer());
+	assert(CDA && "Weight file name must be a constant array.");
+	const auto &file_name = std::string(CDA->getAsString());
+			
+	// Read the weights file
+	int num_elem = dim1_size * dim2_size * dim3_size * dim4_size;
+	int size_in_bytes = sizeof(float16) * num_elem;
+	//DEBUG(errs() << "float16 size: " << sizeof(float16) << "\n");
+	DEBUG(errs() << "size in bytes: " << size_in_bytes << "\n");
+	void *tensor_data = (void *) malloc(size_in_bytes);
+	int file_header_size = 0;
+	DEBUG(errs() << "FILE NAME: " << file_name << "\n");
+	FILE *file = fopen(file_name.c_str(), "rb");
+	if(!file) {
+		DEBUG(errs() << "Data file is not found. Aborting.\n");
+		abort();
+	}
+	fseek(file, file_header_size, SEEK_CUR);
+	size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+	DEBUG(errs() << "BYTES READ: " << bytes_read << "\n");
+	fclose(file);
+	
+	// Create weight tensors
+	auto Weight = Weights(DataType::HALF, tensor_data, NvS64(num_elem));
+	//FILE *try_file = fopen("temp.bin", "wb");
+	//fwrite(Weight.values, sizeof(float), num_elem, try_file);
+	//fclose(try_file);
+	//exit(-1);	
+	return Weight;
+}
+
+// For a tensor to be a input weight tensor, it has to come from the root node
+User *CGT_NVDLA::getBindingTensor(DFLeafNode* N, unsigned index) {
+	// HPVM internal API needs fixing. Remove this lambda function when bug is fixed.
+	auto NodeIsRoot = [](DFNode &InternalNode) {
+		auto *RootFunction = InternalNode.getFuncPointer();
+		for(User *U: RootFunction->users()) {
+                	DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
+			DEBUG(U->print(errs()));
+			DEBUG(errs() << "\n");
+                	auto *II = dyn_cast<IntrinsicInst>(U);
+                	if(!II) {
+                        	auto *BCI = dyn_cast<BitCastOperator>(U);
+                        	assert(BCI && "Not a bitcast instruction.");
+                        	for(User *BCU : BCI->users()) {
+                                	DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
+					DEBUG(BCU->print(errs()));
+					DEBUG(errs() << "\n");
+					II = dyn_cast<IntrinsicInst>(BCU);
+                                	if(II)
+                                        	break;
+                        	}
+                	}
+                	if(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)) {
+                		DEBUG(errs() << "LAUNCH FUNCTION: ");
+                		DEBUG(II->print(errs()));
+		        	DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n");	
+				return true;
+			}	
+		}
+		DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n");
+		return false;
+	};
+
+	auto NodeIsLeaf = [](DFNode &Node) {
+                auto *NodeFunction = Node.getFuncPointer();
+                for(User *U: NodeFunction->users()) {
+                        DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
+                        DEBUG(U->print(errs()));
+                        DEBUG(errs() << "\n");
+                        auto *II = dyn_cast<IntrinsicInst>(U);
+                        if(!II) {
+                                auto *BCI = dyn_cast<BitCastOperator>(U);
+                                assert(BCI && "Not a bitcast instruction.");
+                                for(User *BCU : BCI->users()) {
+                                        DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
+                                        DEBUG(BCU->print(errs()));
+                                        DEBUG(errs() << "\n");
+                                        II = dyn_cast<IntrinsicInst>(BCU);
+                                        if(II)
+                                                break;
+                                }
+                        }
+                        if(II 
+			&& (II->getIntrinsicID() == Intrinsic::hpvm_createNode
+			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode1D
+			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode2D
+			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode3D)) {
+                                DEBUG(errs() << "CREATE NODE FUNCTION: ");
+                                DEBUG(II->print(errs()));
+                                DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n");
+
+			// Ensure that the node function does not have these create node intrinsics
+				for(inst_iterator i = inst_begin(NodeFunction), 
+						  e = inst_end(NodeFunction); i != e; ++i) {
+					Instruction *I = &(*i);
+					if(auto *II = dyn_cast<IntrinsicInst>(I)) {
+						if(II->getIntrinsicID() == Intrinsic::hpvm_createNode
+                        			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode1D
+                        			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode2D
+                        			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode3D) {
+							DEBUG(errs() << "--LAMBDA FUNCTION RETURN FALSE\n");
+							return false;
+						}
+					}
+					
+				}
+                                return true;
+                        }
+                }
+                DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n");
+                return false;
+        };
+
+	DEBUG(errs() << "GET BINDING TENSOR\n");
+	DEBUG(errs() << "GIVEN INDEX: " << index << "\n");
+	DFEdge *DE = N->getInDFEdgeAt(index);
+	assert(DE && "Data edge does not exist at given index");
+         DEBUG(errs() << "LEAF NODE FUNCTION: " << N->getFuncPointer()->getName() << "\n");
+        // Get the argument position in the root node.
+	DEBUG(errs() << "GET TO THE ROOT FIRST\n");	
+	auto *InternalNode = DE->getSourceDF();
+	  DEBUG(errs() << "INTERNAL NODE FUNCTION: " << InternalNode->getFuncPointer()->getName() << "\n");
+        DEBUG(errs() << "INTERNAL NDOE POINTER: " << InternalNode << "\n");
+	if(NodeIsLeaf(*InternalNode)) {
+		DEBUG(errs() << "BIND NONE: EDGE FROM LEAF NODE\n");
+		return nullptr;
+	}
+	unsigned argPos = DE->getSourcePosition();
+        DEBUG(errs() << "ARG POSITION BEFORE LOOP: " << argPos << "\n");
+	while(!NodeIsRoot(*InternalNode)) {
+		DEBUG(errs() << "IN LOOP\n");
+		if(NodeIsLeaf(*InternalNode)) {
+                	DEBUG(errs() << "IN LOOP BIND NONE: EDGE FROM LEAF NODE\n");
+                	return nullptr;
+        	}
+		argPos = DE->getSourcePosition();
+		DE = InternalNode->getInDFEdgeAt(argPos);
+		if(!DE) {
+			DEBUG(errs() << "NO BINDING EDGE IN LOOP\n");
+			// No binding edge.
+			return nullptr;
+		}
+		InternalNode = DE->getSourceDF();	
+		DEBUG(errs() << "INTERNAL NODE FUNCTION IN LOOP: " << InternalNode->getFuncPointer()->getName() << "\n");
+		DEBUG(errs() << "IN LOOP DATA EDGE: " << DE << "\n");
+                DEBUG(errs() << "IN LOOP ARG POSITION: " << argPos << "\n");
+	}
+	DEBUG(errs() << "ARG POSITION: " << argPos << "\n");
+	
+	DEBUG(errs() << "GET THE LAUNCH FUNCTION\n");   	
+	// Now we have the root node. We need to get the launch functions for it.
+	auto *RootFunction = InternalNode->getFuncPointer();
+	for(User *U: RootFunction->users()) {
+		DEBUG(errs() << "User for root: ");
+		DEBUG(U->print(errs()));
+		IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+		if(!II) {
+			auto *BCI = dyn_cast<BitCastOperator>(U);
+			assert(BCI && "Not a bitcast instruction.");
+			for(User *BCU : BCI->users()) {
+				II = dyn_cast<IntrinsicInst>(BCU);
+				if(II)
+					break;
+			}
+		}
+		assert(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)
+				&& "Use of a root node must be in launch function call instrinsic.");
+		DEBUG(errs() << "LAUNCH FUNCTION: ");
+		DEBUG(II->print(errs()));		
+
+		// Now, get the the arguments to the root and get element pointer to argument structure.
+		auto *ArgObj = dyn_cast<Instruction>(II->getOperand(1));
+		if(auto *BCO = dyn_cast<BitCastOperator>(ArgObj)) {
+			ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
+		} else if (auto *CI = dyn_cast<CallInst>(ArgObj)) {
+			for(User *CIU : CI->users()) {
+				auto *BCO = dyn_cast<BitCastOperator>(CIU);
+				if(BCO) {
+					ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
+					break;
+				}
+			}
+		} else if (auto *AI = dyn_cast<AllocaInst>(ArgObj)) {
+			for(User *AIU : AI->users()) {
+                                auto *BCO = dyn_cast<BitCastOperator>(AIU);
+                                if(BCO) {
+                                        ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
+                                        break;
+                                }
+                        }
+		}
+		auto *ArgObjPtrType = dyn_cast<PointerType>(ArgObj->getType());
+		auto *ArgObjType = dyn_cast<StructType>(ArgObjPtrType->getElementType());
+		assert(ArgObjType && "Arguments to launch is a structure.");
+		DEBUG(errs() << "ARG OBJ: ");
+		DEBUG(ArgObj->print(errs()));
+		DEBUG(errs() << "\n");
+		
+		// Use the offset into the structure to get the source tensor.
+		const auto &DL = ArgObj->getParent()->getParent()->getParent()->getDataLayout();
+		const auto *SL = DL.getStructLayout(ArgObjType);
+		uint64_t ElementOffset = SL->getElementOffset(argPos);
+		DEBUG(errs() << "ELEMENT OFFSET: " << ElementOffset << "\n");
+		Instruction *StructElemPtr = nullptr;
+		for(User *U: ArgObj->users()) {
+			if(auto *GI = dyn_cast<GetElementPtrInst>(U)) {
+				auto *Offset = dyn_cast<ConstantInt>(GI->getOperand(2));
+				assert(Offset && "Offset is not constant.");
+				if(Offset->getZExtValue() == argPos) {//ElementOffset) {
+					StructElemPtr = GI;
+					break;
+				}
+			}
+		}
+		assert(StructElemPtr && "No getelementptr found with given offset.");
+		DEBUG(StructElemPtr->print(errs()));
+		DEBUG(errs() << "\n");	
+		DEBUG(errs() << "USE THE STORES TO GET THE BIND TENSOR\n");	
+		// Get store to the element of argument structure to get the pointer to tensor.
+		for(User *GIU: StructElemPtr->users()) {
+			DEBUG(GIU->print(errs()));
+			DEBUG(errs() << "\n");
+			if(auto *BCO = dyn_cast<BitCastOperator>(GIU)) {
+				DEBUG(BCO->print(errs()));
+				DEBUG(errs() << "\n");
+				for(User *BCU : BCO->users()) {
+					if(auto *SI = dyn_cast<StoreInst>(BCU)) {
+						// Get the tensor pointer
+						DEBUG(SI->print(errs()));
+						DEBUG(errs() << "\n");
+						auto *Val = SI->getValueOperand();
+		                                if(auto *BCO = dyn_cast<BitCastOperator>(Val)) {
+				                        return dyn_cast<User>(BCO->getOperand(0));
+						}
+						return dyn_cast<User>(Val);
+					}
+				}
+			}
+			if(auto *SI = dyn_cast<StoreInst>(GIU)) {
+				// Get the tensor pointer
+				DEBUG(SI->print(errs()));
+				auto *Val = SI->getValueOperand();
+				if(auto *BCO = dyn_cast<BitCastOperator>(Val)) {
+					return dyn_cast<User>(BCO->getOperand(0));
+				}
+				return dyn_cast<User>(Val);
+			}
+		}
+	}
+	return nullptr;
+}
+
+
+void CGT_NVDLA::mapOutputTensor(DFNode *N, ITensor *Tensor) {
+	for(int i = 0; i < N->outdfedge_size(); i++)
+		EdgeToTensorMap[N->getOutDFEdgeAt(i)] = Tensor;
+}
+
+ITensor *CGT_NVDLA::getIntermediateInputTensor(DFNode *N) {
+	return EdgeToTensorMap[N->getInDFEdgeAt(0)];
+}
+
+void CGT_NVDLA::getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes,
+					SmallVector<IntrinsicInst *, 4> &AddInsts) {
+	bool AddOpNodes = false;
+	for(int i = 0; i < N->outdfedge_size(); i++) {
+		auto *DestNode = N->getOutDFEdgeAt(i)->getDestDF();
+		auto *F = DestNode->getFuncPointer();
+		
+		// If the node is already cached in the list, no need to visit it
+		auto *Node = dyn_cast<DFLeafNode>(DestNode);
+		if(find(AddNodes, Node) != AddNodes.end())
+			continue;
+		
+		// Add node to list if it contains add operation
+		for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+			Instruction *I = &(*i);
+			 auto *II = dyn_cast<IntrinsicInst>(I);
+			 if (II && II->getIntrinsicID() == Intrinsic::hpvm_tensor_add) {
+				 AddNodes.push_back(Node);
+				 AddInsts.push_back(II);
+				 AddOpNodes = true;
+				 break;
+			}
+		}
+		assert(((AddNodes.size() > 0) == AddOpNodes)
+				&& "All destination nodes are adds or all of them are not.");
+	}
+}
+
+ITensor *CGT_NVDLA::getNVDLAInputTensor(DFLeafNode* N, const User *InputBindingTensor) {
+	if(InputBindingTensor) {
+		auto *BatchesConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(2));
+		auto *ChannelsConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(3));
+		auto *HeightConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(4));
+		auto *WidthConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(5));
+		assert(HeightConst && WidthConst && ChannelsConst && BatchesConst 
+				&& "Number of input dimensions must be constants.");
+		
+		// Input dimensions
+		int InputW = WidthConst->getZExtValue();
+	        int InputH = HeightConst->getZExtValue();
+		int InputC = ChannelsConst->getZExtValue();
+		int InputN = BatchesConst->getZExtValue();
+	
+		// Create a new input tensor
+		Dims4 dims(InputN, InputC, InputH, InputW);
+		return Network->addInput("", dims);
+	}
+	return getIntermediateInputTensor(N);
+}
+
+unsigned CGT_NVDLA::getInputIndex(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "GET INPUT INDEX\n");
+        auto *F = N->getFuncPointer();
+	DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
+	unsigned inputIndex = 0;
+	for(auto &Arg : F->args()) {
+		DEBUG(errs() << "ARGUMENT: ");
+		DEBUG((&Arg)->print(errs()));
+		DEBUG(errs() << "\n");
+		if(II->getOperand(0) == &Arg) {
+			DEBUG(errs() << "INPUT: ");
+			DEBUG(II->getOperand(0)->print(errs()));
+			DEBUG(errs() << "\n");
+			DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n");
+			return inputIndex;
+		}
+		inputIndex++;
+	}
+	assert(false && "Illegal intrinsic or Node.");
+	return -1;  // Keep compiler happy
+}
+
+std::string CGT_NVDLA::getLayerName(std::string Name) {
+	DEBUG(errs() << "GET LAYER NAME\n");
+	if(LayerNameMap.find(Name) == LayerNameMap.end()) {
+		LayerNameMap[Name] = 1;
+	} else {
+		LayerNameMap[Name]++;
+	}
+	return std::to_string(LayerNameMap[Name]);
+}
+
+void CGT_NVDLA::generateConvolutionLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "*****CONVOLUTION LAYER*****\n");
+	// FIXME: What is number of "groups". Setting it to 1 for now.
+	int numGroups  = 1;
+	
+	// If the input tensor is not a binding tensor, it must be coming
+	// from an edge from a visted node, so use that to get number of outputs.
+	unsigned inputIndex = getInputIndex(N, II);
+	DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n");
+	DEBUG(errs() << "GET INPUT TENSOR\n");
+	auto *InputTensor = getBindingTensor(N, inputIndex);
+	DEBUG(errs() << "INPUT TENSOR: ");
+	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
+	
+	// Get the index for kernel tensor
+	auto *F = N->getFuncPointer();
+	DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
+	unsigned kernelIndex = 0;
+	bool ArgFound = false;
+	for(auto &Arg : F->args()) {
+		if(II->getOperand(1) == &Arg) {
+			ArgFound = true;
+			break;
+		}
+		kernelIndex++;
+	}
+	assert(ArgFound && "Illegal intrinsic or Node.");
+	DEBUG(errs() << "KERNEL INDEX: " << kernelIndex << "\n");
+	// Get the kernel tensor
+	DEBUG(errs() << "GET KERNEL TENSOR\n");
+	auto *KernelTensor = getBindingTensor(N, kernelIndex);
+	assert(KernelTensor && "Kernel tensors are always binds.");
+	
+	// Get kernel constants
+	auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5));
+	auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4));
+	auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3));
+	auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2));
+	assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst
+			&& "Kernel dimensions must be constants.");
+	int kernelW = KernelWConst->getZExtValue();
+	int kernelH = KernelHConst->getZExtValue();
+	int kernelC = KernelCHConst->getZExtValue();
+	int kernelN = KernelNConst->getZExtValue();
+	DEBUG(errs() << "\nKERNEL H: " << kernelH << "\n");
+        DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
+	 DEBUG(errs() << "KERNEL C: " << kernelC << "\n");
+        DEBUG(errs() << "KERNEL N: " << kernelN << "\n");
+	
+	 int numOutputs;
+        if(!InputTensor) {
+                DEBUG(errs() << "INPUT FROM EDGE\n");
+                numOutputs = (InputNVDLATensor->getDimensions()).n * kernelN;
+                             //    (InputNVDLATensor->getDimensions()).c;
+        } else {
+                DEBUG(errs() << "INPUT FROM WEIGHT TENSOR\n");
+                auto *BatchesConst = dyn_cast<ConstantInt>(InputTensor->getOperand(2));
+                auto *ChannelsConst = dyn_cast<ConstantInt>(InputTensor->getOperand(3));
+                numOutputs = BatchesConst->getZExtValue() * kernelN;
+                               // ChannelsConst->getZExtValue();
+                DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n");
+        }
+	
+	// Get Strides
+	ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(5));
+	ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(4));
+	assert((StrideWConst && StrideHConst) && "Strides must be constants.");
+	int strideW = StrideWConst->getZExtValue();
+	int strideH = StrideHConst->getZExtValue();
+	 DEBUG(errs() << "STRIDE H: " << strideH << "\n");
+        DEBUG(errs() << "STRIDE W: " << strideW << "\n");
+	
+	// Get pads
+	ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(3));
+	ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(2));
+	assert((PadWConst && PadHConst) && "Pads must be constants.");
+	int padW = PadWConst->getZExtValue();
+	int padH = PadHConst->getZExtValue();
+	DEBUG(errs() << "PAD H: " << padH << "\n");
+        DEBUG(errs() << "PAD W: " << padW << "\n");
+	
+	// FIXME: Support dilations. Set dilations to 1 since we do not have dilation support yet.
+	int dilationW = 1;
+	int dilationH = 1;
+	
+	// Get the nodes with Add operations
+	SmallVector<DFLeafNode *, 4> AddOpNodes;
+	SmallVector<IntrinsicInst *, 4> AddInsts;
+	getaddOpSucceedsNode(N, AddOpNodes, AddInsts);
+	assert((!(AddOpNodes.size() > 1))
+			&& "Number of nodes with Add ops must not be more than 1");
+	
+	// Get bias parameters
+	int BiasW, BiasH, BiasC, BiasN;
+	User *BiasTensor = nullptr;
+	BiasMode biasMode = BiasMode::bNONE;
+	if(AddOpNodes.size()) {
+		// Get the index for bias tensor
+		auto *AddNode = AddOpNodes[0];
+		auto *AddInst = AddInsts[0];
+		DEBUG(AddInst->print(errs()));
+		auto *F = AddNode->getFuncPointer();
+		unsigned BiasIndex = 0;
+		ArgFound = false;
+		for(auto &Arg : F->args()) {
+			if(AddInst->getOperand(1) == &Arg) {
+				ArgFound = true;
+				break;
+			}
+			BiasIndex++;
+		}
+		assert(ArgFound && "Illegal intrinsic or Node.");
+		
+		// Get the bias tensor
+		DEBUG(errs() << "BIAS INDEX: " << BiasIndex << "\n");
+		DEBUG(errs() << "BIAS TENSOR\n");
+		BiasTensor = getBindingTensor(AddNode, BiasIndex);
+		assert(BiasTensor && "Bias tensors are always binds.");
+		
+		// Get Bias constants
+		auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5));
+		auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4));
+		auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3));
+		auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2));
+		assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst 
+				&& "Bias dimensions must be constants.");
+		BiasW = BiasWConst->getZExtValue();
+		BiasH = BiasHConst->getZExtValue();
+		BiasC = BiasCHConst->getZExtValue();
+		BiasN = BiasNConst->getZExtValue();
+		DEBUG(errs() << "BIAS H: " << BiasH << "\n");
+        	DEBUG(errs() << "BIAS W: " << BiasW << "\n");
+         	DEBUG(errs() << "BIAS C: " << BiasC << "\n");
+        	DEBUG(errs() << "BIAS N: " << BiasN << "\n");
+		
+		// Get bias mode
+		//if(kernelN == numOutputs)
+			biasMode = BiasMode::bCHANNEL;
+		//else
+		//	biasMode = BiasMode::bUNIFORM;
+	}
+	
+	// Get weights
+	Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW);
+	Weights biasWeights = AddOpNodes.size() == 1 ?  
+					readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW)
+					: Weights(DataType::HALF, nullptr, 0);
+
+	Dims2 tlPadding  = Dims2(padH, padW);
+	Dims2 brPadding  = Dims2(padH, padW);
+	Dims2 stride     = Dims2(strideH, strideW);
+	Dims2 dilation   = Dims2(dilationH, dilationW);
+	Dims2 kernelSize = Dims2(kernelH, kernelW);
+
+	auto *Layer = Network->addConvolution(InputNVDLATensor, numOutputs, 0,
+									kernelSize, tlPadding, brPadding, stride, dilation,
+									kernelWeights, biasWeights, biasMode, numGroups);
+	if(AddOpNodes.size()) {
+		auto *Node = AddOpNodes[0];
+		mapOutputTensor(Node, Layer->getOutput(0));
+	} else {
+		mapOutputTensor(N, Layer->getOutput(0));
+	}
+	Layer->setName((std::string("conv") + getLayerName(std::string("conv"))).c_str());
+	DEBUG(errs() << Layer->getName() << "\n");
+}
+
+void CGT_NVDLA::generatePoolingLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "*****POOLING LAYER*****\n");
+	 // Get input tensor
+        unsigned inputIndex = getInputIndex(N, II);
+        auto *InputTensor = getBindingTensor(N, inputIndex);
+        ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
+
+	// Get window dimensions
+	ConstantInt *KernelWConst = dyn_cast<ConstantInt>(II->getOperand(2));
+	ConstantInt *KernelHConst = dyn_cast<ConstantInt>(II->getOperand(1));
+	assert((KernelWConst && KernelHConst) && "Kernel dimensions must be constants.");
+	int kernelH = KernelHConst->getZExtValue();
+	int kernelW = KernelWConst->getZExtValue();
+	DEBUG(errs() << "KERNEL H: " << kernelH << "\n");
+	DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
+
+	// Get Strides
+	ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(6));
+	ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(5));
+	assert((StrideWConst && StrideHConst) && "Strides must be constants.");
+	int strideH = StrideHConst->getZExtValue();
+	int strideW = StrideWConst->getZExtValue();
+	DEBUG(errs() << "STRIDE H: " << strideH << "\n");
+	DEBUG(errs() << "STRIDE W: " << strideW << "\n");
+	
+	// Get pads
+	ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(4));
+	ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(3));
+	assert((PadWConst && PadHConst) && "Pads must be constants.");
+	int padH = PadHConst->getZExtValue();
+	int padW = PadWConst->getZExtValue();
+ 	DEBUG(errs() << "PAD H: " << padH << "\n");
+        DEBUG(errs() << "PAD W: " << padW << "\n");
+	
+	Dims2 windowSize = Dims2(kernelH, kernelW);
+	Dims2 stride     = Dims2(strideH, strideW);
+	Dims2 tlPadding  = Dims2(padH, padW);
+	Dims2 brPadding  = Dims2(padH, padW);
+	
+	PoolingType type = (II->getIntrinsicID() == Intrinsic::hpvm_tensor_pool_mean) ? 
+						PoolingType::kAVERAGE : PoolingType::kMAX;
+			
+	auto *Layer = Network->addPooling(InputNVDLATensor, type,
+					windowSize, stride, tlPadding, brPadding);
+	mapOutputTensor(N, Layer->getOutput(0));
+	Layer->setName((std::string("pool") + getLayerName(std::string("pool"))).c_str());
+	DEBUG(errs() << Layer->getName() << "\n");
+}
+
+void CGT_NVDLA::generateGemmLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "****GEMM LAYER****\n");
+	// Get input tensor and compute number of outputs
+	unsigned inputIndex = getInputIndex(N, II);
+	auto *InputTensor = getBindingTensor(N, inputIndex);
+	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
+	
+	// Get the index for kernel tensor
+	auto *F = N->getFuncPointer();
+        DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
+        unsigned kernelIndex = 0;
+        bool ArgFound = false;
+        for(auto &Arg : F->args()) {
+                if(II->getOperand(1) == &Arg) {
+                        ArgFound = true;
+                        break;
+                }
+                kernelIndex++;
+        }
+        assert(ArgFound && "Illegal intrinsic or Node.");	
+	
+	// Get the kernel tensor
+	auto *KernelTensor = getBindingTensor(N, kernelIndex);
+	assert(KernelTensor && "Kernel tensors are always binds.");
+
+	// Get kernel constants
+	auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5));
+	auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4));
+	auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3));
+	auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2));
+	assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst 
+			&& "Kernel dimensions must be constants.");
+	int kernelW = KernelWConst->getZExtValue();
+	int kernelH = KernelHConst->getZExtValue();
+	int kernelC = KernelCHConst->getZExtValue();
+	int kernelN = KernelNConst->getZExtValue();
+	 DEBUG(errs() << "KERNEL H: " << kernelH << "\n");
+        DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
+         DEBUG(errs() << "KERNEL C: " << kernelC << "\n");
+        DEBUG(errs() << "KERNEL N: " << kernelN << "\n");
+	
+	 int numOutputs = kernelW;
+	 DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n");	
+
+	// Get the nodes with Add operations
+	SmallVector<DFLeafNode *, 4> AddOpNodes;
+	SmallVector<IntrinsicInst *, 4> AddInsts;
+	getaddOpSucceedsNode(N, AddOpNodes, AddInsts);
+	assert((!(AddOpNodes.size() > 1))
+			&& "Number of nodes with Add ops must not be more than 1");
+	
+	// Get bias parameters
+	int BiasW, BiasH, BiasC, BiasN;
+	User *BiasTensor = nullptr;
+	BiasMode biasMode = BiasMode::bNONE;
+	if(AddOpNodes.size()) {
+		// Get the index for bias tensor
+		auto *AddNode = AddOpNodes[0];
+		auto *AddInst = AddInsts[0];
+		auto *F = AddNode->getFuncPointer();
+		unsigned BiasIndex = 0;
+                ArgFound = false;
+                for(auto &Arg : F->args()) {
+                        if(AddInst->getOperand(1) == &Arg) {
+                                ArgFound = true;
+                                break;
+                        }
+                        BiasIndex++;
+                }
+                assert(ArgFound && "Illegal intrinsic or Node.");
+	
+		// Get the bias tensor
+		BiasTensor = getBindingTensor(AddNode, BiasIndex);
+		assert(BiasTensor && "Bias tensors are always binds.");
+		
+		// Get Bias constants
+		auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5));
+		auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4));
+		auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3));
+		auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2));
+		assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst 
+				&& "Bias dimensions must be constants.");
+		BiasW = BiasWConst->getZExtValue();
+		BiasH = BiasHConst->getZExtValue();
+		BiasC = BiasCHConst->getZExtValue();
+		BiasN = BiasNConst->getZExtValue();
+		 DEBUG(errs() << "BIAS H: " << BiasH << "\n");
+                DEBUG(errs() << "BIAS W: " << BiasW << "\n");
+                DEBUG(errs() << "BIAS C: " << BiasC << "\n");
+                DEBUG(errs() << "BIAS N: " << BiasN << "\n");
+
+		// Get bias mode
+		//if(KernelCHConst->getZExtValue() == numOutputs)
+			biasMode = BiasMode::bCHANNEL;
+		//else
+		//	biasMode = BiasMode::bUNIFORM;
+	}
+
+	// Get weights
+	Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW);
+	Weights biasWeights = (AddOpNodes.size() == 1) ?  
+					readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW)
+					: Weights(DataType::HALF, nullptr, 0);
+
+	auto *Layer = Network->addFullyConnected(InputNVDLATensor, numOutputs,
+						  kernelWeights, biasWeights, biasMode);
+	if(AddOpNodes.size()) {
+		auto *Node = AddOpNodes[0];
+		mapOutputTensor(Node, Layer->getOutput(0));
+	} else {
+		mapOutputTensor(N, Layer->getOutput(0));
+	}
+	Layer->setName((std::string("gemm") + getLayerName(std::string("gemm"))).c_str());
+	DEBUG(errs() << Layer->getName() << "\n");
+}
+
+void CGT_NVDLA::generateReluLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "******RELU LAYER******\n");
+	// Get input tensor
+	unsigned inputIndex = getInputIndex(N, II);
+	auto *InputTensor = getBindingTensor(N, inputIndex);
+	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
+	
+	auto *Layer = Network->addActivation(InputNVDLATensor, kRELU);
+	mapOutputTensor(N, Layer->getOutput(0));
+	Layer->setName((std::string("relu") + getLayerName(std::string("relu"))).c_str());
+	DEBUG(errs() << Layer->getName() << "\n");
+}
+
+void CGT_NVDLA::generateSoftMaxLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "******SOFTMAX LAYER*******\n");
+	// Get input tensor
+	unsigned inputIndex = getInputIndex(N, II);
+	auto *InputTensor = getBindingTensor(N, inputIndex);
+	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
+	
+	auto *Layer = Network->addSoftMax(InputNVDLATensor);
+	mapOutputTensor(N, Layer->getOutput(0));
+	Layer->setName((std::string("softmax") + getLayerName(std::string("softmax"))).c_str());
+	DEBUG(errs() << Layer->getName() << "\n");
+}
+
+void CGT_NVDLA::generateTanhLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	DEBUG(errs() << "*******TANH LAYER*******\n");
+	// Get input tensor
+	unsigned inputIndex = getInputIndex(N, II);
+	auto *InputTensor = getBindingTensor(N, inputIndex);
+	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
+	
+	auto *Layer = Network->addActivation(InputNVDLATensor, kTANH);
+	mapOutputTensor(N, Layer->getOutput(0));
+	Layer->setName((std::string("tanh") + getLayerName(std::string("tanh"))).c_str());
+	DEBUG(errs() << Layer->getName() << "\n");
+}
+
+/*
+void CGT_NVDLA::generateBatchNormLayer(DFLeafNode* N, const IntrinsicInst *II) {
+	const dc::BatchNormParameter& p = msg.batch_norm_param();
+	Weights mean = weightFactory(msg.name(), kMEAN);
+	Weights variance = weightFactory(msg.name(), kVARIANCE);
+	Weights movingAverage = weightFactory(msg.name(), kMOVING_AVERAGE);
+	float eps = p.eps();
+	float scaleFactor = 1.0f;
+	float average = 0.0f;
+	int i;
+	
+	average = *(static_cast<const float*>(movingAverage.values));
+	if ( average == 0.0f )
+	{
+		gLogError << "Batch Normalization moving average is zero " << std::endl;
+		return 0;
+	}
+	scaleFactor /= average;
+	
+	if (mean.count != variance.count)
+	{
+		gLogError << "Mean and variance have differing number of elements " 
+				  << mean.count << " & " << variance.count << std::endl;
+		return 0;
+	}
+	
+	float *meanBlob = (float *)mean.values;
+	float *varianceBlob = (float *)variance.values;
+	
+	Dims4 inputDims = getIntermediateInputTensor(N)->getDimensions();
+	BatchNormMode mode;
+	
+	if (mean.count == 1)
+	{
+		mode = BatchNormMode::bnUNIFORM;
+		meanBlob[0] = meanBlob[0] * scaleFactor;
+		varianceBlob[0] = varianceBlob[0] * scaleFactor;
+	}
+	else if (mean.count == inputDims.c)
+	{
+		mode = BatchNormMode::bnm_CHANNEL;
+		for (i = 0; i < mean.count; i++)
+		{
+			meanBlob[i] = meanBlob[i] * scaleFactor;
+			varianceBlob[i] = varianceBlob[i] * scaleFactor;
+		}
+	}
+	else
+	{
+		gLogError << "Unknown batch norm mode" << std::endl;
+		return 0;
+	}
+	
+	// Get input tensor
+	unsigned inputIndex = getInputIndex(N, II);
+	Value *InputTensor = getBindingTensor(inputIndex);
+	ITensor *InputNVDLATensor = getNVDLAInputTensor(InputTensor);
+	
+	auto *Layer = Network->addBatchNorm(InputNVDLATensor, mode, mean, variance, eps);
+	mapOutputTensor(N, Layer->getOutput(0));
+}
+*/
+
+unsigned CGT_NVDLA::identifyOutputs() {
+    std::set< ITensor* > outputTensors;
+    std::set< ITensor* > InputTensors;
+
+    for (int l = 0; l < Network->getNumLayers(); ++l) {
+        ILayer* layer = Network->getLayer(l);
+        assert(layer && "Illegal NVDLA compiler IR!");
+        for (int ii = 0; ii < layer->getNumInputs(); ++ii) {
+            InputTensors.insert(layer->getInput(ii));
+        }
+        for (int oo = 0; oo < layer->getNumOutputs(); ++oo) {
+            outputTensors.insert(layer->getOutput(oo));
+        }
+    }
+
+    for (std::set<ITensor*>::iterator oi = outputTensors.begin(); oi != outputTensors.end(); ++oi) {
+        // An output tensor which is not an input to any other layers is a Network output tensor
+        if (InputTensors.find(*oi) == InputTensors.end())
+            Network->markOutput(*oi);
+    }
+    return Network->getNumOutputs();
+}
+
+void CGT_NVDLA::codeGen(DFLeafNode *N) {
+   // No allocation nodes allowed.
+   assert(!N->isAllocationNode() && "Allocation Node not expected in ApproxHPVM");
+	
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Generate code only if it has the right hint
+  //if (!checkPreferredTarget(N, hpvm::NVDLA_TARGET)) {
+   // DEBUG(errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n");
+  //  return;
+ // }
+
+  // Get the function associated with the dataflow node
+  auto *F = N->getFuncPointer();
+  DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
+
+  // Generate code for every instruction in this node
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+
+    if (BuildDFG::isViscIntrinsic(I)) {
+      auto *II = dyn_cast<IntrinsicInst>(I);
+      assert((II->getCalledFunction()->getName()).startswith("llvm.hpvm.tensor")
+        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+
+      switch (II->getIntrinsicID()) {
+		  case Intrinsic::hpvm_tensor_convolution:
+		  case Intrinsic::hpvm_tensor_group_convolution:
+			  generateConvolutionLayer(N, II);
+			  break;
+	
+		  case Intrinsic::hpvm_tensor_batchnorm:
+			  generateBatchNormLayer(N, II);
+			  break;
+		  
+		  case Intrinsic::hpvm_tensor_mul:
+			  generateGemmLayer(N, II);
+			  break;
+			  
+		  case Intrinsic::hpvm_tensor_add:
+			  // Add not explicitly supported by NVDLA compiler!
+			  break;
+			  
+		  case Intrinsic::hpvm_tensor_pool_max:
+		  case Intrinsic::hpvm_tensor_pool_mean:
+			  generatePoolingLayer(N, II);
+			  break;
+			  
+		  case Intrinsic::hpvm_tensor_relu:
+			  generateReluLayer(N, II);
+			  break;
+		  
+		  case Intrinsic::hpvm_tensor_clipped_relu:
+		   // No need to generate NVDLA IR for this?
+		   break;
+		   
+		  case Intrinsic::hpvm_tensor_tanh:
+			  generateTanhLayer(N, II);
+			  break;
+			  
+		  case Intrinsic::hpvm_tensor_softmax:
+			  generateSoftMaxLayer(N, II);
+			  break;
+			  
+		  default:
+			llvm_unreachable("Unknown HPVM Intrinsic!");
+			break;
+      }
+    }
+  }
+}
+
+void CGT_NVDLA::codeGen(DFInternalNode* N) {
+  DEBUG(errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n");
+  DEBUG(errs () << "Skipping internal node\n");
+}
+
+NvDlaError CGT_NVDLA::parseSetup(const TestAppArgs* appArgs, TestInfo* i) {
+    return NvDlaSuccess;
+}
+
+NvDlaError CGT_NVDLA::transformHPVM2NVDLA(const TestAppArgs* appArgs, TestInfo* i) {
+    NVDLA_UNUSED(appArgs);
+    NvDlaError e = NvDlaSuccess;
+
+    Network  = nullptr;
+    Network = nvdla::createNetwork();
+    if (!Network)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createNetwork() failed");
+
+    // Iterate over all the DFGs and produce code for each one of them
+     for(auto &RootNode: *(appArgs->Roots))
+    	  visit(RootNode);
+
+    // if the application has so far not marked the network's outputs, allow the parser to do so now
+    if (Network->getNumOutputs() <= 0) {
+        int outs = identifyOutputs();
+        DEBUG(NvDlaDebugPrintf("Marking total %d outputs\n", outs));
+        if (outs <= 0)
+            ORIGINATE_ERROR_FAIL(NvDlaError_BadValue, "Unable to identify outputs for the network: %d", outs);
+    }
+
+    if (appArgs->computePrecision == nvdla::DataType::INT8) {
+         if (appArgs->calibTable != "") {
+            DEBUG(NvDlaDebugPrintf("parsing calibration table...\n"));
+            PROPAGATE_ERROR_FAIL(readTensorScales(appArgs, i, Network));
+        } else {
+            DEBUG(NvDlaDebugPrintf("initialize all tensors with const scaling factors of 127...\n"));
+            PROPAGATE_ERROR_FAIL(generateTensorScales(appArgs, i, Network));
+       }
+    }
+
+    DEBUG(NvDlaDebugPrintf("attaching parsed network to the wisdom...\n"));
+    if (!i->wisdom->setNetworkTransient(Network))
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->setNetworkTransient() failed");
+
+    return NvDlaSuccess;
+
+fail:
+    return e;
+}
+
+NvDlaError CGT_NVDLA::parseAndCompile(const TestAppArgs* appArgs, TestInfo* i) {
+    NvDlaError e = NvDlaSuccess;
+    bool isCaffe = appArgs->caffemodel != "";
+
+    PROPAGATE_ERROR_FAIL(parseSetup(appArgs, i));
+
+    DEBUG(NvDlaDebugPrintf("creating new wisdom context...\n"));
+    i->wisdom = nvdla::createWisdom();
+    if (!i->wisdom)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createWisdom() failed");
+
+    DEBUG(NvDlaDebugPrintf("opening wisdom context...\n"));
+    if (!i->wisdom->open(i->wisdomPath))
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->open() failed to open: \"%s\"", i->wisdomPath.c_str());
+
+    // Parse
+      PROPAGATE_ERROR_FAIL(transformHPVM2NVDLA(appArgs, i));
+
+    // Compile
+    PROPAGATE_ERROR_FAIL(compileProfile(appArgs, i));
+
+    /* Destroy network before closing wisdom context */
+    nvdla::destroyNetwork(i->wisdom->getNetwork());
+
+    DEBUG(NvDlaDebugPrintf("closing wisdom context...\n"));
+    i->wisdom->close();
+
+fail:
+    if (i->wisdom != NULL) {
+        nvdla::destroyWisdom(i->wisdom);
+        i->wisdom = NULL;
+    }
+    return e;
+}
+
+NvDlaError CGT_NVDLA::testSetup(const TestAppArgs* appArgs, TestInfo* i) {
+    NvDlaError e = NvDlaSuccess;
+
+    std::string wisdomPath = appArgs->outputPath + "wisdom.dir/";
+    std::string removeCmd = "";
+    std::string imagePath = "";
+    NvDlaStatType stat;
+    int ii = 0;
+
+    // Do input paths exist?
+    e = NvDlaStat(appArgs->inputPath.c_str(), &stat);
+    if (e != NvDlaSuccess)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Input path does not exist: \"%s\"", appArgs->inputPath.c_str());
+
+    // Do output paths exist?
+    e = NvDlaStat(appArgs->outputPath.c_str(), &stat);
+    if (e != NvDlaSuccess)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Output path does not exist: \"%s\"", appArgs->outputPath.c_str());
+
+    // Clear wisdomPath if any exist
+    removeCmd += "rm -rf " + wisdomPath;
+    ii = std::system(removeCmd.c_str()); // This is pretty awful
+    if (ii != 0)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "system command failed: \"%s\"", removeCmd.c_str());
+
+    PROPAGATE_ERROR_FAIL(NvDlaMkdir(const_cast<char *>(wisdomPath.c_str())));
+
+    // Initialize TestInfo
+    i->wisdom = NULL;
+    i->wisdomPath = wisdomPath;
+    i->pData = NULL;
+
+    return NvDlaSuccess;
+
+fail:
+    return e;
+}
+
+NvDlaError CGT_NVDLA::launchTest(const TestAppArgs* appArgs) {
+    NvDlaError e = NvDlaSuccess;
+    TestInfo testInfo;
+
+    PROPAGATE_ERROR_FAIL(testSetup(appArgs, &testInfo));
+
+    PROPAGATE_ERROR_FAIL(parseAndCompile(appArgs, &testInfo));
+
+    return NvDlaSuccess;
+
+fail:
+    return e;
+}
+
+bool HPVM2NVDLA::runOnModule(Module &M) {
+  DEBUG(errs() << "**************HPVM2NVDLA PASS****************\n");
+  
+  	NvDlaError e = NvDlaError_TestApplicationFailed;
+	TestAppArgs testAppArgs = defaultTestAppArgs;
+	
+	// Get the HPVM IR graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_NVDLA *CGTVisitor = new CGT_NVDLA(M, DFG);
+	
+  	if(ComputePrecision == "INT8" || ComputePrecision == "int8") {
+  		testAppArgs.computePrecision = nvdla::DataType::INT8;
+  		testAppArgs.quantizationMode = nvdla::QuantizationMode::PER_KERNEL;
+  		testAppArgs.configtarget = std::string("nv_small");
+  	} else {
+  		testAppArgs.computePrecision = nvdla::DataType::HALF;
+  		testAppArgs.quantizationMode = nvdla::QuantizationMode::NONE;
+  		testAppArgs.configtarget = std::string("nv_full");
+  	}
+	testAppArgs.profileName = std::string("hpvm-mod");
+	testAppArgs.calibTable = CalibTablePath;//std::string("output_scales.txt");
+	testAppArgs.outputPath = std::string(".");
+	testAppArgs.inDataFormat = nvdla::DataFormat::NCHW;
+	
+	testAppArgs.Roots = &Roots;
+	
+	e = CGTVisitor->launchTest(&testAppArgs);
+	if (e != NvDlaSuccess)
+		DEBUG(errs() << "ERROR\n");
+	else
+		DEBUG(errs() << "SUCESS\n");
+	
+	delete CGTVisitor;
+  
+	return false;
+}
+
+NvDlaError CGT_NVDLA::compileProfile(const TestAppArgs* appArgs, TestInfo* i) {
+    NvDlaError e = NvDlaSuccess;
+    std::string profileName = "";
+    std::string targetConfigName = "";
+
+    NvDlaFileHandle file = 0;
+    std::string fileName = "";
+    NvU8 *buffer = 0;
+    NvU64 size = 0;
+
+    nvdla::ICompiler* compiler = i->wisdom->getCompiler();
+    if (!compiler)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getCompiler() failed");
+
+    if (!(appArgs->configtarget != ""))
+        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No target config found to load");
+
+    targetConfigName = appArgs->configtarget;
+
+    // Determine profile
+    PROPAGATE_ERROR_FAIL(generateProfile(appArgs, &profileName, i));
+
+    // Compile
+    DEBUG(NvDlaDebugPrintf("compiling profile \"%s\"... config \"%s\"...\n", profileName.c_str(), targetConfigName.c_str()));
+    PROPAGATE_ERROR_FAIL(compiler->compile(profileName.c_str(), targetConfigName.c_str(), &i->compiledLoadable));
+
+    // Get loadable buffer and dump it into a file
+    PROPAGATE_ERROR_FAIL(compiler->getLoadableImageSize(profileName.c_str(),
+                                                    &size));
+    if (size == 0) {
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter,
+                            "Invalid size for a loadable");
+    }
+
+    buffer = (NvU8 *) NvDlaAlloc(size);
+    if (buffer == NULL) {
+        ORIGINATE_ERROR_FAIL(NvDlaError_InsufficientMemory,
+                            "Failed to allocate buffer for loadable");
+    }
+    PROPAGATE_ERROR_FAIL(compiler->getLoadableImage(profileName.c_str(),
+                                                    buffer));
+    fileName = profileName + ".nvdla";
+    errs() << "Writing NVDLA module '" << fileName << "' ...";
+    PROPAGATE_ERROR_FAIL(NvDlaFopen(fileName.c_str(), NVDLA_OPEN_WRITE, &file));
+    PROPAGATE_ERROR_FAIL(NvDlaFwrite(file, buffer, size));
+    errs() << " done.\n";
+
+fail:
+    NvDlaFclose(file);
+    if (buffer != NULL)
+        NvDlaFree(buffer);
+    return e;
+}
+
+NvDlaError CGT_NVDLA::generateProfile(const TestAppArgs* appArgs, std::string* profileName, TestInfo* i) {
+    NvDlaError e = NvDlaSuccess;
+    nvdla::DataFormat inDataFormat = nvdla::DataFormat::UNKNOWN;
+
+    if (appArgs->profileName != "") {
+        // init named profile (basic/default/performance) with default params in its constructor and exit
+        DEBUG(errs() << "PROFILE NAME PROVIDED\n");
+	PROPAGATE_ERROR_FAIL(beginWithNamedProfile(appArgs, i));
+        *profileName = appArgs->profileName;
+    } else {
+        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No profile supplied to load");
+    }
+
+    // capture profile params from command line (override the existing ones as necessary)
+    inDataFormat = inDataFormat == nvdla::DataFormat::UNKNOWN ? appArgs->inDataFormat : inDataFormat;
+    PROPAGATE_ERROR_FAIL(updateProfileWithCmdLineArgs(appArgs, i, profileName->c_str(), inDataFormat));
+
+fail:
+    return e;
+}
+
+NvDlaError CGT_NVDLA::beginWithNamedProfile(const TestAppArgs* appArgs, TestInfo* i) {
+    NvDlaError e = NvDlaSuccess;
+    nvdla::IProfiler* profiler;
+    nvdla::IProfile* profile;
+
+    profiler = i->wisdom->getProfiler();
+    if ( !profiler ) {
+        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profiler not initialized");
+    }
+
+    profile = profiler->getProfile(appArgs->profileName.c_str());
+    if ( !profile ) {
+        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profile %s not initialized", appArgs->profileName.c_str());
+    }
+
+fail:
+    return e;
+}
+
+NvDlaError CGT_NVDLA::updateProfileWithCmdLineArgs
+(
+    const TestAppArgs* appArgs, TestInfo* i, const char* profileName, nvdla::DataFormat inDataFormat
+) {
+    NvDlaError e = NvDlaSuccess;
+    nvdla::IProfiler* profiler;
+    nvdla::IProfile* profile;
+
+    profiler = i->wisdom->getProfiler();
+    if (!profiler)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getProfiler() failed");
+    profile   = profiler->getProfile(profileName);
+    if (!profile)
+        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "profiler->getProfile() failed");
+
+    PROPAGATE_ERROR_FAIL(profile->setComputePrecision(appArgs->computePrecision));
+    PROPAGATE_ERROR_FAIL(profile->setNetworkInputDataFormat(inDataFormat));
+
+    // determine input surface format
+    switch(inDataFormat) {
+        case nvdla::DataFormat::NHWC:
+
+            if (appArgs->computePrecision == nvdla::DataType::HALF) {
+                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A16B16G16R16_F));
+            } else if (appArgs->computePrecision == nvdla::DataType::INT8) {
+                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A8B8G8R8));
+            } else {
+                ORIGINATE_ERROR_FAIL(NvDlaError_NotSupported, "NHWC and compute precision %u is not yet supported",
+                                     appArgs->computePrecision.v());
+            }
+            break;
+        case nvdla::DataFormat::NCxHWx:
+        case nvdla::DataFormat::NCHW:
+        case nvdla::DataFormat::UNKNOWN:    // atleast start the test with feature data format
+        default:
+            if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0)
+                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8));
+            else
+                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE));
+    }
+
+    // determine int8 cfgs
+    if (appArgs->computePrecision == nvdla::DataType::INT8) {
+        PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::PER_TENSOR));
+        switch(appArgs->quantizationMode) {
+            case nvdla::QuantizationMode::PER_FILTER:
+                PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_FILTER));
+                break;
+            case nvdla::QuantizationMode::PER_KERNEL:
+            case nvdla::QuantizationMode::NONE: // default to per-kernel; find a way to run int8 tests w/ NONE qtzMode cleanly
+            default:
+                PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_KERNEL));
+        }
+    } else {
+        PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::NONE));
+        PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::NONE));
+    }
+
+    PROPAGATE_ERROR_FAIL(profile->setNetworkOutputDataFormat(nvdla::DataFormat::NCxHWx));
+
+    if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0)
+        PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8));
+    else
+        PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE));
+
+    if (appArgs->numBatches > 0)
+        PROPAGATE_ERROR_FAIL(profile->setMultiBatchSize(appArgs->numBatches));
+
+fail:
+    return e;
+}
+
+NvDlaError CGT_NVDLA::generateTensorScales(const TestAppArgs* appArgs, TestInfo* i, nvdla::INetwork* network) {
+    NvDlaError e = NvDlaSuccess;
+
+    std::vector<nvdla::ILayer*> networkLayers = network->getLayers();
+    std::vector<nvdla::ITensor*> networkInputs = network->getInputs();
+
+    std::vector<nvdla::ILayer*>::iterator li = networkLayers.begin();
+    std::vector<nvdla::ITensor*>::iterator nii = networkInputs.begin();
+
+    // set scaling factor for the network input tensors
+    for (; nii != networkInputs.end(); ++nii) {
+        NvF32 scale = 1;
+        NvF32 min = scale * -127.0f;
+        NvF32 max = scale * 127.0f;
+        std::string tName = (*nii)->getName();
+	DEBUG(errs() << "INPUT NAME: " << tName << "\n");
+        // set same dynamic range for all channels of the tensor (cIndex = -1)
+        PROPAGATE_ERROR_FAIL( (*nii)->setChannelDynamicRange(-1, min, max) );
+        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale));
+        if (0)
+            NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", tName.c_str(), scale);
+    }
+
+    for (; li != networkLayers.end(); ++li) {
+        NvF32 scale = 127;
+        NvF32 min = scale * -127.0f;
+        NvF32 max = scale * 127.0f;
+        std::string lName = (*li)->getName();
+        nvdla::ITensor* outTensor = (*li)->getOutput(0);
+	DEBUG(errs() << "LAYER NAME: " << lName << "\n");
+        // set same dynamic range for all channels of the tensor (cIndex = -1)
+        PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) );
+        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(lName, scale));
+        if (0)
+            NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", lName.c_str(), scale);
+    }
+
+fail:
+    return e;
+}
+
+NvDlaError CGT_NVDLA::readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network) {
+    NvDlaError e = NvDlaSuccess;
+    NvDlaStatType stat;
+    std::string calibTableFile = /*i->calibTablesPath + "/" + */appArgs->calibTable;
+
+    //PROPAGATE_ERROR_FAIL(NvDlaStat(calibTableFile.c_str(), &stat));
+    DEBUG(errs() << "***********READING TENSOR SCALESi*************\n");
+    std::ifstream infile(calibTableFile.c_str());
+    std::string line;
+    std::map<std::string, float> LayerNameToScaleMap;
+    while (std::getline(infile, line)) {
+        DEBUG(errs() << "READ LINE:  " << line << "\n");
+        line.erase(remove(line.begin(), line.end(), ' '), line.end());
+        DEBUG(errs() << "READ LINE WITHOUT WHITE SPACES:  " << line << "\n");
+        std::string delimiter = ":";
+        std::string layer_name = line.substr(0, line.find(delimiter));
+        std::string Scale = line.substr(line.find(delimiter) + 1);
+        DEBUG(errs() << "LAYER NAME: " << layer_name << "\n");
+        DEBUG(errs() << "SCALE: " << Scale << "\n");
+        size_t size;
+        LayerNameToScaleMap[layer_name] = std::stof(Scale, &size);
+    }
+    infile.close();
+    DEBUG(errs() << "GOT TENSOR SCALES FROM CALIB TABLE\n");
+
+    std::vector<nvdla::ILayer*> networkLayers = network->getLayers();
+    std::vector<nvdla::ITensor*> networkInputs = network->getInputs();
+    for (auto *Input : networkInputs) {
+        NvF32 scale = 0.0f;
+        NvF32 min = 0.0f;
+        NvF32 max = 0.0f;
+        DEBUG(errs() << "SET SCALE FOR INPUT\n");
+	scale = LayerNameToScaleMap["input"]; 	
+        DEBUG(errs() << "INPUT SCALE: " << scale << "\n");
+	min = scale * -127.0f;
+        max = scale * 127.0;
+        PROPAGATE_ERROR_FAIL(Input->setChannelDynamicRange(-1, min, max) );
+        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>("data", scale));
+    }
+    DEBUG(errs() << "PER LAYER CALIB\n");
+    for (auto *Layer : networkLayers) {
+         NvF32 scale = 0.0f;
+                NvF32 min = 0.0f;
+                NvF32 max = 0.0f;
+		std::string tName = Layer->getName();
+                DEBUG(errs() << "SETTING SCALE FOR LAYER NAME: " << tName << "\n");
+		nvdla::ITensor* outTensor = Layer->getOutput(0);
+                auto it = LayerNameToScaleMap.find(tName);
+                if (it != LayerNameToScaleMap.end()) {
+                        DEBUG(errs() << "SET SCALE FOR NAME: " << tName << "\n");
+			DEBUG(errs() << "SCALE: " << it->second << "\n");
+                        scale = it->second;
+                        min = scale * -127.0f;
+                        max = scale * 127.0f;
+                } else {
+                        DEBUG(errs() << "SET DEFAULT SCALE FOR NAME: " << tName << "\n");
+                        DEBUG(errs() << "SCALE: 1\n");
+			scale = 1;
+                        min = scale * -127.0f;
+                        max = scale * 127.0f;
+                }
+                //else {
+                //      ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Atleast 1 of scale or min-max should be specified for %s\n", tName.c_str());
+                //}
+        PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) );
+        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale));
+    }
+
+    DEBUG(errs() << "DONE PARSING CALIBRATION TABLE\n");
+ fail:
+     return e;
+}
+
diff --git a/hpvm/lib/Transforms/HPVM2NVDLA/LLVMBuild.txt b/hpvm/lib/Transforms/HPVM2NVDLA/LLVMBuild.txt
new file mode 100644
index 0000000000..44e63f3c71
--- /dev/null
+++ b/hpvm/lib/Transforms/HPVM2NVDLA/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = HPVM2NVDLA
+parent = Transforms
-- 
GitLab