Skip to content
Snippets Groups Projects
HPVM2NVDLAPass.cpp 59.15 KiB
#define ENABLE_ASSERTS

#define DEBUG_TYPE "DFG2NVDLA"

#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/IR/Attributes.h"
#include "llvm/ADT/STLExtras.h"

#include "SupportHPVM/DFG2LLVM.h"

#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <set>

#include "dlaerror.h"
#include "dlatypes.h"

#include "nvdla/IRuntime.h"
#include "DlaImageUtils.h"

#include "ErrorMacros.h"
#include "nvdla_inf.h"
#include "nvdla_os_inf.h"
#include "nvdla/IType.h"
#include "nvdla/ITensor.h"
#include "nvdla/INetwork.h"
#include "nvdla/ILayer.h"
#include "nvdla/IProfiler.h"
#include "nvdla/IProfile.h"
#include "nvdla/ICompiler.h"
#include "nvdla/ILoadable.h"
#include "nvdla/IWisdom.h"

#include "rapidjson/document.h"
#include "rapidjson/filereadstream.h"
#include "rapidjson/error/en.h"
#include "half.h"

using namespace llvm;
using namespace builddfg;
using namespace dfg2llvm;

using namespace nvdla;

typedef half_float::half float16;

static cl::opt<std::string> ComputePrecision("cprecision",
                    cl::desc("Compute precision (int8 or fp16)."), cl::init("float16"));

static cl::opt<std::string> CalibTablePath("calib-table", 
                    cl::desc("Path to tensor scales file"), 
					cl::value_desc("filename"), cl::Required);


#define DEFAULT_BATCH_SIZE 0
#define DEFAULT_DATA_FMT nvdla::DataFormat::NCHW
#define DEFAULT_QUANT_MODE nvdla::QuantizationMode::NONE
#define TARGET_CONFIG_NAME "nv_full"
#define TEST_PARAM_FILE_MAX_SIZE    65536
struct HPVM2NVDLA : public ModulePass {
  static char ID; // Pass identification, replacement for typeid
  HPVM2NVDLA() : ModulePass(ID) {}

public:
  // Functions
  virtual bool runOnModule(Module &M);
  
  void getAnalysisUsage(AnalysisUsage &AU) const {
	  AU.addRequired<BuildDFG>();
	  AU.addPreserved<BuildDFG>();
  }

private:
  //bool transformHPVM2NVDLA(Module &M);
  
  //void codeGenHPVM2NVDLA(CGT_NVDLA *, DFNode *);
};

struct TestAppArgs
{
    std::string project;
    std::string inputPath;
    std::string inputName;
    std::string outputPath;
    std::string testname;
    std::string testArgs;
    std::string prototxt; // This should be folded into testArgs
    std::string caffemodel; // This should be folded into testArgs
    std::string cachemodel; // This should be folded into testArgs

    std::string profileName; // ok here?
    std::string profileFile;
    std::string configtarget;
    std::string calibTable;
    nvdla::QuantizationMode quantizationMode;
    
    Module *M;
    std::vector<DFInternalNode *> *Roots;

    NvU16 numBatches;
    nvdla::DataFormat inDataFormat;
    nvdla::DataType computePrecision;

    std::map<std::string, NvF32> tensorScales;
};

struct TestInfo
{
    // common
    nvdla::IWisdom* wisdom;
    std::string wisdomPath;

    // parse
    std::string modelsPath;
    std::string profilesPath;
    std::string calibTablesPath;

    // runtime
   // nvdla::IRuntime* runtime;
    nvdla::ILoadable* compiledLoadable;
    NvU8 *pData;
    //std::string inputImagesPath;
    //std::string inputLoadablePath;
   // std::map<std::string, NvDlaImage*> inputImages;
   // std::map<std::string, void *> inputBuffers;
   // std::map<std::string, NvDlaImage*> outputImages;
   // std::map<std::string, void *> outputBuffers;
   // std::vector<SubmitContext*> submits;
    NvU32 timeout;
    NvU16 numBatches; // runtime's point-of-view
    NvU32 numSubmits;
};

static TestAppArgs defaultTestAppArgs =
{
    /* .project = */ "OpenDLA",
    /* .inputPath = */ "./",
    /* .inputName = */ "",
    /* .outputPath = */ "./",
    /* .testname = */ "",
    /* .testArgs = */ "",
    /* .prototxt = */ "",
    /* .caffemodel = */ "",
    /* .cachemodel = */ "",
    /* .profileName = */ "fast-math",
    /* .profileFile = */ "",
    /* .configtarget = */ TARGET_CONFIG_NAME,
    /* .calibtable = */ "",
    /* .quantizationMode = */ DEFAULT_QUANT_MODE,
	nullptr, nullptr,
    /* .numBatches = */ DEFAULT_BATCH_SIZE,
    /* .inDataFormat = */ DEFAULT_DATA_FMT,
    /* .computePrecision = */ nvdla::DataType::INT8
};

char HPVM2NVDLA::ID = 0;
static RegisterPass<HPVM2NVDLA> X("hpvm-nvdla",
				 "Dataflow Graph to NVDLA IR Pass",
				 false, false);


// Visitor for Code generation traversal of HPVM IR
class CGT_NVDLA : public CodeGenTraversal {
private:
  // Data information
  //DataFormat InDataFormat;
  //DataType ComputePrecision;
  //QuantizationMode Quantization;
  //NvU16 NumBatches;
	
  // Wisdom and network information
  IWisdom *Wisdom;
  INetwork *Network;

  std::map<std::string, int> LayerNameMap;
  
  // Maps dataflow edges in HPVM IR to Tensors in NVDLA IR
  DenseMap<const DFEdge *, ITensor *> EdgeToTensorMap;
	
  // Virtual Functions
  void init();
  void initRuntimeAPI();
  void codeGen(DFInternalNode* N);
  void codeGen(DFLeafNode* N);
  
  // Codegen functions for all supported layers
  void generateConvolutionLayer(DFLeafNode *, const IntrinsicInst *);
  void generatePoolingLayer(DFLeafNode *, const IntrinsicInst *);
  void generateBatchNormLayer(DFLeafNode *, const IntrinsicInst *);
  void generateReluLayer(DFLeafNode *, const IntrinsicInst *);
  void generateGemmLayer(DFLeafNode *, const IntrinsicInst *);
  void generateSoftMaxLayer(DFLeafNode *, const IntrinsicInst *);
  void generateTanhLayer(DFLeafNode *, const IntrinsicInst *);
  
  // Map edges to output tensors
  void mapOutputTensor(DFNode *N, ITensor *Tensor);
  
  // Get input tensors to nodes
  ITensor *getIntermediateInputTensor(DFNode *N);
  
  // Get binding tensors to nodes
  User *getBindingTensor(DFLeafNode* N, unsigned index);
  
  // Get the input NVDLA tensors to nodes
   ITensor *getNVDLAInputTensor(DFLeafNode* N, const User *InputTensor);
  
  // Get index for an input tensor
  unsigned getInputIndex(DFLeafNode* N, const IntrinsicInst *II);
  
  // Gets nodes with add ops meant to be combined with convolution and gemm
  void getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes,
  			SmallVector<IntrinsicInst *, 4> &AddInsts);
  
  // Getting weights
   Weights readTrainedWeights(User *TensorPtr,
  					int dim1_size, int dim2_size,
  					int dim3_size, int dim4_size);
   
  // Identify outputs
  unsigned identifyOutputs();
   
  // Generate profile based on data parameters
  //void generateProfile(std::string &, std::string &);

std::string getLayerName(std::string Name);
  
public:

  CGT_NVDLA(Module &_M, BuildDFG &_DFG)
  : CodeGenTraversal(_M, _DFG) {// : Network(nullptr) {
    //initRuntimeAPI();
    init();
  }
  
  //void destroySetUp();
  
  //void setUpWisdom();
  
  //void compileProfile();
  
  //void transformHPVM2NVDLA(DFNode *);
  
  NvDlaError generateTensorScales(const TestAppArgs*, TestInfo*, nvdla::INetwork*);
  
  NvDlaError updateProfileWithCmdLineArgs(const TestAppArgs*, TestInfo*, const char*, nvdla::DataFormat);
  
  NvDlaError beginWithNamedProfile(const TestAppArgs*, TestInfo*);
  
  NvDlaError generateProfile(const TestAppArgs*, std::string*, TestInfo*);
  
  NvDlaError compileProfile(const TestAppArgs*, TestInfo*);
  
  NvDlaError launchTest(const TestAppArgs*);
  
  NvDlaError testSetup(const TestAppArgs*, TestInfo*);
  
  NvDlaError parseAndCompile(const TestAppArgs*, TestInfo*);
  
  NvDlaError transformHPVM2NVDLA(const TestAppArgs*, TestInfo*);
  
  NvDlaError parseSetup(const TestAppArgs*, TestInfo*);

  NvDlaError readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network);
};

void CGT_NVDLA::init() {
	// Default paramters
	//InDataFormat = DataFormat::NCHW;
	//ComputePrecision = DataType::FLOAT;
	//Quantization = QuantizationMode::NONE;
	//NumBatches = 0;
}

void CGT_NVDLA::initRuntimeAPI() {
	// Nothing to do here!
}

Weights CGT_NVDLA::readTrainedWeights(User *TensorPtr,
					int dim1_size, int dim2_size,
					int dim3_size, int dim4_size) {
	DEBUG(errs() << "READ TRAINED WEIGHTS\n");
	// Get weights file name
	User *MemcpyPtr = dyn_cast<User>(TensorPtr->getOperand(0));
	DEBUG(MemcpyPtr->print(errs()));  
        DEBUG(errs() << "\n");
	while(!dyn_cast<AllocaInst>(MemcpyPtr)) {
		MemcpyPtr = dyn_cast<User>(MemcpyPtr->getOperand(0));
	}
	User *MemcpyArg = nullptr;
	for(User *U: MemcpyPtr->users()) {
		DEBUG(U->print(errs()));
                DEBUG(errs() << "\n");
		if(auto *BCO = dyn_cast<BitCastOperator>(U)) { 
			for(User *CU: BCO->users()) {
				if(auto *CI = dyn_cast<CallInst>(CU)) {
					CI->getCalledFunction()->getName().contains(StringRef("memcpy"));
					 MemcpyArg = dyn_cast<User>(CI->getOperand(1));
		                        break;
				}
			}
			if(MemcpyArg)
				break;
		}
	}
	assert(MemcpyArg && "File name not found.");
	auto *WeightFileName = dyn_cast<GlobalVariable>(MemcpyArg->getOperand(0));
	assert(WeightFileName && "Weight file name must be a global variable.");
	auto* CDA = dyn_cast<ConstantDataArray>(WeightFileName->getInitializer());
	assert(CDA && "Weight file name must be a constant array.");
	const auto &file_name = std::string(CDA->getAsString());
			
	// Read the weights file
	int num_elem = dim1_size * dim2_size * dim3_size * dim4_size;
	int size_in_bytes = sizeof(float16) * num_elem;
	//DEBUG(errs() << "float16 size: " << sizeof(float16) << "\n");
	DEBUG(errs() << "size in bytes: " << size_in_bytes << "\n");
	void *tensor_data = (void *) malloc(size_in_bytes);
	int file_header_size = 0;
	DEBUG(errs() << "FILE NAME: " << file_name << "\n");
	FILE *file = fopen(file_name.c_str(), "rb");
	if(!file) {
		DEBUG(errs() << "Data file is not found. Aborting.\n");
		abort();
	}
	fseek(file, file_header_size, SEEK_CUR);
	size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
	DEBUG(errs() << "BYTES READ: " << bytes_read << "\n");
	fclose(file);
	
	// Create weight tensors
	auto Weight = Weights(DataType::HALF, tensor_data, NvS64(num_elem));
	//FILE *try_file = fopen("temp.bin", "wb");
	//fwrite(Weight.values, sizeof(float), num_elem, try_file);
	//fclose(try_file);
	//exit(-1);	
	return Weight;
}

// For a tensor to be a input weight tensor, it has to come from the root node
User *CGT_NVDLA::getBindingTensor(DFLeafNode* N, unsigned index) {
	// HPVM internal API needs fixing. Remove this lambda function when bug is fixed.
	auto NodeIsRoot = [](DFNode &InternalNode) {
		auto *RootFunction = InternalNode.getFuncPointer();
		for(User *U: RootFunction->users()) {
                	DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
			DEBUG(U->print(errs()));
			DEBUG(errs() << "\n");
                	auto *II = dyn_cast<IntrinsicInst>(U);
                	if(!II) {
                        	auto *BCI = dyn_cast<BitCastOperator>(U);
                        	assert(BCI && "Not a bitcast instruction.");
                        	for(User *BCU : BCI->users()) {
                                	DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
					DEBUG(BCU->print(errs()));
					DEBUG(errs() << "\n");
					II = dyn_cast<IntrinsicInst>(BCU);
                                	if(II)
                                        	break;
                        	}
                	}
                	if(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)) {
                		DEBUG(errs() << "LAUNCH FUNCTION: ");
                		DEBUG(II->print(errs()));
		        	DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n");	
				return true;
			}	
		}
		DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n");
		return false;
	};

	auto NodeIsLeaf = [](DFNode &Node) {
                auto *NodeFunction = Node.getFuncPointer();
                for(User *U: NodeFunction->users()) {
                        DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
                        DEBUG(U->print(errs()));
                        DEBUG(errs() << "\n");
                        auto *II = dyn_cast<IntrinsicInst>(U);
                        if(!II) {
                                auto *BCI = dyn_cast<BitCastOperator>(U);
                                assert(BCI && "Not a bitcast instruction.");
                                for(User *BCU : BCI->users()) {
                                        DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
                                        DEBUG(BCU->print(errs()));
                                        DEBUG(errs() << "\n");
                                        II = dyn_cast<IntrinsicInst>(BCU);
                                        if(II)
                                                break;
                                }
                        }
                        if(II 
			&& (II->getIntrinsicID() == Intrinsic::hpvm_createNode
			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode1D
			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode2D
			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode3D)) {
                                DEBUG(errs() << "CREATE NODE FUNCTION: ");
                                DEBUG(II->print(errs()));
                                DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n");

			// Ensure that the node function does not have these create node intrinsics
				for(inst_iterator i = inst_begin(NodeFunction), 
						  e = inst_end(NodeFunction); i != e; ++i) {
					Instruction *I = &(*i);
					if(auto *II = dyn_cast<IntrinsicInst>(I)) {
						if(II->getIntrinsicID() == Intrinsic::hpvm_createNode
                        			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode1D
                        			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode2D
                        			|| II->getIntrinsicID() == Intrinsic::hpvm_createNode3D) {
							DEBUG(errs() << "--LAMBDA FUNCTION RETURN FALSE\n");
							return false;
						}
					}
					
				}
                                return true;
                        }
                }
                DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n");
                return false;
        };

	DEBUG(errs() << "GET BINDING TENSOR\n");
	DEBUG(errs() << "GIVEN INDEX: " << index << "\n");
	DFEdge *DE = N->getInDFEdgeAt(index);
	assert(DE && "Data edge does not exist at given index");
         DEBUG(errs() << "LEAF NODE FUNCTION: " << N->getFuncPointer()->getName() << "\n");
        // Get the argument position in the root node.
	DEBUG(errs() << "GET TO THE ROOT FIRST\n");	
	auto *InternalNode = DE->getSourceDF();
	  DEBUG(errs() << "INTERNAL NODE FUNCTION: " << InternalNode->getFuncPointer()->getName() << "\n");
        DEBUG(errs() << "INTERNAL NDOE POINTER: " << InternalNode << "\n");
	if(NodeIsLeaf(*InternalNode)) {
		DEBUG(errs() << "BIND NONE: EDGE FROM LEAF NODE\n");
		return nullptr;
	}
	unsigned argPos = DE->getSourcePosition();
        DEBUG(errs() << "ARG POSITION BEFORE LOOP: " << argPos << "\n");
	while(!NodeIsRoot(*InternalNode)) {
		DEBUG(errs() << "IN LOOP\n");
		if(NodeIsLeaf(*InternalNode)) {
                	DEBUG(errs() << "IN LOOP BIND NONE: EDGE FROM LEAF NODE\n");
                	return nullptr;
        	}
		argPos = DE->getSourcePosition();
		DE = InternalNode->getInDFEdgeAt(argPos);
		if(!DE) {
			DEBUG(errs() << "NO BINDING EDGE IN LOOP\n");
			// No binding edge.
			return nullptr;
		}
		InternalNode = DE->getSourceDF();	
		DEBUG(errs() << "INTERNAL NODE FUNCTION IN LOOP: " << InternalNode->getFuncPointer()->getName() << "\n");
		DEBUG(errs() << "IN LOOP DATA EDGE: " << DE << "\n");
                DEBUG(errs() << "IN LOOP ARG POSITION: " << argPos << "\n");
	}
	DEBUG(errs() << "ARG POSITION: " << argPos << "\n");
	
	DEBUG(errs() << "GET THE LAUNCH FUNCTION\n");   	
	// Now we have the root node. We need to get the launch functions for it.
	auto *RootFunction = InternalNode->getFuncPointer();
	for(User *U: RootFunction->users()) {
		DEBUG(errs() << "User for root: ");
		DEBUG(U->print(errs()));
		IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
		if(!II) {
			auto *BCI = dyn_cast<BitCastOperator>(U);
			assert(BCI && "Not a bitcast instruction.");
			for(User *BCU : BCI->users()) {
				II = dyn_cast<IntrinsicInst>(BCU);
				if(II)
					break;
			}
		}
		assert(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)
				&& "Use of a root node must be in launch function call instrinsic.");
		DEBUG(errs() << "LAUNCH FUNCTION: ");
		DEBUG(II->print(errs()));		

		// Now, get the the arguments to the root and get element pointer to argument structure.
		auto *ArgObj = dyn_cast<Instruction>(II->getOperand(1));
		if(auto *BCO = dyn_cast<BitCastOperator>(ArgObj)) {
			ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
		} else if (auto *CI = dyn_cast<CallInst>(ArgObj)) {
			for(User *CIU : CI->users()) {
				auto *BCO = dyn_cast<BitCastOperator>(CIU);
				if(BCO) {
					ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
					break;
				}
			}
		} else if (auto *AI = dyn_cast<AllocaInst>(ArgObj)) {
			for(User *AIU : AI->users()) {
                                auto *BCO = dyn_cast<BitCastOperator>(AIU);
                                if(BCO) {
                                        ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
                                        break;
                                }
                        }
		}
		auto *ArgObjPtrType = dyn_cast<PointerType>(ArgObj->getType());
		auto *ArgObjType = dyn_cast<StructType>(ArgObjPtrType->getElementType());
		assert(ArgObjType && "Arguments to launch is a structure.");
		DEBUG(errs() << "ARG OBJ: ");
		DEBUG(ArgObj->print(errs()));
		DEBUG(errs() << "\n");
		
		// Use the offset into the structure to get the source tensor.
		const auto &DL = ArgObj->getParent()->getParent()->getParent()->getDataLayout();
		const auto *SL = DL.getStructLayout(ArgObjType);
		uint64_t ElementOffset = SL->getElementOffset(argPos);
		DEBUG(errs() << "ELEMENT OFFSET: " << ElementOffset << "\n");
		Instruction *StructElemPtr = nullptr;
		for(User *U: ArgObj->users()) {
			if(auto *GI = dyn_cast<GetElementPtrInst>(U)) {
				auto *Offset = dyn_cast<ConstantInt>(GI->getOperand(2));
				assert(Offset && "Offset is not constant.");
				if(Offset->getZExtValue() == argPos) {//ElementOffset) {
					StructElemPtr = GI;
					break;
				}
			}
		}
		assert(StructElemPtr && "No getelementptr found with given offset.");
		DEBUG(StructElemPtr->print(errs()));
		DEBUG(errs() << "\n");	
		DEBUG(errs() << "USE THE STORES TO GET THE BIND TENSOR\n");	
		// Get store to the element of argument structure to get the pointer to tensor.
		for(User *GIU: StructElemPtr->users()) {
			DEBUG(GIU->print(errs()));
			DEBUG(errs() << "\n");
			if(auto *BCO = dyn_cast<BitCastOperator>(GIU)) {
				DEBUG(BCO->print(errs()));
				DEBUG(errs() << "\n");
				for(User *BCU : BCO->users()) {
					if(auto *SI = dyn_cast<StoreInst>(BCU)) {
						// Get the tensor pointer
						DEBUG(SI->print(errs()));
						DEBUG(errs() << "\n");
						auto *Val = SI->getValueOperand();
		                                if(auto *BCO = dyn_cast<BitCastOperator>(Val)) {
				                        return dyn_cast<User>(BCO->getOperand(0));
						}
						return dyn_cast<User>(Val);
					}
				}
			}
			if(auto *SI = dyn_cast<StoreInst>(GIU)) {
				// Get the tensor pointer
				DEBUG(SI->print(errs()));
				auto *Val = SI->getValueOperand();
				if(auto *BCO = dyn_cast<BitCastOperator>(Val)) {
					return dyn_cast<User>(BCO->getOperand(0));
				}
				return dyn_cast<User>(Val);
			}
		}
	}
	return nullptr;
}


void CGT_NVDLA::mapOutputTensor(DFNode *N, ITensor *Tensor) {
	for(int i = 0; i < N->outdfedge_size(); i++)
		EdgeToTensorMap[N->getOutDFEdgeAt(i)] = Tensor;
}

ITensor *CGT_NVDLA::getIntermediateInputTensor(DFNode *N) {
	return EdgeToTensorMap[N->getInDFEdgeAt(0)];
}

void CGT_NVDLA::getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes,
					SmallVector<IntrinsicInst *, 4> &AddInsts) {
	bool AddOpNodes = false;
	for(int i = 0; i < N->outdfedge_size(); i++) {
		auto *DestNode = N->getOutDFEdgeAt(i)->getDestDF();
		auto *F = DestNode->getFuncPointer();
		
		// If the node is already cached in the list, no need to visit it
		auto *Node = dyn_cast<DFLeafNode>(DestNode);
		if(find(AddNodes, Node) != AddNodes.end())
			continue;
		
		// Add node to list if it contains add operation
		for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
			Instruction *I = &(*i);
			 auto *II = dyn_cast<IntrinsicInst>(I);
			 if (II && II->getIntrinsicID() == Intrinsic::hpvm_tensor_add) {
				 AddNodes.push_back(Node);
				 AddInsts.push_back(II);
				 AddOpNodes = true;
				 break;
			}
		}
		assert(((AddNodes.size() > 0) == AddOpNodes)
				&& "All destination nodes are adds or all of them are not.");
	}
}

ITensor *CGT_NVDLA::getNVDLAInputTensor(DFLeafNode* N, const User *InputBindingTensor) {
	if(InputBindingTensor) {
		auto *BatchesConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(2));
		auto *ChannelsConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(3));
		auto *HeightConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(4));
		auto *WidthConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(5));
		assert(HeightConst && WidthConst && ChannelsConst && BatchesConst 
				&& "Number of input dimensions must be constants.");
		
		// Input dimensions
		int InputW = WidthConst->getZExtValue();
	        int InputH = HeightConst->getZExtValue();
		int InputC = ChannelsConst->getZExtValue();
		int InputN = BatchesConst->getZExtValue();
	
		// Create a new input tensor
		Dims4 dims(InputN, InputC, InputH, InputW);
		return Network->addInput("", dims);
	}
	return getIntermediateInputTensor(N);
}

unsigned CGT_NVDLA::getInputIndex(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "GET INPUT INDEX\n");
        auto *F = N->getFuncPointer();
	DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
	unsigned inputIndex = 0;
	for(auto &Arg : F->args()) {
		DEBUG(errs() << "ARGUMENT: ");
		DEBUG((&Arg)->print(errs()));
		DEBUG(errs() << "\n");
		if(II->getOperand(0) == &Arg) {
			DEBUG(errs() << "INPUT: ");
			DEBUG(II->getOperand(0)->print(errs()));
			DEBUG(errs() << "\n");
			DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n");
			return inputIndex;
		}
		inputIndex++;
	}
	assert(false && "Illegal intrinsic or Node.");
	return -1;  // Keep compiler happy
}

std::string CGT_NVDLA::getLayerName(std::string Name) {
	DEBUG(errs() << "GET LAYER NAME\n");
	if(LayerNameMap.find(Name) == LayerNameMap.end()) {
		LayerNameMap[Name] = 1;
	} else {
		LayerNameMap[Name]++;
	}
	return std::to_string(LayerNameMap[Name]);
}

void CGT_NVDLA::generateConvolutionLayer(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "*****CONVOLUTION LAYER*****\n");
	// FIXME: What is number of "groups". Setting it to 1 for now.
	int numGroups  = 1;
	
	// If the input tensor is not a binding tensor, it must be coming
	// from an edge from a visted node, so use that to get number of outputs.
	unsigned inputIndex = getInputIndex(N, II);
	DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n");
	DEBUG(errs() << "GET INPUT TENSOR\n");
	auto *InputTensor = getBindingTensor(N, inputIndex);
	DEBUG(errs() << "INPUT TENSOR: ");
	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
	
	// Get the index for kernel tensor
	auto *F = N->getFuncPointer();
	DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
	unsigned kernelIndex = 0;
	bool ArgFound = false;
	for(auto &Arg : F->args()) {
		if(II->getOperand(1) == &Arg) {
			ArgFound = true;
			break;
		}
		kernelIndex++;
	}
	assert(ArgFound && "Illegal intrinsic or Node.");
	DEBUG(errs() << "KERNEL INDEX: " << kernelIndex << "\n");
	// Get the kernel tensor
	DEBUG(errs() << "GET KERNEL TENSOR\n");
	auto *KernelTensor = getBindingTensor(N, kernelIndex);
	assert(KernelTensor && "Kernel tensors are always binds.");
	
	// Get kernel constants
	auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5));
	auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4));
	auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3));
	auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2));
	assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst
			&& "Kernel dimensions must be constants.");
	int kernelW = KernelWConst->getZExtValue();
	int kernelH = KernelHConst->getZExtValue();
	int kernelC = KernelCHConst->getZExtValue();
	int kernelN = KernelNConst->getZExtValue();
	DEBUG(errs() << "\nKERNEL H: " << kernelH << "\n");
        DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
	 DEBUG(errs() << "KERNEL C: " << kernelC << "\n");
        DEBUG(errs() << "KERNEL N: " << kernelN << "\n");
	
	 int numOutputs;
        if(!InputTensor) {
                DEBUG(errs() << "INPUT FROM EDGE\n");
                numOutputs = (InputNVDLATensor->getDimensions()).n * kernelN;
                             //    (InputNVDLATensor->getDimensions()).c;
        } else {
                DEBUG(errs() << "INPUT FROM WEIGHT TENSOR\n");
                auto *BatchesConst = dyn_cast<ConstantInt>(InputTensor->getOperand(2));
                auto *ChannelsConst = dyn_cast<ConstantInt>(InputTensor->getOperand(3));
                numOutputs = BatchesConst->getZExtValue() * kernelN;
                               // ChannelsConst->getZExtValue();
                DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n");
        }
	
	// Get Strides
	ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(5));
	ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(4));
	assert((StrideWConst && StrideHConst) && "Strides must be constants.");
	int strideW = StrideWConst->getZExtValue();
	int strideH = StrideHConst->getZExtValue();
	 DEBUG(errs() << "STRIDE H: " << strideH << "\n");
        DEBUG(errs() << "STRIDE W: " << strideW << "\n");
	
	// Get pads
	ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(3));
	ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(2));
	assert((PadWConst && PadHConst) && "Pads must be constants.");
	int padW = PadWConst->getZExtValue();
	int padH = PadHConst->getZExtValue();
	DEBUG(errs() << "PAD H: " << padH << "\n");
        DEBUG(errs() << "PAD W: " << padW << "\n");
	
	// FIXME: Support dilations. Set dilations to 1 since we do not have dilation support yet.
	int dilationW = 1;
	int dilationH = 1;
	
	// Get the nodes with Add operations
	SmallVector<DFLeafNode *, 4> AddOpNodes;
	SmallVector<IntrinsicInst *, 4> AddInsts;
	getaddOpSucceedsNode(N, AddOpNodes, AddInsts);
	assert((!(AddOpNodes.size() > 1))
			&& "Number of nodes with Add ops must not be more than 1");
	
	// Get bias parameters
	int BiasW, BiasH, BiasC, BiasN;
	User *BiasTensor = nullptr;
	BiasMode biasMode = BiasMode::bNONE;
	if(AddOpNodes.size()) {
		// Get the index for bias tensor
		auto *AddNode = AddOpNodes[0];
		auto *AddInst = AddInsts[0];
		DEBUG(AddInst->print(errs()));
		auto *F = AddNode->getFuncPointer();
		unsigned BiasIndex = 0;
		ArgFound = false;
		for(auto &Arg : F->args()) {
			if(AddInst->getOperand(1) == &Arg) {
				ArgFound = true;
				break;
			}
			BiasIndex++;
		}
		assert(ArgFound && "Illegal intrinsic or Node.");
		
		// Get the bias tensor
		DEBUG(errs() << "BIAS INDEX: " << BiasIndex << "\n");
		DEBUG(errs() << "BIAS TENSOR\n");
		BiasTensor = getBindingTensor(AddNode, BiasIndex);
		assert(BiasTensor && "Bias tensors are always binds.");
		
		// Get Bias constants
		auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5));
		auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4));
		auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3));
		auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2));
		assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst 
				&& "Bias dimensions must be constants.");
		BiasW = BiasWConst->getZExtValue();
		BiasH = BiasHConst->getZExtValue();
		BiasC = BiasCHConst->getZExtValue();
		BiasN = BiasNConst->getZExtValue();
		DEBUG(errs() << "BIAS H: " << BiasH << "\n");
        	DEBUG(errs() << "BIAS W: " << BiasW << "\n");
         	DEBUG(errs() << "BIAS C: " << BiasC << "\n");
        	DEBUG(errs() << "BIAS N: " << BiasN << "\n");
		
		// Get bias mode
		//if(kernelN == numOutputs)
			biasMode = BiasMode::bCHANNEL;
		//else
		//	biasMode = BiasMode::bUNIFORM;
	}
	
	// Get weights
	Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW);
	Weights biasWeights = AddOpNodes.size() == 1 ?  
					readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW)
					: Weights(DataType::HALF, nullptr, 0);

	Dims2 tlPadding  = Dims2(padH, padW);
	Dims2 brPadding  = Dims2(padH, padW);
	Dims2 stride     = Dims2(strideH, strideW);
	Dims2 dilation   = Dims2(dilationH, dilationW);
	Dims2 kernelSize = Dims2(kernelH, kernelW);

	auto *Layer = Network->addConvolution(InputNVDLATensor, numOutputs, 0,
									kernelSize, tlPadding, brPadding, stride, dilation,
									kernelWeights, biasWeights, biasMode, numGroups);
	if(AddOpNodes.size()) {
		auto *Node = AddOpNodes[0];
		mapOutputTensor(Node, Layer->getOutput(0));
	} else {
		mapOutputTensor(N, Layer->getOutput(0));
	}
	Layer->setName((std::string("conv") + getLayerName(std::string("conv"))).c_str());
	DEBUG(errs() << Layer->getName() << "\n");
}

void CGT_NVDLA::generatePoolingLayer(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "*****POOLING LAYER*****\n");
	 // Get input tensor
        unsigned inputIndex = getInputIndex(N, II);
        auto *InputTensor = getBindingTensor(N, inputIndex);
        ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);

	// Get window dimensions
	ConstantInt *KernelWConst = dyn_cast<ConstantInt>(II->getOperand(2));
	ConstantInt *KernelHConst = dyn_cast<ConstantInt>(II->getOperand(1));
	assert((KernelWConst && KernelHConst) && "Kernel dimensions must be constants.");
	int kernelH = KernelHConst->getZExtValue();
	int kernelW = KernelWConst->getZExtValue();
	DEBUG(errs() << "KERNEL H: " << kernelH << "\n");
	DEBUG(errs() << "KERNEL W: " << kernelW << "\n");

	// Get Strides
	ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(6));
	ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(5));
	assert((StrideWConst && StrideHConst) && "Strides must be constants.");
	int strideH = StrideHConst->getZExtValue();
	int strideW = StrideWConst->getZExtValue();
	DEBUG(errs() << "STRIDE H: " << strideH << "\n");
	DEBUG(errs() << "STRIDE W: " << strideW << "\n");
	
	// Get pads
	ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(4));
	ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(3));
	assert((PadWConst && PadHConst) && "Pads must be constants.");
	int padH = PadHConst->getZExtValue();
	int padW = PadWConst->getZExtValue();
 	DEBUG(errs() << "PAD H: " << padH << "\n");
        DEBUG(errs() << "PAD W: " << padW << "\n");
	
	Dims2 windowSize = Dims2(kernelH, kernelW);
	Dims2 stride     = Dims2(strideH, strideW);
	Dims2 tlPadding  = Dims2(padH, padW);
	Dims2 brPadding  = Dims2(padH, padW);
	
	PoolingType type = (II->getIntrinsicID() == Intrinsic::hpvm_tensor_pool_mean) ? 
						PoolingType::kAVERAGE : PoolingType::kMAX;
			
	auto *Layer = Network->addPooling(InputNVDLATensor, type,
					windowSize, stride, tlPadding, brPadding);
	mapOutputTensor(N, Layer->getOutput(0));
	Layer->setName((std::string("pool") + getLayerName(std::string("pool"))).c_str());
	DEBUG(errs() << Layer->getName() << "\n");
}

void CGT_NVDLA::generateGemmLayer(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "****GEMM LAYER****\n");
	// Get input tensor and compute number of outputs
	unsigned inputIndex = getInputIndex(N, II);
	auto *InputTensor = getBindingTensor(N, inputIndex);
	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
	
	// Get the index for kernel tensor
	auto *F = N->getFuncPointer();
        DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
        unsigned kernelIndex = 0;
        bool ArgFound = false;
        for(auto &Arg : F->args()) {
                if(II->getOperand(1) == &Arg) {
                        ArgFound = true;
                        break;
                }
                kernelIndex++;
        }
        assert(ArgFound && "Illegal intrinsic or Node.");	
	
	// Get the kernel tensor
	auto *KernelTensor = getBindingTensor(N, kernelIndex);
	assert(KernelTensor && "Kernel tensors are always binds.");

	// Get kernel constants
	auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5));
	auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4));
	auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3));
	auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2));
	assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst 
			&& "Kernel dimensions must be constants.");
	int kernelW = KernelWConst->getZExtValue();
	int kernelH = KernelHConst->getZExtValue();
	int kernelC = KernelCHConst->getZExtValue();
	int kernelN = KernelNConst->getZExtValue();
	 DEBUG(errs() << "KERNEL H: " << kernelH << "\n");
        DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
         DEBUG(errs() << "KERNEL C: " << kernelC << "\n");
        DEBUG(errs() << "KERNEL N: " << kernelN << "\n");
	
	 int numOutputs = kernelW;
	 DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n");	

	// Get the nodes with Add operations
	SmallVector<DFLeafNode *, 4> AddOpNodes;
	SmallVector<IntrinsicInst *, 4> AddInsts;
	getaddOpSucceedsNode(N, AddOpNodes, AddInsts);
	assert((!(AddOpNodes.size() > 1))
			&& "Number of nodes with Add ops must not be more than 1");
	
	// Get bias parameters
	int BiasW, BiasH, BiasC, BiasN;
	User *BiasTensor = nullptr;
	BiasMode biasMode = BiasMode::bNONE;
	if(AddOpNodes.size()) {
		// Get the index for bias tensor
		auto *AddNode = AddOpNodes[0];
		auto *AddInst = AddInsts[0];
		auto *F = AddNode->getFuncPointer();
		unsigned BiasIndex = 0;
                ArgFound = false;
                for(auto &Arg : F->args()) {
                        if(AddInst->getOperand(1) == &Arg) {
                                ArgFound = true;
                                break;
                        }
                        BiasIndex++;
                }
                assert(ArgFound && "Illegal intrinsic or Node.");
	
		// Get the bias tensor
		BiasTensor = getBindingTensor(AddNode, BiasIndex);
		assert(BiasTensor && "Bias tensors are always binds.");
		
		// Get Bias constants
		auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5));
		auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4));
		auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3));
		auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2));
		assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst 
				&& "Bias dimensions must be constants.");
		BiasW = BiasWConst->getZExtValue();
		BiasH = BiasHConst->getZExtValue();
		BiasC = BiasCHConst->getZExtValue();
		BiasN = BiasNConst->getZExtValue();
		 DEBUG(errs() << "BIAS H: " << BiasH << "\n");
                DEBUG(errs() << "BIAS W: " << BiasW << "\n");
                DEBUG(errs() << "BIAS C: " << BiasC << "\n");
                DEBUG(errs() << "BIAS N: " << BiasN << "\n");

		// Get bias mode
		//if(KernelCHConst->getZExtValue() == numOutputs)
			biasMode = BiasMode::bCHANNEL;
		//else
		//	biasMode = BiasMode::bUNIFORM;
	}

	// Get weights
	Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW);
	Weights biasWeights = (AddOpNodes.size() == 1) ?  
					readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW)
					: Weights(DataType::HALF, nullptr, 0);

	auto *Layer = Network->addFullyConnected(InputNVDLATensor, numOutputs,
						  kernelWeights, biasWeights, biasMode);
	if(AddOpNodes.size()) {
		auto *Node = AddOpNodes[0];
		mapOutputTensor(Node, Layer->getOutput(0));
	} else {
		mapOutputTensor(N, Layer->getOutput(0));
	}
	Layer->setName((std::string("gemm") + getLayerName(std::string("gemm"))).c_str());
	DEBUG(errs() << Layer->getName() << "\n");
}

void CGT_NVDLA::generateReluLayer(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "******RELU LAYER******\n");
	// Get input tensor
	unsigned inputIndex = getInputIndex(N, II);
	auto *InputTensor = getBindingTensor(N, inputIndex);
	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
	
	auto *Layer = Network->addActivation(InputNVDLATensor, kRELU);
	mapOutputTensor(N, Layer->getOutput(0));
	Layer->setName((std::string("relu") + getLayerName(std::string("relu"))).c_str());
	DEBUG(errs() << Layer->getName() << "\n");
}

void CGT_NVDLA::generateSoftMaxLayer(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "******SOFTMAX LAYER*******\n");
	// Get input tensor
	unsigned inputIndex = getInputIndex(N, II);
	auto *InputTensor = getBindingTensor(N, inputIndex);
	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
	
	auto *Layer = Network->addSoftMax(InputNVDLATensor);
	mapOutputTensor(N, Layer->getOutput(0));
	Layer->setName((std::string("softmax") + getLayerName(std::string("softmax"))).c_str());
	DEBUG(errs() << Layer->getName() << "\n");
}

void CGT_NVDLA::generateTanhLayer(DFLeafNode* N, const IntrinsicInst *II) {
	DEBUG(errs() << "*******TANH LAYER*******\n");
	// Get input tensor
	unsigned inputIndex = getInputIndex(N, II);
	auto *InputTensor = getBindingTensor(N, inputIndex);
	ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
	
	auto *Layer = Network->addActivation(InputNVDLATensor, kTANH);
	mapOutputTensor(N, Layer->getOutput(0));
	Layer->setName((std::string("tanh") + getLayerName(std::string("tanh"))).c_str());
	DEBUG(errs() << Layer->getName() << "\n");
}

/*
void CGT_NVDLA::generateBatchNormLayer(DFLeafNode* N, const IntrinsicInst *II) {
	const dc::BatchNormParameter& p = msg.batch_norm_param();
	Weights mean = weightFactory(msg.name(), kMEAN);
	Weights variance = weightFactory(msg.name(), kVARIANCE);
	Weights movingAverage = weightFactory(msg.name(), kMOVING_AVERAGE);
	float eps = p.eps();
	float scaleFactor = 1.0f;
	float average = 0.0f;
	int i;
	
	average = *(static_cast<const float*>(movingAverage.values));
	if ( average == 0.0f )
	{
		gLogError << "Batch Normalization moving average is zero " << std::endl;
		return 0;
	}
	scaleFactor /= average;
	
	if (mean.count != variance.count)
	{
		gLogError << "Mean and variance have differing number of elements " 
				  << mean.count << " & " << variance.count << std::endl;
		return 0;
	}
	
	float *meanBlob = (float *)mean.values;
	float *varianceBlob = (float *)variance.values;
	
	Dims4 inputDims = getIntermediateInputTensor(N)->getDimensions();
	BatchNormMode mode;
	
	if (mean.count == 1)
	{
		mode = BatchNormMode::bnUNIFORM;
		meanBlob[0] = meanBlob[0] * scaleFactor;
		varianceBlob[0] = varianceBlob[0] * scaleFactor;
	}
	else if (mean.count == inputDims.c)
	{
		mode = BatchNormMode::bnm_CHANNEL;
		for (i = 0; i < mean.count; i++)
		{
			meanBlob[i] = meanBlob[i] * scaleFactor;
			varianceBlob[i] = varianceBlob[i] * scaleFactor;
		}
	}
	else
	{
		gLogError << "Unknown batch norm mode" << std::endl;
		return 0;
	}
	
	// Get input tensor
	unsigned inputIndex = getInputIndex(N, II);
	Value *InputTensor = getBindingTensor(inputIndex);
	ITensor *InputNVDLATensor = getNVDLAInputTensor(InputTensor);
	
	auto *Layer = Network->addBatchNorm(InputNVDLATensor, mode, mean, variance, eps);
	mapOutputTensor(N, Layer->getOutput(0));
}
*/

unsigned CGT_NVDLA::identifyOutputs() {
  
    std::set< ITensor* > outputTensors;
    std::set< ITensor* > InputTensors;

    for (int l = 0; l < Network->getNumLayers(); ++l) {
        ILayer* layer = Network->getLayer(l);
        assert(layer && "Illegal NVDLA compiler IR!");
        for (int ii = 0; ii < layer->getNumInputs(); ++ii) {
            InputTensors.insert(layer->getInput(ii));
        }
        for (int oo = 0; oo < layer->getNumOutputs(); ++oo) {
            outputTensors.insert(layer->getOutput(oo));
        }
    }

    for (std::set<ITensor*>::iterator oi = outputTensors.begin(); oi != outputTensors.end(); ++oi) {
        // An output tensor which is not an input to any other layers is a Network output tensor
        if (InputTensors.find(*oi) == InputTensors.end())
            Network->markOutput(*oi);
    }
    return Network->getNumOutputs();
}

void CGT_NVDLA::codeGen(DFLeafNode *N) {
   // No allocation nodes allowed.
   assert(!N->isAllocationNode() && "Allocation Node not expected in ApproxHPVM");
	
  // Skip code generation if it is a dummy node
  if(N->isDummyNode()) {
    DEBUG(errs() << "Skipping dummy node\n");
    return;
  }

  // Generate code only if it has the right hint
  //if (!checkPreferredTarget(N, hpvm::NVDLA_TARGET)) {
   // DEBUG(errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n");
  //  return;
 // }

  // Get the function associated with the dataflow node
  auto *F = N->getFuncPointer();
  DEBUG(errs()<<"function name = "<< F->getName()<<"\n");

  // Generate code for every instruction in this node
  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
    Instruction *I = &(*i);

    if (BuildDFG::isViscIntrinsic(I)) {
      auto *II = dyn_cast<IntrinsicInst>(I);
      assert((II->getCalledFunction()->getName()).startswith("llvm.hpvm.tensor")
        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");

      switch (II->getIntrinsicID()) {
		  case Intrinsic::hpvm_tensor_convolution:
		  case Intrinsic::hpvm_tensor_group_convolution:
			  generateConvolutionLayer(N, II);
			  break;
	
		  case Intrinsic::hpvm_tensor_batchnorm:
			  generateBatchNormLayer(N, II);
			  break;
		  
		  case Intrinsic::hpvm_tensor_mul:
			  generateGemmLayer(N, II);
			  break;
			  
		  case Intrinsic::hpvm_tensor_add:
			  // Add not explicitly supported by NVDLA compiler!
			  break;
			  
		  case Intrinsic::hpvm_tensor_pool_max:
		  case Intrinsic::hpvm_tensor_pool_mean:
			  generatePoolingLayer(N, II);
			  break;
			  
		  case Intrinsic::hpvm_tensor_relu:
			  generateReluLayer(N, II);
			  break;
		  
		  case Intrinsic::hpvm_tensor_clipped_relu:
		   // No need to generate NVDLA IR for this?
		   break;
		   
		  case Intrinsic::hpvm_tensor_tanh:
			  generateTanhLayer(N, II);
			  break;
			  
		  case Intrinsic::hpvm_tensor_softmax:
			  generateSoftMaxLayer(N, II);
			  break;
			  
		  default:
			llvm_unreachable("Unknown HPVM Intrinsic!");
			break;
      }
    }
  }
}

void CGT_NVDLA::codeGen(DFInternalNode* N) {
  DEBUG(errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n");
  DEBUG(errs () << "Skipping internal node\n");
}

NvDlaError CGT_NVDLA::parseSetup(const TestAppArgs* appArgs, TestInfo* i) {
    return NvDlaSuccess;
}

NvDlaError CGT_NVDLA::transformHPVM2NVDLA(const TestAppArgs* appArgs, TestInfo* i) {
    NVDLA_UNUSED(appArgs);
    NvDlaError e = NvDlaSuccess;

    Network  = nullptr;
    Network = nvdla::createNetwork();
    if (!Network)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createNetwork() failed");

    // Iterate over all the DFGs and produce code for each one of them
     for(auto &RootNode: *(appArgs->Roots))
    	  visit(RootNode);

    // if the application has so far not marked the network's outputs, allow the parser to do so now
    if (Network->getNumOutputs() <= 0) {
        int outs = identifyOutputs();
        DEBUG(NvDlaDebugPrintf("Marking total %d outputs\n", outs));
        if (outs <= 0)
            ORIGINATE_ERROR_FAIL(NvDlaError_BadValue, "Unable to identify outputs for the network: %d", outs);
    }

    if (appArgs->computePrecision == nvdla::DataType::INT8) {
         if (appArgs->calibTable != "") {
            DEBUG(NvDlaDebugPrintf("parsing calibration table...\n"));
            PROPAGATE_ERROR_FAIL(readTensorScales(appArgs, i, Network));
        } else {
            DEBUG(NvDlaDebugPrintf("initialize all tensors with const scaling factors of 127...\n"));
            PROPAGATE_ERROR_FAIL(generateTensorScales(appArgs, i, Network));
       }
    }

    DEBUG(NvDlaDebugPrintf("attaching parsed network to the wisdom...\n"));
    if (!i->wisdom->setNetworkTransient(Network))
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->setNetworkTransient() failed");

    return NvDlaSuccess;

fail:
    return e;
}

NvDlaError CGT_NVDLA::parseAndCompile(const TestAppArgs* appArgs, TestInfo* i) {
    NvDlaError e = NvDlaSuccess;
    bool isCaffe = appArgs->caffemodel != "";

    PROPAGATE_ERROR_FAIL(parseSetup(appArgs, i));

    DEBUG(NvDlaDebugPrintf("creating new wisdom context...\n"));
    i->wisdom = nvdla::createWisdom();
    if (!i->wisdom)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createWisdom() failed");

    DEBUG(NvDlaDebugPrintf("opening wisdom context...\n"));
    if (!i->wisdom->open(i->wisdomPath))
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->open() failed to open: \"%s\"", i->wisdomPath.c_str());

    // Parse
      PROPAGATE_ERROR_FAIL(transformHPVM2NVDLA(appArgs, i));

    // Compile
    PROPAGATE_ERROR_FAIL(compileProfile(appArgs, i));

    /* Destroy network before closing wisdom context */
    nvdla::destroyNetwork(i->wisdom->getNetwork());

    DEBUG(NvDlaDebugPrintf("closing wisdom context...\n"));
    i->wisdom->close();

fail:
    if (i->wisdom != NULL) {
        nvdla::destroyWisdom(i->wisdom);
        i->wisdom = NULL;
    }
    return e;
}

NvDlaError CGT_NVDLA::testSetup(const TestAppArgs* appArgs, TestInfo* i) {
    NvDlaError e = NvDlaSuccess;

    std::string wisdomPath = appArgs->outputPath + "wisdom.dir/";
    std::string removeCmd = "";
    std::string imagePath = "";
    NvDlaStatType stat;
    int ii = 0;

    // Do input paths exist?
    e = NvDlaStat(appArgs->inputPath.c_str(), &stat);
    if (e != NvDlaSuccess)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Input path does not exist: \"%s\"", appArgs->inputPath.c_str());

    // Do output paths exist?
    e = NvDlaStat(appArgs->outputPath.c_str(), &stat);
    if (e != NvDlaSuccess)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Output path does not exist: \"%s\"", appArgs->outputPath.c_str());

    // Clear wisdomPath if any exist
    removeCmd += "rm -rf " + wisdomPath;
    ii = std::system(removeCmd.c_str()); // This is pretty awful
    if (ii != 0)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "system command failed: \"%s\"", removeCmd.c_str());

    PROPAGATE_ERROR_FAIL(NvDlaMkdir(const_cast<char *>(wisdomPath.c_str())));

    // Initialize TestInfo
    i->wisdom = NULL;
    i->wisdomPath = wisdomPath;
    i->pData = NULL;

    return NvDlaSuccess;

fail:
    return e;
}

NvDlaError CGT_NVDLA::launchTest(const TestAppArgs* appArgs) {
    NvDlaError e = NvDlaSuccess;
    TestInfo testInfo;

    PROPAGATE_ERROR_FAIL(testSetup(appArgs, &testInfo));

    PROPAGATE_ERROR_FAIL(parseAndCompile(appArgs, &testInfo));

    return NvDlaSuccess;
fail:
    return e;
}

bool HPVM2NVDLA::runOnModule(Module &M) {
  DEBUG(errs() << "**************HPVM2NVDLA PASS****************\n");
  
  	NvDlaError e = NvDlaError_TestApplicationFailed;
	TestAppArgs testAppArgs = defaultTestAppArgs;
	
	// Get the HPVM IR graph
  BuildDFG &DFG = getAnalysis<BuildDFG>();
  std::vector<DFInternalNode *> Roots = DFG.getRoots();

  // Visitor for Code Generation Graph Traversal
  CGT_NVDLA *CGTVisitor = new CGT_NVDLA(M, DFG);
	
  	if(ComputePrecision == "INT8" || ComputePrecision == "int8") {
  		testAppArgs.computePrecision = nvdla::DataType::INT8;
  		testAppArgs.quantizationMode = nvdla::QuantizationMode::PER_KERNEL;
  		testAppArgs.configtarget = std::string("nv_small");
  	} else {
  		testAppArgs.computePrecision = nvdla::DataType::HALF;
  		testAppArgs.quantizationMode = nvdla::QuantizationMode::NONE;
  		testAppArgs.configtarget = std::string("nv_full");
  	}
	testAppArgs.profileName = std::string("hpvm-mod");
	testAppArgs.calibTable = CalibTablePath;//std::string("output_scales.txt");
	testAppArgs.outputPath = std::string(".");
	testAppArgs.inDataFormat = nvdla::DataFormat::NCHW;
	
	testAppArgs.Roots = &Roots;
	
	e = CGTVisitor->launchTest(&testAppArgs);
	if (e != NvDlaSuccess)
		DEBUG(errs() << "ERROR\n");
	else
		DEBUG(errs() << "SUCESS\n");
	
	delete CGTVisitor;
  
	return false;
}

NvDlaError CGT_NVDLA::compileProfile(const TestAppArgs* appArgs, TestInfo* i) {
    NvDlaError e = NvDlaSuccess;
    std::string profileName = "";
    std::string targetConfigName = "";

    NvDlaFileHandle file = 0;
    std::string fileName = "";
    NvU8 *buffer = 0;
    NvU64 size = 0;

    nvdla::ICompiler* compiler = i->wisdom->getCompiler();
    if (!compiler)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getCompiler() failed");

    if (!(appArgs->configtarget != ""))
        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No target config found to load");

    targetConfigName = appArgs->configtarget;

    // Determine profile
    PROPAGATE_ERROR_FAIL(generateProfile(appArgs, &profileName, i));

    // Compile
    DEBUG(NvDlaDebugPrintf("compiling profile \"%s\"... config \"%s\"...\n", profileName.c_str(), targetConfigName.c_str()));
    PROPAGATE_ERROR_FAIL(compiler->compile(profileName.c_str(), targetConfigName.c_str(), &i->compiledLoadable));
    // Get loadable buffer and dump it into a file
    PROPAGATE_ERROR_FAIL(compiler->getLoadableImageSize(profileName.c_str(),
                                                    &size));
    if (size == 0) {
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter,
                            "Invalid size for a loadable");
    }

    buffer = (NvU8 *) NvDlaAlloc(size);
    if (buffer == NULL) {
        ORIGINATE_ERROR_FAIL(NvDlaError_InsufficientMemory,
                            "Failed to allocate buffer for loadable");
    }
    PROPAGATE_ERROR_FAIL(compiler->getLoadableImage(profileName.c_str(),
                                                    buffer));
    fileName = profileName + ".nvdla";
    errs() << "Writing NVDLA module '" << fileName << "' ...";
    PROPAGATE_ERROR_FAIL(NvDlaFopen(fileName.c_str(), NVDLA_OPEN_WRITE, &file));
    PROPAGATE_ERROR_FAIL(NvDlaFwrite(file, buffer, size));
    errs() << " done.\n";

fail:
    NvDlaFclose(file);
    if (buffer != NULL)
        NvDlaFree(buffer);
    return e;
}

NvDlaError CGT_NVDLA::generateProfile(const TestAppArgs* appArgs, std::string* profileName, TestInfo* i) {
    NvDlaError e = NvDlaSuccess;
    nvdla::DataFormat inDataFormat = nvdla::DataFormat::UNKNOWN;

    if (appArgs->profileName != "") {
        // init named profile (basic/default/performance) with default params in its constructor and exit
        DEBUG(errs() << "PROFILE NAME PROVIDED\n");
	PROPAGATE_ERROR_FAIL(beginWithNamedProfile(appArgs, i));
        *profileName = appArgs->profileName;
    } else {
        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No profile supplied to load");
    }

    // capture profile params from command line (override the existing ones as necessary)
    inDataFormat = inDataFormat == nvdla::DataFormat::UNKNOWN ? appArgs->inDataFormat : inDataFormat;
    PROPAGATE_ERROR_FAIL(updateProfileWithCmdLineArgs(appArgs, i, profileName->c_str(), inDataFormat));

fail:
    return e;
}

NvDlaError CGT_NVDLA::beginWithNamedProfile(const TestAppArgs* appArgs, TestInfo* i) {
    NvDlaError e = NvDlaSuccess;
    nvdla::IProfiler* profiler;
    nvdla::IProfile* profile;

    profiler = i->wisdom->getProfiler();
    if ( !profiler ) {
        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profiler not initialized");
    }

    profile = profiler->getProfile(appArgs->profileName.c_str());
    if ( !profile ) {
        ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profile %s not initialized", appArgs->profileName.c_str());
    }

fail:
    return e;
}

NvDlaError CGT_NVDLA::updateProfileWithCmdLineArgs
(
    const TestAppArgs* appArgs, TestInfo* i, const char* profileName, nvdla::DataFormat inDataFormat
) {
    NvDlaError e = NvDlaSuccess;
    nvdla::IProfiler* profiler;
    nvdla::IProfile* profile;

    profiler = i->wisdom->getProfiler();
    if (!profiler)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getProfiler() failed");
    profile   = profiler->getProfile(profileName);
    if (!profile)
        ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "profiler->getProfile() failed");

    PROPAGATE_ERROR_FAIL(profile->setComputePrecision(appArgs->computePrecision));
    PROPAGATE_ERROR_FAIL(profile->setNetworkInputDataFormat(inDataFormat));

    // determine input surface format
    switch(inDataFormat) {
        case nvdla::DataFormat::NHWC:

            if (appArgs->computePrecision == nvdla::DataType::HALF) {
                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A16B16G16R16_F));
            } else if (appArgs->computePrecision == nvdla::DataType::INT8) {
                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A8B8G8R8));
            } else {
                ORIGINATE_ERROR_FAIL(NvDlaError_NotSupported, "NHWC and compute precision %u is not yet supported",
                                     appArgs->computePrecision.v());
            }
            break;
        case nvdla::DataFormat::NCxHWx:
        case nvdla::DataFormat::NCHW:
        case nvdla::DataFormat::UNKNOWN:    // atleast start the test with feature data format
        default:
            if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0)
                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8));
            else
                PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE));
    }

    // determine int8 cfgs
    if (appArgs->computePrecision == nvdla::DataType::INT8) {
        PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::PER_TENSOR));
        switch(appArgs->quantizationMode) {
            case nvdla::QuantizationMode::PER_FILTER:
                PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_FILTER));
                break;
            case nvdla::QuantizationMode::PER_KERNEL:
            case nvdla::QuantizationMode::NONE: // default to per-kernel; find a way to run int8 tests w/ NONE qtzMode cleanly
            default:
                PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_KERNEL));
        }
    } else {
        PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::NONE));
        PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::NONE));
    }

    PROPAGATE_ERROR_FAIL(profile->setNetworkOutputDataFormat(nvdla::DataFormat::NCxHWx));

    if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0)
        PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8));
    else
        PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE));

    if (appArgs->numBatches > 0)
        PROPAGATE_ERROR_FAIL(profile->setMultiBatchSize(appArgs->numBatches));

fail:
    return e;
}
NvDlaError CGT_NVDLA::generateTensorScales(const TestAppArgs* appArgs, TestInfo* i, nvdla::INetwork* network) {
    NvDlaError e = NvDlaSuccess;

    std::vector<nvdla::ILayer*> networkLayers = network->getLayers();
    std::vector<nvdla::ITensor*> networkInputs = network->getInputs();

    std::vector<nvdla::ILayer*>::iterator li = networkLayers.begin();
    std::vector<nvdla::ITensor*>::iterator nii = networkInputs.begin();

    // set scaling factor for the network input tensors
    for (; nii != networkInputs.end(); ++nii) {
        NvF32 scale = 1;
        NvF32 min = scale * -127.0f;
        NvF32 max = scale * 127.0f;
        std::string tName = (*nii)->getName();
	DEBUG(errs() << "INPUT NAME: " << tName << "\n");
        // set same dynamic range for all channels of the tensor (cIndex = -1)
        PROPAGATE_ERROR_FAIL( (*nii)->setChannelDynamicRange(-1, min, max) );
        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale));
        if (0)
            NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", tName.c_str(), scale);
    }

    for (; li != networkLayers.end(); ++li) {
        NvF32 scale = 127;
        NvF32 min = scale * -127.0f;
        NvF32 max = scale * 127.0f;
        std::string lName = (*li)->getName();
        nvdla::ITensor* outTensor = (*li)->getOutput(0);
	DEBUG(errs() << "LAYER NAME: " << lName << "\n");
        // set same dynamic range for all channels of the tensor (cIndex = -1)
        PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) );
        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(lName, scale));
        if (0)
            NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", lName.c_str(), scale);
    }

fail:
    return e;
}

NvDlaError CGT_NVDLA::readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network) {
    NvDlaError e = NvDlaSuccess;
    NvDlaStatType stat;
    std::string calibTableFile = /*i->calibTablesPath + "/" + */appArgs->calibTable;

    //PROPAGATE_ERROR_FAIL(NvDlaStat(calibTableFile.c_str(), &stat));
    DEBUG(errs() << "***********READING TENSOR SCALESi*************\n");
    std::ifstream infile(calibTableFile.c_str());
    std::string line;
    std::map<std::string, float> LayerNameToScaleMap;
    while (std::getline(infile, line)) {
        DEBUG(errs() << "READ LINE:  " << line << "\n");
        line.erase(remove(line.begin(), line.end(), ' '), line.end());
        DEBUG(errs() << "READ LINE WITHOUT WHITE SPACES:  " << line << "\n");
        std::string delimiter = ":";
        std::string layer_name = line.substr(0, line.find(delimiter));
        std::string Scale = line.substr(line.find(delimiter) + 1);
        DEBUG(errs() << "LAYER NAME: " << layer_name << "\n");
        DEBUG(errs() << "SCALE: " << Scale << "\n");
        size_t size;
        LayerNameToScaleMap[layer_name] = std::stof(Scale, &size);
    }
    infile.close();
    DEBUG(errs() << "GOT TENSOR SCALES FROM CALIB TABLE\n");

    std::vector<nvdla::ILayer*> networkLayers = network->getLayers();
    std::vector<nvdla::ITensor*> networkInputs = network->getInputs();
    for (auto *Input : networkInputs) {
        NvF32 scale = 0.0f;
        NvF32 min = 0.0f;
        NvF32 max = 0.0f;
        DEBUG(errs() << "SET SCALE FOR INPUT\n");
	scale = LayerNameToScaleMap["input"]; 	
        DEBUG(errs() << "INPUT SCALE: " << scale << "\n");
	min = scale * -127.0f;
        max = scale * 127.0;
        PROPAGATE_ERROR_FAIL(Input->setChannelDynamicRange(-1, min, max) );
        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>("data", scale));
    }
    DEBUG(errs() << "PER LAYER CALIB\n");
    for (auto *Layer : networkLayers) {
         NvF32 scale = 0.0f;
                NvF32 min = 0.0f;
                NvF32 max = 0.0f;
		std::string tName = Layer->getName();
                DEBUG(errs() << "SETTING SCALE FOR LAYER NAME: " << tName << "\n");
		nvdla::ITensor* outTensor = Layer->getOutput(0);
                auto it = LayerNameToScaleMap.find(tName);
                if (it != LayerNameToScaleMap.end()) {
                        DEBUG(errs() << "SET SCALE FOR NAME: " << tName << "\n");
			DEBUG(errs() << "SCALE: " << it->second << "\n");
                        scale = it->second;
                        min = scale * -127.0f;
                        max = scale * 127.0f;
                } else {
                        DEBUG(errs() << "SET DEFAULT SCALE FOR NAME: " << tName << "\n");
                        DEBUG(errs() << "SCALE: 1\n");
			scale = 1;
                        min = scale * -127.0f;
                        max = scale * 127.0f;
                }
                //else {
                //      ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Atleast 1 of scale or min-max should be specified for %s\n", tName.c_str());
                //}
        PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) );
        const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale));
    }

    DEBUG(errs() << "DONE PARSING CALIBRATION TABLE\n");
 fail:
     return e;
}