#define ENABLE_ASSERTS #define DEBUG_TYPE "DFG2NVDLA" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/IR/InstIterator.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/FileSystem.h" #include "llvm/IR/Attributes.h" #include "llvm/ADT/STLExtras.h" #include "SupportHPVM/DFG2LLVM.h" #include <sstream> #include <fstream> #include <vector> #include <map> #include <set> #include "dlaerror.h" #include "dlatypes.h" #include "nvdla/IRuntime.h" #include "DlaImageUtils.h" #include "ErrorMacros.h" #include "nvdla_inf.h" #include "nvdla_os_inf.h" #include "nvdla/IType.h" #include "nvdla/ITensor.h" #include "nvdla/INetwork.h" #include "nvdla/ILayer.h" #include "nvdla/IProfiler.h" #include "nvdla/IProfile.h" #include "nvdla/ICompiler.h" #include "nvdla/ILoadable.h" #include "nvdla/IWisdom.h" #include "rapidjson/document.h" #include "rapidjson/filereadstream.h" #include "rapidjson/error/en.h" #include "half.h" using namespace llvm; using namespace builddfg; using namespace dfg2llvm; using namespace nvdla; typedef half_float::half float16; static cl::opt<std::string> ComputePrecision("cprecision", cl::desc("Compute precision (int8 or fp16)."), cl::init("float16")); static cl::opt<std::string> CalibTablePath("calib-table", cl::desc("Path to tensor scales file"), cl::value_desc("filename"), cl::Required); #define DEFAULT_BATCH_SIZE 0 #define DEFAULT_DATA_FMT nvdla::DataFormat::NCHW #define DEFAULT_QUANT_MODE nvdla::QuantizationMode::NONE #define TARGET_CONFIG_NAME "nv_full" #define TEST_PARAM_FILE_MAX_SIZE 65536 struct HPVM2NVDLA : public ModulePass { static char ID; // Pass identification, replacement for typeid HPVM2NVDLA() : ModulePass(ID) {} public: // Functions virtual bool runOnModule(Module &M); void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<BuildDFG>(); AU.addPreserved<BuildDFG>(); } private: //bool transformHPVM2NVDLA(Module &M); //void codeGenHPVM2NVDLA(CGT_NVDLA *, DFNode *); }; struct TestAppArgs { std::string project; std::string inputPath; std::string inputName; std::string outputPath; std::string testname; std::string testArgs; std::string prototxt; // This should be folded into testArgs std::string caffemodel; // This should be folded into testArgs std::string cachemodel; // This should be folded into testArgs std::string profileName; // ok here? std::string profileFile; std::string configtarget; std::string calibTable; nvdla::QuantizationMode quantizationMode; Module *M; std::vector<DFInternalNode *> *Roots; NvU16 numBatches; nvdla::DataFormat inDataFormat; nvdla::DataType computePrecision; std::map<std::string, NvF32> tensorScales; }; struct TestInfo { // common nvdla::IWisdom* wisdom; std::string wisdomPath; // parse std::string modelsPath; std::string profilesPath; std::string calibTablesPath; // runtime // nvdla::IRuntime* runtime; nvdla::ILoadable* compiledLoadable; NvU8 *pData; //std::string inputImagesPath; //std::string inputLoadablePath; // std::map<std::string, NvDlaImage*> inputImages; // std::map<std::string, void *> inputBuffers; // std::map<std::string, NvDlaImage*> outputImages; // std::map<std::string, void *> outputBuffers; // std::vector<SubmitContext*> submits; NvU32 timeout; NvU16 numBatches; // runtime's point-of-view NvU32 numSubmits; }; static TestAppArgs defaultTestAppArgs = { /* .project = */ "OpenDLA", /* .inputPath = */ "./", /* .inputName = */ "", /* .outputPath = */ "./", /* .testname = */ "", /* .testArgs = */ "", /* .prototxt = */ "", /* .caffemodel = */ "", /* .cachemodel = */ "", /* .profileName = */ "fast-math", /* .profileFile = */ "", /* .configtarget = */ TARGET_CONFIG_NAME, /* .calibtable = */ "", /* .quantizationMode = */ DEFAULT_QUANT_MODE, nullptr, nullptr, /* .numBatches = */ DEFAULT_BATCH_SIZE, /* .inDataFormat = */ DEFAULT_DATA_FMT, /* .computePrecision = */ nvdla::DataType::INT8 }; char HPVM2NVDLA::ID = 0; static RegisterPass<HPVM2NVDLA> X("hpvm-nvdla", "Dataflow Graph to NVDLA IR Pass", false, false); // Visitor for Code generation traversal of HPVM IR class CGT_NVDLA : public CodeGenTraversal { private: // Data information //DataFormat InDataFormat; //DataType ComputePrecision; //QuantizationMode Quantization; //NvU16 NumBatches; // Wisdom and network information IWisdom *Wisdom; INetwork *Network; std::map<std::string, int> LayerNameMap; // Maps dataflow edges in HPVM IR to Tensors in NVDLA IR DenseMap<const DFEdge *, ITensor *> EdgeToTensorMap; // Virtual Functions void init(); void initRuntimeAPI(); void codeGen(DFInternalNode* N); void codeGen(DFLeafNode* N); // Codegen functions for all supported layers void generateConvolutionLayer(DFLeafNode *, const IntrinsicInst *); void generatePoolingLayer(DFLeafNode *, const IntrinsicInst *); void generateBatchNormLayer(DFLeafNode *, const IntrinsicInst *); void generateReluLayer(DFLeafNode *, const IntrinsicInst *); void generateGemmLayer(DFLeafNode *, const IntrinsicInst *); void generateSoftMaxLayer(DFLeafNode *, const IntrinsicInst *); void generateTanhLayer(DFLeafNode *, const IntrinsicInst *); // Map edges to output tensors void mapOutputTensor(DFNode *N, ITensor *Tensor); // Get input tensors to nodes ITensor *getIntermediateInputTensor(DFNode *N); // Get binding tensors to nodes User *getBindingTensor(DFLeafNode* N, unsigned index); // Get the input NVDLA tensors to nodes ITensor *getNVDLAInputTensor(DFLeafNode* N, const User *InputTensor); // Get index for an input tensor unsigned getInputIndex(DFLeafNode* N, const IntrinsicInst *II); // Gets nodes with add ops meant to be combined with convolution and gemm void getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes, SmallVector<IntrinsicInst *, 4> &AddInsts); // Getting weights Weights readTrainedWeights(User *TensorPtr, int dim1_size, int dim2_size, int dim3_size, int dim4_size); // Identify outputs unsigned identifyOutputs(); // Generate profile based on data parameters //void generateProfile(std::string &, std::string &); std::string getLayerName(std::string Name); public: CGT_NVDLA(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {// : Network(nullptr) { //initRuntimeAPI(); init(); } //void destroySetUp(); //void setUpWisdom(); //void compileProfile(); //void transformHPVM2NVDLA(DFNode *); NvDlaError generateTensorScales(const TestAppArgs*, TestInfo*, nvdla::INetwork*); NvDlaError updateProfileWithCmdLineArgs(const TestAppArgs*, TestInfo*, const char*, nvdla::DataFormat); NvDlaError beginWithNamedProfile(const TestAppArgs*, TestInfo*); NvDlaError generateProfile(const TestAppArgs*, std::string*, TestInfo*); NvDlaError compileProfile(const TestAppArgs*, TestInfo*); NvDlaError launchTest(const TestAppArgs*); NvDlaError testSetup(const TestAppArgs*, TestInfo*); NvDlaError parseAndCompile(const TestAppArgs*, TestInfo*); NvDlaError transformHPVM2NVDLA(const TestAppArgs*, TestInfo*); NvDlaError parseSetup(const TestAppArgs*, TestInfo*); NvDlaError readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network); }; void CGT_NVDLA::init() { // Default paramters //InDataFormat = DataFormat::NCHW; //ComputePrecision = DataType::FLOAT; //Quantization = QuantizationMode::NONE; //NumBatches = 0; } void CGT_NVDLA::initRuntimeAPI() { // Nothing to do here! } Weights CGT_NVDLA::readTrainedWeights(User *TensorPtr, int dim1_size, int dim2_size, int dim3_size, int dim4_size) { DEBUG(errs() << "READ TRAINED WEIGHTS\n"); // Get weights file name User *MemcpyPtr = dyn_cast<User>(TensorPtr->getOperand(0)); DEBUG(MemcpyPtr->print(errs())); DEBUG(errs() << "\n"); while(!dyn_cast<AllocaInst>(MemcpyPtr)) { MemcpyPtr = dyn_cast<User>(MemcpyPtr->getOperand(0)); } User *MemcpyArg = nullptr; for(User *U: MemcpyPtr->users()) { DEBUG(U->print(errs())); DEBUG(errs() << "\n"); if(auto *BCO = dyn_cast<BitCastOperator>(U)) { for(User *CU: BCO->users()) { if(auto *CI = dyn_cast<CallInst>(CU)) { CI->getCalledFunction()->getName().contains(StringRef("memcpy")); MemcpyArg = dyn_cast<User>(CI->getOperand(1)); break; } } if(MemcpyArg) break; } } assert(MemcpyArg && "File name not found."); auto *WeightFileName = dyn_cast<GlobalVariable>(MemcpyArg->getOperand(0)); assert(WeightFileName && "Weight file name must be a global variable."); auto* CDA = dyn_cast<ConstantDataArray>(WeightFileName->getInitializer()); assert(CDA && "Weight file name must be a constant array."); const auto &file_name = std::string(CDA->getAsString()); // Read the weights file int num_elem = dim1_size * dim2_size * dim3_size * dim4_size; int size_in_bytes = sizeof(float16) * num_elem; //DEBUG(errs() << "float16 size: " << sizeof(float16) << "\n"); DEBUG(errs() << "size in bytes: " << size_in_bytes << "\n"); void *tensor_data = (void *) malloc(size_in_bytes); int file_header_size = 0; DEBUG(errs() << "FILE NAME: " << file_name << "\n"); FILE *file = fopen(file_name.c_str(), "rb"); if(!file) { DEBUG(errs() << "Data file is not found. Aborting.\n"); abort(); } fseek(file, file_header_size, SEEK_CUR); size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); DEBUG(errs() << "BYTES READ: " << bytes_read << "\n"); fclose(file); // Create weight tensors auto Weight = Weights(DataType::HALF, tensor_data, NvS64(num_elem)); //FILE *try_file = fopen("temp.bin", "wb"); //fwrite(Weight.values, sizeof(float), num_elem, try_file); //fclose(try_file); //exit(-1); return Weight; } // For a tensor to be a input weight tensor, it has to come from the root node User *CGT_NVDLA::getBindingTensor(DFLeafNode* N, unsigned index) { // HPVM internal API needs fixing. Remove this lambda function when bug is fixed. auto NodeIsRoot = [](DFNode &InternalNode) { auto *RootFunction = InternalNode.getFuncPointer(); for(User *U: RootFunction->users()) { DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: "); DEBUG(U->print(errs())); DEBUG(errs() << "\n"); auto *II = dyn_cast<IntrinsicInst>(U); if(!II) { auto *BCI = dyn_cast<BitCastOperator>(U); assert(BCI && "Not a bitcast instruction."); for(User *BCU : BCI->users()) { DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: "); DEBUG(BCU->print(errs())); DEBUG(errs() << "\n"); II = dyn_cast<IntrinsicInst>(BCU); if(II) break; } } if(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)) { DEBUG(errs() << "LAUNCH FUNCTION: "); DEBUG(II->print(errs())); DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n"); return true; } } DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n"); return false; }; auto NodeIsLeaf = [](DFNode &Node) { auto *NodeFunction = Node.getFuncPointer(); for(User *U: NodeFunction->users()) { DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: "); DEBUG(U->print(errs())); DEBUG(errs() << "\n"); auto *II = dyn_cast<IntrinsicInst>(U); if(!II) { auto *BCI = dyn_cast<BitCastOperator>(U); assert(BCI && "Not a bitcast instruction."); for(User *BCU : BCI->users()) { DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: "); DEBUG(BCU->print(errs())); DEBUG(errs() << "\n"); II = dyn_cast<IntrinsicInst>(BCU); if(II) break; } } if(II && (II->getIntrinsicID() == Intrinsic::hpvm_createNode || II->getIntrinsicID() == Intrinsic::hpvm_createNode1D || II->getIntrinsicID() == Intrinsic::hpvm_createNode2D || II->getIntrinsicID() == Intrinsic::hpvm_createNode3D)) { DEBUG(errs() << "CREATE NODE FUNCTION: "); DEBUG(II->print(errs())); DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n"); // Ensure that the node function does not have these create node intrinsics for(inst_iterator i = inst_begin(NodeFunction), e = inst_end(NodeFunction); i != e; ++i) { Instruction *I = &(*i); if(auto *II = dyn_cast<IntrinsicInst>(I)) { if(II->getIntrinsicID() == Intrinsic::hpvm_createNode || II->getIntrinsicID() == Intrinsic::hpvm_createNode1D || II->getIntrinsicID() == Intrinsic::hpvm_createNode2D || II->getIntrinsicID() == Intrinsic::hpvm_createNode3D) { DEBUG(errs() << "--LAMBDA FUNCTION RETURN FALSE\n"); return false; } } } return true; } } DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n"); return false; }; DEBUG(errs() << "GET BINDING TENSOR\n"); DEBUG(errs() << "GIVEN INDEX: " << index << "\n"); DFEdge *DE = N->getInDFEdgeAt(index); assert(DE && "Data edge does not exist at given index"); DEBUG(errs() << "LEAF NODE FUNCTION: " << N->getFuncPointer()->getName() << "\n"); // Get the argument position in the root node. DEBUG(errs() << "GET TO THE ROOT FIRST\n"); auto *InternalNode = DE->getSourceDF(); DEBUG(errs() << "INTERNAL NODE FUNCTION: " << InternalNode->getFuncPointer()->getName() << "\n"); DEBUG(errs() << "INTERNAL NDOE POINTER: " << InternalNode << "\n"); if(NodeIsLeaf(*InternalNode)) { DEBUG(errs() << "BIND NONE: EDGE FROM LEAF NODE\n"); return nullptr; } unsigned argPos = DE->getSourcePosition(); DEBUG(errs() << "ARG POSITION BEFORE LOOP: " << argPos << "\n"); while(!NodeIsRoot(*InternalNode)) { DEBUG(errs() << "IN LOOP\n"); if(NodeIsLeaf(*InternalNode)) { DEBUG(errs() << "IN LOOP BIND NONE: EDGE FROM LEAF NODE\n"); return nullptr; } argPos = DE->getSourcePosition(); DE = InternalNode->getInDFEdgeAt(argPos); if(!DE) { DEBUG(errs() << "NO BINDING EDGE IN LOOP\n"); // No binding edge. return nullptr; } InternalNode = DE->getSourceDF(); DEBUG(errs() << "INTERNAL NODE FUNCTION IN LOOP: " << InternalNode->getFuncPointer()->getName() << "\n"); DEBUG(errs() << "IN LOOP DATA EDGE: " << DE << "\n"); DEBUG(errs() << "IN LOOP ARG POSITION: " << argPos << "\n"); } DEBUG(errs() << "ARG POSITION: " << argPos << "\n"); DEBUG(errs() << "GET THE LAUNCH FUNCTION\n"); // Now we have the root node. We need to get the launch functions for it. auto *RootFunction = InternalNode->getFuncPointer(); for(User *U: RootFunction->users()) { DEBUG(errs() << "User for root: "); DEBUG(U->print(errs())); IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); if(!II) { auto *BCI = dyn_cast<BitCastOperator>(U); assert(BCI && "Not a bitcast instruction."); for(User *BCU : BCI->users()) { II = dyn_cast<IntrinsicInst>(BCU); if(II) break; } } assert(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch) && "Use of a root node must be in launch function call instrinsic."); DEBUG(errs() << "LAUNCH FUNCTION: "); DEBUG(II->print(errs())); // Now, get the the arguments to the root and get element pointer to argument structure. auto *ArgObj = dyn_cast<Instruction>(II->getOperand(1)); if(auto *BCO = dyn_cast<BitCastOperator>(ArgObj)) { ArgObj = dyn_cast<Instruction>(BCO->getOperand(0)); } else if (auto *CI = dyn_cast<CallInst>(ArgObj)) { for(User *CIU : CI->users()) { auto *BCO = dyn_cast<BitCastOperator>(CIU); if(BCO) { ArgObj = dyn_cast<Instruction>(BCO->getOperand(0)); break; } } } else if (auto *AI = dyn_cast<AllocaInst>(ArgObj)) { for(User *AIU : AI->users()) { auto *BCO = dyn_cast<BitCastOperator>(AIU); if(BCO) { ArgObj = dyn_cast<Instruction>(BCO->getOperand(0)); break; } } } auto *ArgObjPtrType = dyn_cast<PointerType>(ArgObj->getType()); auto *ArgObjType = dyn_cast<StructType>(ArgObjPtrType->getElementType()); assert(ArgObjType && "Arguments to launch is a structure."); DEBUG(errs() << "ARG OBJ: "); DEBUG(ArgObj->print(errs())); DEBUG(errs() << "\n"); // Use the offset into the structure to get the source tensor. const auto &DL = ArgObj->getParent()->getParent()->getParent()->getDataLayout(); const auto *SL = DL.getStructLayout(ArgObjType); uint64_t ElementOffset = SL->getElementOffset(argPos); DEBUG(errs() << "ELEMENT OFFSET: " << ElementOffset << "\n"); Instruction *StructElemPtr = nullptr; for(User *U: ArgObj->users()) { if(auto *GI = dyn_cast<GetElementPtrInst>(U)) { auto *Offset = dyn_cast<ConstantInt>(GI->getOperand(2)); assert(Offset && "Offset is not constant."); if(Offset->getZExtValue() == argPos) {//ElementOffset) { StructElemPtr = GI; break; } } } assert(StructElemPtr && "No getelementptr found with given offset."); DEBUG(StructElemPtr->print(errs())); DEBUG(errs() << "\n"); DEBUG(errs() << "USE THE STORES TO GET THE BIND TENSOR\n"); // Get store to the element of argument structure to get the pointer to tensor. for(User *GIU: StructElemPtr->users()) { DEBUG(GIU->print(errs())); DEBUG(errs() << "\n"); if(auto *BCO = dyn_cast<BitCastOperator>(GIU)) { DEBUG(BCO->print(errs())); DEBUG(errs() << "\n"); for(User *BCU : BCO->users()) { if(auto *SI = dyn_cast<StoreInst>(BCU)) { // Get the tensor pointer DEBUG(SI->print(errs())); DEBUG(errs() << "\n"); auto *Val = SI->getValueOperand(); if(auto *BCO = dyn_cast<BitCastOperator>(Val)) { return dyn_cast<User>(BCO->getOperand(0)); } return dyn_cast<User>(Val); } } } if(auto *SI = dyn_cast<StoreInst>(GIU)) { // Get the tensor pointer DEBUG(SI->print(errs())); auto *Val = SI->getValueOperand(); if(auto *BCO = dyn_cast<BitCastOperator>(Val)) { return dyn_cast<User>(BCO->getOperand(0)); } return dyn_cast<User>(Val); } } } return nullptr; } void CGT_NVDLA::mapOutputTensor(DFNode *N, ITensor *Tensor) { for(int i = 0; i < N->outdfedge_size(); i++) EdgeToTensorMap[N->getOutDFEdgeAt(i)] = Tensor; } ITensor *CGT_NVDLA::getIntermediateInputTensor(DFNode *N) { return EdgeToTensorMap[N->getInDFEdgeAt(0)]; } void CGT_NVDLA::getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes, SmallVector<IntrinsicInst *, 4> &AddInsts) { bool AddOpNodes = false; for(int i = 0; i < N->outdfedge_size(); i++) { auto *DestNode = N->getOutDFEdgeAt(i)->getDestDF(); auto *F = DestNode->getFuncPointer(); // If the node is already cached in the list, no need to visit it auto *Node = dyn_cast<DFLeafNode>(DestNode); if(find(AddNodes, Node) != AddNodes.end()) continue; // Add node to list if it contains add operation for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &(*i); auto *II = dyn_cast<IntrinsicInst>(I); if (II && II->getIntrinsicID() == Intrinsic::hpvm_tensor_add) { AddNodes.push_back(Node); AddInsts.push_back(II); AddOpNodes = true; break; } } assert(((AddNodes.size() > 0) == AddOpNodes) && "All destination nodes are adds or all of them are not."); } } ITensor *CGT_NVDLA::getNVDLAInputTensor(DFLeafNode* N, const User *InputBindingTensor) { if(InputBindingTensor) { auto *BatchesConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(2)); auto *ChannelsConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(3)); auto *HeightConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(4)); auto *WidthConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(5)); assert(HeightConst && WidthConst && ChannelsConst && BatchesConst && "Number of input dimensions must be constants."); // Input dimensions int InputW = WidthConst->getZExtValue(); int InputH = HeightConst->getZExtValue(); int InputC = ChannelsConst->getZExtValue(); int InputN = BatchesConst->getZExtValue(); // Create a new input tensor Dims4 dims(InputN, InputC, InputH, InputW); return Network->addInput("", dims); } return getIntermediateInputTensor(N); } unsigned CGT_NVDLA::getInputIndex(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "GET INPUT INDEX\n"); auto *F = N->getFuncPointer(); DEBUG(errs()<<"function name = "<< F->getName()<<"\n"); unsigned inputIndex = 0; for(auto &Arg : F->args()) { DEBUG(errs() << "ARGUMENT: "); DEBUG((&Arg)->print(errs())); DEBUG(errs() << "\n"); if(II->getOperand(0) == &Arg) { DEBUG(errs() << "INPUT: "); DEBUG(II->getOperand(0)->print(errs())); DEBUG(errs() << "\n"); DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n"); return inputIndex; } inputIndex++; } assert(false && "Illegal intrinsic or Node."); return -1; // Keep compiler happy } std::string CGT_NVDLA::getLayerName(std::string Name) { DEBUG(errs() << "GET LAYER NAME\n"); if(LayerNameMap.find(Name) == LayerNameMap.end()) { LayerNameMap[Name] = 1; } else { LayerNameMap[Name]++; } return std::to_string(LayerNameMap[Name]); } void CGT_NVDLA::generateConvolutionLayer(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "*****CONVOLUTION LAYER*****\n"); // FIXME: What is number of "groups". Setting it to 1 for now. int numGroups = 1; // If the input tensor is not a binding tensor, it must be coming // from an edge from a visted node, so use that to get number of outputs. unsigned inputIndex = getInputIndex(N, II); DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n"); DEBUG(errs() << "GET INPUT TENSOR\n"); auto *InputTensor = getBindingTensor(N, inputIndex); DEBUG(errs() << "INPUT TENSOR: "); ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor); // Get the index for kernel tensor auto *F = N->getFuncPointer(); DEBUG(errs()<<"function name = "<< F->getName()<<"\n"); unsigned kernelIndex = 0; bool ArgFound = false; for(auto &Arg : F->args()) { if(II->getOperand(1) == &Arg) { ArgFound = true; break; } kernelIndex++; } assert(ArgFound && "Illegal intrinsic or Node."); DEBUG(errs() << "KERNEL INDEX: " << kernelIndex << "\n"); // Get the kernel tensor DEBUG(errs() << "GET KERNEL TENSOR\n"); auto *KernelTensor = getBindingTensor(N, kernelIndex); assert(KernelTensor && "Kernel tensors are always binds."); // Get kernel constants auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5)); auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4)); auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3)); auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2)); assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst && "Kernel dimensions must be constants."); int kernelW = KernelWConst->getZExtValue(); int kernelH = KernelHConst->getZExtValue(); int kernelC = KernelCHConst->getZExtValue(); int kernelN = KernelNConst->getZExtValue(); DEBUG(errs() << "\nKERNEL H: " << kernelH << "\n"); DEBUG(errs() << "KERNEL W: " << kernelW << "\n"); DEBUG(errs() << "KERNEL C: " << kernelC << "\n"); DEBUG(errs() << "KERNEL N: " << kernelN << "\n"); int numOutputs; if(!InputTensor) { DEBUG(errs() << "INPUT FROM EDGE\n"); numOutputs = (InputNVDLATensor->getDimensions()).n * kernelN; // (InputNVDLATensor->getDimensions()).c; } else { DEBUG(errs() << "INPUT FROM WEIGHT TENSOR\n"); auto *BatchesConst = dyn_cast<ConstantInt>(InputTensor->getOperand(2)); auto *ChannelsConst = dyn_cast<ConstantInt>(InputTensor->getOperand(3)); numOutputs = BatchesConst->getZExtValue() * kernelN; // ChannelsConst->getZExtValue(); DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n"); } // Get Strides ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(5)); ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(4)); assert((StrideWConst && StrideHConst) && "Strides must be constants."); int strideW = StrideWConst->getZExtValue(); int strideH = StrideHConst->getZExtValue(); DEBUG(errs() << "STRIDE H: " << strideH << "\n"); DEBUG(errs() << "STRIDE W: " << strideW << "\n"); // Get pads ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(3)); ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(2)); assert((PadWConst && PadHConst) && "Pads must be constants."); int padW = PadWConst->getZExtValue(); int padH = PadHConst->getZExtValue(); DEBUG(errs() << "PAD H: " << padH << "\n"); DEBUG(errs() << "PAD W: " << padW << "\n"); // FIXME: Support dilations. Set dilations to 1 since we do not have dilation support yet. int dilationW = 1; int dilationH = 1; // Get the nodes with Add operations SmallVector<DFLeafNode *, 4> AddOpNodes; SmallVector<IntrinsicInst *, 4> AddInsts; getaddOpSucceedsNode(N, AddOpNodes, AddInsts); assert((!(AddOpNodes.size() > 1)) && "Number of nodes with Add ops must not be more than 1"); // Get bias parameters int BiasW, BiasH, BiasC, BiasN; User *BiasTensor = nullptr; BiasMode biasMode = BiasMode::bNONE; if(AddOpNodes.size()) { // Get the index for bias tensor auto *AddNode = AddOpNodes[0]; auto *AddInst = AddInsts[0]; DEBUG(AddInst->print(errs())); auto *F = AddNode->getFuncPointer(); unsigned BiasIndex = 0; ArgFound = false; for(auto &Arg : F->args()) { if(AddInst->getOperand(1) == &Arg) { ArgFound = true; break; } BiasIndex++; } assert(ArgFound && "Illegal intrinsic or Node."); // Get the bias tensor DEBUG(errs() << "BIAS INDEX: " << BiasIndex << "\n"); DEBUG(errs() << "BIAS TENSOR\n"); BiasTensor = getBindingTensor(AddNode, BiasIndex); assert(BiasTensor && "Bias tensors are always binds."); // Get Bias constants auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5)); auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4)); auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3)); auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2)); assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst && "Bias dimensions must be constants."); BiasW = BiasWConst->getZExtValue(); BiasH = BiasHConst->getZExtValue(); BiasC = BiasCHConst->getZExtValue(); BiasN = BiasNConst->getZExtValue(); DEBUG(errs() << "BIAS H: " << BiasH << "\n"); DEBUG(errs() << "BIAS W: " << BiasW << "\n"); DEBUG(errs() << "BIAS C: " << BiasC << "\n"); DEBUG(errs() << "BIAS N: " << BiasN << "\n"); // Get bias mode //if(kernelN == numOutputs) biasMode = BiasMode::bCHANNEL; //else // biasMode = BiasMode::bUNIFORM; } // Get weights Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW); Weights biasWeights = AddOpNodes.size() == 1 ? readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW) : Weights(DataType::HALF, nullptr, 0); Dims2 tlPadding = Dims2(padH, padW); Dims2 brPadding = Dims2(padH, padW); Dims2 stride = Dims2(strideH, strideW); Dims2 dilation = Dims2(dilationH, dilationW); Dims2 kernelSize = Dims2(kernelH, kernelW); auto *Layer = Network->addConvolution(InputNVDLATensor, numOutputs, 0, kernelSize, tlPadding, brPadding, stride, dilation, kernelWeights, biasWeights, biasMode, numGroups); if(AddOpNodes.size()) { auto *Node = AddOpNodes[0]; mapOutputTensor(Node, Layer->getOutput(0)); } else { mapOutputTensor(N, Layer->getOutput(0)); } Layer->setName((std::string("conv") + getLayerName(std::string("conv"))).c_str()); DEBUG(errs() << Layer->getName() << "\n"); } void CGT_NVDLA::generatePoolingLayer(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "*****POOLING LAYER*****\n"); // Get input tensor unsigned inputIndex = getInputIndex(N, II); auto *InputTensor = getBindingTensor(N, inputIndex); ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor); // Get window dimensions ConstantInt *KernelWConst = dyn_cast<ConstantInt>(II->getOperand(2)); ConstantInt *KernelHConst = dyn_cast<ConstantInt>(II->getOperand(1)); assert((KernelWConst && KernelHConst) && "Kernel dimensions must be constants."); int kernelH = KernelHConst->getZExtValue(); int kernelW = KernelWConst->getZExtValue(); DEBUG(errs() << "KERNEL H: " << kernelH << "\n"); DEBUG(errs() << "KERNEL W: " << kernelW << "\n"); // Get Strides ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(6)); ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(5)); assert((StrideWConst && StrideHConst) && "Strides must be constants."); int strideH = StrideHConst->getZExtValue(); int strideW = StrideWConst->getZExtValue(); DEBUG(errs() << "STRIDE H: " << strideH << "\n"); DEBUG(errs() << "STRIDE W: " << strideW << "\n"); // Get pads ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(4)); ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(3)); assert((PadWConst && PadHConst) && "Pads must be constants."); int padH = PadHConst->getZExtValue(); int padW = PadWConst->getZExtValue(); DEBUG(errs() << "PAD H: " << padH << "\n"); DEBUG(errs() << "PAD W: " << padW << "\n"); Dims2 windowSize = Dims2(kernelH, kernelW); Dims2 stride = Dims2(strideH, strideW); Dims2 tlPadding = Dims2(padH, padW); Dims2 brPadding = Dims2(padH, padW); PoolingType type = (II->getIntrinsicID() == Intrinsic::hpvm_tensor_pool_mean) ? PoolingType::kAVERAGE : PoolingType::kMAX; auto *Layer = Network->addPooling(InputNVDLATensor, type, windowSize, stride, tlPadding, brPadding); mapOutputTensor(N, Layer->getOutput(0)); Layer->setName((std::string("pool") + getLayerName(std::string("pool"))).c_str()); DEBUG(errs() << Layer->getName() << "\n"); } void CGT_NVDLA::generateGemmLayer(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "****GEMM LAYER****\n"); // Get input tensor and compute number of outputs unsigned inputIndex = getInputIndex(N, II); auto *InputTensor = getBindingTensor(N, inputIndex); ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor); // Get the index for kernel tensor auto *F = N->getFuncPointer(); DEBUG(errs()<<"function name = "<< F->getName()<<"\n"); unsigned kernelIndex = 0; bool ArgFound = false; for(auto &Arg : F->args()) { if(II->getOperand(1) == &Arg) { ArgFound = true; break; } kernelIndex++; } assert(ArgFound && "Illegal intrinsic or Node."); // Get the kernel tensor auto *KernelTensor = getBindingTensor(N, kernelIndex); assert(KernelTensor && "Kernel tensors are always binds."); // Get kernel constants auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5)); auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4)); auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3)); auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2)); assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst && "Kernel dimensions must be constants."); int kernelW = KernelWConst->getZExtValue(); int kernelH = KernelHConst->getZExtValue(); int kernelC = KernelCHConst->getZExtValue(); int kernelN = KernelNConst->getZExtValue(); DEBUG(errs() << "KERNEL H: " << kernelH << "\n"); DEBUG(errs() << "KERNEL W: " << kernelW << "\n"); DEBUG(errs() << "KERNEL C: " << kernelC << "\n"); DEBUG(errs() << "KERNEL N: " << kernelN << "\n"); int numOutputs = kernelW; DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n"); // Get the nodes with Add operations SmallVector<DFLeafNode *, 4> AddOpNodes; SmallVector<IntrinsicInst *, 4> AddInsts; getaddOpSucceedsNode(N, AddOpNodes, AddInsts); assert((!(AddOpNodes.size() > 1)) && "Number of nodes with Add ops must not be more than 1"); // Get bias parameters int BiasW, BiasH, BiasC, BiasN; User *BiasTensor = nullptr; BiasMode biasMode = BiasMode::bNONE; if(AddOpNodes.size()) { // Get the index for bias tensor auto *AddNode = AddOpNodes[0]; auto *AddInst = AddInsts[0]; auto *F = AddNode->getFuncPointer(); unsigned BiasIndex = 0; ArgFound = false; for(auto &Arg : F->args()) { if(AddInst->getOperand(1) == &Arg) { ArgFound = true; break; } BiasIndex++; } assert(ArgFound && "Illegal intrinsic or Node."); // Get the bias tensor BiasTensor = getBindingTensor(AddNode, BiasIndex); assert(BiasTensor && "Bias tensors are always binds."); // Get Bias constants auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5)); auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4)); auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3)); auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2)); assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst && "Bias dimensions must be constants."); BiasW = BiasWConst->getZExtValue(); BiasH = BiasHConst->getZExtValue(); BiasC = BiasCHConst->getZExtValue(); BiasN = BiasNConst->getZExtValue(); DEBUG(errs() << "BIAS H: " << BiasH << "\n"); DEBUG(errs() << "BIAS W: " << BiasW << "\n"); DEBUG(errs() << "BIAS C: " << BiasC << "\n"); DEBUG(errs() << "BIAS N: " << BiasN << "\n"); // Get bias mode //if(KernelCHConst->getZExtValue() == numOutputs) biasMode = BiasMode::bCHANNEL; //else // biasMode = BiasMode::bUNIFORM; } // Get weights Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW); Weights biasWeights = (AddOpNodes.size() == 1) ? readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW) : Weights(DataType::HALF, nullptr, 0); auto *Layer = Network->addFullyConnected(InputNVDLATensor, numOutputs, kernelWeights, biasWeights, biasMode); if(AddOpNodes.size()) { auto *Node = AddOpNodes[0]; mapOutputTensor(Node, Layer->getOutput(0)); } else { mapOutputTensor(N, Layer->getOutput(0)); } Layer->setName((std::string("gemm") + getLayerName(std::string("gemm"))).c_str()); DEBUG(errs() << Layer->getName() << "\n"); } void CGT_NVDLA::generateReluLayer(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "******RELU LAYER******\n"); // Get input tensor unsigned inputIndex = getInputIndex(N, II); auto *InputTensor = getBindingTensor(N, inputIndex); ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor); auto *Layer = Network->addActivation(InputNVDLATensor, kRELU); mapOutputTensor(N, Layer->getOutput(0)); Layer->setName((std::string("relu") + getLayerName(std::string("relu"))).c_str()); DEBUG(errs() << Layer->getName() << "\n"); } void CGT_NVDLA::generateSoftMaxLayer(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "******SOFTMAX LAYER*******\n"); // Get input tensor unsigned inputIndex = getInputIndex(N, II); auto *InputTensor = getBindingTensor(N, inputIndex); ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor); auto *Layer = Network->addSoftMax(InputNVDLATensor); mapOutputTensor(N, Layer->getOutput(0)); Layer->setName((std::string("softmax") + getLayerName(std::string("softmax"))).c_str()); DEBUG(errs() << Layer->getName() << "\n"); } void CGT_NVDLA::generateTanhLayer(DFLeafNode* N, const IntrinsicInst *II) { DEBUG(errs() << "*******TANH LAYER*******\n"); // Get input tensor unsigned inputIndex = getInputIndex(N, II); auto *InputTensor = getBindingTensor(N, inputIndex); ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor); auto *Layer = Network->addActivation(InputNVDLATensor, kTANH); mapOutputTensor(N, Layer->getOutput(0)); Layer->setName((std::string("tanh") + getLayerName(std::string("tanh"))).c_str()); DEBUG(errs() << Layer->getName() << "\n"); } /* void CGT_NVDLA::generateBatchNormLayer(DFLeafNode* N, const IntrinsicInst *II) { const dc::BatchNormParameter& p = msg.batch_norm_param(); Weights mean = weightFactory(msg.name(), kMEAN); Weights variance = weightFactory(msg.name(), kVARIANCE); Weights movingAverage = weightFactory(msg.name(), kMOVING_AVERAGE); float eps = p.eps(); float scaleFactor = 1.0f; float average = 0.0f; int i; average = *(static_cast<const float*>(movingAverage.values)); if ( average == 0.0f ) { gLogError << "Batch Normalization moving average is zero " << std::endl; return 0; } scaleFactor /= average; if (mean.count != variance.count) { gLogError << "Mean and variance have differing number of elements " << mean.count << " & " << variance.count << std::endl; return 0; } float *meanBlob = (float *)mean.values; float *varianceBlob = (float *)variance.values; Dims4 inputDims = getIntermediateInputTensor(N)->getDimensions(); BatchNormMode mode; if (mean.count == 1) { mode = BatchNormMode::bnUNIFORM; meanBlob[0] = meanBlob[0] * scaleFactor; varianceBlob[0] = varianceBlob[0] * scaleFactor; } else if (mean.count == inputDims.c) { mode = BatchNormMode::bnm_CHANNEL; for (i = 0; i < mean.count; i++) { meanBlob[i] = meanBlob[i] * scaleFactor; varianceBlob[i] = varianceBlob[i] * scaleFactor; } } else { gLogError << "Unknown batch norm mode" << std::endl; return 0; } // Get input tensor unsigned inputIndex = getInputIndex(N, II); Value *InputTensor = getBindingTensor(inputIndex); ITensor *InputNVDLATensor = getNVDLAInputTensor(InputTensor); auto *Layer = Network->addBatchNorm(InputNVDLATensor, mode, mean, variance, eps); mapOutputTensor(N, Layer->getOutput(0)); } */ unsigned CGT_NVDLA::identifyOutputs() { std::set< ITensor* > outputTensors; std::set< ITensor* > InputTensors; for (int l = 0; l < Network->getNumLayers(); ++l) { ILayer* layer = Network->getLayer(l); assert(layer && "Illegal NVDLA compiler IR!"); for (int ii = 0; ii < layer->getNumInputs(); ++ii) { InputTensors.insert(layer->getInput(ii)); } for (int oo = 0; oo < layer->getNumOutputs(); ++oo) { outputTensors.insert(layer->getOutput(oo)); } } for (std::set<ITensor*>::iterator oi = outputTensors.begin(); oi != outputTensors.end(); ++oi) { // An output tensor which is not an input to any other layers is a Network output tensor if (InputTensors.find(*oi) == InputTensors.end()) Network->markOutput(*oi); } return Network->getNumOutputs(); } void CGT_NVDLA::codeGen(DFLeafNode *N) { // No allocation nodes allowed. assert(!N->isAllocationNode() && "Allocation Node not expected in ApproxHPVM"); // Skip code generation if it is a dummy node if(N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } // Generate code only if it has the right hint //if (!checkPreferredTarget(N, hpvm::NVDLA_TARGET)) { // DEBUG(errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"); // return; // } // Get the function associated with the dataflow node auto *F = N->getFuncPointer(); DEBUG(errs()<<"function name = "<< F->getName()<<"\n"); // Generate code for every instruction in this node for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &(*i); if (BuildDFG::isViscIntrinsic(I)) { auto *II = dyn_cast<IntrinsicInst>(I); assert((II->getCalledFunction()->getName()).startswith("llvm.hpvm.tensor") && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); switch (II->getIntrinsicID()) { case Intrinsic::hpvm_tensor_convolution: case Intrinsic::hpvm_tensor_group_convolution: generateConvolutionLayer(N, II); break; case Intrinsic::hpvm_tensor_batchnorm: generateBatchNormLayer(N, II); break; case Intrinsic::hpvm_tensor_mul: generateGemmLayer(N, II); break; case Intrinsic::hpvm_tensor_add: // Add not explicitly supported by NVDLA compiler! break; case Intrinsic::hpvm_tensor_pool_max: case Intrinsic::hpvm_tensor_pool_mean: generatePoolingLayer(N, II); break; case Intrinsic::hpvm_tensor_relu: generateReluLayer(N, II); break; case Intrinsic::hpvm_tensor_clipped_relu: // No need to generate NVDLA IR for this? break; case Intrinsic::hpvm_tensor_tanh: generateTanhLayer(N, II); break; case Intrinsic::hpvm_tensor_softmax: generateSoftMaxLayer(N, II); break; default: llvm_unreachable("Unknown HPVM Intrinsic!"); break; } } } } void CGT_NVDLA::codeGen(DFInternalNode* N) { DEBUG(errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"); DEBUG(errs () << "Skipping internal node\n"); } NvDlaError CGT_NVDLA::parseSetup(const TestAppArgs* appArgs, TestInfo* i) { return NvDlaSuccess; } NvDlaError CGT_NVDLA::transformHPVM2NVDLA(const TestAppArgs* appArgs, TestInfo* i) { NVDLA_UNUSED(appArgs); NvDlaError e = NvDlaSuccess; Network = nullptr; Network = nvdla::createNetwork(); if (!Network) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createNetwork() failed"); // Iterate over all the DFGs and produce code for each one of them for(auto &RootNode: *(appArgs->Roots)) visit(RootNode); // if the application has so far not marked the network's outputs, allow the parser to do so now if (Network->getNumOutputs() <= 0) { int outs = identifyOutputs(); DEBUG(NvDlaDebugPrintf("Marking total %d outputs\n", outs)); if (outs <= 0) ORIGINATE_ERROR_FAIL(NvDlaError_BadValue, "Unable to identify outputs for the network: %d", outs); } if (appArgs->computePrecision == nvdla::DataType::INT8) { if (appArgs->calibTable != "") { DEBUG(NvDlaDebugPrintf("parsing calibration table...\n")); PROPAGATE_ERROR_FAIL(readTensorScales(appArgs, i, Network)); } else { DEBUG(NvDlaDebugPrintf("initialize all tensors with const scaling factors of 127...\n")); PROPAGATE_ERROR_FAIL(generateTensorScales(appArgs, i, Network)); } } DEBUG(NvDlaDebugPrintf("attaching parsed network to the wisdom...\n")); if (!i->wisdom->setNetworkTransient(Network)) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->setNetworkTransient() failed"); return NvDlaSuccess; fail: return e; } NvDlaError CGT_NVDLA::parseAndCompile(const TestAppArgs* appArgs, TestInfo* i) { NvDlaError e = NvDlaSuccess; bool isCaffe = appArgs->caffemodel != ""; PROPAGATE_ERROR_FAIL(parseSetup(appArgs, i)); DEBUG(NvDlaDebugPrintf("creating new wisdom context...\n")); i->wisdom = nvdla::createWisdom(); if (!i->wisdom) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createWisdom() failed"); DEBUG(NvDlaDebugPrintf("opening wisdom context...\n")); if (!i->wisdom->open(i->wisdomPath)) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->open() failed to open: \"%s\"", i->wisdomPath.c_str()); // Parse PROPAGATE_ERROR_FAIL(transformHPVM2NVDLA(appArgs, i)); // Compile PROPAGATE_ERROR_FAIL(compileProfile(appArgs, i)); /* Destroy network before closing wisdom context */ nvdla::destroyNetwork(i->wisdom->getNetwork()); DEBUG(NvDlaDebugPrintf("closing wisdom context...\n")); i->wisdom->close(); fail: if (i->wisdom != NULL) { nvdla::destroyWisdom(i->wisdom); i->wisdom = NULL; } return e; } NvDlaError CGT_NVDLA::testSetup(const TestAppArgs* appArgs, TestInfo* i) { NvDlaError e = NvDlaSuccess; std::string wisdomPath = appArgs->outputPath + "wisdom.dir/"; std::string removeCmd = ""; std::string imagePath = ""; NvDlaStatType stat; int ii = 0; // Do input paths exist? e = NvDlaStat(appArgs->inputPath.c_str(), &stat); if (e != NvDlaSuccess) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Input path does not exist: \"%s\"", appArgs->inputPath.c_str()); // Do output paths exist? e = NvDlaStat(appArgs->outputPath.c_str(), &stat); if (e != NvDlaSuccess) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Output path does not exist: \"%s\"", appArgs->outputPath.c_str()); // Clear wisdomPath if any exist removeCmd += "rm -rf " + wisdomPath; ii = std::system(removeCmd.c_str()); // This is pretty awful if (ii != 0) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "system command failed: \"%s\"", removeCmd.c_str()); PROPAGATE_ERROR_FAIL(NvDlaMkdir(const_cast<char *>(wisdomPath.c_str()))); // Initialize TestInfo i->wisdom = NULL; i->wisdomPath = wisdomPath; i->pData = NULL; return NvDlaSuccess; fail: return e; } NvDlaError CGT_NVDLA::launchTest(const TestAppArgs* appArgs) { NvDlaError e = NvDlaSuccess; TestInfo testInfo; PROPAGATE_ERROR_FAIL(testSetup(appArgs, &testInfo)); PROPAGATE_ERROR_FAIL(parseAndCompile(appArgs, &testInfo)); return NvDlaSuccess; fail: return e; } bool HPVM2NVDLA::runOnModule(Module &M) { DEBUG(errs() << "**************HPVM2NVDLA PASS****************\n"); NvDlaError e = NvDlaError_TestApplicationFailed; TestAppArgs testAppArgs = defaultTestAppArgs; // Get the HPVM IR graph BuildDFG &DFG = getAnalysis<BuildDFG>(); std::vector<DFInternalNode *> Roots = DFG.getRoots(); // Visitor for Code Generation Graph Traversal CGT_NVDLA *CGTVisitor = new CGT_NVDLA(M, DFG); if(ComputePrecision == "INT8" || ComputePrecision == "int8") { testAppArgs.computePrecision = nvdla::DataType::INT8; testAppArgs.quantizationMode = nvdla::QuantizationMode::PER_KERNEL; testAppArgs.configtarget = std::string("nv_small"); } else { testAppArgs.computePrecision = nvdla::DataType::HALF; testAppArgs.quantizationMode = nvdla::QuantizationMode::NONE; testAppArgs.configtarget = std::string("nv_full"); } testAppArgs.profileName = std::string("hpvm-mod"); testAppArgs.calibTable = CalibTablePath;//std::string("output_scales.txt"); testAppArgs.outputPath = std::string("."); testAppArgs.inDataFormat = nvdla::DataFormat::NCHW; testAppArgs.Roots = &Roots; e = CGTVisitor->launchTest(&testAppArgs); if (e != NvDlaSuccess) DEBUG(errs() << "ERROR\n"); else DEBUG(errs() << "SUCESS\n"); delete CGTVisitor; return false; } NvDlaError CGT_NVDLA::compileProfile(const TestAppArgs* appArgs, TestInfo* i) { NvDlaError e = NvDlaSuccess; std::string profileName = ""; std::string targetConfigName = ""; NvDlaFileHandle file = 0; std::string fileName = ""; NvU8 *buffer = 0; NvU64 size = 0; nvdla::ICompiler* compiler = i->wisdom->getCompiler(); if (!compiler) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getCompiler() failed"); if (!(appArgs->configtarget != "")) ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No target config found to load"); targetConfigName = appArgs->configtarget; // Determine profile PROPAGATE_ERROR_FAIL(generateProfile(appArgs, &profileName, i)); // Compile DEBUG(NvDlaDebugPrintf("compiling profile \"%s\"... config \"%s\"...\n", profileName.c_str(), targetConfigName.c_str())); PROPAGATE_ERROR_FAIL(compiler->compile(profileName.c_str(), targetConfigName.c_str(), &i->compiledLoadable)); // Get loadable buffer and dump it into a file PROPAGATE_ERROR_FAIL(compiler->getLoadableImageSize(profileName.c_str(), &size)); if (size == 0) { ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Invalid size for a loadable"); } buffer = (NvU8 *) NvDlaAlloc(size); if (buffer == NULL) { ORIGINATE_ERROR_FAIL(NvDlaError_InsufficientMemory, "Failed to allocate buffer for loadable"); } PROPAGATE_ERROR_FAIL(compiler->getLoadableImage(profileName.c_str(), buffer)); fileName = profileName + ".nvdla"; errs() << "Writing NVDLA module '" << fileName << "' ..."; PROPAGATE_ERROR_FAIL(NvDlaFopen(fileName.c_str(), NVDLA_OPEN_WRITE, &file)); PROPAGATE_ERROR_FAIL(NvDlaFwrite(file, buffer, size)); errs() << " done.\n"; fail: NvDlaFclose(file); if (buffer != NULL) NvDlaFree(buffer); return e; } NvDlaError CGT_NVDLA::generateProfile(const TestAppArgs* appArgs, std::string* profileName, TestInfo* i) { NvDlaError e = NvDlaSuccess; nvdla::DataFormat inDataFormat = nvdla::DataFormat::UNKNOWN; if (appArgs->profileName != "") { // init named profile (basic/default/performance) with default params in its constructor and exit DEBUG(errs() << "PROFILE NAME PROVIDED\n"); PROPAGATE_ERROR_FAIL(beginWithNamedProfile(appArgs, i)); *profileName = appArgs->profileName; } else { ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No profile supplied to load"); } // capture profile params from command line (override the existing ones as necessary) inDataFormat = inDataFormat == nvdla::DataFormat::UNKNOWN ? appArgs->inDataFormat : inDataFormat; PROPAGATE_ERROR_FAIL(updateProfileWithCmdLineArgs(appArgs, i, profileName->c_str(), inDataFormat)); fail: return e; } NvDlaError CGT_NVDLA::beginWithNamedProfile(const TestAppArgs* appArgs, TestInfo* i) { NvDlaError e = NvDlaSuccess; nvdla::IProfiler* profiler; nvdla::IProfile* profile; profiler = i->wisdom->getProfiler(); if ( !profiler ) { ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profiler not initialized"); } profile = profiler->getProfile(appArgs->profileName.c_str()); if ( !profile ) { ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profile %s not initialized", appArgs->profileName.c_str()); } fail: return e; } NvDlaError CGT_NVDLA::updateProfileWithCmdLineArgs ( const TestAppArgs* appArgs, TestInfo* i, const char* profileName, nvdla::DataFormat inDataFormat ) { NvDlaError e = NvDlaSuccess; nvdla::IProfiler* profiler; nvdla::IProfile* profile; profiler = i->wisdom->getProfiler(); if (!profiler) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getProfiler() failed"); profile = profiler->getProfile(profileName); if (!profile) ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "profiler->getProfile() failed"); PROPAGATE_ERROR_FAIL(profile->setComputePrecision(appArgs->computePrecision)); PROPAGATE_ERROR_FAIL(profile->setNetworkInputDataFormat(inDataFormat)); // determine input surface format switch(inDataFormat) { case nvdla::DataFormat::NHWC: if (appArgs->computePrecision == nvdla::DataType::HALF) { PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A16B16G16R16_F)); } else if (appArgs->computePrecision == nvdla::DataType::INT8) { PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A8B8G8R8)); } else { ORIGINATE_ERROR_FAIL(NvDlaError_NotSupported, "NHWC and compute precision %u is not yet supported", appArgs->computePrecision.v()); } break; case nvdla::DataFormat::NCxHWx: case nvdla::DataFormat::NCHW: case nvdla::DataFormat::UNKNOWN: // atleast start the test with feature data format default: if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0) PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8)); else PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE)); } // determine int8 cfgs if (appArgs->computePrecision == nvdla::DataType::INT8) { PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::PER_TENSOR)); switch(appArgs->quantizationMode) { case nvdla::QuantizationMode::PER_FILTER: PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_FILTER)); break; case nvdla::QuantizationMode::PER_KERNEL: case nvdla::QuantizationMode::NONE: // default to per-kernel; find a way to run int8 tests w/ NONE qtzMode cleanly default: PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_KERNEL)); } } else { PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::NONE)); PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::NONE)); } PROPAGATE_ERROR_FAIL(profile->setNetworkOutputDataFormat(nvdla::DataFormat::NCxHWx)); if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0) PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8)); else PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE)); if (appArgs->numBatches > 0) PROPAGATE_ERROR_FAIL(profile->setMultiBatchSize(appArgs->numBatches)); fail: return e; } NvDlaError CGT_NVDLA::generateTensorScales(const TestAppArgs* appArgs, TestInfo* i, nvdla::INetwork* network) { NvDlaError e = NvDlaSuccess; std::vector<nvdla::ILayer*> networkLayers = network->getLayers(); std::vector<nvdla::ITensor*> networkInputs = network->getInputs(); std::vector<nvdla::ILayer*>::iterator li = networkLayers.begin(); std::vector<nvdla::ITensor*>::iterator nii = networkInputs.begin(); // set scaling factor for the network input tensors for (; nii != networkInputs.end(); ++nii) { NvF32 scale = 1; NvF32 min = scale * -127.0f; NvF32 max = scale * 127.0f; std::string tName = (*nii)->getName(); DEBUG(errs() << "INPUT NAME: " << tName << "\n"); // set same dynamic range for all channels of the tensor (cIndex = -1) PROPAGATE_ERROR_FAIL( (*nii)->setChannelDynamicRange(-1, min, max) ); const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale)); if (0) NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", tName.c_str(), scale); } for (; li != networkLayers.end(); ++li) { NvF32 scale = 127; NvF32 min = scale * -127.0f; NvF32 max = scale * 127.0f; std::string lName = (*li)->getName(); nvdla::ITensor* outTensor = (*li)->getOutput(0); DEBUG(errs() << "LAYER NAME: " << lName << "\n"); // set same dynamic range for all channels of the tensor (cIndex = -1) PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) ); const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(lName, scale)); if (0) NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", lName.c_str(), scale); } fail: return e; } NvDlaError CGT_NVDLA::readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network) { NvDlaError e = NvDlaSuccess; NvDlaStatType stat; std::string calibTableFile = /*i->calibTablesPath + "/" + */appArgs->calibTable; //PROPAGATE_ERROR_FAIL(NvDlaStat(calibTableFile.c_str(), &stat)); DEBUG(errs() << "***********READING TENSOR SCALESi*************\n"); std::ifstream infile(calibTableFile.c_str()); std::string line; std::map<std::string, float> LayerNameToScaleMap; while (std::getline(infile, line)) { DEBUG(errs() << "READ LINE: " << line << "\n"); line.erase(remove(line.begin(), line.end(), ' '), line.end()); DEBUG(errs() << "READ LINE WITHOUT WHITE SPACES: " << line << "\n"); std::string delimiter = ":"; std::string layer_name = line.substr(0, line.find(delimiter)); std::string Scale = line.substr(line.find(delimiter) + 1); DEBUG(errs() << "LAYER NAME: " << layer_name << "\n"); DEBUG(errs() << "SCALE: " << Scale << "\n"); size_t size; LayerNameToScaleMap[layer_name] = std::stof(Scale, &size); } infile.close(); DEBUG(errs() << "GOT TENSOR SCALES FROM CALIB TABLE\n"); std::vector<nvdla::ILayer*> networkLayers = network->getLayers(); std::vector<nvdla::ITensor*> networkInputs = network->getInputs(); for (auto *Input : networkInputs) { NvF32 scale = 0.0f; NvF32 min = 0.0f; NvF32 max = 0.0f; DEBUG(errs() << "SET SCALE FOR INPUT\n"); scale = LayerNameToScaleMap["input"]; DEBUG(errs() << "INPUT SCALE: " << scale << "\n"); min = scale * -127.0f; max = scale * 127.0; PROPAGATE_ERROR_FAIL(Input->setChannelDynamicRange(-1, min, max) ); const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>("data", scale)); } DEBUG(errs() << "PER LAYER CALIB\n"); for (auto *Layer : networkLayers) { NvF32 scale = 0.0f; NvF32 min = 0.0f; NvF32 max = 0.0f; std::string tName = Layer->getName(); DEBUG(errs() << "SETTING SCALE FOR LAYER NAME: " << tName << "\n"); nvdla::ITensor* outTensor = Layer->getOutput(0); auto it = LayerNameToScaleMap.find(tName); if (it != LayerNameToScaleMap.end()) { DEBUG(errs() << "SET SCALE FOR NAME: " << tName << "\n"); DEBUG(errs() << "SCALE: " << it->second << "\n"); scale = it->second; min = scale * -127.0f; max = scale * 127.0f; } else { DEBUG(errs() << "SET DEFAULT SCALE FOR NAME: " << tName << "\n"); DEBUG(errs() << "SCALE: 1\n"); scale = 1; min = scale * -127.0f; max = scale * 127.0f; } //else { // ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Atleast 1 of scale or min-max should be specified for %s\n", tName.c_str()); //} PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) ); const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale)); } DEBUG(errs() << "DONE PARSING CALIBRATION TABLE\n"); fail: return e; }