-
Hashim Sharif authoredHashim Sharif authored
HPVM2NVDLAPass.cpp 59.15 KiB
#define ENABLE_ASSERTS
#define DEBUG_TYPE "DFG2NVDLA"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/IR/Attributes.h"
#include "llvm/ADT/STLExtras.h"
#include "SupportHPVM/DFG2LLVM.h"
#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <set>
#include "dlaerror.h"
#include "dlatypes.h"
#include "nvdla/IRuntime.h"
#include "DlaImageUtils.h"
#include "ErrorMacros.h"
#include "nvdla_inf.h"
#include "nvdla_os_inf.h"
#include "nvdla/IType.h"
#include "nvdla/ITensor.h"
#include "nvdla/INetwork.h"
#include "nvdla/ILayer.h"
#include "nvdla/IProfiler.h"
#include "nvdla/IProfile.h"
#include "nvdla/ICompiler.h"
#include "nvdla/ILoadable.h"
#include "nvdla/IWisdom.h"
#include "rapidjson/document.h"
#include "rapidjson/filereadstream.h"
#include "rapidjson/error/en.h"
#include "half.h"
using namespace llvm;
using namespace builddfg;
using namespace dfg2llvm;
using namespace nvdla;
typedef half_float::half float16;
static cl::opt<std::string> ComputePrecision("cprecision",
cl::desc("Compute precision (int8 or fp16)."), cl::init("float16"));
static cl::opt<std::string> CalibTablePath("calib-table",
cl::desc("Path to tensor scales file"),
cl::value_desc("filename"), cl::Required);
#define DEFAULT_BATCH_SIZE 0
#define DEFAULT_DATA_FMT nvdla::DataFormat::NCHW
#define DEFAULT_QUANT_MODE nvdla::QuantizationMode::NONE
#define TARGET_CONFIG_NAME "nv_full"
#define TEST_PARAM_FILE_MAX_SIZE 65536
struct HPVM2NVDLA : public ModulePass {
static char ID; // Pass identification, replacement for typeid
HPVM2NVDLA() : ModulePass(ID) {}
public:
// Functions
virtual bool runOnModule(Module &M);
void getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<BuildDFG>();
AU.addPreserved<BuildDFG>();
}
private:
//bool transformHPVM2NVDLA(Module &M);
//void codeGenHPVM2NVDLA(CGT_NVDLA *, DFNode *);
};
struct TestAppArgs
{
std::string project;
std::string inputPath;
std::string inputName;
std::string outputPath;
std::string testname;
std::string testArgs;
std::string prototxt; // This should be folded into testArgs
std::string caffemodel; // This should be folded into testArgs
std::string cachemodel; // This should be folded into testArgs
std::string profileName; // ok here?
std::string profileFile;
std::string configtarget;
std::string calibTable;
nvdla::QuantizationMode quantizationMode;
Module *M;
std::vector<DFInternalNode *> *Roots;
NvU16 numBatches;
nvdla::DataFormat inDataFormat;
nvdla::DataType computePrecision;
std::map<std::string, NvF32> tensorScales;
};
struct TestInfo
{
// common
nvdla::IWisdom* wisdom;
std::string wisdomPath;
// parse
std::string modelsPath;
std::string profilesPath;
std::string calibTablesPath;
// runtime
// nvdla::IRuntime* runtime;
nvdla::ILoadable* compiledLoadable;
NvU8 *pData;
//std::string inputImagesPath;
//std::string inputLoadablePath;
// std::map<std::string, NvDlaImage*> inputImages;
// std::map<std::string, void *> inputBuffers;
// std::map<std::string, NvDlaImage*> outputImages;
// std::map<std::string, void *> outputBuffers;
// std::vector<SubmitContext*> submits;
NvU32 timeout;
NvU16 numBatches; // runtime's point-of-view
NvU32 numSubmits;
};
static TestAppArgs defaultTestAppArgs =
{
/* .project = */ "OpenDLA",
/* .inputPath = */ "./",
/* .inputName = */ "",
/* .outputPath = */ "./",
/* .testname = */ "",
/* .testArgs = */ "",
/* .prototxt = */ "",
/* .caffemodel = */ "",
/* .cachemodel = */ "",
/* .profileName = */ "fast-math",
/* .profileFile = */ "",
/* .configtarget = */ TARGET_CONFIG_NAME,
/* .calibtable = */ "",
/* .quantizationMode = */ DEFAULT_QUANT_MODE,
nullptr, nullptr,
/* .numBatches = */ DEFAULT_BATCH_SIZE,
/* .inDataFormat = */ DEFAULT_DATA_FMT,
/* .computePrecision = */ nvdla::DataType::INT8
};
char HPVM2NVDLA::ID = 0;
static RegisterPass<HPVM2NVDLA> X("hpvm-nvdla",
"Dataflow Graph to NVDLA IR Pass",
false, false);
// Visitor for Code generation traversal of HPVM IR
class CGT_NVDLA : public CodeGenTraversal {
private:
// Data information
//DataFormat InDataFormat;
//DataType ComputePrecision;
//QuantizationMode Quantization;
//NvU16 NumBatches;
// Wisdom and network information
IWisdom *Wisdom;
INetwork *Network;
std::map<std::string, int> LayerNameMap;
// Maps dataflow edges in HPVM IR to Tensors in NVDLA IR
DenseMap<const DFEdge *, ITensor *> EdgeToTensorMap;
// Virtual Functions
void init();
void initRuntimeAPI();
void codeGen(DFInternalNode* N);
void codeGen(DFLeafNode* N);
// Codegen functions for all supported layers
void generateConvolutionLayer(DFLeafNode *, const IntrinsicInst *);
void generatePoolingLayer(DFLeafNode *, const IntrinsicInst *);
void generateBatchNormLayer(DFLeafNode *, const IntrinsicInst *);
void generateReluLayer(DFLeafNode *, const IntrinsicInst *);
void generateGemmLayer(DFLeafNode *, const IntrinsicInst *);
void generateSoftMaxLayer(DFLeafNode *, const IntrinsicInst *);
void generateTanhLayer(DFLeafNode *, const IntrinsicInst *);
// Map edges to output tensors
void mapOutputTensor(DFNode *N, ITensor *Tensor);
// Get input tensors to nodes
ITensor *getIntermediateInputTensor(DFNode *N);
// Get binding tensors to nodes
User *getBindingTensor(DFLeafNode* N, unsigned index);
// Get the input NVDLA tensors to nodes
ITensor *getNVDLAInputTensor(DFLeafNode* N, const User *InputTensor);
// Get index for an input tensor
unsigned getInputIndex(DFLeafNode* N, const IntrinsicInst *II);
// Gets nodes with add ops meant to be combined with convolution and gemm
void getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes,
SmallVector<IntrinsicInst *, 4> &AddInsts);
// Getting weights
Weights readTrainedWeights(User *TensorPtr,
int dim1_size, int dim2_size,
int dim3_size, int dim4_size);
// Identify outputs
unsigned identifyOutputs();
// Generate profile based on data parameters
//void generateProfile(std::string &, std::string &);
std::string getLayerName(std::string Name);
public:
CGT_NVDLA(Module &_M, BuildDFG &_DFG)
: CodeGenTraversal(_M, _DFG) {// : Network(nullptr) {
//initRuntimeAPI();
init();
}
//void destroySetUp();
//void setUpWisdom();
//void compileProfile();
//void transformHPVM2NVDLA(DFNode *);
NvDlaError generateTensorScales(const TestAppArgs*, TestInfo*, nvdla::INetwork*);
NvDlaError updateProfileWithCmdLineArgs(const TestAppArgs*, TestInfo*, const char*, nvdla::DataFormat);
NvDlaError beginWithNamedProfile(const TestAppArgs*, TestInfo*);
NvDlaError generateProfile(const TestAppArgs*, std::string*, TestInfo*);
NvDlaError compileProfile(const TestAppArgs*, TestInfo*);
NvDlaError launchTest(const TestAppArgs*);
NvDlaError testSetup(const TestAppArgs*, TestInfo*);
NvDlaError parseAndCompile(const TestAppArgs*, TestInfo*);
NvDlaError transformHPVM2NVDLA(const TestAppArgs*, TestInfo*);
NvDlaError parseSetup(const TestAppArgs*, TestInfo*);
NvDlaError readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network);
};
void CGT_NVDLA::init() {
// Default paramters
//InDataFormat = DataFormat::NCHW;
//ComputePrecision = DataType::FLOAT;
//Quantization = QuantizationMode::NONE;
//NumBatches = 0;
}
void CGT_NVDLA::initRuntimeAPI() {
// Nothing to do here!
}
Weights CGT_NVDLA::readTrainedWeights(User *TensorPtr,
int dim1_size, int dim2_size,
int dim3_size, int dim4_size) {
DEBUG(errs() << "READ TRAINED WEIGHTS\n");
// Get weights file name
User *MemcpyPtr = dyn_cast<User>(TensorPtr->getOperand(0));
DEBUG(MemcpyPtr->print(errs()));
DEBUG(errs() << "\n");
while(!dyn_cast<AllocaInst>(MemcpyPtr)) {
MemcpyPtr = dyn_cast<User>(MemcpyPtr->getOperand(0));
}
User *MemcpyArg = nullptr;
for(User *U: MemcpyPtr->users()) {
DEBUG(U->print(errs()));
DEBUG(errs() << "\n");
if(auto *BCO = dyn_cast<BitCastOperator>(U)) {
for(User *CU: BCO->users()) {
if(auto *CI = dyn_cast<CallInst>(CU)) {
CI->getCalledFunction()->getName().contains(StringRef("memcpy"));
MemcpyArg = dyn_cast<User>(CI->getOperand(1));
break;
}
}
if(MemcpyArg)
break;
}
}
assert(MemcpyArg && "File name not found.");
auto *WeightFileName = dyn_cast<GlobalVariable>(MemcpyArg->getOperand(0));
assert(WeightFileName && "Weight file name must be a global variable.");
auto* CDA = dyn_cast<ConstantDataArray>(WeightFileName->getInitializer());
assert(CDA && "Weight file name must be a constant array.");
const auto &file_name = std::string(CDA->getAsString());
// Read the weights file
int num_elem = dim1_size * dim2_size * dim3_size * dim4_size;
int size_in_bytes = sizeof(float16) * num_elem;
//DEBUG(errs() << "float16 size: " << sizeof(float16) << "\n");
DEBUG(errs() << "size in bytes: " << size_in_bytes << "\n");
void *tensor_data = (void *) malloc(size_in_bytes);
int file_header_size = 0;
DEBUG(errs() << "FILE NAME: " << file_name << "\n");
FILE *file = fopen(file_name.c_str(), "rb");
if(!file) {
DEBUG(errs() << "Data file is not found. Aborting.\n");
abort();
}
fseek(file, file_header_size, SEEK_CUR);
size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
DEBUG(errs() << "BYTES READ: " << bytes_read << "\n");
fclose(file);
// Create weight tensors
auto Weight = Weights(DataType::HALF, tensor_data, NvS64(num_elem));
//FILE *try_file = fopen("temp.bin", "wb");
//fwrite(Weight.values, sizeof(float), num_elem, try_file);
//fclose(try_file);
//exit(-1);
return Weight;
}
// For a tensor to be a input weight tensor, it has to come from the root node
User *CGT_NVDLA::getBindingTensor(DFLeafNode* N, unsigned index) {
// HPVM internal API needs fixing. Remove this lambda function when bug is fixed.
auto NodeIsRoot = [](DFNode &InternalNode) {
auto *RootFunction = InternalNode.getFuncPointer();
for(User *U: RootFunction->users()) {
DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
DEBUG(U->print(errs()));
DEBUG(errs() << "\n");
auto *II = dyn_cast<IntrinsicInst>(U);
if(!II) {
auto *BCI = dyn_cast<BitCastOperator>(U);
assert(BCI && "Not a bitcast instruction.");
for(User *BCU : BCI->users()) {
DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
DEBUG(BCU->print(errs()));
DEBUG(errs() << "\n");
II = dyn_cast<IntrinsicInst>(BCU);
if(II)
break;
}
}
if(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)) {
DEBUG(errs() << "LAUNCH FUNCTION: ");
DEBUG(II->print(errs()));
DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n");
return true;
}
}
DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n");
return false;
};
auto NodeIsLeaf = [](DFNode &Node) {
auto *NodeFunction = Node.getFuncPointer();
for(User *U: NodeFunction->users()) {
DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
DEBUG(U->print(errs()));
DEBUG(errs() << "\n");
auto *II = dyn_cast<IntrinsicInst>(U);
if(!II) {
auto *BCI = dyn_cast<BitCastOperator>(U);
assert(BCI && "Not a bitcast instruction.");
for(User *BCU : BCI->users()) {
DEBUG(errs() << "USER FOR INTERNAL NODE IN LAMBDA FUNCTION: ");
DEBUG(BCU->print(errs()));
DEBUG(errs() << "\n");
II = dyn_cast<IntrinsicInst>(BCU);
if(II)
break;
}
}
if(II
&& (II->getIntrinsicID() == Intrinsic::hpvm_createNode
|| II->getIntrinsicID() == Intrinsic::hpvm_createNode1D
|| II->getIntrinsicID() == Intrinsic::hpvm_createNode2D
|| II->getIntrinsicID() == Intrinsic::hpvm_createNode3D)) {
DEBUG(errs() << "CREATE NODE FUNCTION: ");
DEBUG(II->print(errs()));
DEBUG(errs() << "LAMBDA FUNCTION RETURN TRUE\n");
// Ensure that the node function does not have these create node intrinsics
for(inst_iterator i = inst_begin(NodeFunction),
e = inst_end(NodeFunction); i != e; ++i) {
Instruction *I = &(*i);
if(auto *II = dyn_cast<IntrinsicInst>(I)) {
if(II->getIntrinsicID() == Intrinsic::hpvm_createNode
|| II->getIntrinsicID() == Intrinsic::hpvm_createNode1D
|| II->getIntrinsicID() == Intrinsic::hpvm_createNode2D
|| II->getIntrinsicID() == Intrinsic::hpvm_createNode3D) {
DEBUG(errs() << "--LAMBDA FUNCTION RETURN FALSE\n");
return false;
}
}
}
return true;
}
}
DEBUG(errs() << "LAMBDA FUNCTION RETURN FALSE\n");
return false;
};
DEBUG(errs() << "GET BINDING TENSOR\n");
DEBUG(errs() << "GIVEN INDEX: " << index << "\n");
DFEdge *DE = N->getInDFEdgeAt(index);
assert(DE && "Data edge does not exist at given index");
DEBUG(errs() << "LEAF NODE FUNCTION: " << N->getFuncPointer()->getName() << "\n");
// Get the argument position in the root node.
DEBUG(errs() << "GET TO THE ROOT FIRST\n");
auto *InternalNode = DE->getSourceDF();
DEBUG(errs() << "INTERNAL NODE FUNCTION: " << InternalNode->getFuncPointer()->getName() << "\n");
DEBUG(errs() << "INTERNAL NDOE POINTER: " << InternalNode << "\n");
if(NodeIsLeaf(*InternalNode)) {
DEBUG(errs() << "BIND NONE: EDGE FROM LEAF NODE\n");
return nullptr;
}
unsigned argPos = DE->getSourcePosition();
DEBUG(errs() << "ARG POSITION BEFORE LOOP: " << argPos << "\n");
while(!NodeIsRoot(*InternalNode)) {
DEBUG(errs() << "IN LOOP\n");
if(NodeIsLeaf(*InternalNode)) {
DEBUG(errs() << "IN LOOP BIND NONE: EDGE FROM LEAF NODE\n");
return nullptr;
}
argPos = DE->getSourcePosition();
DE = InternalNode->getInDFEdgeAt(argPos);
if(!DE) {
DEBUG(errs() << "NO BINDING EDGE IN LOOP\n");
// No binding edge.
return nullptr;
}
InternalNode = DE->getSourceDF();
DEBUG(errs() << "INTERNAL NODE FUNCTION IN LOOP: " << InternalNode->getFuncPointer()->getName() << "\n");
DEBUG(errs() << "IN LOOP DATA EDGE: " << DE << "\n");
DEBUG(errs() << "IN LOOP ARG POSITION: " << argPos << "\n");
}
DEBUG(errs() << "ARG POSITION: " << argPos << "\n");
DEBUG(errs() << "GET THE LAUNCH FUNCTION\n");
// Now we have the root node. We need to get the launch functions for it.
auto *RootFunction = InternalNode->getFuncPointer();
for(User *U: RootFunction->users()) {
DEBUG(errs() << "User for root: ");
DEBUG(U->print(errs()));
IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
if(!II) {
auto *BCI = dyn_cast<BitCastOperator>(U);
assert(BCI && "Not a bitcast instruction.");
for(User *BCU : BCI->users()) {
II = dyn_cast<IntrinsicInst>(BCU);
if(II)
break;
}
}
assert(II && (II->getIntrinsicID() == Intrinsic::hpvm_launch)
&& "Use of a root node must be in launch function call instrinsic.");
DEBUG(errs() << "LAUNCH FUNCTION: ");
DEBUG(II->print(errs()));
// Now, get the the arguments to the root and get element pointer to argument structure.
auto *ArgObj = dyn_cast<Instruction>(II->getOperand(1));
if(auto *BCO = dyn_cast<BitCastOperator>(ArgObj)) {
ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
} else if (auto *CI = dyn_cast<CallInst>(ArgObj)) {
for(User *CIU : CI->users()) {
auto *BCO = dyn_cast<BitCastOperator>(CIU);
if(BCO) {
ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
break;
}
}
} else if (auto *AI = dyn_cast<AllocaInst>(ArgObj)) {
for(User *AIU : AI->users()) {
auto *BCO = dyn_cast<BitCastOperator>(AIU);
if(BCO) {
ArgObj = dyn_cast<Instruction>(BCO->getOperand(0));
break;
}
}
}
auto *ArgObjPtrType = dyn_cast<PointerType>(ArgObj->getType());
auto *ArgObjType = dyn_cast<StructType>(ArgObjPtrType->getElementType());
assert(ArgObjType && "Arguments to launch is a structure.");
DEBUG(errs() << "ARG OBJ: ");
DEBUG(ArgObj->print(errs()));
DEBUG(errs() << "\n");
// Use the offset into the structure to get the source tensor.
const auto &DL = ArgObj->getParent()->getParent()->getParent()->getDataLayout();
const auto *SL = DL.getStructLayout(ArgObjType);
uint64_t ElementOffset = SL->getElementOffset(argPos);
DEBUG(errs() << "ELEMENT OFFSET: " << ElementOffset << "\n");
Instruction *StructElemPtr = nullptr;
for(User *U: ArgObj->users()) {
if(auto *GI = dyn_cast<GetElementPtrInst>(U)) {
auto *Offset = dyn_cast<ConstantInt>(GI->getOperand(2));
assert(Offset && "Offset is not constant.");
if(Offset->getZExtValue() == argPos) {//ElementOffset) {
StructElemPtr = GI;
break;
}
}
}
assert(StructElemPtr && "No getelementptr found with given offset.");
DEBUG(StructElemPtr->print(errs()));
DEBUG(errs() << "\n");
DEBUG(errs() << "USE THE STORES TO GET THE BIND TENSOR\n");
// Get store to the element of argument structure to get the pointer to tensor.
for(User *GIU: StructElemPtr->users()) {
DEBUG(GIU->print(errs()));
DEBUG(errs() << "\n");
if(auto *BCO = dyn_cast<BitCastOperator>(GIU)) {
DEBUG(BCO->print(errs()));
DEBUG(errs() << "\n");
for(User *BCU : BCO->users()) {
if(auto *SI = dyn_cast<StoreInst>(BCU)) {
// Get the tensor pointer
DEBUG(SI->print(errs()));
DEBUG(errs() << "\n");
auto *Val = SI->getValueOperand();
if(auto *BCO = dyn_cast<BitCastOperator>(Val)) {
return dyn_cast<User>(BCO->getOperand(0));
}
return dyn_cast<User>(Val);
}
}
}
if(auto *SI = dyn_cast<StoreInst>(GIU)) {
// Get the tensor pointer
DEBUG(SI->print(errs()));
auto *Val = SI->getValueOperand();
if(auto *BCO = dyn_cast<BitCastOperator>(Val)) {
return dyn_cast<User>(BCO->getOperand(0));
}
return dyn_cast<User>(Val);
}
}
}
return nullptr;
}
void CGT_NVDLA::mapOutputTensor(DFNode *N, ITensor *Tensor) {
for(int i = 0; i < N->outdfedge_size(); i++)
EdgeToTensorMap[N->getOutDFEdgeAt(i)] = Tensor;
}
ITensor *CGT_NVDLA::getIntermediateInputTensor(DFNode *N) {
return EdgeToTensorMap[N->getInDFEdgeAt(0)];
}
void CGT_NVDLA::getaddOpSucceedsNode(DFNode *N, SmallVector<DFLeafNode *, 4> &AddNodes,
SmallVector<IntrinsicInst *, 4> &AddInsts) {
bool AddOpNodes = false;
for(int i = 0; i < N->outdfedge_size(); i++) {
auto *DestNode = N->getOutDFEdgeAt(i)->getDestDF();
auto *F = DestNode->getFuncPointer();
// If the node is already cached in the list, no need to visit it
auto *Node = dyn_cast<DFLeafNode>(DestNode);
if(find(AddNodes, Node) != AddNodes.end())
continue;
// Add node to list if it contains add operation
for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
Instruction *I = &(*i);
auto *II = dyn_cast<IntrinsicInst>(I);
if (II && II->getIntrinsicID() == Intrinsic::hpvm_tensor_add) {
AddNodes.push_back(Node);
AddInsts.push_back(II);
AddOpNodes = true;
break;
}
}
assert(((AddNodes.size() > 0) == AddOpNodes)
&& "All destination nodes are adds or all of them are not.");
}
}
ITensor *CGT_NVDLA::getNVDLAInputTensor(DFLeafNode* N, const User *InputBindingTensor) {
if(InputBindingTensor) {
auto *BatchesConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(2));
auto *ChannelsConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(3));
auto *HeightConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(4));
auto *WidthConst = dyn_cast<ConstantInt>(InputBindingTensor->getOperand(5));
assert(HeightConst && WidthConst && ChannelsConst && BatchesConst
&& "Number of input dimensions must be constants.");
// Input dimensions
int InputW = WidthConst->getZExtValue();
int InputH = HeightConst->getZExtValue();
int InputC = ChannelsConst->getZExtValue();
int InputN = BatchesConst->getZExtValue();
// Create a new input tensor
Dims4 dims(InputN, InputC, InputH, InputW);
return Network->addInput("", dims);
}
return getIntermediateInputTensor(N);
}
unsigned CGT_NVDLA::getInputIndex(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "GET INPUT INDEX\n");
auto *F = N->getFuncPointer();
DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
unsigned inputIndex = 0;
for(auto &Arg : F->args()) {
DEBUG(errs() << "ARGUMENT: ");
DEBUG((&Arg)->print(errs()));
DEBUG(errs() << "\n");
if(II->getOperand(0) == &Arg) {
DEBUG(errs() << "INPUT: ");
DEBUG(II->getOperand(0)->print(errs()));
DEBUG(errs() << "\n");
DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n");
return inputIndex;
}
inputIndex++;
}
assert(false && "Illegal intrinsic or Node.");
return -1; // Keep compiler happy
}
std::string CGT_NVDLA::getLayerName(std::string Name) {
DEBUG(errs() << "GET LAYER NAME\n");
if(LayerNameMap.find(Name) == LayerNameMap.end()) {
LayerNameMap[Name] = 1;
} else {
LayerNameMap[Name]++;
}
return std::to_string(LayerNameMap[Name]);
}
void CGT_NVDLA::generateConvolutionLayer(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "*****CONVOLUTION LAYER*****\n");
// FIXME: What is number of "groups". Setting it to 1 for now.
int numGroups = 1;
// If the input tensor is not a binding tensor, it must be coming
// from an edge from a visted node, so use that to get number of outputs.
unsigned inputIndex = getInputIndex(N, II);
DEBUG(errs() << "INPUT INDEX: " << inputIndex << "\n");
DEBUG(errs() << "GET INPUT TENSOR\n");
auto *InputTensor = getBindingTensor(N, inputIndex);
DEBUG(errs() << "INPUT TENSOR: ");
ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
// Get the index for kernel tensor
auto *F = N->getFuncPointer();
DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
unsigned kernelIndex = 0;
bool ArgFound = false;
for(auto &Arg : F->args()) {
if(II->getOperand(1) == &Arg) {
ArgFound = true;
break;
}
kernelIndex++;
}
assert(ArgFound && "Illegal intrinsic or Node.");
DEBUG(errs() << "KERNEL INDEX: " << kernelIndex << "\n");
// Get the kernel tensor
DEBUG(errs() << "GET KERNEL TENSOR\n");
auto *KernelTensor = getBindingTensor(N, kernelIndex);
assert(KernelTensor && "Kernel tensors are always binds.");
// Get kernel constants
auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5));
auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4));
auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3));
auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2));
assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst
&& "Kernel dimensions must be constants.");
int kernelW = KernelWConst->getZExtValue();
int kernelH = KernelHConst->getZExtValue();
int kernelC = KernelCHConst->getZExtValue();
int kernelN = KernelNConst->getZExtValue();
DEBUG(errs() << "\nKERNEL H: " << kernelH << "\n");
DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
DEBUG(errs() << "KERNEL C: " << kernelC << "\n");
DEBUG(errs() << "KERNEL N: " << kernelN << "\n");
int numOutputs;
if(!InputTensor) {
DEBUG(errs() << "INPUT FROM EDGE\n");
numOutputs = (InputNVDLATensor->getDimensions()).n * kernelN;
// (InputNVDLATensor->getDimensions()).c;
} else {
DEBUG(errs() << "INPUT FROM WEIGHT TENSOR\n");
auto *BatchesConst = dyn_cast<ConstantInt>(InputTensor->getOperand(2));
auto *ChannelsConst = dyn_cast<ConstantInt>(InputTensor->getOperand(3));
numOutputs = BatchesConst->getZExtValue() * kernelN;
// ChannelsConst->getZExtValue();
DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n");
}
// Get Strides
ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(5));
ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(4));
assert((StrideWConst && StrideHConst) && "Strides must be constants.");
int strideW = StrideWConst->getZExtValue();
int strideH = StrideHConst->getZExtValue();
DEBUG(errs() << "STRIDE H: " << strideH << "\n");
DEBUG(errs() << "STRIDE W: " << strideW << "\n");
// Get pads
ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(3));
ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(2));
assert((PadWConst && PadHConst) && "Pads must be constants.");
int padW = PadWConst->getZExtValue();
int padH = PadHConst->getZExtValue();
DEBUG(errs() << "PAD H: " << padH << "\n");
DEBUG(errs() << "PAD W: " << padW << "\n");
// FIXME: Support dilations. Set dilations to 1 since we do not have dilation support yet.
int dilationW = 1;
int dilationH = 1;
// Get the nodes with Add operations
SmallVector<DFLeafNode *, 4> AddOpNodes;
SmallVector<IntrinsicInst *, 4> AddInsts;
getaddOpSucceedsNode(N, AddOpNodes, AddInsts);
assert((!(AddOpNodes.size() > 1))
&& "Number of nodes with Add ops must not be more than 1");
// Get bias parameters
int BiasW, BiasH, BiasC, BiasN;
User *BiasTensor = nullptr;
BiasMode biasMode = BiasMode::bNONE;
if(AddOpNodes.size()) {
// Get the index for bias tensor
auto *AddNode = AddOpNodes[0];
auto *AddInst = AddInsts[0];
DEBUG(AddInst->print(errs()));
auto *F = AddNode->getFuncPointer();
unsigned BiasIndex = 0;
ArgFound = false;
for(auto &Arg : F->args()) {
if(AddInst->getOperand(1) == &Arg) {
ArgFound = true;
break;
}
BiasIndex++;
}
assert(ArgFound && "Illegal intrinsic or Node.");
// Get the bias tensor
DEBUG(errs() << "BIAS INDEX: " << BiasIndex << "\n");
DEBUG(errs() << "BIAS TENSOR\n");
BiasTensor = getBindingTensor(AddNode, BiasIndex);
assert(BiasTensor && "Bias tensors are always binds.");
// Get Bias constants
auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5));
auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4));
auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3));
auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2));
assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst
&& "Bias dimensions must be constants.");
BiasW = BiasWConst->getZExtValue();
BiasH = BiasHConst->getZExtValue();
BiasC = BiasCHConst->getZExtValue();
BiasN = BiasNConst->getZExtValue();
DEBUG(errs() << "BIAS H: " << BiasH << "\n");
DEBUG(errs() << "BIAS W: " << BiasW << "\n");
DEBUG(errs() << "BIAS C: " << BiasC << "\n");
DEBUG(errs() << "BIAS N: " << BiasN << "\n");
// Get bias mode
//if(kernelN == numOutputs)
biasMode = BiasMode::bCHANNEL;
//else
// biasMode = BiasMode::bUNIFORM;
}
// Get weights
Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW);
Weights biasWeights = AddOpNodes.size() == 1 ?
readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW)
: Weights(DataType::HALF, nullptr, 0);
Dims2 tlPadding = Dims2(padH, padW);
Dims2 brPadding = Dims2(padH, padW);
Dims2 stride = Dims2(strideH, strideW);
Dims2 dilation = Dims2(dilationH, dilationW);
Dims2 kernelSize = Dims2(kernelH, kernelW);
auto *Layer = Network->addConvolution(InputNVDLATensor, numOutputs, 0,
kernelSize, tlPadding, brPadding, stride, dilation,
kernelWeights, biasWeights, biasMode, numGroups);
if(AddOpNodes.size()) {
auto *Node = AddOpNodes[0];
mapOutputTensor(Node, Layer->getOutput(0));
} else {
mapOutputTensor(N, Layer->getOutput(0));
}
Layer->setName((std::string("conv") + getLayerName(std::string("conv"))).c_str());
DEBUG(errs() << Layer->getName() << "\n");
}
void CGT_NVDLA::generatePoolingLayer(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "*****POOLING LAYER*****\n");
// Get input tensor
unsigned inputIndex = getInputIndex(N, II);
auto *InputTensor = getBindingTensor(N, inputIndex);
ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
// Get window dimensions
ConstantInt *KernelWConst = dyn_cast<ConstantInt>(II->getOperand(2));
ConstantInt *KernelHConst = dyn_cast<ConstantInt>(II->getOperand(1));
assert((KernelWConst && KernelHConst) && "Kernel dimensions must be constants.");
int kernelH = KernelHConst->getZExtValue();
int kernelW = KernelWConst->getZExtValue();
DEBUG(errs() << "KERNEL H: " << kernelH << "\n");
DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
// Get Strides
ConstantInt *StrideWConst = dyn_cast<ConstantInt>(II->getOperand(6));
ConstantInt *StrideHConst = dyn_cast<ConstantInt>(II->getOperand(5));
assert((StrideWConst && StrideHConst) && "Strides must be constants.");
int strideH = StrideHConst->getZExtValue();
int strideW = StrideWConst->getZExtValue();
DEBUG(errs() << "STRIDE H: " << strideH << "\n");
DEBUG(errs() << "STRIDE W: " << strideW << "\n");
// Get pads
ConstantInt *PadWConst = dyn_cast<ConstantInt>(II->getOperand(4));
ConstantInt *PadHConst = dyn_cast<ConstantInt>(II->getOperand(3));
assert((PadWConst && PadHConst) && "Pads must be constants.");
int padH = PadHConst->getZExtValue();
int padW = PadWConst->getZExtValue();
DEBUG(errs() << "PAD H: " << padH << "\n");
DEBUG(errs() << "PAD W: " << padW << "\n");
Dims2 windowSize = Dims2(kernelH, kernelW);
Dims2 stride = Dims2(strideH, strideW);
Dims2 tlPadding = Dims2(padH, padW);
Dims2 brPadding = Dims2(padH, padW);
PoolingType type = (II->getIntrinsicID() == Intrinsic::hpvm_tensor_pool_mean) ?
PoolingType::kAVERAGE : PoolingType::kMAX;
auto *Layer = Network->addPooling(InputNVDLATensor, type,
windowSize, stride, tlPadding, brPadding);
mapOutputTensor(N, Layer->getOutput(0));
Layer->setName((std::string("pool") + getLayerName(std::string("pool"))).c_str());
DEBUG(errs() << Layer->getName() << "\n");
}
void CGT_NVDLA::generateGemmLayer(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "****GEMM LAYER****\n");
// Get input tensor and compute number of outputs
unsigned inputIndex = getInputIndex(N, II);
auto *InputTensor = getBindingTensor(N, inputIndex);
ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
// Get the index for kernel tensor
auto *F = N->getFuncPointer();
DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
unsigned kernelIndex = 0;
bool ArgFound = false;
for(auto &Arg : F->args()) {
if(II->getOperand(1) == &Arg) {
ArgFound = true;
break;
}
kernelIndex++;
}
assert(ArgFound && "Illegal intrinsic or Node.");
// Get the kernel tensor
auto *KernelTensor = getBindingTensor(N, kernelIndex);
assert(KernelTensor && "Kernel tensors are always binds.");
// Get kernel constants
auto *KernelWConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(5));
auto *KernelHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(4));
auto *KernelCHConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(3));
auto *KernelNConst = dyn_cast<ConstantInt>(KernelTensor->getOperand(2));
assert(KernelWConst && KernelHConst && KernelCHConst && KernelNConst
&& "Kernel dimensions must be constants.");
int kernelW = KernelWConst->getZExtValue();
int kernelH = KernelHConst->getZExtValue();
int kernelC = KernelCHConst->getZExtValue();
int kernelN = KernelNConst->getZExtValue();
DEBUG(errs() << "KERNEL H: " << kernelH << "\n");
DEBUG(errs() << "KERNEL W: " << kernelW << "\n");
DEBUG(errs() << "KERNEL C: " << kernelC << "\n");
DEBUG(errs() << "KERNEL N: " << kernelN << "\n");
int numOutputs = kernelW;
DEBUG(errs() << "NUM OUTPUTS: " << numOutputs << "\n");
// Get the nodes with Add operations
SmallVector<DFLeafNode *, 4> AddOpNodes;
SmallVector<IntrinsicInst *, 4> AddInsts;
getaddOpSucceedsNode(N, AddOpNodes, AddInsts);
assert((!(AddOpNodes.size() > 1))
&& "Number of nodes with Add ops must not be more than 1");
// Get bias parameters
int BiasW, BiasH, BiasC, BiasN;
User *BiasTensor = nullptr;
BiasMode biasMode = BiasMode::bNONE;
if(AddOpNodes.size()) {
// Get the index for bias tensor
auto *AddNode = AddOpNodes[0];
auto *AddInst = AddInsts[0];
auto *F = AddNode->getFuncPointer();
unsigned BiasIndex = 0;
ArgFound = false;
for(auto &Arg : F->args()) {
if(AddInst->getOperand(1) == &Arg) {
ArgFound = true;
break;
}
BiasIndex++;
}
assert(ArgFound && "Illegal intrinsic or Node.");
// Get the bias tensor
BiasTensor = getBindingTensor(AddNode, BiasIndex);
assert(BiasTensor && "Bias tensors are always binds.");
// Get Bias constants
auto *BiasWConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(5));
auto *BiasHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(4));
auto *BiasCHConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(3));
auto *BiasNConst = dyn_cast<ConstantInt>(BiasTensor->getOperand(2));
assert(BiasWConst && BiasHConst && BiasCHConst && BiasNConst
&& "Bias dimensions must be constants.");
BiasW = BiasWConst->getZExtValue();
BiasH = BiasHConst->getZExtValue();
BiasC = BiasCHConst->getZExtValue();
BiasN = BiasNConst->getZExtValue();
DEBUG(errs() << "BIAS H: " << BiasH << "\n");
DEBUG(errs() << "BIAS W: " << BiasW << "\n");
DEBUG(errs() << "BIAS C: " << BiasC << "\n");
DEBUG(errs() << "BIAS N: " << BiasN << "\n");
// Get bias mode
//if(KernelCHConst->getZExtValue() == numOutputs)
biasMode = BiasMode::bCHANNEL;
//else
// biasMode = BiasMode::bUNIFORM;
}
// Get weights
Weights kernelWeights = readTrainedWeights(KernelTensor, kernelN, kernelC, kernelH, kernelW);
Weights biasWeights = (AddOpNodes.size() == 1) ?
readTrainedWeights(BiasTensor, BiasN, BiasC, BiasH, BiasW)
: Weights(DataType::HALF, nullptr, 0);
auto *Layer = Network->addFullyConnected(InputNVDLATensor, numOutputs,
kernelWeights, biasWeights, biasMode);
if(AddOpNodes.size()) {
auto *Node = AddOpNodes[0];
mapOutputTensor(Node, Layer->getOutput(0));
} else {
mapOutputTensor(N, Layer->getOutput(0));
}
Layer->setName((std::string("gemm") + getLayerName(std::string("gemm"))).c_str());
DEBUG(errs() << Layer->getName() << "\n");
}
void CGT_NVDLA::generateReluLayer(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "******RELU LAYER******\n");
// Get input tensor
unsigned inputIndex = getInputIndex(N, II);
auto *InputTensor = getBindingTensor(N, inputIndex);
ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
auto *Layer = Network->addActivation(InputNVDLATensor, kRELU);
mapOutputTensor(N, Layer->getOutput(0));
Layer->setName((std::string("relu") + getLayerName(std::string("relu"))).c_str());
DEBUG(errs() << Layer->getName() << "\n");
}
void CGT_NVDLA::generateSoftMaxLayer(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "******SOFTMAX LAYER*******\n");
// Get input tensor
unsigned inputIndex = getInputIndex(N, II);
auto *InputTensor = getBindingTensor(N, inputIndex);
ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
auto *Layer = Network->addSoftMax(InputNVDLATensor);
mapOutputTensor(N, Layer->getOutput(0));
Layer->setName((std::string("softmax") + getLayerName(std::string("softmax"))).c_str());
DEBUG(errs() << Layer->getName() << "\n");
}
void CGT_NVDLA::generateTanhLayer(DFLeafNode* N, const IntrinsicInst *II) {
DEBUG(errs() << "*******TANH LAYER*******\n");
// Get input tensor
unsigned inputIndex = getInputIndex(N, II);
auto *InputTensor = getBindingTensor(N, inputIndex);
ITensor *InputNVDLATensor = getNVDLAInputTensor(N, InputTensor);
auto *Layer = Network->addActivation(InputNVDLATensor, kTANH);
mapOutputTensor(N, Layer->getOutput(0));
Layer->setName((std::string("tanh") + getLayerName(std::string("tanh"))).c_str());
DEBUG(errs() << Layer->getName() << "\n");
}
/*
void CGT_NVDLA::generateBatchNormLayer(DFLeafNode* N, const IntrinsicInst *II) {
const dc::BatchNormParameter& p = msg.batch_norm_param();
Weights mean = weightFactory(msg.name(), kMEAN);
Weights variance = weightFactory(msg.name(), kVARIANCE);
Weights movingAverage = weightFactory(msg.name(), kMOVING_AVERAGE);
float eps = p.eps();
float scaleFactor = 1.0f;
float average = 0.0f;
int i;
average = *(static_cast<const float*>(movingAverage.values));
if ( average == 0.0f )
{
gLogError << "Batch Normalization moving average is zero " << std::endl;
return 0;
}
scaleFactor /= average;
if (mean.count != variance.count)
{
gLogError << "Mean and variance have differing number of elements "
<< mean.count << " & " << variance.count << std::endl;
return 0;
}
float *meanBlob = (float *)mean.values;
float *varianceBlob = (float *)variance.values;
Dims4 inputDims = getIntermediateInputTensor(N)->getDimensions();
BatchNormMode mode;
if (mean.count == 1)
{
mode = BatchNormMode::bnUNIFORM;
meanBlob[0] = meanBlob[0] * scaleFactor;
varianceBlob[0] = varianceBlob[0] * scaleFactor;
}
else if (mean.count == inputDims.c)
{
mode = BatchNormMode::bnm_CHANNEL;
for (i = 0; i < mean.count; i++)
{
meanBlob[i] = meanBlob[i] * scaleFactor;
varianceBlob[i] = varianceBlob[i] * scaleFactor;
}
}
else
{
gLogError << "Unknown batch norm mode" << std::endl;
return 0;
}
// Get input tensor
unsigned inputIndex = getInputIndex(N, II);
Value *InputTensor = getBindingTensor(inputIndex);
ITensor *InputNVDLATensor = getNVDLAInputTensor(InputTensor);
auto *Layer = Network->addBatchNorm(InputNVDLATensor, mode, mean, variance, eps);
mapOutputTensor(N, Layer->getOutput(0));
}
*/
unsigned CGT_NVDLA::identifyOutputs() {
std::set< ITensor* > outputTensors;
std::set< ITensor* > InputTensors;
for (int l = 0; l < Network->getNumLayers(); ++l) {
ILayer* layer = Network->getLayer(l);
assert(layer && "Illegal NVDLA compiler IR!");
for (int ii = 0; ii < layer->getNumInputs(); ++ii) {
InputTensors.insert(layer->getInput(ii));
}
for (int oo = 0; oo < layer->getNumOutputs(); ++oo) {
outputTensors.insert(layer->getOutput(oo));
}
}
for (std::set<ITensor*>::iterator oi = outputTensors.begin(); oi != outputTensors.end(); ++oi) {
// An output tensor which is not an input to any other layers is a Network output tensor
if (InputTensors.find(*oi) == InputTensors.end())
Network->markOutput(*oi);
}
return Network->getNumOutputs();
}
void CGT_NVDLA::codeGen(DFLeafNode *N) {
// No allocation nodes allowed.
assert(!N->isAllocationNode() && "Allocation Node not expected in ApproxHPVM");
// Skip code generation if it is a dummy node
if(N->isDummyNode()) {
DEBUG(errs() << "Skipping dummy node\n");
return;
}
// Generate code only if it has the right hint
//if (!checkPreferredTarget(N, hpvm::NVDLA_TARGET)) {
// DEBUG(errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n");
// return;
// }
// Get the function associated with the dataflow node
auto *F = N->getFuncPointer();
DEBUG(errs()<<"function name = "<< F->getName()<<"\n");
// Generate code for every instruction in this node
for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
Instruction *I = &(*i);
if (BuildDFG::isViscIntrinsic(I)) {
auto *II = dyn_cast<IntrinsicInst>(I);
assert((II->getCalledFunction()->getName()).startswith("llvm.hpvm.tensor")
&& "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
switch (II->getIntrinsicID()) {
case Intrinsic::hpvm_tensor_convolution:
case Intrinsic::hpvm_tensor_group_convolution:
generateConvolutionLayer(N, II);
break;
case Intrinsic::hpvm_tensor_batchnorm:
generateBatchNormLayer(N, II);
break;
case Intrinsic::hpvm_tensor_mul:
generateGemmLayer(N, II);
break;
case Intrinsic::hpvm_tensor_add:
// Add not explicitly supported by NVDLA compiler!
break;
case Intrinsic::hpvm_tensor_pool_max:
case Intrinsic::hpvm_tensor_pool_mean:
generatePoolingLayer(N, II);
break;
case Intrinsic::hpvm_tensor_relu:
generateReluLayer(N, II);
break;
case Intrinsic::hpvm_tensor_clipped_relu:
// No need to generate NVDLA IR for this?
break;
case Intrinsic::hpvm_tensor_tanh:
generateTanhLayer(N, II);
break;
case Intrinsic::hpvm_tensor_softmax:
generateSoftMaxLayer(N, II);
break;
default:
llvm_unreachable("Unknown HPVM Intrinsic!");
break;
}
}
}
}
void CGT_NVDLA::codeGen(DFInternalNode* N) {
DEBUG(errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n");
DEBUG(errs () << "Skipping internal node\n");
}
NvDlaError CGT_NVDLA::parseSetup(const TestAppArgs* appArgs, TestInfo* i) {
return NvDlaSuccess;
}
NvDlaError CGT_NVDLA::transformHPVM2NVDLA(const TestAppArgs* appArgs, TestInfo* i) {
NVDLA_UNUSED(appArgs);
NvDlaError e = NvDlaSuccess;
Network = nullptr;
Network = nvdla::createNetwork();
if (!Network)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createNetwork() failed");
// Iterate over all the DFGs and produce code for each one of them
for(auto &RootNode: *(appArgs->Roots))
visit(RootNode);
// if the application has so far not marked the network's outputs, allow the parser to do so now
if (Network->getNumOutputs() <= 0) {
int outs = identifyOutputs();
DEBUG(NvDlaDebugPrintf("Marking total %d outputs\n", outs));
if (outs <= 0)
ORIGINATE_ERROR_FAIL(NvDlaError_BadValue, "Unable to identify outputs for the network: %d", outs);
}
if (appArgs->computePrecision == nvdla::DataType::INT8) {
if (appArgs->calibTable != "") {
DEBUG(NvDlaDebugPrintf("parsing calibration table...\n"));
PROPAGATE_ERROR_FAIL(readTensorScales(appArgs, i, Network));
} else {
DEBUG(NvDlaDebugPrintf("initialize all tensors with const scaling factors of 127...\n"));
PROPAGATE_ERROR_FAIL(generateTensorScales(appArgs, i, Network));
}
}
DEBUG(NvDlaDebugPrintf("attaching parsed network to the wisdom...\n"));
if (!i->wisdom->setNetworkTransient(Network))
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->setNetworkTransient() failed");
return NvDlaSuccess;
fail:
return e;
}
NvDlaError CGT_NVDLA::parseAndCompile(const TestAppArgs* appArgs, TestInfo* i) {
NvDlaError e = NvDlaSuccess;
bool isCaffe = appArgs->caffemodel != "";
PROPAGATE_ERROR_FAIL(parseSetup(appArgs, i));
DEBUG(NvDlaDebugPrintf("creating new wisdom context...\n"));
i->wisdom = nvdla::createWisdom();
if (!i->wisdom)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "createWisdom() failed");
DEBUG(NvDlaDebugPrintf("opening wisdom context...\n"));
if (!i->wisdom->open(i->wisdomPath))
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->open() failed to open: \"%s\"", i->wisdomPath.c_str());
// Parse
PROPAGATE_ERROR_FAIL(transformHPVM2NVDLA(appArgs, i));
// Compile
PROPAGATE_ERROR_FAIL(compileProfile(appArgs, i));
/* Destroy network before closing wisdom context */
nvdla::destroyNetwork(i->wisdom->getNetwork());
DEBUG(NvDlaDebugPrintf("closing wisdom context...\n"));
i->wisdom->close();
fail:
if (i->wisdom != NULL) {
nvdla::destroyWisdom(i->wisdom);
i->wisdom = NULL;
}
return e;
}
NvDlaError CGT_NVDLA::testSetup(const TestAppArgs* appArgs, TestInfo* i) {
NvDlaError e = NvDlaSuccess;
std::string wisdomPath = appArgs->outputPath + "wisdom.dir/";
std::string removeCmd = "";
std::string imagePath = "";
NvDlaStatType stat;
int ii = 0;
// Do input paths exist?
e = NvDlaStat(appArgs->inputPath.c_str(), &stat);
if (e != NvDlaSuccess)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Input path does not exist: \"%s\"", appArgs->inputPath.c_str());
// Do output paths exist?
e = NvDlaStat(appArgs->outputPath.c_str(), &stat);
if (e != NvDlaSuccess)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Output path does not exist: \"%s\"", appArgs->outputPath.c_str());
// Clear wisdomPath if any exist
removeCmd += "rm -rf " + wisdomPath;
ii = std::system(removeCmd.c_str()); // This is pretty awful
if (ii != 0)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "system command failed: \"%s\"", removeCmd.c_str());
PROPAGATE_ERROR_FAIL(NvDlaMkdir(const_cast<char *>(wisdomPath.c_str())));
// Initialize TestInfo
i->wisdom = NULL;
i->wisdomPath = wisdomPath;
i->pData = NULL;
return NvDlaSuccess;
fail:
return e;
}
NvDlaError CGT_NVDLA::launchTest(const TestAppArgs* appArgs) {
NvDlaError e = NvDlaSuccess;
TestInfo testInfo;
PROPAGATE_ERROR_FAIL(testSetup(appArgs, &testInfo));
PROPAGATE_ERROR_FAIL(parseAndCompile(appArgs, &testInfo));
return NvDlaSuccess;
fail:
return e;
}
bool HPVM2NVDLA::runOnModule(Module &M) {
DEBUG(errs() << "**************HPVM2NVDLA PASS****************\n");
NvDlaError e = NvDlaError_TestApplicationFailed;
TestAppArgs testAppArgs = defaultTestAppArgs;
// Get the HPVM IR graph
BuildDFG &DFG = getAnalysis<BuildDFG>();
std::vector<DFInternalNode *> Roots = DFG.getRoots();
// Visitor for Code Generation Graph Traversal
CGT_NVDLA *CGTVisitor = new CGT_NVDLA(M, DFG);
if(ComputePrecision == "INT8" || ComputePrecision == "int8") {
testAppArgs.computePrecision = nvdla::DataType::INT8;
testAppArgs.quantizationMode = nvdla::QuantizationMode::PER_KERNEL;
testAppArgs.configtarget = std::string("nv_small");
} else {
testAppArgs.computePrecision = nvdla::DataType::HALF;
testAppArgs.quantizationMode = nvdla::QuantizationMode::NONE;
testAppArgs.configtarget = std::string("nv_full");
}
testAppArgs.profileName = std::string("hpvm-mod");
testAppArgs.calibTable = CalibTablePath;//std::string("output_scales.txt");
testAppArgs.outputPath = std::string(".");
testAppArgs.inDataFormat = nvdla::DataFormat::NCHW;
testAppArgs.Roots = &Roots;
e = CGTVisitor->launchTest(&testAppArgs);
if (e != NvDlaSuccess)
DEBUG(errs() << "ERROR\n");
else
DEBUG(errs() << "SUCESS\n");
delete CGTVisitor;
return false;
}
NvDlaError CGT_NVDLA::compileProfile(const TestAppArgs* appArgs, TestInfo* i) {
NvDlaError e = NvDlaSuccess;
std::string profileName = "";
std::string targetConfigName = "";
NvDlaFileHandle file = 0;
std::string fileName = "";
NvU8 *buffer = 0;
NvU64 size = 0;
nvdla::ICompiler* compiler = i->wisdom->getCompiler();
if (!compiler)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getCompiler() failed");
if (!(appArgs->configtarget != ""))
ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No target config found to load");
targetConfigName = appArgs->configtarget;
// Determine profile
PROPAGATE_ERROR_FAIL(generateProfile(appArgs, &profileName, i));
// Compile
DEBUG(NvDlaDebugPrintf("compiling profile \"%s\"... config \"%s\"...\n", profileName.c_str(), targetConfigName.c_str()));
PROPAGATE_ERROR_FAIL(compiler->compile(profileName.c_str(), targetConfigName.c_str(), &i->compiledLoadable));
// Get loadable buffer and dump it into a file
PROPAGATE_ERROR_FAIL(compiler->getLoadableImageSize(profileName.c_str(),
&size));
if (size == 0) {
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter,
"Invalid size for a loadable");
}
buffer = (NvU8 *) NvDlaAlloc(size);
if (buffer == NULL) {
ORIGINATE_ERROR_FAIL(NvDlaError_InsufficientMemory,
"Failed to allocate buffer for loadable");
}
PROPAGATE_ERROR_FAIL(compiler->getLoadableImage(profileName.c_str(),
buffer));
fileName = profileName + ".nvdla";
errs() << "Writing NVDLA module '" << fileName << "' ...";
PROPAGATE_ERROR_FAIL(NvDlaFopen(fileName.c_str(), NVDLA_OPEN_WRITE, &file));
PROPAGATE_ERROR_FAIL(NvDlaFwrite(file, buffer, size));
errs() << " done.\n";
fail:
NvDlaFclose(file);
if (buffer != NULL)
NvDlaFree(buffer);
return e;
}
NvDlaError CGT_NVDLA::generateProfile(const TestAppArgs* appArgs, std::string* profileName, TestInfo* i) {
NvDlaError e = NvDlaSuccess;
nvdla::DataFormat inDataFormat = nvdla::DataFormat::UNKNOWN;
if (appArgs->profileName != "") {
// init named profile (basic/default/performance) with default params in its constructor and exit
DEBUG(errs() << "PROFILE NAME PROVIDED\n");
PROPAGATE_ERROR_FAIL(beginWithNamedProfile(appArgs, i));
*profileName = appArgs->profileName;
} else {
ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "No profile supplied to load");
}
// capture profile params from command line (override the existing ones as necessary)
inDataFormat = inDataFormat == nvdla::DataFormat::UNKNOWN ? appArgs->inDataFormat : inDataFormat;
PROPAGATE_ERROR_FAIL(updateProfileWithCmdLineArgs(appArgs, i, profileName->c_str(), inDataFormat));
fail:
return e;
}
NvDlaError CGT_NVDLA::beginWithNamedProfile(const TestAppArgs* appArgs, TestInfo* i) {
NvDlaError e = NvDlaSuccess;
nvdla::IProfiler* profiler;
nvdla::IProfile* profile;
profiler = i->wisdom->getProfiler();
if ( !profiler ) {
ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profiler not initialized");
}
profile = profiler->getProfile(appArgs->profileName.c_str());
if ( !profile ) {
ORIGINATE_ERROR_FAIL(NvDlaError_NotInitialized, "Profile %s not initialized", appArgs->profileName.c_str());
}
fail:
return e;
}
NvDlaError CGT_NVDLA::updateProfileWithCmdLineArgs
(
const TestAppArgs* appArgs, TestInfo* i, const char* profileName, nvdla::DataFormat inDataFormat
) {
NvDlaError e = NvDlaSuccess;
nvdla::IProfiler* profiler;
nvdla::IProfile* profile;
profiler = i->wisdom->getProfiler();
if (!profiler)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "wisdom->getProfiler() failed");
profile = profiler->getProfile(profileName);
if (!profile)
ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "profiler->getProfile() failed");
PROPAGATE_ERROR_FAIL(profile->setComputePrecision(appArgs->computePrecision));
PROPAGATE_ERROR_FAIL(profile->setNetworkInputDataFormat(inDataFormat));
// determine input surface format
switch(inDataFormat) {
case nvdla::DataFormat::NHWC:
if (appArgs->computePrecision == nvdla::DataType::HALF) {
PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A16B16G16R16_F));
} else if (appArgs->computePrecision == nvdla::DataType::INT8) {
PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::A8B8G8R8));
} else {
ORIGINATE_ERROR_FAIL(NvDlaError_NotSupported, "NHWC and compute precision %u is not yet supported",
appArgs->computePrecision.v());
}
break;
case nvdla::DataFormat::NCxHWx:
case nvdla::DataFormat::NCHW:
case nvdla::DataFormat::UNKNOWN: // atleast start the test with feature data format
default:
if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0)
PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8));
else
PROPAGATE_ERROR_FAIL(profile->setNetworkInputSurfaceFormat(nvdla::PixelFormat::FEATURE));
}
// determine int8 cfgs
if (appArgs->computePrecision == nvdla::DataType::INT8) {
PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::PER_TENSOR));
switch(appArgs->quantizationMode) {
case nvdla::QuantizationMode::PER_FILTER:
PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_FILTER));
break;
case nvdla::QuantizationMode::PER_KERNEL:
case nvdla::QuantizationMode::NONE: // default to per-kernel; find a way to run int8 tests w/ NONE qtzMode cleanly
default:
PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::PER_KERNEL));
}
} else {
PROPAGATE_ERROR_FAIL(profile->setTensorScalingMode(nvdla::TensorScalingMode::NONE));
PROPAGATE_ERROR_FAIL(profile->setQuantizationMode(nvdla::QuantizationMode::NONE));
}
PROPAGATE_ERROR_FAIL(profile->setNetworkOutputDataFormat(nvdla::DataFormat::NCxHWx));
if (std::strcmp(appArgs->configtarget.c_str(), "opendla-small") == 0)
PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE_X8));
else
PROPAGATE_ERROR_FAIL(profile->setNetworkOutputSurfaceFormat(nvdla::PixelFormat::FEATURE));
if (appArgs->numBatches > 0)
PROPAGATE_ERROR_FAIL(profile->setMultiBatchSize(appArgs->numBatches));
fail:
return e;
}
NvDlaError CGT_NVDLA::generateTensorScales(const TestAppArgs* appArgs, TestInfo* i, nvdla::INetwork* network) {
NvDlaError e = NvDlaSuccess;
std::vector<nvdla::ILayer*> networkLayers = network->getLayers();
std::vector<nvdla::ITensor*> networkInputs = network->getInputs();
std::vector<nvdla::ILayer*>::iterator li = networkLayers.begin();
std::vector<nvdla::ITensor*>::iterator nii = networkInputs.begin();
// set scaling factor for the network input tensors
for (; nii != networkInputs.end(); ++nii) {
NvF32 scale = 1;
NvF32 min = scale * -127.0f;
NvF32 max = scale * 127.0f;
std::string tName = (*nii)->getName();
DEBUG(errs() << "INPUT NAME: " << tName << "\n");
// set same dynamic range for all channels of the tensor (cIndex = -1)
PROPAGATE_ERROR_FAIL( (*nii)->setChannelDynamicRange(-1, min, max) );
const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale));
if (0)
NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", tName.c_str(), scale);
}
for (; li != networkLayers.end(); ++li) {
NvF32 scale = 127;
NvF32 min = scale * -127.0f;
NvF32 max = scale * 127.0f;
std::string lName = (*li)->getName();
nvdla::ITensor* outTensor = (*li)->getOutput(0);
DEBUG(errs() << "LAYER NAME: " << lName << "\n");
// set same dynamic range for all channels of the tensor (cIndex = -1)
PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) );
const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(lName, scale));
if (0)
NvDlaDebugPrintf("setting dynamic range of: %s to %f\n", lName.c_str(), scale);
}
fail:
return e;
}
NvDlaError CGT_NVDLA::readTensorScales(const TestAppArgs* appArgs, TestInfo *i, nvdla::INetwork* network) {
NvDlaError e = NvDlaSuccess;
NvDlaStatType stat;
std::string calibTableFile = /*i->calibTablesPath + "/" + */appArgs->calibTable;
//PROPAGATE_ERROR_FAIL(NvDlaStat(calibTableFile.c_str(), &stat));
DEBUG(errs() << "***********READING TENSOR SCALESi*************\n");
std::ifstream infile(calibTableFile.c_str());
std::string line;
std::map<std::string, float> LayerNameToScaleMap;
while (std::getline(infile, line)) {
DEBUG(errs() << "READ LINE: " << line << "\n");
line.erase(remove(line.begin(), line.end(), ' '), line.end());
DEBUG(errs() << "READ LINE WITHOUT WHITE SPACES: " << line << "\n");
std::string delimiter = ":";
std::string layer_name = line.substr(0, line.find(delimiter));
std::string Scale = line.substr(line.find(delimiter) + 1);
DEBUG(errs() << "LAYER NAME: " << layer_name << "\n");
DEBUG(errs() << "SCALE: " << Scale << "\n");
size_t size;
LayerNameToScaleMap[layer_name] = std::stof(Scale, &size);
}
infile.close();
DEBUG(errs() << "GOT TENSOR SCALES FROM CALIB TABLE\n");
std::vector<nvdla::ILayer*> networkLayers = network->getLayers();
std::vector<nvdla::ITensor*> networkInputs = network->getInputs();
for (auto *Input : networkInputs) {
NvF32 scale = 0.0f;
NvF32 min = 0.0f;
NvF32 max = 0.0f;
DEBUG(errs() << "SET SCALE FOR INPUT\n");
scale = LayerNameToScaleMap["input"];
DEBUG(errs() << "INPUT SCALE: " << scale << "\n");
min = scale * -127.0f;
max = scale * 127.0;
PROPAGATE_ERROR_FAIL(Input->setChannelDynamicRange(-1, min, max) );
const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>("data", scale));
}
DEBUG(errs() << "PER LAYER CALIB\n");
for (auto *Layer : networkLayers) {
NvF32 scale = 0.0f;
NvF32 min = 0.0f;
NvF32 max = 0.0f;
std::string tName = Layer->getName();
DEBUG(errs() << "SETTING SCALE FOR LAYER NAME: " << tName << "\n");
nvdla::ITensor* outTensor = Layer->getOutput(0);
auto it = LayerNameToScaleMap.find(tName);
if (it != LayerNameToScaleMap.end()) {
DEBUG(errs() << "SET SCALE FOR NAME: " << tName << "\n");
DEBUG(errs() << "SCALE: " << it->second << "\n");
scale = it->second;
min = scale * -127.0f;
max = scale * 127.0f;
} else {
DEBUG(errs() << "SET DEFAULT SCALE FOR NAME: " << tName << "\n");
DEBUG(errs() << "SCALE: 1\n");
scale = 1;
min = scale * -127.0f;
max = scale * 127.0f;
}
//else {
// ORIGINATE_ERROR_FAIL(NvDlaError_BadParameter, "Atleast 1 of scale or min-max should be specified for %s\n", tName.c_str());
//}
PROPAGATE_ERROR_FAIL( outTensor->setChannelDynamicRange(-1, min, max) );
const_cast<TestAppArgs*>(appArgs)->tensorScales.insert(std::pair<std::string, NvF32>(tName, scale));
}
DEBUG(errs() << "DONE PARSING CALIBRATION TABLE\n");
fail:
return e;
}