From 91d350a2e573c2f62825ab3baef84a3fbc14e549 Mon Sep 17 00:00:00 2001
From: Akash Kothari <akashk4@tyler.cs.illinois.edu>
Date: Tue, 8 Dec 2020 12:17:37 -0600
Subject: [PATCH] Add support for node ID intrinsic

---
 llvm/include/llvm/IR/IntrinsicsVISC.td        |   3 +
 llvm/lib/Transforms/ClearDFG/ClearDFG.cpp     |  23 +-
 .../DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp         |  46 +++-
 .../DFG2LLVM_WrapperAPI.cpp                   | 198 ++++++++----------
 .../FuseHPVMTensorNodes.cpp                   |  76 +++++--
 llvm/lib/Transforms/GenVISC/GenVISC.cpp       |  98 +--------
 6 files changed, 217 insertions(+), 227 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index ab22372d80..404903648f 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -325,4 +325,7 @@ let TargetPrefix = "visc" in {
                                                             llvm_i32_ty,
                                                             llvm_i32_ty], []>;
 
+  def int_visc_node_id : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], []>;
+
+
 }
diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 37f74325a7..84f9bec04f 100644
--- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -59,6 +59,7 @@ private:
 
   //Functions
   void deleteNode(DFNode* N);
+  
 public:
   // Constructor
   TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { }
@@ -88,6 +89,7 @@ public:
 };
 
 bool ClearDFG::runOnModule(Module &M) {
+
   errs() << "\nCLEARDFG PASS\n";
   // Get the BuildDFG Analysis Results:
   // - Dataflow graph
@@ -118,6 +120,19 @@ bool ClearDFG::runOnModule(Module &M) {
   VC->replaceAllUsesWith(UndefValue::get(VC->getType()));
   VC->eraseFromParent();
 
+
+  Function* VN = M.getFunction("llvm.visc.node.id");
+  if (VN != NULL){ // Delete visc.node.id intrinsic calls if they exist
+    for(Value::user_iterator ui = VN->user_begin(), ue = VN->user_end(); ui != ue; ui++) {
+      Instruction* I = dyn_cast<Instruction>(*ui);
+      I->eraseFromParent();
+    }
+    
+    VN->replaceAllUsesWith(UndefValue::get(VN->getType()));
+    VN->eraseFromParent();
+  }
+  
+  
   // Visitor for Code Generation Graph Traversal
   TreeTraversal *Visitor = new TreeTraversal(M, DFG);
 
@@ -125,7 +140,9 @@ bool ClearDFG::runOnModule(Module &M) {
   for (auto rootNode: Roots) {
     Visitor->visit(rootNode);
   }
+  
   delete Visitor;
+
   return true;
 }
 
@@ -149,7 +166,7 @@ void TreeTraversal::deleteNode(DFNode* N) {
 
 char ClearDFG::ID = 0;
 static RegisterPass<ClearDFG> X("clearDFG",
-                                    "Delete all DFG functions for which code has been generated",
-                                    false /* does not modify the CFG */,
-                                    true /* transformation, not just analysis */);
+				"Delete all DFG functions for which code has been generated",
+				false /* does not modify the CFG */,
+				true /* transformation, not just analysis */);
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp b/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
index abc4e9ef89..f18325588c 100644
--- a/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
@@ -145,7 +145,7 @@ void CGT_CUDNN::initRuntimeAPI() {
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
   if(runtimeModule == nullptr)
     DEBUG(errs() << Err.getMessage());
-  else
+   else
     DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n");
 
   // Get or insert Global declarations for
@@ -158,7 +158,15 @@ void CGT_CUDNN::initRuntimeAPI() {
 
   // Find visc.init and visc.cleanup calls, and add placeholder methods
   // for initialization and cleanup of the hpvm tensor runtime
-
+  /*
+  LLVMContext &C = M.getContext();
+  auto *FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({Type::getInt32Ty(C)}), false);
+  llvm_hpvm_initTensorRt = M.getOrInsertFunction(StringRef("llvm_hpvm_initTensorRt"), FuncType);
+  FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({}), false);
+  llvm_hpvm_cleanupTensorRt = M.getOrInsertFunction(StringRef("llvm_hpvm_cleanupTensorRt"), FuncType);
+  FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({Type::getInt8PtrTy(C), Type::getInt32Ty(C)}), false);
+  hpvm_request_tensor = M.getOrInsertFunction(StringRef("hpvm_request_tensor"), FuncType); 
+*/
   Function* VI = M.getFunction("llvm.visc.init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n");
   InitCall = cast<Instruction>(*VI->user_begin());
@@ -166,6 +174,7 @@ void CGT_CUDNN::initRuntimeAPI() {
                    ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)),
                    "", InitCall);
 
+
   Function* VC = M.getFunction("llvm.visc.cleanup");
   assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n");
   CleanupCall = cast<Instruction>(*VC->user_begin());
@@ -264,9 +273,13 @@ void CGT_CUDNN::codeGen(DFLeafNode* N) {
 
     if (BuildDFG::isViscIntrinsic(I)) {
       IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
-        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+      //assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+      //  && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
 
+      //if (!(II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")){
+      //continue; // skip non-tensor ops 
+      //}
+      
       /********************* Handle VISC Tensor intrinsics ********************/
       switch (II->getIntrinsicID()) {
 
@@ -427,7 +440,8 @@ void CGT_CUDNN::codeGen(DFLeafNode* N) {
       { /* llvm.visc.tensor.relu */
         DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor_pool_max\n");
 
-        // Argument list - tensorPooling(input, poolFunction, window_height, window_width, vertical_pad, horizontal_pad,
+        // Argument list - tensorPooling(input, poolFunction, window_height,
+	//                               window_width, vertical_pad, horizontal_pad,
 	//                               vertical_stride, horizontal_stride);
         std::vector<Value*> Args;
         Args.push_back(II->getOperand(0));
@@ -540,6 +554,28 @@ void CGT_CUDNN::codeGen(DFLeafNode* N) {
         IItoRemove.push_back(II);
       }
       break;
+
+      case Intrinsic::visc_node_id:
+      { /* llvm.visc.node.id */
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling Node ID Intrinsic \n");
+        // Get uint32 argument
+        Value *Op = II->getOperand(0);
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+        // Create hpvm-tensor-rt function call
+        Constant* tensor_set_node_id;
+        DECLARE(tensor_set_node_id);
+        CallInst::Create(tensor_set_node_id, Args, "", II);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      
       default:
         llvm_unreachable("Unknown VISC Intrinsic!");
         break;
diff --git a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
index ac5fa450c0..ecec258dfe 100644
--- a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
@@ -73,6 +73,7 @@ private:
 
   std::vector<Value*> Args;
   std::vector<IntrinsicInst*> IIs;
+  std::vector<IntrinsicInst*> IIs_remove; // Intrinsics to remove
   AbstractState *current;
 
 public:
@@ -88,6 +89,10 @@ public:
     return M;
   }
 
+  Module *getRtModule() {
+    return RtM;
+  }
+
   void addArgument(Value *Arg) {
     Args.push_back(Arg);
   }
@@ -96,6 +101,10 @@ public:
     IIs.push_back(II);
   }
 
+  void addIntrinsicToRemove(IntrinsicInst *II) {
+    IIs_remove.push_back(II);
+  }
+
   IntrinsicInst *getIntrinsicInstAt(unsigned idx) {
     return IIs[idx];
   }
@@ -267,6 +276,7 @@ public:
   void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
 };
 
+  
 void InitialState::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {
   if (II) { // Not end of instruction stream
     switch (II->getIntrinsicID()) {
@@ -288,6 +298,31 @@ void InitialState::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {
         Mch->setCurrent(new FullyConnectedLayer_1());
         }
         break;
+
+      case Intrinsic::visc_node_id:
+        {
+
+	 DEBUG(errs() << "\t: Handling __visc_node_id \n");
+         // Get uint32 node ID
+         Value *Op = II->getOperand(0);
+
+	 std::vector<Value*> Args;
+         Args.push_back(Op); 
+
+	 Module *M = Mch->getModule();
+	 Module *RtM = Mch->getRtModule();
+	 
+         Constant* visc_node_id_call =
+          M->getOrInsertFunction(StringRef("tensor_set_node_id"),
+                  RtM->getFunction(StringRef("tensor_set_node_id"))->getFunctionType());
+
+	 CallInst::Create(visc_node_id_call, Args, "", II);
+
+	 Mch->addIntrinsicToRemove(II);
+	 Mch->setCurrent(new InitialState());
+        }
+        break;
+	
       default: // Other HPVM intrinsic
         {
         Mch->addIntrinsicInst(II);
@@ -438,14 +473,15 @@ void ConvolutionLayer_1::transition(CodeGenStateMachine *Mch,
     Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv
     Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv
     Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv
-//    Mch->addArgument(ConstantInt::get(
-//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
-//    Mch->addArgument(ConstantInt::get(
-//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
-//    Mch->addArgument(ConstantInt::get(
-//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
-//    Mch->addArgument(ConstantInt::get(
-//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
 
     // No pooling
     // 0 for unused pool arguments:
@@ -470,9 +506,9 @@ void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch,
     switch (II->getIntrinsicID()) {
       case Intrinsic::visc_tensor_tanh:
         {
-        // Type of activation : TanH
-//        Mch->addArgument(ConstantInt::get(
-//                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+	  // Type of activation : TanH
+	  //        Mch->addArgument(ConstantInt::get(
+	  //                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
         Mch->addIntrinsicInst(II);
 
         Mch->setCurrent(new ConvolutionLayer_3());
@@ -480,9 +516,9 @@ void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch,
         break;
       case Intrinsic::visc_tensor_relu:
         {
-        // Type of activation : ReLU
-//        Mch->addArgument(ConstantInt::get(
-//                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+	  // Type of activation : ReLU
+	  //        Mch->addArgument(ConstantInt::get(
+	  //                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
         Mch->addIntrinsicInst(II);
 
         Mch->setCurrent(new ConvolutionLayer_3());
@@ -490,9 +526,9 @@ void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch,
         break;
       case Intrinsic::visc_tensor_clipped_relu:
         {
-        // Type of activation : Clipped ReLU
-//        Mch->addArgument(ConstantInt::get(
-//                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+	  // Type of activation : Clipped ReLU
+	  //        Mch->addArgument(ConstantInt::get(
+	  //                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
         Mch->addIntrinsicInst(II);
 
         Mch->setCurrent(new ConvolutionLayer_3());
@@ -613,6 +649,7 @@ void ConvolutionLayer_3::transition(CodeGenStateMachine *Mch,
         // pool min FIXME: 2: supported?
         Mch->addArgument(ConstantInt::get(
                          Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+	
         // pool_size_v, pool_size_h, pool pad_v,
         // pool_pad_h, pool_stride_v, pool_stride_h
         for (int i = 1; i < 7; i++) {
@@ -737,10 +774,11 @@ void CodeGenStateMachine::codeGen(DFNode *N, Function *F, const StringRef &strRe
   assert( ( (current->getStateID() == AbstractState::ID::FULLY_CONNECTED_LAYER) ||
             (current->getStateID() == AbstractState::ID::CONVOLUTION_LAYER)     ||
             (current->getStateID() == AbstractState::ID::SINGLE_TENSOR_OPERATION) ) &&
-          "Unsupported instruction sequence for the Wrapper API.\n" );
+            "Unsupported instruction sequence for the Wrapper API.\n" );
 
   if ((current->getStateID() == AbstractState::ID::FULLY_CONNECTED_LAYER) ||
       (current->getStateID() == AbstractState::ID::CONVOLUTION_LAYER)) {
+
     // Layer Operation.
     DEBUG(errs() << "Layer Instruction Sequence. Validating ...\n");
     // We have a valid instruction sequence.
@@ -765,6 +803,7 @@ void CodeGenStateMachine::codeGen(DFNode *N, Function *F, const StringRef &strRe
           Constant* wrapper_ConvLayer2 =
             M->getOrInsertFunction(StringRef("wrapper_ConvLayer2"),
                    RtM->getFunction(StringRef("wrapper_ConvLayer2"))->getFunctionType());
+	  
           DEBUG(errs() << *wrapper_ConvLayer2);
   
           // FIXME: get last (float) arguments from clipped relu intrinsic. For now, 0
@@ -839,12 +878,14 @@ void CodeGenStateMachine::codeGen(DFNode *N, Function *F, const StringRef &strRe
     CI->insertBefore(IIlast);
     IIlast->replaceAllUsesWith(CI);
 
-  } else { // SINGLE_TENSOR_OPERATION
+  }
+  else { // SINGLE_TENSOR_OPERATION
     assert((IIs.size() == 1) &&
             "Unexpected size of intrinsics vector in code gen state machine.\n");
     assert(Args.empty() && "Unexpected arguments found in coge gen state machine.\n");
     IntrinsicInst *TensorII = IIs[0];
-errs() << "TensorII: " << *TensorII << "\n";
+
+    errs() << "TensorII: " << *TensorII << "\n";
 
     switch (TensorII->getIntrinsicID()) {
       case Intrinsic::visc_tensor_group_convolution:
@@ -893,8 +934,9 @@ errs() << "TensorII: " << *TensorII << "\n";
 
       case Intrinsic::visc_tensor_batchnorm:
       { /* llvm.hpvm.tensor.batchnorm */
+
         // Tensor batchnorm is not in place.
-    // FIXME: Add Check for InPlace Analysis 
+	// FIXME: Add Check for InPlace Analysis 
         DEBUG(errs() << F->getName() << "\t: Handling tensor batch normalization \n");
 
         // Argument list for the runtime call
@@ -933,18 +975,18 @@ errs() << "TensorII: " << *TensorII << "\n";
 
       case Intrinsic::visc_tensor_add:
       { /* llvm.hpvm.tensor.add */
-        DEBUG(errs() << F->getName() << "\t: Handling tensor add\n");
-        // Tensor add(a,b) is in place for argument a.
-//        Value *Op = TensorII->getOperand(0);
+        DEBUG(errs() << F->getName() << "\t: Handling tensorAdd\n");
 
+	// Tensor add(a,b) is in place for argument a.
+	//        Value *Op = TensorII->getOperand(0);
         // Test the intrinsic operand for in place operation.
-//        bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
+	//        bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
 
         // Code generation will not continue if this is false, because the target
         // may provide an in place operation(safe choice)
         // FIXME: remove this comment - must check for in-place
-//        assert(inplace &&
-//               "Operand not valid for in place operation. Code gen aborted.\n");
+	//        assert(inplace &&
+	//               "Operand not valid for in place operation. Code gen aborted.\n");
 
 
         // Argument list for the runtime call
@@ -1047,13 +1089,11 @@ errs() << "TensorII: " << *TensorII << "\n";
         Value *Op = TensorII->getOperand(0);
 
         // Test the intrinsic operand for in place operation.
-        bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
+        //-- bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
         // Code generation will not continue if this is false, because the target
         // may provide an in place operation(safe choice)
-        assert(inplace &&
-               "Operand not valid for in place operation. Code gen aborted.\n");
-
-        // Argument list for the runtime call
+        //-- assert(inplace &&
+        //--        "Operand not valid for in place operation. Code gen aborted.\n");
 
         // Create string for node name, as first argument for wrapper API call
         Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
@@ -1110,15 +1150,6 @@ errs() << "TensorII: " << *TensorII << "\n";
         // Tensor softmax(a) is in place for argument a.
         Value *Op = TensorII->getOperand(0);
 
-        // Test the intrinsic operand for in place operation.
-        bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
-        // Code generation will not continue if this is false, because the target
-        // may provide an in place operation(safe choice)
-        assert(inplace &&
-               "Operand not valid for in place operation. Code gen aborted.\n");
-
-        // Argument list for the runtime call
-
         // Create string for node name, as first argument for wrapper API call
         Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
                                                             strRef, true);
@@ -1146,62 +1177,8 @@ errs() << "TensorII: " << *TensorII << "\n";
         TensorII->replaceAllUsesWith(TensorII->getOperand(0));
       }
       break;
-/*
-      case Intrinsic::visc_image_fft_transform:
-      { // llvm.hpvm.image.fft.transform - Or another image intrinsic
-        // All will be treated as not in place
-        DEBUG(errs() << F->getName() << "\t: Handling fft transform \n");
-
-        // Create argument list for the runtime call - stored in Args
 
-        // All interfaces will have a string as first argument, which will be
-        // used to identify the dataflow node at runtime
-        // Create string for node name, as first argument for wrapper API call
-        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
-                                                            strRef, true);
-        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
-                               true, GlobalValue::ExternalLinkage, ConstArray, "");
-        // Create GEP expression to access it
-        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
-        Constant* GEPIndices[] = { Int_0, Int_0 };
-        Constant* GEPConst =
-          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
-                                         GV, GEPIndices);
-
-        Args.push_back(GEPConst);
-
-        // Here, use you will access the appropriate arruments of the intrinsic
-        // and push_back, in order to create the argument list of runtime call
-        Args.push_back(TensorII->getOperand(0));
-        Args.push_back(TensorII->getOperand(1));
-        Args.push_back(TensorII->getOperand(2));
-        Args.push_back(TensorII->getOperand(3));
-        Args.push_back(TensorII->getOperand(4));
-        Args.push_back(TensorII->getOperand(5));
-
-        Constant *conv_mode = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1);
-        Args.push_back(conv_mode);
-
-        Args.push_back(TensorII->getOperand(7));
-
-        // Done with argument list.
-
-        // Create wrapper API runtime function call
-        // Appropriately set the name of the function of the runtime that you
-        // want to call
-        // Note: the Constant * is what we need to pass to the callInst.
-        // This name does not have to match, but does so for similarity.
-        Constant* wrapper_tensorGroupConvolution;
-          M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"),
-            RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType());
-        CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution,
-                                        Args, "", TensorII);
-        // We can replace the call to hpvm.tensor.xxx with the runtime call
-        TensorII->replaceAllUsesWith(CI);
-      }
-      break;
-
-*/
+      
       default:
         llvm_unreachable("Unknown VISC Intrinsic!");
         break;
@@ -1219,6 +1196,13 @@ errs() << "TensorII: " << *TensorII << "\n";
     (*ri)->eraseFromParent();
   }
 
+
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs_remove.rbegin(),
+       re = IIs_remove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    (*ri)->eraseFromParent();
+  }
+
 }
 
 // DFG2LLVM_WrapperAPI - The first implementation.
@@ -1226,6 +1210,8 @@ errs() << "TensorII: " << *TensorII << "\n";
 struct DFG2LLVM_WrapperAPI : public DFG2LLVM {
   static char ID; // Pass identification, replacement for typeid
   DFG2LLVM_WrapperAPI() : DFG2LLVM(ID) {}
+
+  
 private:
 
 public:
@@ -1380,20 +1366,13 @@ void CGT_WrapperAPI::codeGen(DFLeafNode* N) {
     return;
   }
 
-//  For wrapper API, we generate code for every leaf node.
-//  No need to check for hints from frontend
-//  // Generate code only if it has the right hint
-//  if (!checkPreferredTarget(N, visc::PROMISE_TARGET)) {
-//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-//    return;
-//  }
 
   // Increment the node ID, for current node.
   ++nodeID;
 
   // Get the function associated with the dataflow node
   Function *F = N->getFuncPointer();
-errs() << "Node Function: " << *F << "\n";
+  errs() << "Node Function: " << *F << "\n";
   // Look up if we have visited this function before. If we have, then just
   // get the cloned function pointer from DFNode. Otherwise, create the cloned
   // function and add it to the DFNode GenFunc.
@@ -1405,6 +1384,8 @@ errs() << "Node Function: " << *F << "\n";
   // Clone the function
   ValueToValueMapTy VMap;
   std::string FName(F->getName().data());//Twine FName = F->getName();
+
+  
   F_wrapper_api = CloneFunction(F, VMap);
   F_wrapper_api->setName(FName+"_wrapper_api");
   F_wrapper_api->removeFromParent();
@@ -1461,15 +1442,12 @@ errs() << "Node Function: " << *F << "\n";
   //CGM.codeGen(N, F_wrapper_api, N->getFuncPointer()->getName(), *IPP);
   CGM.codeGen(N, F_wrapper_api, StringRef(std::to_string(nodeID)), *IPP);
 
-//errs() << "-----------------------------------\n";
-//errs() << *F_wrapper_api << "\n";
-
   return;
 }
 
 bool DFG2LLVM_WrapperAPI::runOnModule(Module &M) {
-  errs() << "\nDFG2LLVM_WrapperAPI PASS\n";
 
+  errs() << "\nDFG2LLVM_WrapperAPI PASS\n";
   // Get the BuildDFG Analysis Results:
   // - Dataflow graph
   BuildDFG &DFG = getAnalysis<BuildDFG>();
@@ -1477,9 +1455,8 @@ bool DFG2LLVM_WrapperAPI::runOnModule(Module &M) {
   // Get the In Place Analysis Results
   InPlaceDFGAnalysis::InPlaceDFGParameter IPP =
     (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP();
-  // Print results
-//  printInPlaceDFGParameter(IPP);
 
+  
   std::vector<DFInternalNode*> Roots = DFG.getRoots();
  
   // Visitor for Code Generation Graph Traversal
@@ -1496,6 +1473,7 @@ bool DFG2LLVM_WrapperAPI::runOnModule(Module &M) {
   //TODO: Edit module epilogue to remove the VISC intrinsic declarations
   delete CGTVisitor;
 
+  
   return true;
 }
 
diff --git a/llvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp b/llvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
index d9a3c588b5..541efe4e1d 100644
--- a/llvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
+++ b/llvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
@@ -78,17 +78,35 @@ static bool isIncomingEdgeArgument(unsigned argno,
   return false;
 }
 
+  
 // Check that this is a valid HPVM Tensor Node (starts with an HPVM intrinsic)
 // Return the node intrinsic function
 static IntrinsicInst *isValidHPVMTensorNode(DFNode *N) {
+  
   Function *F = N->getFuncPointer();
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*(inst_begin(F)));
-  assert(II &&
-         "HPVM tensor intrinsic expected as first instruction of HPVM tensor node\n");
-  assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") &&
-         "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+  //IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*(inst_begin(F)));
+
+  IntrinsicInst *II;
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; I++){
+
+    if(dyn_cast<IntrinsicInst>(&*I)){
+      II = dyn_cast<IntrinsicInst>(&*I);
+      if ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")){
+	errs()<<"** Tensor Intrinsic = " << *II << "\n";
+      }
+      
+    }
+  }
+  
+  //assert(II &&
+  //        "HPVM tensor intrinsic expected as first instruction of HPVM tensor node\n");
+
+  //assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") &&
+  //        "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+  
   return II;
 }
+  
 
 // Returns the next node in a node sequence, or NULL if it does not exist.
 // We consider two nodes a sequence if SrcN has a single successor, DstN,
@@ -340,9 +358,9 @@ Function* FuseHPVMTensorNodes::createEmptyDFNodeFunction(IntrinsicInst* II1,
            the body of the fused function instead                             *
  * OutVs: This maps the output struct field index to the stored value         */
 void FuseHPVMTensorNodes::inlineFirstNodeFunction(Module &M, Function *F1,
-                                                 Function *Ffused,
-                                                 ValueMap<Value*, Value*> &VMap,
-                                                 std::vector<Value*> &OutVs) {
+                                                  Function *Ffused,
+                                                  ValueMap<Value*, Value*> &VMap,
+                                                  std::vector<Value*> &OutVs) {
 
   ReturnInst *RI = cast<ReturnInst>(Ffused->getEntryBlock().getTerminator());
 
@@ -356,8 +374,9 @@ void FuseHPVMTensorNodes::inlineFirstNodeFunction(Module &M, Function *F1,
     }
 
     IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-    assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
-      && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+    assert ( ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+	      || (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id") )
+	     && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
 
     std::vector<Value*> Args;
     for(unsigned i = 0; i < II->getNumArgOperands(); i++) {
@@ -370,6 +389,7 @@ void FuseHPVMTensorNodes::inlineFirstNodeFunction(Module &M, Function *F1,
         Args.push_back(VMap[V]);
       }
     }
+    
     Function *F = Intrinsic::getDeclaration(&M, II->getIntrinsicID());
     CallInst* CI =
       CallInst::Create(F, Args,
@@ -409,9 +429,14 @@ void FuseHPVMTensorNodes::inlineSecondNodeFunction(Module &M, Function *F2,
     Instruction *I = &(*f2_i);
     if ((BuildDFG::isViscIntrinsic(I))) {
       IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+      assert( ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+	     || (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id"))
         && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
 
+      if ( (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id")) {
+	continue; // Skip adding visc.node.id calls in nodes other than first node
+      }
+						
       std::vector<Value*> Args;
       for(unsigned i = 0; i < II->getNumArgOperands(); i++) {
         Value *V = II->getArgOperand(i);
@@ -506,10 +531,11 @@ Function* FuseHPVMTensorNodes::createLeafDFNodeFunction(IntrinsicInst* II1,
     ++fused_arg_it;
   }
 
-//  for(const auto& v: FusedValueMap) {
-//    errs() << "key = " << *(v.first) << "\t";
-//    errs() << "value = " << *(v.second) << "\n";
-//  }
+
+  //  for(const auto& v: FusedValueMap) {
+  //    errs() << "key = " << *(v.first) << "\t";
+  //    errs() << "value = " << *(v.second) << "\n";
+  //  }
 
   // Invoke function that inlines F1 into Ffused, using and updating mappings
   inlineFirstNodeFunction(M, F1, Ffused, FusedValueMap, OutValues);
@@ -670,6 +696,7 @@ void FuseHPVMTensorNodes::updateParentNodeFunction(IntrinsicInst* II1,
     DEBUG(errs() << "Erasing: " << **ib << "\n");
     (*ib)->eraseFromParent();
   }
+
   II2->replaceAllUsesWith(IInew);
   II2->eraseFromParent();
 
@@ -792,6 +819,7 @@ void FindFusionTargetsTraversal::codeGen(DFInternalNode *N) {
   return;
 }
 
+  
 void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) {
   DEBUG(errs() << "Inside leaf node: "
                << N->getFuncPointer()->getName() << "\n");
@@ -802,9 +830,9 @@ void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) {
     return;
   }
 
-//  if(N->getTargetHint() != visc::PROMISE_TARGET) {
+
   if(!preferredTargetIncludes(N, visc::PROMISE_TARGET)) {
-    // Only fuse if we plan to target PROMISE
+    // Only fuse if we plan to target PROMISE/Layers API
     // The CUDNN backend would be able to generate calls for the fused node,
     // but not the other way around
     DEBUG(errs() << "No PROMISE hint. Skipping node: "
@@ -820,6 +848,14 @@ void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) {
   std::vector<IntrinsicInst*> CurrentNodeSequence;
 
   switch(II->getIntrinsicID()) {
+
+    /*case Intrinsic::visc_node_id:
+    { // Found beginning of pattern conv-bias-activation-pooling.
+
+    }
+    break;
+    */
+    
     case Intrinsic::visc_tensor_convolution:
       { // Found beginning of pattern conv-bias-activation-pooling.
         // Look for the rest
@@ -931,9 +967,9 @@ void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) {
 }
 
 bool FuseHPVMTensorNodesWrapper::runOnModule(Module &M) {
-  errs() << "\nFUSE HPVM TENSOR NODES PASS\n";
 
-// Get the BuildDFG Analysis Results:
+  errs() << "\nFUSE HPVM TENSOR NODES PASS\n";
+  // Get the BuildDFG Analysis Results:
   // - Dataflow graph
   BuildDFG &DFG = getAnalysis<BuildDFG>();
 
@@ -952,7 +988,7 @@ bool FuseHPVMTensorNodesWrapper::runOnModule(Module &M) {
   FuseHPVMTensorNodes::FusionTargets &FTs = FTTVisitor->getFusionTargets();
 
   FuseHPVMTensorNodes Fuse;
-//  Fuse.printFusionTargets(FTs);
+  //  Fuse.printFusionTargets(FTs);
 
   Fuse.run(M, FTs);
 
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index faab312087..a4d9f2c2a4 100644
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -27,21 +27,14 @@
 #include "llvm/SupportVISC/VISCUtils.h"
 
 
-#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
-
 using namespace llvm;
 using namespace viscUtils;
 
 
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer"));
-
 namespace genvisc {
 
 // Helper Functions
 
-static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
 static Function* transformReturnTypeToStruct(Function* F);
 static Type* getReturnTypeFromReturnInst(Function* F);
 
@@ -178,6 +171,9 @@ IS_VISC_CALL(tensor_tanh)
 IS_VISC_CALL(tensor_sigmoid)
 IS_VISC_CALL(tensor_softmax)
 
+IS_VISC_CALL(node_id)
+
+
 // Return the constant integer represented by value V
 static unsigned getNumericValue(Value* V) {
   assert(isa<ConstantInt>(V)
@@ -816,53 +812,17 @@ bool GenVISC::runOnModule(Module &M) {
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL &&
-         "Define LLVM_SRC_ROOT environment variable!");
-
-  Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll";
-  errs() << llvmSrcRoot << "\n";
-
-  std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-  //    Module* runtimeModule =
-  // ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll",
-  // Err, M.getContext());
-
-  if(runtimeModule == NULL)
-    DEBUG(errs() << Err.getMessage());
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
-
-  llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet",
-                                 runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_initializeTimerSet);
-
-  llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer",
-                            runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_switchToTimer);
-
-  llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet",
-                            runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_printTimerSet);
-
   // Insert init context in main
   DEBUG(errs() << "Locate __visc__init()\n");
   Function* VI = M.getFunction("__visc__init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
   Instruction* I = cast<Instruction>(*VI->user_begin());
 
-  DEBUG(errs() << "Initialize Timer Set\n");
-  initializeTimerSet(I);
-  switchToTimer(visc_TimerID_NONE, I);
-
   // Insert print instruction at visc exit
   DEBUG(errs() << "Locate __visc__cleanup()\n");
   Function* VC = M.getFunction("__visc__cleanup");
   assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
   I = cast<Instruction>(*VC->user_begin());
-  printTimerSet(I);
-
 
   DEBUG(errs() << "-------- Searching for launch sites ----------\n");
 
@@ -1308,6 +1268,12 @@ bool GenVISC::runOnModule(Module &M) {
       if (isVISCCall_tensor_softmax(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_softmax, &toBeErased);
       }
+
+      // New Intrinsic to set Node ID
+      if (isVISCCall_node_id(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_node_id, &toBeErased);
+      }
+      
     }
 
     // Erase the __visc__node calls
@@ -1530,7 +1496,6 @@ void GenVISC::genHost(CallInst* CI, Function* KernelF, unsigned levels, unsigned
   StructType* ArgStructTy = StructType::create(ArgList, "struct.arg", true);
   DEBUG(errs() << *ArgStructTy << "\n");
 
-  switchToTimer(visc_TimerID_ARG_PACK, CI);
   // Insert alloca inst for this argument struct type
   AllocaInst* AI = new AllocaInst(ArgStructTy, "in.addr", CI);
 
@@ -1544,8 +1509,6 @@ void GenVISC::genHost(CallInst* CI, Function* KernelF, unsigned levels, unsigned
                  "args",
                  CI);
 
-  switchToTimer(visc_TimerID_NONE, CI);
-
   // Bitcast Root function to i8*
   Constant* Root_i8ptr = ConstantExpr::getPointerCast(Root, Type::getInt8PtrTy(Ctx));
   // Replace CI with launch call to a Root function
@@ -1575,49 +1538,6 @@ void GenVISC::genHost(CallInst* CI, Function* KernelF, unsigned levels, unsigned
   // Get result (optional)
 }
 
-void GenVISC::initializeTimerSet(Instruction* InsertBefore) {
-  Value* TimerSetAddr;
-  StoreInst* SI;
-  TIMER(TimerSet = new GlobalVariable(*M,
-                                      Type::getInt8PtrTy(M->getContext()),
-                                      false,
-                                      GlobalValue::CommonLinkage,
-                                      Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
-                                      "viscTimerSet_GenVISC"));
-  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n");
-  DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
-
-  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
-                                        None,
-                                        "",
-                                        InsertBefore));
-  DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
-  TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
-  DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
-}
-
-void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
-  Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)};
-  TIMER(CallInst::Create(llvm_visc_switchToTimer,
-                         ArrayRef<Value*>(switchArgs, 2),
-                         "",
-                         InsertBefore));
-}
-
-void GenVISC::printTimerSet(Instruction* InsertBefore) {
-  Value* TimerName;
-  TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore));
-  Value* printArgs[] = {TimerSet, TimerName};
-  TIMER(CallInst::Create(llvm_visc_printTimerSet,
-                         ArrayRef<Value*>(printArgs, 2),
-                         "",
-                         InsertBefore));
-}
-
-static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
-  return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
-}
-
 static Function* transformReturnTypeToStruct(Function* F) {
   // Currently only works for void return types
   DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
-- 
GitLab