From bc478ebef86ec8ed8e1f0cb3de63f3441701e6bc Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Wed, 26 Nov 2014 21:24:23 +0000
Subject: [PATCH] (1) Modified X86 pass to now not do code gen if any visiting
 node has a genFunc     set. This is important so as not to fail in case of
 2-level PTX kernel,     where the intermediate node does not have a gen
 function, but would pass     because its parent has one. (2) Modified PTX
 pass to     i.  Correct mapping of arguments between intermediate and kernel
 launch node     ii. Allow both constants and arguments as node limit values  
   iii.Generate get_group_id and get_global_id correctly

M    Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
M    Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
---
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 132 ++++++++++++------
 .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp  |   8 +-
 2 files changed, 98 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 7df93bd099..190b9af288 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -75,20 +75,29 @@ public:
   unsigned blockDim;
   std::vector<Value*> globalWGSize;
   std::vector<Value*> localWGSize;
+  std::vector<int> localDimMap;
 
   std::vector<unsigned> getInArgMap() {
     return inArgMap;
   }
+  void setInArgMap(std::vector<unsigned> map) {
+    inArgMap = map;
+  }
+
+  void setLocalWGSize(std::vector<Value*> V) {
+    localWGSize = V;
+  }
+
 
   bool hasLocalWG() {
-    return blockDim == 0;
+    return blockDim != 0;
   }
 };
 
 // Helper function declarations
 static void getExecuteNodeParams(Value* &, Value* &, Value* &, Kernel*,
                                  ValueToValueMapTy&, Instruction*);
-static Value* genWorkGroupPtr(std::vector<Value*>&, ValueToValueMapTy&,
+static Value* genWorkGroupPtr(std::vector<Value*>, ValueToValueMapTy&,
                               Instruction*, const Twine& WGName = "WGSize");
 static bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
 static std::string getPTXFilename(const Module&);
@@ -615,10 +624,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
 
   // Read all the pointer arguments which had side effects i.e., had out
   // attribute
+  errs() << "Output Pointers : " << OutputPointers.size() << "\n";
   for(auto output: OutputPointers) {
-    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
-    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
-    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
+    errs() << "Read: " << *output.d_ptr << "\n";
+    errs() << "\tTo: " << *output.h_ptr << "\n";
+    errs() << "\t#bytes: " << *output.bytes << "\n";
 
     Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
     CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput,
@@ -703,7 +713,6 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   } else {
     DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
     // Keep track of the arguments order.
-
     std::vector<unsigned> map1 = N->getInArgMap();
     std::vector<unsigned> map2 = kernel->getInArgMap();
 
@@ -711,7 +720,33 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     for (unsigned i = 0; i < map2.size(); i++) {
       map2[i] = map1[map2[i]];
     }
-
+    kernel->setInArgMap(map2);
+
+    // Track the source of local dimlimits for the kernel
+    // Dimension limit can either be a constant or an argument of parent
+    // function. Since Internal node would no longer exist, we need to insert the
+    // localWGSize with values from the parent of N.
+    std::vector<Value*> localWGSizeMapped;
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Constant>(kernel->localWGSize[i])) {
+        // if constant, use as it is
+        localWGSizeMapped.push_back(kernel->localWGSize[i]);
+      }
+      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+        // if argument, find the argument location in N. Use InArgMap of N to
+        // find the source location in Parent of N. Retrieve the argument from
+        // parent to insert in the vector.
+        unsigned argNum = Arg->getArgNo();
+        unsigned parentArgNum = N->getInArgMap()[argNum];
+        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        localWGSizeMapped.push_back(A);
+      }
+      else {
+        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      }
+    }
+    // Update localWGSize vector of kernel
+    kernel->setLocalWGSize(localWGSizeMapped);
   }
 
 }
@@ -736,20 +771,6 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   if (!pLevel || !pReplFactor) {
     KernelLaunchNode = PNode;
     kernel = new Kernel(NULL, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits());
-    // TODO: Find a good way of choosing parameters - is this required?
-    //kernel->gridDim = N->getNumOfDim();
-    //kernel->blockDim = N->getNumOfDim();
-    //kernel->globalWGSize = N->getDimLimits();
-    //kernel->localWGSize = N->getDimLimits();
-    //FIXME: Comment this out as we can provide localWGSize as null
-    //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
-    // TODO: How to choose the div factor;
-    //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
-    //std::vector<Value*> tmp(kernel->gridDim, divFactor);
-    //for (unsigned i = 0; i < kernel->gridDim; i++) {
-    //  BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]);
-    //  kernel->localWGSize.push_back(SDivInst);
-    //}
   }
   else {
     // Converting a 2-level DFG to opencl kernel
@@ -757,15 +778,10 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
     KernelLaunchNode = PNode->getParent();
     assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
-    std::vector<Value*> globalWGSizeInsts;
-    for (unsigned i = 0; i < PNode->getNumOfDim(); i++) {
-      BinaryOperator* MulInst = BinaryOperator::CreateMul(PNode->getDimLimits()[i],N->getDimLimits()[i]);
-      globalWGSizeInsts.push_back(MulInst);
-    }
     kernel = new Kernel(NULL,                 // kernel function
                         N->getInArgMap(),     // kenel argument mapping
                         PNode->getNumOfDim(), // gridDim
-                        globalWGSizeInsts,    // grid size
+                        PNode->getDimLimits(),// grid size
                         N->getNumOfDim(),     // blockDim
                         N->getDimLimits());   // block size
 
@@ -885,21 +901,26 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
 
         // The following is to find which function to call
         Function * OpenCLFunction;
-        int parentLevel = ParentDFNode->getLevel();
-        int parentReplFactor = ParentDFNode->getNumOfDim();
+        int parentLevel = N->getParent()->getLevel();
+        int parentReplFactor = N->getParent()->getNumOfDim();
+        errs() << "Parent Level = " << parentLevel << "\n";
+        errs() << "Parent Repl factor = " << parentReplFactor << "\n";
 
-        if (!parentLevel || !parentReplFactor) {
+        if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
-          // launch, so we need to specify a global id
-
+          // launch, so we need to specify a global id.
+          // We can translate this only if the argument is the current node
+          // itself
+          errs() << "Substitute with get_global_id()\n";
+          errs() << *II << "\n";
           FunctionType* FT =
             FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
                               std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                               false);
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's id with respect to its parent
           // this is a local id call
           FunctionType* FT =
@@ -918,7 +939,11 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT));
         } else {
-          assert(false && "Unable to translate this intrinsic");
+          errs() << N->getFuncPointer()->getName() << "\n";
+          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
+          errs() << *II << "\n";
+
+          assert(false && "Unable to translate getNodeInstanceID intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
@@ -973,7 +998,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
                               false);
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
           FunctionType* FT =
@@ -992,7 +1017,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT));
         } else {
-          assert(false && "Unable to translate this intrinsic");
+          assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
@@ -1294,11 +1319,20 @@ static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value*
     LocalWGPtr = genWorkGroupPtr(kernel->localWGSize, VMap, IB, "LocalWGSize");
   }
 
-  GlobalWGPtr = genWorkGroupPtr(kernel->globalWGSize, VMap, IB, "GlobalWGSize");
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value*> globalWGSizeInsts;
+  for (unsigned i = 0; i < kernel->gridDim; i++) {
+    BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+    globalWGSizeInsts.push_back(MulInst);
+  }
+  GlobalWGPtr = genWorkGroupPtr(globalWGSizeInsts, VMap, IB, "GlobalWGSize");
   DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
-static Value* genWorkGroupPtr(std::vector<Value*>& WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
+// CodeGen for allocating space for Work Group on stack and returning a pointer
+// to its address
+static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
   Value* WGPtr;
   // Get int64_t and or ease of use
   Type* Int64Ty = Type::getInt64Ty(getGlobalContext());
@@ -1316,18 +1350,34 @@ static Value* genWorkGroupPtr(std::vector<Value*>& WGSize, ValueToValueMapTy& VM
   // size in that dimension
   for(unsigned i=0; i < WGSize.size(); i++) {
     assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+    // If WGSize[i] is not a constant or a instruction, use mapped value in the new function
+    Value* WGSizeMapped;
+    if(isa<Argument>(WGSize[i]))
+      WGSizeMapped = VMap[WGSize[i]];
+    else {
+      WGSizeMapped = WGSize[i];
+      errs() << "Mapping value is not required: ";
+      errs() << *WGSize[i] << "\n";
+    }
     if(WGSize[i]->getType() != Int64Ty) {
       // If number of dimensions are mentioned in any other integer format,
       // generate code to extend it to i64. We need to use the mapped value in
       // the new generated function, hence the use of VMap
       // FIXME: Why are we changing the kernel WGSize vector here?
-      WGSize[i] = BitCastInst::CreateIntegerCast(VMap[WGSize[i]], Int64Ty, true, "", IB);
-      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+      errs() << "Not i64. Zero extend required.\n";
+      errs() << *WGSize[i] << "\n";
+      errs() << *WGSizeMapped << "\n";
+      CastInst* CI = BitCastInst::CreateIntegerCast(WGSizeMapped, Int64Ty, true, "", IB);
+      errs() << "Bitcast done.\n";
+      StoreInst* SI = new StoreInst(CI, nextDim, IB);
+      errs() << "Zero extend done.\n";
       DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
     } else {
       // Store the value representing work group size in ith dimension on
       // stack
-      StoreInst* SI = new StoreInst(VMap[WGSize[i]], nextDim, IB);
+      StoreInst* SI = new StoreInst(WGSizeMapped, nextDim, IB);
+
       DEBUG(errs() << "\t Work group size: " << *SI << "\n");
     }
     if(i+1 < WGSize.size()) {
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 32ffaa1341..818be97f6f 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -90,9 +90,15 @@ public:
   void codeGenLaunch(DFInternalNode* Root);
 
   virtual void visit(DFInternalNode* N) {
+    // If code has already been generated for this internal node, skip the
+    // children
+    if(N->getGenFunc() != NULL)
+      return;
+
+    DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
     for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
         e = N->getChildGraph()->end(); i != e; ++i) {
       DFNode* child = *i;
-- 
GitLab