From 3f87f33f80d95015136468d0e89359087e2797e3 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Fri, 14 Nov 2014 21:27:49 +0000
Subject: [PATCH] (1) Moved Kernel to a separate class (2) Made all the local
 functions static so that they are not visible outside     this file (3) Runs
 correctly. Now have to finish support for attributes

Point to note: Writing a module messes it up in memory, hence it should always
be done when it is not to be used anymore
---
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 172 +++++++++++++-----
 1 file changed, 125 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 4fc3934246..2961a966af 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
 
 #include <sstream>
 
@@ -36,11 +37,13 @@ using namespace builddfg;
 namespace {
 
 // Helper function declarations
-  void changeDataLayout(Module &);
-  void changeTargetTriple(Module &);
-  std::string printType(Type*);
-  std::string convertInt(int);
-  void findReturnInst(Function *, std::vector<ReturnInst *> &);
+  static std::string getPTXFilename(const Module&);
+  static std::string getFilenameFromModule(const Module& M);
+  static void changeDataLayout(Module &);
+  static void changeTargetTriple(Module &);
+  static std::string printType(Type*);
+  static std::string convertInt(int);
+  static void findReturnInst(Function *, std::vector<ReturnInst *> &);
 
   // DFG2LLVM_NVPTX - The first implementation.
   struct DFG2LLVM_NVPTX : public ModulePass {
@@ -64,6 +67,26 @@ namespace {
   };
 
   // Visitor for Code generation traversal (tree traversal for now)
+  class Kernel {
+  public:
+    Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*>
+        _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0,
+        std::vector<Value*> _localWGSize = std::vector<Value*>()) : KernelFunction(_KF),
+    gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
+    localWGSize(_localWGSize) {
+      assert(gridDim == globalWGSize.size()
+          && "gridDim should be same as the size of vector globalWGSize");
+      assert(blockDim == localWGSize.size()
+          && "blockDim should be same as the size of vector localWGSize");
+    }
+
+    Function* KernelFunction;
+    unsigned gridDim;
+    unsigned blockDim;
+    std::vector<Value*> globalWGSize;
+    std::vector<Value*> localWGSize;
+  };
+
   class CodeGenTraversal : public DFNodeVisitor {
 
   private:
@@ -71,14 +94,8 @@ namespace {
     Module &M;
     Module &KernelM;
     BuildDFG &DFG;
-    DFNode * KernelLaunchNode;
-    struct { Function * KF;
-             unsigned gridDim;
-             unsigned blockDim;
-             std::vector<Value*> localWGSize;
-             std::vector<Value*> globalWGSize;
-           } kernel;
-
+    DFNode* KernelLaunchNode;
+    Kernel* kernel;
     // Map from Old function associated with DFNode to new cloned function with
     // extra index and dimension arguments. This map also serves to find out if
     // we already have an index and dim extended function copy or not (i.e.,
@@ -110,7 +127,7 @@ namespace {
     Argument* getArgumentAt(Function* F, unsigned offset);
     Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
                         Instruction* InsertBefore);
-    void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& Kernel);
+    void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName);
 
     void codeGen(DFInternalNode* N);
     void codeGen(DFLeafNode* N);
@@ -326,11 +343,14 @@ namespace {
   // used to generate a function to associate with this leaf node. The function
   // is responsible for all the memory allocation/transfer and invoking the
   // kernel call on the device
-  void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) {
+  void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) {
     // Check if clone already exists. If it does, it means we have visited this
-    // function before and nothing else needs to be done for this leaf node.
+    // function before.
     assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
+    // If kernel struct has not been initialized with kernel function, then fail
+    assert(kernel != NULL && "No kernel found!!");
+
     DEBUG(errs() << "Generating kernel call code\n");
 
     Function* F = N->getFuncPointer();
@@ -401,10 +421,12 @@ namespace {
 
     DEBUG(errs() << "Initializing commandQ" << "\n");
     // Initialize command queue
-    Value* file = getStringPointer(FileName, RI, "Filename");
-    Value* kernel = getStringPointer(KernelName, RI,"KernelName");
+    Value* fileStr = getStringPointer(FileName, RI, "Filename");
+    errs() << *fileStr << "\n";
+    errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n";
+    Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName");
 
-    Value* LaunchInstArgs[] = {file, kernel};
+    Value* LaunchInstArgs[] = {fileStr, kernelStr};
 
     DEBUG(errs() << "Inserting launch call" << "\n");
     CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
@@ -489,15 +511,46 @@ namespace {
     // Need work dim, localworksize, globalworksize
     // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work
     // size and global work size
+    // Allocate size_t[numDims] space on stack. Store the work group sizes and
+    // pass it as an argument to ExecNode
+    Type* Int64Ty = Type::getInt64Ty(M.getContext());
+    Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim);
+    AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", RI);
+    Value* GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", RI);
+    Value* nextDim = GlobalWGPtr;
+    errs() << *GlobalWGPtr << "\n";
+    Constant* IntOne = ConstantInt::get(Int64Ty, 1);
+    errs() << *IntOne << "\n";
+    for(unsigned i=0; i < kernel->gridDim; i++) {
+      errs() << *kernel->globalWGSize[i]->getType() << "\n";
+      errs() << *nextDim->getType() << "\n";
+      assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+      if(kernel->globalWGSize[i]->getType() != Int64Ty) {
+        kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", RI);
+        StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, RI);
+        errs() << *SI << "\n";
+      } else {
+        StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, RI);
+        errs() << *SI << "\n";
+      }
+      if(i+1 < kernel->gridDim) {
+        GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim, ArrayRef<Value*>(IntOne), GlobalWG->getName()+"."+Twine(i+1), RI);
+        errs() << *GEP << "\n";
+        nextDim = GEP;
+      }
+    }
+    errs() << *llvm_visc_ptx_executeNode << "\n";
+    errs() << *GlobalWGPtr << "\n";
     Value* ExecNodeArgs[] = {GraphID,
                             ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()),
                             Constant::getNullValue(Type::getInt64PtrTy(M.getContext())),
-                            Constant::getNullValue(Type::getInt64PtrTy(M.getContext()))
+                            GlobalWGPtr
                             };
     CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode,
                                        ArrayRef<Value*>(ExecNodeArgs, 4),
                                        "event."+CF->getName(),
                                        RI);
+    errs() << *Event << "\n";
     // Wait for Kernel to Finish
     CallInst::Create(llvm_visc_ptx_wait,
                      ArrayRef<Value*>(GraphID),
@@ -615,9 +668,9 @@ namespace {
 
       // Now the remaining nodes to be visited should be ignored
       KernelLaunchNode = NULL;
-      writeKernelsModule();
       errs() << "Insert Runtime calls\n";
-      insertRuntimeCalls(N, getKernelsModuleName(M), "matrixMul");
+      insertRuntimeCalls(N, getPTXFilename(M));
+      writeKernelsModule();
 
     } else {
       DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
@@ -643,31 +696,36 @@ namespace {
 
     if (!pLevel || !pReplFactor) {
       KernelLaunchNode = PNode;
+      kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits());
       // TODO: Find a better way of choosing parameters
-      kernel.gridDim = N->getNumOfDim();
-      kernel.blockDim = N->getNumOfDim();
-      kernel.globalWGSize = N->getDimLimits();
-      IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
+      //kernel->gridDim = N->getNumOfDim();
+      //kernel->blockDim = N->getNumOfDim();
+      //kernel->globalWGSize = N->getDimLimits();
+      //kernel->localWGSize = N->getDimLimits();
+      //FIXME: Comment this out as we can provide localWGSize as null
+      //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
       // TODO: How to choose the div factor;
-      ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
-      std::vector<Value*> tmp(kernel.gridDim, divFactor);
-      for (unsigned i = 0; i < kernel.gridDim; i++) {
-        BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel.globalWGSize[i],tmp[i]);
-        kernel.localWGSize.push_back(SDivInst);
-      }
+      //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
+      //std::vector<Value*> tmp(kernel->gridDim, divFactor);
+      //for (unsigned i = 0; i < kernel->gridDim; i++) {
+      //  BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]);
+      //  kernel->localWGSize.push_back(SDivInst);
+      //}
     }
     else {
+      errs() << "*************** Entering else part **************\n";
+      /*
       KernelLaunchNode = PNode->getParent();
-      kernel.gridDim = PNode->getNumOfDim();
-      kernel.blockDim = N->getNumOfDim();
+      kernel->gridDim = PNode->getNumOfDim();
+      kernel->blockDim = N->getNumOfDim();
       // TODO: Handle different number of dimensions
-      assert((kernel.gridDim == kernel.blockDim) && "Dimension number must match");
+      assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match");
       std::vector<Value*> numOfBlocks = PNode->getDimLimits();
-      kernel.localWGSize = N->getDimLimits();
-      for (unsigned i = 0; i < kernel.gridDim; i++) {
-        BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel.localWGSize[i],numOfBlocks[i]);
-        kernel.globalWGSize.push_back(MulInst);
-      }
+      kernel->localWGSize = N->getDimLimits();
+      for (unsigned i = 0; i < kernel->gridDim; i++) {
+        //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]);
+        //kernel->globalWGSize.push_back(MulInst);
+      }*/
     }
 
     std::vector<IntrinsicInst *> IItoRemove;
@@ -922,7 +980,8 @@ namespace {
       (*ri)->eraseFromParent();
 
     addCLMetadata(F_nvptx);
-    kernel.KF = F_nvptx;
+    kernel->KernelFunction = F_nvptx;
+    errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
     DEBUG(errs() << KernelM);
 
     return;
@@ -945,13 +1004,19 @@ namespace {
     // Initiate code generation for root DFNode
     CGTVisitor->visit(Root);
     //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+    delete CGTVisitor;
 
     return true;
   }
 
   std::string CodeGenTraversal::getKernelsModuleName(Module &M) {
+    /*SmallString<128> currentDir;
+    llvm::sys::fs::current_path(currentDir);
+    std::string fileName = getFilenameFromModule(M);
+    Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+    return output.str().append(".kernels.ll");*/
     std::string mid = M.getModuleIdentifier();
-    return mid.append("_kernels.ll");
+    return mid.append(".kernels.ll");
   }
 
   void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) {
@@ -1174,9 +1239,22 @@ namespace {
  *                              Helper functions                              *
  ******************************************************************************/
 
+  // Get generated PTX binary name
+  static std::string getPTXFilename(const Module& M) {
+    std::string moduleID = M.getModuleIdentifier();
+    moduleID.append(".nvptx.s");
+    return moduleID;
+  }
+
+  // Get the name of the input file from module ID
+  static std::string getFilenameFromModule(const Module& M) {
+    std::string moduleID = M.getModuleIdentifier();
+    return moduleID.substr(moduleID.find_last_of("/")+1);
+  }
+
   // Changes the data layout of the Module to be compiled with NVPTX backend
   // TODO: Figure out when to call it, probably after duplicating the modules
-  void changeDataLayout(Module &M) {
+  static void changeDataLayout(Module &M) {
     std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
     std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
 
@@ -1189,7 +1267,7 @@ namespace {
     return;
   }
 
-  void changeTargetTriple(Module &M) {
+  static void changeTargetTriple(Module &M) {
     std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
     std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
@@ -1203,7 +1281,7 @@ namespace {
   }
 
 // Helper function, generate a string representation of a type
-  std::string printType(Type* ty) {
+  static std::string printType(Type* ty) {
     std::string type_str;
     raw_string_ostream rso(type_str);
     ty->print(rso);
@@ -1211,14 +1289,14 @@ namespace {
   }
 
 // Helper function, convert int to string
-  std::string convertInt(int number) {
+  static std::string convertInt(int number) {
    std::stringstream ss;//create a stringstream
    ss << number;//add number to the stream
    return ss.str();//return a string with the contents of the stream
   }
 
 // Helper function, populate a vector with all return statements in a function
-  void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
+  static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
     for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
       Instruction *I = &(*i);
       ReturnInst* RI = dyn_cast<ReturnInst>(I);
-- 
GitLab