From b9461eeadc955b5f3b2461199eb572cd8dd38e7a Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Thu, 6 Nov 2014 04:36:20 +0000
Subject: [PATCH] (1) Modified test case to use i64 for pointer data size and
 not i32. Using i32 is incompatible with visc-rt (2) Set GenFunc pointer to
 NULL. Completely removed FMap from both backend passes. Now generated func
 info is contained in DFNode itself. (3) Added Makefile to visc-rt for easy
 compilation to generate visc-rt.ll

---
 llvm/include/llvm/IR/DFGraph.h                |  2 +-
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 25 ++++----
 .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp  | 58 ++++++++-----------
 llvm/projects/visc-rt/Makefile                | 11 ++++
 llvm/projects/visc-rt/visc-rt.cpp             |  4 +-
 .../VISC/MatrixMultiplication/visc_gemm.ll    | 31 +++++-----
 6 files changed, 65 insertions(+), 66 deletions(-)
 create mode 100644 llvm/projects/visc-rt/Makefile

diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index 6093fe9297..2ab5f91c83 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -167,7 +167,7 @@ private:
   // Important things that make up a Dataflow Node
   IntrinsicInst* II;              ///< Associated IntrinsicInst/Value
   Function* FuncPointer;          ///< Associated Function
-  Function* GenFunc;              ///< Associated Function generated by backend
+  Function* GenFunc = NULL;       ///< Associated Function generated by backend
   DFInternalNode* Parent;         ///< Pointer to parent dataflow Node
   unsigned NumOfDim;              ///< Number of dimensions
   std::vector<Value*> DimLimits;  ///< Number of instances in each dimension
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index d08a3ae170..8d37637fdc 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -76,7 +76,6 @@ namespace {
     // extra index and dimension arguments. This map also serves to find out if
     // we already have an index and dim extended function copy or not (i.e.,
     // "Have we visited this function before?")
-    ValueMap<Function*, Function*> FMap; 
     DenseMap<DFNode*, Value*> OutputMap;
 
     // VISC Runtime API
@@ -288,7 +287,7 @@ namespace {
       assert(OutputMap.count(SrcDF)
              && "Source node call not found. Dependency violation!");
 
-      // Find CallInst associated with the Source DFNode using FMap
+      // Find CallInst associated with the Source DFNode using OutputMap
       Value* CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
@@ -323,7 +322,7 @@ namespace {
   void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) {
     // Check if clone already exists. If it does, it means we have visited this
     // function before and nothing else needs to be done for this leaf node.
-    assert(N->getGenFunc() != NULL && "Code already generated for this node");
+    assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
     DEBUG(errs() << "Generating kernel call code\n");
 
@@ -355,7 +354,7 @@ namespace {
     ReturnInst* RI = ReturnInst::Create(M.getContext(),
                                         UndefValue::get(F_X86->getReturnType()), BB);
 
-    //Add old func: new func pair to the FMap
+    //Add the generated function info to DFNode
     N->setGenFunc(F_X86, DFNode::X86);
 
     // FIXME: Adding Index and Dim arguments are probably not required except
@@ -645,13 +644,11 @@ namespace {
     Function *F = N->getFuncPointer();
 
     // Look up if we have visited this function before. If we have, then just
-    // get the cloned function pointer from FMap. Otherwise, create the cloned
-    // function and add it to the FMap.
-    Function *F_nvptx;
-    if(FMap.count(F)) {
-      F_nvptx = FMap[F];
-    }
-    else {
+    // get the cloned function pointer from DFNode. Otherwise, create the cloned
+    // function and add it to the DFNode GenFunc.
+    Function *F_nvptx = N->getGenFunc();
+    if(F_nvptx == NULL) {
+      errs() << "GenFunc is NULL\n";
       // Clone the function
       ValueToValueMapTy VMap;
       F_nvptx = CloneFunction(F, VMap, true);
@@ -662,8 +659,10 @@ namespace {
       DEBUG(errs() << *F_nvptx->getType());
       DEBUG(errs() << *F_nvptx);
 
-      //Add old func: new func pair to the FMap
-      FMap[F] = F_nvptx;
+      //Add generated function info to DFNode
+      N->setGenFunc(F_nvptx, DFNode::PTX);
+    } {
+      errs() << "GenFunc is not NULL\n" << *F_nvptx;
     }
 
     transformFunctionToVoid(F_nvptx);
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 8b61d42af9..2659e9e444 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -56,11 +56,7 @@ private:
   Module &M;
   BuildDFG &DFG;
 
-  // Map from Old function associated with DFNode to new cloned function with
-  // extra index and dimension arguments. This map also serves to find out if
-  // we already have an index and dim extended function copy or not (i.e.,
-  // "Have we visited this function before?")
-  ValueMap<Function*, Function*> FMap;
+  // Map from Node to Instruction containing output of a node
   DenseMap<DFNode*, Value*> OutputMap;
 
   // VISC Runtime API
@@ -82,7 +78,6 @@ private:
       Instruction* InsertBefore);
   void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
       Instruction* InsertBefore);
-  void codeGenLaunch(DFInternalNode* Root);
   void codeGen(DFInternalNode* N);
   void codeGen(DFLeafNode* N);
 public:
@@ -91,6 +86,8 @@ public:
     initRuntimeAPI();
   }
 
+  void codeGenLaunch(DFInternalNode* Root);
+
   virtual void visit(DFInternalNode* N) {
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
@@ -112,6 +109,7 @@ public:
     DEBUG(errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n");
   }
 
+
 };
 
 bool DFG2LLVM_X86::runOnModule(Module &M) {
@@ -130,6 +128,14 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
 
   // Initiate code generation for root DFNode
   CGTVisitor->visit(Root);
+
+
+  // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+  // TODO: Later on, we might like to do this in a separate pass, which would
+  // allow us the flexibility to switch between complete static code generation
+  // for DFG or having a customized runtime+scheduler
+  CGTVisitor->codeGenLaunch(Root);
+
   delete CGTVisitor;
   return true;
 }
@@ -323,7 +329,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
 
   DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and 
-  Function* RootF_X86 = FMap[Root->getFuncPointer()];
+  Function* RootF_X86 = Root->getGenFunc();
   // Generate a call to RootF_X86 with null parameters for now 
   std::vector<Value*>Args;
   for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
@@ -416,7 +422,7 @@ Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* Paren
     assert(OutputMap.count(SrcDF)
            && "Source node call not found. Dependency violation!");
 
-    // Find CallInst associated with the Source DFNode using FMap
+    // Find CallInst associated with the Source DFNode using OutputMap
     Value* CI = OutputMap[SrcDF];
 
     // Extract element at source position from this call instruction
@@ -483,13 +489,12 @@ void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86,
 }
 
 void CodeGenTraversal::codeGen(DFInternalNode* N) {
-  Function* F = N->getFuncPointer();
-
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-  if(FMap.count(F))
+  if(N->getGenFunc() != NULL)
     return;
 
+  Function* F = N->getFuncPointer();
   // Create of clone of F with no instructions. Only the type is the same as F
   // without the extra arguments.
   Function* F_X86;
@@ -515,10 +520,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
                                       UndefValue::get(F_X86->getReturnType()), BB);
 
-  //Add old func: new func pair to the FMap
-  // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap
-  // is required
-  FMap[F] = F_X86;
+  //Add generated function info to DFNode
   N->setGenFunc(F_X86, DFNode::X86);
 
   // Add Index and Dim arguments except for the root node
@@ -591,16 +593,6 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
 
-  // If it is a root node, we can go ahead and replace the launch intrinsic with
-  // pthead call, otherwise return now.
-  // TODO: Later on, we might like to do this in a separate pass, which would
-  // allow us the flexibility to switch between complete static code generation
-  // for DFG or having a customized runtime+scheduler
-  if(!N->isRoot())
-    return;
-  
-  codeGenLaunch(N);
-
 }
 
 // Code generation for leaf nodes
@@ -611,6 +603,11 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
     return;
   }
 
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+  if(N->getGenFunc() != NULL)
+    return;
+
   std::vector<IntrinsicInst *> IItoRemove;
   std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
   BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
@@ -618,21 +615,14 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   // Get the function associated woth the dataflow node
   Function *F = N->getFuncPointer();
 
-  // Check if clone already exists. If it does, it means we have visited this
-  // function before and nothing else needs to be done for this leaf node.
-  if(FMap.count(F))
-    return;
-
   // Clone the function, if we are seeing this function for the first time.
   Function *F_X86;
   ValueToValueMapTy VMap;
   F_X86 = CloneFunction(F, VMap, true);
   // Insert the cloned function into the module
   M.getFunctionList().push_back(F_X86);
-  // Add old func: new func pair to the FMap
-  // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap
-  // is required
-  FMap[F] = F_X86;
+
+  // Add generated function info to DFNode
   N->setGenFunc(F_X86, DFNode::X86);
 
   // Add the new argument to the argument list
diff --git a/llvm/projects/visc-rt/Makefile b/llvm/projects/visc-rt/Makefile
new file mode 100644
index 0000000000..68e3e5f79c
--- /dev/null
+++ b/llvm/projects/visc-rt/Makefile
@@ -0,0 +1,11 @@
+LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
+TARGET:=visc-rt
+LLVM_CC:=$(LLVM_INSTALL)/bin/clang
+
+all: $(TARGET:%=%.ll)
+
+$(TARGET:%=%.ll):%.ll:%.cpp
+	$(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
+
+clean :
+	rm -f $(TARGET).ll
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 0a5eab8a87..6c032289e7 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -95,7 +95,7 @@ void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t
       CL_MEM_COPY_HOST_PTR, size, input, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
-  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument");
   return d_input;
 }
 
@@ -107,7 +107,7 @@ void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
       size, NULL, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
-  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument");
   return d_output;
 }
 
diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll
index 9b19a8c9a1..2f4a6f8277 100644
--- a/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll
+++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll
@@ -131,8 +131,8 @@ declare i32 @printf(i8* nocapture, ...) #1
 
 ; --------------- VISC Intrinsics ---------------
 ; Return Type of VISC Compute Matrix Mul
-%rtype = type {float*, i32}
-%struct.arg = type <{ float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }>
+%rtype = type {float*, i64}
+%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.launch(i8*, i8*) #0
@@ -178,12 +178,12 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; ----------------- VISC intrinsics end ------------------
 
 ; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
+define %rtype @matrixMul(float* nocapture %A, i64 %bytes_A, float* nocapture %B, i64 %bytes_B, float* %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
 entry:
   ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
   
   ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
+  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
   ; Replaced statement -- 
   ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
   ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
@@ -232,13 +232,13 @@ for.end:                                          ; preds = %for.body, %entry
   ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
   ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
   %.fca.0.insert = insertvalue %rtype undef, float* %C, 0
-  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1
+  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i64 %bytes_C, 1
   ret %rtype %.fca.1.insert
 }
 
 ; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
+define %rtype @MatrixMulRoot(float* %h_A, i64 %bytes_A, float* %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
+  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
   ; Bind Inputs
   call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
   call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
@@ -256,7 +256,7 @@ define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_
 }
 
 ; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i32 %bytes_A, float* nocapture %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
+;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
 ;entry:
 ;  %cmp18 = icmp eq i32 %m, 0
 ;  %cmp215 = icmp eq i32 %n, 0
@@ -272,7 +272,7 @@ define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_
 ;
 ;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
 ;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i32 undef, float* %h_B, i32 undef, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
+;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
 ;  %inc.us = add i32 %j.016.us, 1
 ;  %exitcond = icmp eq i32 %inc.us, %n
 ;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
@@ -347,19 +347,18 @@ randomInit.exit41:                                ; preds = %for.body.i40
   %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
 
   store float* %0, float** %in.addr.h_A
-  store i32 4194304, i32* %in.addr.bytes_A
+  store i64 4194304, i64* %in.addr.bytes_A
   store float* %1, float** %in.addr.h_B
-  store i32 4194304, i32* %in.addr.bytes_B
+  store i64 4194304, i64* %in.addr.bytes_B
   store float* %2, float** %in.addr.h_C
-  store i32 4194304, i32* %in.addr.bytes_C
+  store i64 4194304, i64* %in.addr.bytes_C
   store i32 1024, i32* %in.addr.WA
   store i32 1024, i32* %in.addr.WB
   store i32 1024, i32* %in.addr.HA
 
   ; Change type to i8* and VISC Launch call
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-  ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304)
+  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
 
   ; Wait for result
   call void @llvm.visc.wait(i8* %graphID)
@@ -367,8 +366,8 @@ randomInit.exit41:                                ; preds = %for.body.i40
   ; Get the result
   %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
   %out = load %rtype* %out.addr
-  ; -------------------------------- Completed VISC Launch Call --------------------------------  
-  
+  ; -------------------------------- Completed VISC Launch Call --------------------------------
+
   %3 = extractvalue %rtype %out, 0
   %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3)
   %tobool = icmp eq i32 %call14, 0
-- 
GitLab