From b9461eeadc955b5f3b2461199eb572cd8dd38e7a Mon Sep 17 00:00:00 2001 From: Prakalp Srivastava <psrivas2@illinois.edu> Date: Thu, 6 Nov 2014 04:36:20 +0000 Subject: [PATCH] (1) Modified test case to use i64 for pointer data size and not i32. Using i32 is incompatible with visc-rt (2) Set GenFunc pointer to NULL. Completely removed FMap from both backend passes. Now generated func info is contained in DFNode itself. (3) Added Makefile to visc-rt for easy compilation to generate visc-rt.ll --- llvm/include/llvm/IR/DFGraph.h | 2 +- .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 25 ++++---- .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp | 58 ++++++++----------- llvm/projects/visc-rt/Makefile | 11 ++++ llvm/projects/visc-rt/visc-rt.cpp | 4 +- .../VISC/MatrixMultiplication/visc_gemm.ll | 31 +++++----- 6 files changed, 65 insertions(+), 66 deletions(-) create mode 100644 llvm/projects/visc-rt/Makefile diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h index 6093fe9297..2ab5f91c83 100644 --- a/llvm/include/llvm/IR/DFGraph.h +++ b/llvm/include/llvm/IR/DFGraph.h @@ -167,7 +167,7 @@ private: // Important things that make up a Dataflow Node IntrinsicInst* II; ///< Associated IntrinsicInst/Value Function* FuncPointer; ///< Associated Function - Function* GenFunc; ///< Associated Function generated by backend + Function* GenFunc = NULL; ///< Associated Function generated by backend DFInternalNode* Parent; ///< Pointer to parent dataflow Node unsigned NumOfDim; ///< Number of dimensions std::vector<Value*> DimLimits; ///< Number of instances in each dimension diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index d08a3ae170..8d37637fdc 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -76,7 +76,6 @@ namespace { // extra index and dimension arguments. This map also serves to find out if // we already have an index and dim extended function copy or not (i.e., // "Have we visited this function before?") - ValueMap<Function*, Function*> FMap; DenseMap<DFNode*, Value*> OutputMap; // VISC Runtime API @@ -288,7 +287,7 @@ namespace { assert(OutputMap.count(SrcDF) && "Source node call not found. Dependency violation!"); - // Find CallInst associated with the Source DFNode using FMap + // Find CallInst associated with the Source DFNode using OutputMap Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction @@ -323,7 +322,7 @@ namespace { void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) { // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. - assert(N->getGenFunc() != NULL && "Code already generated for this node"); + assert(N->getGenFunc() == NULL && "Code already generated for this node"); DEBUG(errs() << "Generating kernel call code\n"); @@ -355,7 +354,7 @@ namespace { ReturnInst* RI = ReturnInst::Create(M.getContext(), UndefValue::get(F_X86->getReturnType()), BB); - //Add old func: new func pair to the FMap + //Add the generated function info to DFNode N->setGenFunc(F_X86, DFNode::X86); // FIXME: Adding Index and Dim arguments are probably not required except @@ -645,13 +644,11 @@ namespace { Function *F = N->getFuncPointer(); // Look up if we have visited this function before. If we have, then just - // get the cloned function pointer from FMap. Otherwise, create the cloned - // function and add it to the FMap. - Function *F_nvptx; - if(FMap.count(F)) { - F_nvptx = FMap[F]; - } - else { + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. + Function *F_nvptx = N->getGenFunc(); + if(F_nvptx == NULL) { + errs() << "GenFunc is NULL\n"; // Clone the function ValueToValueMapTy VMap; F_nvptx = CloneFunction(F, VMap, true); @@ -662,8 +659,10 @@ namespace { DEBUG(errs() << *F_nvptx->getType()); DEBUG(errs() << *F_nvptx); - //Add old func: new func pair to the FMap - FMap[F] = F_nvptx; + //Add generated function info to DFNode + N->setGenFunc(F_nvptx, DFNode::PTX); + } { + errs() << "GenFunc is not NULL\n" << *F_nvptx; } transformFunctionToVoid(F_nvptx); diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 8b61d42af9..2659e9e444 100644 --- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -56,11 +56,7 @@ private: Module &M; BuildDFG &DFG; - // Map from Old function associated with DFNode to new cloned function with - // extra index and dimension arguments. This map also serves to find out if - // we already have an index and dim extended function copy or not (i.e., - // "Have we visited this function before?") - ValueMap<Function*, Function*> FMap; + // Map from Node to Instruction containing output of a node DenseMap<DFNode*, Value*> OutputMap; // VISC Runtime API @@ -82,7 +78,6 @@ private: Instruction* InsertBefore); void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, Instruction* InsertBefore); - void codeGenLaunch(DFInternalNode* Root); void codeGen(DFInternalNode* N); void codeGen(DFLeafNode* N); public: @@ -91,6 +86,8 @@ public: initRuntimeAPI(); } + void codeGenLaunch(DFInternalNode* Root); + virtual void visit(DFInternalNode* N) { // Follows a bottom-up approach for code generation. // First generate code for all the child nodes @@ -112,6 +109,7 @@ public: DEBUG(errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"); } + }; bool DFG2LLVM_X86::runOnModule(Module &M) { @@ -130,6 +128,14 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { // Initiate code generation for root DFNode CGTVisitor->visit(Root); + + + // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // TODO: Later on, we might like to do this in a separate pass, which would + // allow us the flexibility to switch between complete static code generation + // for DFG or having a customized runtime+scheduler + CGTVisitor->codeGenLaunch(Root); + delete CGTVisitor; return true; } @@ -323,7 +329,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and - Function* RootF_X86 = FMap[Root->getFuncPointer()]; + Function* RootF_X86 = Root->getGenFunc(); // Generate a call to RootF_X86 with null parameters for now std::vector<Value*>Args; for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { @@ -416,7 +422,7 @@ Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* Paren assert(OutputMap.count(SrcDF) && "Source node call not found. Dependency violation!"); - // Find CallInst associated with the Source DFNode using FMap + // Find CallInst associated with the Source DFNode using OutputMap Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction @@ -483,13 +489,12 @@ void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86, } void CodeGenTraversal::codeGen(DFInternalNode* N) { - Function* F = N->getFuncPointer(); - // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. - if(FMap.count(F)) + if(N->getGenFunc() != NULL) return; + Function* F = N->getFuncPointer(); // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. Function* F_X86; @@ -515,10 +520,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), UndefValue::get(F_X86->getReturnType()), BB); - //Add old func: new func pair to the FMap - // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap - // is required - FMap[F] = F_X86; + //Add generated function info to DFNode N->setGenFunc(F_X86, DFNode::X86); // Add Index and Dim arguments except for the root node @@ -591,16 +593,6 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); - // If it is a root node, we can go ahead and replace the launch intrinsic with - // pthead call, otherwise return now. - // TODO: Later on, we might like to do this in a separate pass, which would - // allow us the flexibility to switch between complete static code generation - // for DFG or having a customized runtime+scheduler - if(!N->isRoot()) - return; - - codeGenLaunch(N); - } // Code generation for leaf nodes @@ -611,6 +603,11 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { return; } + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. + if(N->getGenFunc() != NULL) + return; + std::vector<IntrinsicInst *> IItoRemove; std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; @@ -618,21 +615,14 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { // Get the function associated woth the dataflow node Function *F = N->getFuncPointer(); - // Check if clone already exists. If it does, it means we have visited this - // function before and nothing else needs to be done for this leaf node. - if(FMap.count(F)) - return; - // Clone the function, if we are seeing this function for the first time. Function *F_X86; ValueToValueMapTy VMap; F_X86 = CloneFunction(F, VMap, true); // Insert the cloned function into the module M.getFunctionList().push_back(F_X86); - // Add old func: new func pair to the FMap - // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap - // is required - FMap[F] = F_X86; + + // Add generated function info to DFNode N->setGenFunc(F_X86, DFNode::X86); // Add the new argument to the argument list diff --git a/llvm/projects/visc-rt/Makefile b/llvm/projects/visc-rt/Makefile new file mode 100644 index 0000000000..68e3e5f79c --- /dev/null +++ b/llvm/projects/visc-rt/Makefile @@ -0,0 +1,11 @@ +LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install +TARGET:=visc-rt +LLVM_CC:=$(LLVM_INSTALL)/bin/clang + +all: $(TARGET:%=%.ll) + +$(TARGET:%=%.ll):%.ll:%.cpp + $(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@ + +clean : + rm -f $(TARGET).ll diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 0a5eab8a87..6c032289e7 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -95,7 +95,7 @@ void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t CL_MEM_COPY_HOST_PTR, size, input, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); - checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); + checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument"); return d_input; } @@ -107,7 +107,7 @@ void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { size, NULL, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); - checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); + checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument"); return d_output; } diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll index 9b19a8c9a1..2f4a6f8277 100644 --- a/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll +++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll @@ -131,8 +131,8 @@ declare i32 @printf(i8* nocapture, ...) #1 ; --------------- VISC Intrinsics --------------- ; Return Type of VISC Compute Matrix Mul -%rtype = type {float*, i32} -%struct.arg = type <{ float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }> +%rtype = type {float*, i64} +%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> ; Function Attrs: nounwind declare i8* @llvm.visc.launch(i8*, i8*) #0 @@ -178,12 +178,12 @@ declare void @llvm.visc.bind.output(i8*, i32, i32) ; ----------------- VISC intrinsics end ------------------ ; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { +define %rtype @matrixMul(float* nocapture %A, i64 %bytes_A, float* nocapture %B, i64 %bytes_B, float* %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { entry: ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x + ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x ; Replaced statement -- ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 @@ -232,13 +232,13 @@ for.end: ; preds = %for.body, %entry ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) %.fca.0.insert = insertvalue %rtype undef, float* %C, 0 - %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1 + %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i64 %bytes_C, 1 ret %rtype %.fca.1.insert } ; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) +define %rtype @MatrixMulRoot(float* %h_A, i64 %bytes_A, float* %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { + %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) ; Bind Inputs call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A @@ -256,7 +256,7 @@ define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_ } ; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i32 %bytes_A, float* nocapture %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { +;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { ;entry: ; %cmp18 = icmp eq i32 %m, 0 ; %cmp215 = icmp eq i32 %n, 0 @@ -272,7 +272,7 @@ define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_ ; ;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us ; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i32 undef, float* %h_B, i32 undef, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) +; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) ; %inc.us = add i32 %j.016.us, 1 ; %exitcond = icmp eq i32 %inc.us, %n ; br i1 %exitcond, label %for.inc4.us, label %for.body3.us @@ -347,19 +347,18 @@ randomInit.exit41: ; preds = %for.body.i40 %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 store float* %0, float** %in.addr.h_A - store i32 4194304, i32* %in.addr.bytes_A + store i64 4194304, i64* %in.addr.bytes_A store float* %1, float** %in.addr.h_B - store i32 4194304, i32* %in.addr.bytes_B + store i64 4194304, i64* %in.addr.bytes_B store float* %2, float** %in.addr.h_C - store i32 4194304, i32* %in.addr.bytes_C + store i64 4194304, i64* %in.addr.bytes_C store i32 1024, i32* %in.addr.WA store i32 1024, i32* %in.addr.WB store i32 1024, i32* %in.addr.HA ; Change type to i8* and VISC Launch call %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) ; Wait for result call void @llvm.visc.wait(i8* %graphID) @@ -367,8 +366,8 @@ randomInit.exit41: ; preds = %for.body.i40 ; Get the result %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 %out = load %rtype* %out.addr - ; -------------------------------- Completed VISC Launch Call -------------------------------- - + ; -------------------------------- Completed VISC Launch Call -------------------------------- + %3 = extractvalue %rtype %out, 0 %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3) %tobool = icmp eq i32 %call14, 0 -- GitLab