diff --git a/README.md b/README.md index d285c6e1f6852b9f49d02ae91bd193dde07e79fc..5d493035026121da19c8f95ddbfda9da8a77f6a0 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,6 @@ Some common options that can be used with CMake are: * -DLLVM_ENABLE_ASSERTIONS=On --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types). -## Building hpvm runtime -HPVM also includes a runtime library which comprises of low-level, target-specific wrappers required by HPVM's code generation. -```shell -cd projects/visc-rt -make -cd .. -``` To use hpvm to compile benchmarks set environment variable `LLVM_SRC_ROOT` to llvm directory in your local repository ```shell export LLVM_SRC_ROOT=<full path to hpvm>/llvm diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index d830574c54da3600a452768cd3731a7bc9bc86f5..6498b46cd9a56ad69df35d4497b463b9dda98c87 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -8,55 +8,34 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "DFG2LLVM_X86" -#include "SupportVISC/DFG2LLVM.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Linker/Linker.h" #include "llvm/Pass.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" +#include "SupportVISC/DFG2LLVM.h" using namespace llvm; using namespace builddfg; using namespace dfg2llvm; // VISC Command line option to use timer or not -static cl::opt<bool> VISCTimer_X86("visc-timers-x86", - cl::desc("Enable visc timers")); -// Command line option to enable device abstraction or not static cl::opt<bool> - DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, - cl::desc("Enable visc device abstraction")); +VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); namespace { -// Helper Functions -static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { - if (!isa<CallInst>(I)) - return false; - CallInst *CI = cast<CallInst>(I); - return (CI->getCalledValue()->stripPointerCasts()->getName()) - .equals("llvm_visc_policy_getVersion"); -} - -CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { - for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { - Instruction *I = &*ib; - if (isVISCCall_llvm_visc_policy_getVersion(I)) - return cast<CallInst>(I); - } - return NULL; -} // DFG2LLVM_X86 - The first implementation. struct DFG2LLVM_X86 : public DFG2LLVM { static char ID; // Pass identification, replacement for typeid - DFG2LLVM_X86() : DFG2LLVM(ID) {} + DFG2LLVM_X86() :DFG2LLVM(ID) {} private: // Member variables @@ -71,7 +50,7 @@ public: class CGT_X86 : public CodeGenTraversal { private: - // Member variables + //Member variables FunctionCallee malloc; // VISC Runtime API @@ -88,46 +67,41 @@ private: FunctionCallee llvm_visc_createEdgeBuffer; FunctionCallee llvm_visc_createLastInputBuffer; FunctionCallee llvm_visc_createThread; - // Constant* llvm_visc_freeThreads; FunctionCallee llvm_visc_bufferPush; FunctionCallee llvm_visc_bufferPop; FunctionCallee llvm_visc_x86_dstack_push; FunctionCallee llvm_visc_x86_dstack_pop; FunctionCallee llvm_visc_x86_getDimLimit; FunctionCallee llvm_visc_x86_getDimInstance; - - // Functions - std::vector<IntrinsicInst *> *getUseList(Value *LI); - Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = ""); - void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *); + + //Functions + std::vector<IntrinsicInst*>* getUseList(Value* LI); + Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); + void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); - Argument *getArgumentFromEnd(Function *F, unsigned offset); - Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86, - Instruction *InsertBefore); - void invokeChild_X86(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap, - Instruction *InsertBefore); - void invokeChild_PTX(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap, - Instruction *InsertBefore); - StructType *getArgumentListStructTy(DFNode *); - Function *createFunctionFilter(DFNode *C); - void startNodeThread(DFNode *, std::vector<Value *>, - DenseMap<DFEdge *, Value *>, Value *, Value *, - Instruction *); - Function *createLaunchFunction(DFInternalNode *); - Function *createPushFunction(DFInternalNode *); - Function *createPopFunction(DFInternalNode *); - Function *createWaitFunction(DFInternalNode *); - + Argument* getArgumentFromEnd(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + StructType* getArgumentListStructTy(DFNode*); + Function* createFunctionFilter(DFNode* C); + void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>, + Value*, Value*, Instruction*); + Function* createLaunchFunction(DFInternalNode*); + // Virtual Functions void init() { VISCTimer = VISCTimer_X86; TargetName = "X86"; } void initRuntimeAPI(); - void codeGen(DFInternalNode *N); - void codeGen(DFLeafNode *N); - Function *codeGenStreamPush(DFInternalNode *N); - Function *codeGenStreamPop(DFInternalNode *N); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + Function* codeGenStreamPush(DFInternalNode* N); + Function* codeGenStreamPop(DFInternalNode* N); public: // Constructor @@ -136,20 +110,20 @@ public: initRuntimeAPI(); } - void codeGenLaunch(DFInternalNode *Root); - void codeGenLaunchStreaming(DFInternalNode *Root); + void codeGenLaunch(DFInternalNode* Root); + void codeGenLaunchStreaming(DFInternalNode* Root); }; bool DFG2LLVM_X86::runOnModule(Module &M) { - DEBUG(errs() << "\nDFG2LLVM_X86 PASS\n"); + errs() << "\nDFG2LLVM_X86 PASS\n"; // Get the BuildDFG Analysis Results: // - Dataflow graph // - Maps from i8* hansles to DFNode and DFEdge BuildDFG &DFG = getAnalysis<BuildDFG>(); - // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode *> Roots = DFG.getRoots(); + //DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); @@ -157,17 +131,16 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { CGT_X86 *CGTVisitor = new CGT_X86(M, DFG); // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode : Roots) { + for (auto &rootNode: Roots) { // Initiate code generation for root DFNode CGTVisitor->visit(rootNode); - // Go ahead and replace the launch intrinsic with pthread call, otherwise - // return now. + // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. // TODO: Later on, we might like to do this in a separate pass, which would - // allow us the flexibility to switch between complete static code - // generation for DFG or having a customized runtime+scheduler - + // allow us the flexibility to switch between complete static code generation + // for DFG or having a customized runtime+scheduler + // Do streaming code generation if root node is streaming. Usual otherwise - if (rootNode->isChildGraphStreaming()) + if(rootNode->isChildGraphStreaming()) CGTVisitor->codeGenLaunchStreaming(rootNode); else CGTVisitor->codeGenLaunch(rootNode); @@ -183,7 +156,7 @@ void CGT_X86::initRuntimeAPI() { // Load Runtime API Module SMDiagnostic Err; - char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; @@ -212,7 +185,6 @@ void CGT_X86::initRuntimeAPI() { DECLARE(llvm_visc_createEdgeBuffer); DECLARE(llvm_visc_createLastInputBuffer); DECLARE(llvm_visc_createThread); - // DECLARE(llvm_visc_freeThreads); DECLARE(llvm_visc_bufferPush); DECLARE(llvm_visc_bufferPop); DECLARE(llvm_visc_x86_dstack_push); @@ -224,66 +196,28 @@ void CGT_X86::initRuntimeAPI() { initTimerAPI(); // Insert init context in main - Function *VI = M.getFunction("llvm.visc.init"); + Function* VI = M.getFunction("llvm.visc.init"); assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); DEBUG(errs() << "Inserting x86 timer initialization\n"); - Instruction *I = cast<Instruction>(*VI->user_begin()); + Instruction* I = cast<Instruction>(*VI->user_begin()); initializeTimerSet(I); switchToTimer(visc_TimerID_NONE, I); - // Insert code for initializing the sceduling policy - FunctionCallee IP = M.getOrInsertFunction( - "llvm_visc_policy_init", - runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()); - CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); - DEBUG(errs() << *IPCallInst << "\n"); - - // If device abstraction is enabled, we add a runtime call to start the - // device status simulation - if (DeviceAbstraction) { - FunctionCallee ID = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_start", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_start") - ->getFunctionType()); - CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); - DEBUG(errs() << *IDCallInst << "\n"); - } - // Insert print instruction at visc exit - Function *VC = M.getFunction("llvm.visc.cleanup"); + Function* VC = M.getFunction("llvm.visc.cleanup"); assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); - // Insert code for clearing the sceduling policy - I = cast<Instruction>(*VC->user_begin()); - IP = M.getOrInsertFunction( - "llvm_visc_policy_clear", - runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()); - IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); - DEBUG(errs() << *IPCallInst << "\n"); - DEBUG(errs() << "Inserting x86 timer print\n"); printTimerSet(I); - - // If device abstraction is enabled, we add a runtime call to end the - // device status simulation - if (DeviceAbstraction) { - FunctionCallee ID = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_end", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_end") - ->getFunctionType()); - CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); - DEBUG(errs() << *IDCallInst << "\n"); - } } /* Returns vector of all wait instructions */ -std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) { - std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>(); +std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { + std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>(); // It must have been loaded from memory somewhere - for (Value::user_iterator ui = GraphID->user_begin(), - ue = GraphID->user_end(); - ui != ue; ++ui) { - if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) { + for(Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); ui!=ue; ++ui) { + if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { UseList->push_back(waitI); } else { llvm_unreachable("Error: Operation on Graph ID not supported!\n"); @@ -295,14 +229,14 @@ std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) { /* Traverse the function argument list in reverse order to get argument at a * distance offset fromt he end of argument list of function F */ -Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) { - assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) && - "Invalid offset to access arguments!"); +Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) + && "Invalid offset to access arguments!"); Function::arg_iterator e = F->arg_end(); // Last element of argument iterator is dummy. Skip it. e--; - Argument *arg; - for (; offset != 0; e--) { + Argument* arg; + for( ; offset != 0; e--) { offset--; arg = &*e; } @@ -320,24 +254,25 @@ Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) { * which loops over bidy if true and goes to end if false * (5) Update phi node of body */ -void CGT_X86::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart, - Instruction *BodyEnd, Value *TerminationCond) { - BasicBlock *Entry = CondBlockStart->getParent(); - BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); - BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); - BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); +void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, + Instruction* BodyEnd, Value* TerminationCond) { + BasicBlock* Entry = CondBlockStart->getParent(); + BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); + BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); + BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); // Replace the terminator instruction of conditional with new conditional // branch which goes to while.body if true and branches to while.end otherwise - BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); + BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); ReplaceInstWithInst(CondBlock->getTerminator(), BI); // While Body should jump to condition block - BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock); + BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock); ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); + } -Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, +Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, BasicBlock *Body) { Module *M = Entry->getParent()->getParent(); Type *Int64Ty = Type::getInt64Ty(M->getContext()); @@ -347,10 +282,10 @@ Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); ConstantInt *IConst = - ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); Instruction *CounterIncr = - BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, - "cnt_incr", Body->getTerminator()); + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); // Set incoming values for Phi node IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); @@ -372,40 +307,39 @@ Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, * which loops over bidy if true and goes to end if false * (5) Update phi node of body */ -Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) { - BasicBlock *Entry = I->getParent(); - BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body"); +Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { + BasicBlock* Entry = I->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body"); BasicBlock::iterator i(I); ++i; - Instruction *NextI = &*i; + Instruction* NextI = &*i; // Next Instruction should also belong to the same basic block as the basic // block will have a terminator instruction - assert(NextI->getParent() == ForBody && - "Next Instruction should also belong to the same basic block!"); - BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); + assert(NextI->getParent() == ForBody + && "Next Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); + // Add Phi Node for index variable - PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2, - "index." + indexName, I); + PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), + 2, "index."+indexName, I); // Add incoming edge to phi IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0), Entry); // Increment index variable - BinaryOperator *IndexInc = BinaryOperator::Create( - Instruction::Add, IndexPhi, - ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), - "index." + indexName + ".inc", ForBody->getTerminator()); + BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add, + IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), + "index."+indexName+".inc", ForBody->getTerminator()); // Compare index variable with limit - CmpInst *Cond = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit, - "cond." + indexName, ForBody->getTerminator()); + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, + limit, "cond."+indexName, ForBody->getTerminator()); // Replace the terminator instruction of for.body with new conditional // branch which loops over body if true and branches to for.end otherwise - BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond); + BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond); ReplaceInstWithInst(ForBody->getTerminator(), BI); // Add incoming edge to phi node in body @@ -417,274 +351,260 @@ Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) { // types, output types and isLastInput buffer type. All the streaming // inputs/outputs are converted to i8*, since this is the type of buffer // handles. -StructType *CGT_X86::getArgumentListStructTy(DFNode *C) { - std::vector<Type *> TyList; +StructType* CGT_X86::getArgumentListStructTy(DFNode* C) { + std::vector<Type*> TyList; // Input types - Function *CF = C->getFuncPointer(); - for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); - ai != ae; ++ai) { - if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) + Function* CF = C->getFuncPointer(); + for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); + ai != ae; ++ai) { + if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - else + else TyList.push_back(ai->getType()); } // Output Types - StructType *OutStructTy = cast<StructType>(CF->getReturnType()); + StructType* OutStructTy = cast<StructType>(CF->getReturnType()); for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { // All outputs of a node are streaming edge - assert(C->getOutDFEdgeAt(i)->isStreamingEdge() && - "All output edges of child node have to be streaming"); + assert(C->getOutDFEdgeAt(i)->isStreamingEdge() + && "All output edges of child node have to be streaming"); TyList.push_back(Type::getInt8PtrTy(CF->getContext())); } // isLastInput buffer element TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - StructType *STy = - StructType::create(CF->getContext(), TyList, - Twine("struct.thread." + CF->getName()).str(), true); + StructType* STy = StructType::create(CF->getContext(), TyList, + Twine("struct.thread."+CF->getName()).str(), true); return STy; + } -void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args, - DenseMap<DFEdge *, Value *> EdgeBufferMap, - Value *isLastInputBuffer, Value *graphID, - Instruction *IB) { - DEBUG(errs() << "Starting Pipeline for child node: " - << C->getFuncPointer()->getName() << "\n"); +void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*> + EdgeBufferMap, Value* isLastInputBuffer, Value* graphID, + Instruction* IB) { + DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n"); // Create a filter/pipeline function for the child node - Function *C_Pipeline = createFunctionFilter(C); - Function *CF = C->getFuncPointer(); + Function* C_Pipeline = createFunctionFilter(C); + Function* CF = C->getFuncPointer(); // Get module context and i32 0 constant, as they would be frequently used in // this function. - LLVMContext &Ctx = IB->getParent()->getContext(); - Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + LLVMContext& Ctx = IB->getParent()->getContext(); + Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); // Marshall arguments // Create a packed struct type with inputs of C followed by outputs and then // another i8* to indicate isLastInput buffer. Streaming inputs are replaced // by i8* // - StructType *STy = getArgumentListStructTy(C); + StructType* STy = getArgumentListStructTy(C); // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* - CallInst *CI = - CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)), - C->getFuncPointer()->getName() + ".inputs", IB); - CastInst *Struct = BitCastInst::CreatePointerCast( - CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB); - // AllocaInst* AI = new AllocaInst(STy, - // C->getFuncPointer()->getName()+".inputs", IB); + CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)), + C->getFuncPointer()->getName()+".inputs", IB); + CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB); + //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB); // Insert elements in the struct - DEBUG(errs() << "Marshall inputs for child node: " - << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n"); // Marshall Inputs - for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { + for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) { // Create constant int (i) - Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); // Get Element pointer instruction - Value *GEPIndices[] = {IntZero, Int_i}; - GetElementPtrInst *GEP = GetElementPtrInst::Create( - nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), - Struct->getName() + ".arg_" + Twine(i), IB); - DFEdge *E = C->getInDFEdgeAt(i); + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".arg_"+Twine(i), + IB); + DFEdge* E = C->getInDFEdgeAt(i); if (E->getSourceDF()->isEntryNode()) { // This is a Bind Input Edge - if (E->isStreamingEdge()) { + if(E->isStreamingEdge()) { // Streaming Bind Input edge. Get buffer corresponding to it - assert(EdgeBufferMap.count(E) && - "No mapping buffer for a Streaming Bind DFEdge!"); + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!"); new StoreInst(EdgeBufferMap[E], GEP, IB); - } else { + } + else { // Non-streaming Bind edge new StoreInst(Args[i], GEP, IB); } - } else { - // This is an edge between siblings. + } + else { + // This is an edge between siblings. // This must be an streaming edge. As it is our assumption that all edges // between two nodes in a DFG are streaming. - assert(EdgeBufferMap.count(E) && - "No mapping buffer for a Streaming DFEdge!"); + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!"); new StoreInst(EdgeBufferMap[E], GEP, IB); } } unsigned numInputs = CF->getFunctionType()->getNumParams(); unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); // Marshall Outputs - DEBUG(errs() << "Marshall outputs for child node: " - << C->getFuncPointer()->getName() << "\n"); - for (unsigned i = 0; i < numOutputs; i++) { + DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n"); + for(unsigned i = 0; i < numOutputs; i++ ) { // Create constant int (i+numInputs) - Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs); + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs); // Get Element pointer instruction - Value *GEPIndices[] = {IntZero, Int_i}; - GetElementPtrInst *GEP = GetElementPtrInst::Create( - nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), - Struct->getName() + ".out_" + Twine(i), IB); - DFEdge *E = C->getOutDFEdgeAt(i); - assert(E->isStreamingEdge() && - "Output Edge must be streaming of all nodes"); - assert(EdgeBufferMap.count(E) && - "No mapping buffer for a Out Streaming DFEdge!"); + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".out_"+Twine(i), + IB); + DFEdge* E = C->getOutDFEdgeAt(i); + assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes"); + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!"); new StoreInst(EdgeBufferMap[E], GEP, IB); } // Marshall last argument. isLastInput buffer - DEBUG(errs() << "Marshall isLastInput for child node: " - << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n"); // Create constant int (i+numInputs) - Constant *Int_index = - ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs); + Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs); // Get Element pointer instruction - Value *GEPIndices[] = {IntZero, Int_index}; - GetElementPtrInst *GEP = GetElementPtrInst::Create( - nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), - Struct->getName() + ".isLastInput", IB); + Value* GEPIndices[] = { IntZero, Int_index }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".isLastInput", IB); new StoreInst(isLastInputBuffer, GEP, IB); // AllocaInst AI points to memory with all the arguments packed // Call runtime to create the thread with these arguments - DEBUG(errs() << "Start Thread for child node: " - << C->getFuncPointer()->getName() << "\n"); - // DEBUG(errs() << *llvm_visc_createThread << "\n"); + DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); +// DEBUG(errs() << *llvm_visc_createThread << "\n"); DEBUG(errs() << *graphID->getType() << "\n"); DEBUG(errs() << *C_Pipeline->getType() << "\n"); DEBUG(errs() << *Struct->getType() << "\n"); // Bitcast AI to i8* - CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), - Struct->getName(), IB); - Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI}; - CallInst *CreateThread = CallInst::Create( - llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); + CastInst* BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB); + Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI}; + CallInst::Create(llvm_visc_createThread, ArrayRef<Value*>(CreateThreadArgs, 3), "", IB); + } -Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { +Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { DEBUG(errs() << "Generating Streaming Launch Function\n"); // Get Function associated with Node N - Function *NF = N->getFuncPointer(); + Function* NF = N->getFuncPointer(); - // Map from Streaming edge to buffer - DenseMap<DFEdge *, Value *> EdgeBufferMap; + // Map from Streaming edge to buffer + DenseMap<DFEdge*, Value*> EdgeBufferMap; /* Now we have all the necessary global declarations necessary to generate the - * Launch function, pointer to which can be passed to pthread utils to execute - * DFG. The Launch function has just one input: i8* data.addr - * This is the address of the all the input data that needs to be passed to - * this function. In our case it contains the input arguments of the Root - * function in the correct order. - * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) - * (2) Extract each of inputs from data.addr - * (3) create Buffers for all the streaming edges - * - Put buffers in the context - * (4) Go over each child node - * - marshall its arguments together (use buffers in place of streaming - * arguments) - * - Start the threads - * (5) The return value from Root is stored in memory, pointer to which is - * passed to pthread_exit call. - */ + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) + * (2) Extract each of inputs from data.addr + * (3) create Buffers for all the streaming edges + * - Put buffers in the context + * (4) Go over each child node + * - marshall its arguments together (use buffers in place of streaming + * arguments) + * - Start the threads + * (5) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ // (1) Create Launch Function of type void (i8* args, i8* GraphID) - Type *i8Ty = Type::getInt8Ty(M.getContext()); - Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; - FunctionType *LaunchFuncTy = FunctionType::get( - Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false); - Function *LaunchFunc = Function::Create( - LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M); + Type* i8Ty = Type::getInt8Ty(M.getContext()); + Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; + FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()), + ArrayRef<Type*>(ArgTypes, 2), false); + Function* LaunchFunc = Function::Create(LaunchFuncTy, + NF->getLinkage(), + NF->getName()+".LaunchFunction", + &M); DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); // Give a name to the argument which is used pass data to this thread - Argument *data = &*LaunchFunc->arg_begin(); + Argument* data = &*LaunchFunc->arg_begin(); // NOTE-HS: Check correctness with Maria - Argument *graphID = &*(LaunchFunc->arg_begin() + 1); + Argument* graphID = &*(LaunchFunc->arg_begin() + 1); data->setName("data.addr"); graphID->setName("graphID"); // Add a basic block to this empty function and a return null statement to it DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); - BasicBlock *BB = - BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); - ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB); + BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); + ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(), + BB); DEBUG(errs() << "Created Empty Launch Function\n"); // (2) Extract each of inputs from data.addr - std::vector<Type *> TyList; + std::vector<Type*> TyList; std::vector<std::string> names; - std::vector<Value *> Args; + std::vector<Value*> Args; for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); - ai != ae; ++ai) { - if (N->getChildGraph() - ->getEntry() - ->getOutDFEdgeAt(ai->getArgNo()) - ->isStreamingEdge()) { + ai != ae; ++ai) { + if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) { TyList.push_back(i8Ty->getPointerTo()); - names.push_back(Twine(ai->getName() + "_buffer").str()); + names.push_back(Twine(ai->getName()+"_buffer").str()); continue; } TyList.push_back(ai->getType()); names.push_back(ai->getName()); } Args = extractElements(data, TyList, names, RI); - DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc - << "\n"); + DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc << "\n"); // (3) Create buffers for all the streaming edges - for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), - de = N->getChildGraph()->dfedge_end(); - di != de; ++di) { - DFEdge *Edge = *di; + for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), + de = N->getChildGraph()->dfedge_end(); di != de; ++di) { + DFEdge* Edge = *di; DEBUG(errs() << *Edge->getType() << "\n"); - Value *size = ConstantExpr::getSizeOf(Edge->getType()); - Value *CallArgs[] = {graphID, size}; + Value* size = ConstantExpr::getSizeOf(Edge->getType()); + Value* CallArgs[] = {graphID, size}; if (Edge->isStreamingEdge()) { - CallInst *CI; + CallInst* CI; // Create a buffer call - if (Edge->getSourceDF()->isEntryNode()) { + if(Edge->getSourceDF()->isEntryNode()) { // Bind Input Edge - Constant *Int_ArgNo = ConstantInt::get( - Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition()); - Value *BindInCallArgs[] = {graphID, size, Int_ArgNo}; - CI = CallInst::Create( - llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), - "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI); - } else if (Edge->getDestDF()->isExitNode()) { + Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()), + Edge->getSourcePosition()); + Value* BindInCallArgs[] = {graphID, size, Int_ArgNo}; + CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3), + "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + else if(Edge->getDestDF()->isExitNode()) { // Bind Output Edge - CI = CallInst::Create( - llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), - "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI); - } else { + CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(), + RI); + } + else { // Streaming Edge - CI = CallInst::Create( - llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), - Edge->getSourceDF()->getFuncPointer()->getName() + "." + - Edge->getDestDF()->getFuncPointer()->getName(), - RI); + CI = CallInst::Create(llvm_visc_createEdgeBuffer, + ArrayRef<Value*>(CallArgs, 2), + Edge->getSourceDF()->getFuncPointer()->getName()+"." + +Edge->getDestDF()->getFuncPointer()->getName(), + RI); } EdgeBufferMap[Edge] = CI; } } // Create buffer for isLastInput for all the child nodes - DFGraph *G = N->getChildGraph(); - DenseMap<DFNode *, Value *> NodeLastInputMap; - for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; - ++ci) { - DFNode *child = *ci; - if (child->isDummyNode()) + DFGraph* G = N->getChildGraph(); + DenseMap<DFNode*, Value*> NodeLastInputMap; + for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) { + DFNode* child = *ci; + if(child->isDummyNode()) continue; - Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); - Value *CallArgs[] = {graphID, size}; - CallInst *CI = CallInst::Create( - llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), - "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI); + Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); + Value* CallArgs[] = {graphID, size}; + CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindIn.isLastInput."+child->getFuncPointer()->getName(), + RI); NodeLastInputMap[child] = CI; } - DEBUG(errs() << "Start Each child node filter\n"); + DEBUG(errs() << "Start Each child node filter\n"); // (4) Marshall arguments for each child node and start the thread with its // pipeline funtion - for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); - ci != ce; ++ci) { - DFNode *C = *ci; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; // Skip dummy node call if (C->isDummyNode()) continue; - + // Marshall all the arguments for this node into an i8* // Pass to the runtime to create the thread // Start the thread for child node C @@ -697,23 +617,7 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { return LaunchFunc; } -Function *CGT_X86::createPushFunction(DFInternalNode *N) { - DEBUG(errs() << "Generating Push function\n"); - Function *PushFunc; - return PushFunc; -} - -Function *CGT_X86::createPopFunction(DFInternalNode *N) { - DEBUG(errs() << "Generating Pop function\n"); - Function *PushFunc; - return PushFunc; -} -Function *CGT_X86::createWaitFunction(DFInternalNode *N) { - DEBUG(errs() << "Generating Wait function\n"); - Function *PushFunc; - return PushFunc; -} /* This fuction does the steps necessary to launch a streaming graph * Steps * Create Pipeline/Filter function for each node in child graph of Root @@ -721,162 +625,167 @@ Function *CGT_X86::createWaitFunction(DFInternalNode *N) { * Modify each of the instrinsic in host code * Launch, Push, Pop, Wait */ -void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) { - IntrinsicInst *LI = Root->getInstruction(); - Function *RootLaunch = createLaunchFunction(Root); - // Function* RootPush = createPushFunction(Root); - // Function* RootPop = createPopFunction(Root); - // Function* RootWait = createWaitFunction(Root); +void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) { + IntrinsicInst* LI = Root->getInstruction(); + Function* RootLaunch = createLaunchFunction(Root); // Substitute launch intrinsic main - DEBUG(errs() << "Substitute launch intrinsic\n"); - Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)}; - CallInst *LaunchInst = CallInst::Create( - llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), - "graph" + Root->getFuncPointer()->getName(), LI); - // ReplaceInstWithInst(LI, LaunchInst); + DEBUG(errs() << "Substitute launch intrinsic\n"); + Value* LaunchInstArgs[] = {RootLaunch, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); DEBUG(errs() << *LaunchInst << "\n"); // Replace all wait instructions with x86 specific wait instructions - DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); - std::vector<IntrinsicInst *> *UseList = getUseList(LI); - for (unsigned i = 0; i < UseList->size(); ++i) { - IntrinsicInst *II = UseList->at(i); - CallInst *CI; - Value *PushArgs[] = {LaunchInst, II->getOperand(1)}; - switch (II->getIntrinsicID()) { + DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + Value* PushArgs[] = {LaunchInst, II->getOperand(1)}; + switch(II->getIntrinsicID()) { case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst), + CI = CallInst::Create(llvm_visc_streamWait, + ArrayRef<Value*>(LaunchInst), ""); break; case Intrinsic::visc_push: CI = CallInst::Create(llvm_visc_streamPush, - ArrayRef<Value *>(PushArgs, 2), ""); + ArrayRef<Value*>(PushArgs, 2), + ""); break; case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst), + CI = CallInst::Create(llvm_visc_streamPop, + ArrayRef<Value*>(LaunchInst), ""); break; default: - llvm_unreachable( - "GraphID is used by an instruction other than wait, push, pop"); + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); }; DEBUG(errs() << "Replace:\n\t" << *II << "\n"); ReplaceInstWithInst(II, CI); DEBUG(errs() << "\twith " << *CI << "\n"); } + + } -void CGT_X86::codeGenLaunch(DFInternalNode *Root) { +void CGT_X86::codeGenLaunch(DFInternalNode* Root) { // TODO: Place an assert to check if the constant passed by launch intrinsic // as the number of arguments to DFG is same as the number of arguments of the // root of DFG DEBUG(errs() << "Generating Launch Function\n"); // Get Launch Instruction - IntrinsicInst *LI = Root->getInstruction(); + IntrinsicInst* LI = Root->getInstruction(); switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); DEBUG(errs() << "Generating Launch Function\n"); /* Now we have all the necessary global declarations necessary to generate the - * Launch function, pointer to which can be passed to pthread utils to execute - * DFG. The Launch function has just one input: i8* data.addr - * This is the address of the all the input data that needs to be passed to - * this function. In our case it contains the input arguments of the Root - * function in the correct order. - * (1) Create an empty Launch function of type i8*(i8*) - * (2) Extract each of inputs from data.addr and pass them as arguments to the - * call to Root function - * (3) The return value from Root is stored in memory, pointer to which is - * passed to pthread_exit call. - */ + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type i8*(i8*) + * (2) Extract each of inputs from data.addr and pass them as arguments to the + * call to Root function + * (3) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ // Create Launch Function of type i8*(i8*) which calls the root function - Type *i8Ty = Type::getInt8Ty(M.getContext()); - FunctionType *AppFuncTy = FunctionType::get( - i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false); - Function *AppFunc = - Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(), - "LaunchDataflowGraph", &M); + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* AppFunc = Function::Create(AppFuncTy, + Root->getFuncPointer()->getLinkage(), + "LaunchDataflowGraph", + &M); DEBUG(errs() << "Generating Launch Function\n"); // Give a name to the argument which is used pass data to this thread - Value *data = &*AppFunc->arg_begin(); + Value* data = &*AppFunc->arg_begin(); data->setName("data.addr"); // Add a basic block to this empty function and a return null statement to it BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); - ReturnInst *RI = - ReturnInst::Create(AppFunc->getContext(), - Constant::getNullValue(AppFunc->getReturnType()), BB); + ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), + Constant::getNullValue(AppFunc->getReturnType()), + BB); switchToTimer(visc_TimerID_ARG_UNPACK, RI); DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and - // Function* RootF_X86 = Root->getGenFunc(); - Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); +// Function* RootF_X86 = Root->getGenFunc(); + Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && "Error: Generated Function for Root node with no x86 wrapper\n"); // Generate a call to RootF_X86 with null parameters for now - std::vector<Value *> Args; - for (unsigned i = 0; i < RootF_X86->getFunctionType()->getNumParams(); i++) { - Args.push_back( - Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); + std::vector<Value*>Args; + for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { + Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); } - CallInst *CI = - CallInst::Create(RootF_X86, Args, RootF_X86->getName() + ".output", RI); + CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI); // Extract input data from i8* data.addr and patch them to correct argument of // call to RootF_X86. For each argument - std::vector<Type *> TyList; + std::vector<Type*> TyList; std::vector<std::string> names; - for (Function::arg_iterator ai = RootF_X86->arg_begin(), - ae = RootF_X86->arg_end(); - ai != ae; ++ai) { + for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); + ai != ae; ++ai) { TyList.push_back(ai->getType()); names.push_back(ai->getName()); } - std::vector<Value *> elements = extractElements(data, TyList, names, CI); + std::vector<Value*> elements = extractElements(data, TyList, names, CI); // Patch the elements to the call arguments - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) + for(unsigned i=0; i<CI->getNumArgOperands(); i++) CI->setArgOperand(i, elements[i]); // Add timers around Call to RootF_X86 function switchToTimer(visc_TimerID_COMPUTATION, CI); switchToTimer(visc_TimerID_OUTPUT_PACK, RI); - StructType *RootRetTy = - cast<StructType>(RootF_X86->getFunctionType()->getReturnType()); + StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType()); - // if Root has non empty return + // if Root has non empty return if (RootRetTy->getNumElements()) { // We can't access the type of the arg struct - build it - std::vector<Type *> TyList; - for (Function::arg_iterator ai = RootF_X86->arg_begin(), - ae = RootF_X86->arg_end(); - ai != ae; ++ai) { + std::vector<Type*> TyList; + for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); + ai != ae; ++ai) { TyList.push_back(ai->getType()); } TyList.push_back(CI->getType()); - StructType *ArgStructTy = StructType::create( - M.getContext(), ArrayRef<Type *>(TyList), - (RootF_X86->getName() + ".arg.struct.ty").str(), true); + StructType* ArgStructTy = StructType::create(M.getContext(), + ArrayRef<Type*>(TyList), + (RootF_X86->getName()+".arg.struct.ty").str(), true); // Cast the data pointer to the type of the arg struct - CastInst *OutputAddrCast = CastInst::CreatePointerCast( - data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI); + CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, + ArgStructTy->getPointerTo(), + "argStructCast.addr", + RI); // Result struct is the last element of the packed struct passed to launch unsigned outStructIdx = ArgStructTy->getNumElements() - 1; - ConstantInt *IntZero = - ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); - ConstantInt *IntIdx = - ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx); + ConstantInt *IntZero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + ConstantInt *IntIdx = ConstantInt::get(Type::getInt32Ty(M.getContext()), + outStructIdx); - Value *GEPIIdxList[] = {IntZero, IntIdx}; + Value* GEPIIdxList[] = { IntZero, + IntIdx + }; // Get data pointer to the last element of struct - result field - GetElementPtrInst *OutGEPI = GetElementPtrInst::Create( - ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2), - CI->getName() + ".addr", RI); + GetElementPtrInst *OutGEPI = + GetElementPtrInst::Create(ArgStructTy, + OutputAddrCast, + ArrayRef<Value*>(GEPIIdxList, 2), + CI->getName()+".addr", + RI); // Store result there new StoreInst(CI, OutGEPI, RI); } else { @@ -885,8 +794,10 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { // We were casting the data pointer to the result type of Root, and // returning result there. This would work at the LLVM level, but not // at the C level, thus the rewrite. - CastInst *OutputAddrCast = CastInst::CreatePointerCast( - data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI); + CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, + CI->getType()->getPointerTo(), + CI->getName()+".addr", + RI); new StoreInst(CI, OutputAddrCast, RI); } @@ -896,109 +807,114 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { DEBUG(errs() << *AppFunc << "\n"); // Substitute launch intrinsic main - Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)}; - CallInst *LaunchInst = CallInst::Create( - llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2), - "graph" + Root->getFuncPointer()->getName(), LI); - // ReplaceInstWithInst(LI, LaunchInst); + Value* LaunchInstArgs[] = {AppFunc, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); DEBUG(errs() << *LaunchInst << "\n"); // Replace all wait instructions with x86 specific wait instructions - std::vector<IntrinsicInst *> *UseList = getUseList(LI); - for (unsigned i = 0; i < UseList->size(); ++i) { - IntrinsicInst *II = UseList->at(i); - CallInst *CI; - switch (II->getIntrinsicID()) { + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + switch(II->getIntrinsicID()) { case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst), + CI = CallInst::Create(llvm_visc_x86_wait, + ArrayRef<Value*>(LaunchInst), ""); break; case Intrinsic::visc_push: - CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst), + CI = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(LaunchInst), ""); break; case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst), + CI = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(LaunchInst), ""); break; default: - llvm_unreachable( - "GraphID is used by an instruction other than wait, push, pop"); + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); }; ReplaceInstWithInst(II, CI); DEBUG(errs() << *CI << "\n"); } + } -Value *CGT_X86::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86, - Instruction *InsertBefore) { +Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { // TODO: Assumption is that each input port of a node has just one // incoming edge. May change later on. // Find the incoming edge at the requested input port - DFEdge *E = Child->getInDFEdgeAt(i); + DFEdge* E = Child->getInDFEdgeAt(i); assert(E && "No incoming edge or binding for input element!"); // Find the Source DFNode associated with the incoming edge - DFNode *SrcDF = E->getSourceDF(); + DFNode* SrcDF = E->getSourceDF(); // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value *inputVal; - if (SrcDF->isEntryNode()) { + Value* inputVal; + if(SrcDF->isEntryNode()) { inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); - DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); - } else { + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { // edge is from a sibling // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) && - "Source node call not found. Dependency violation!"); + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); // Find CallInst associated with the Source DFNode using OutputMap - Value *CI = OutputMap[SrcDF]; + Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); - ExtractValueInst *EI = - ExtractValueInst::Create(CI, IndexList, "", InsertBefore); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); inputVal = EI; } return inputVal; } -void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, - ValueToValueMapTy &VMap, Instruction *IB) { - Function *CF = C->getFuncPointer(); +void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, + ValueToValueMapTy &VMap,Instruction* IB) { + Function* CF = C->getFuncPointer(); - // Function* CF_X86 = C->getGenFunc(); +// Function* CF_X86 = C->getGenFunc(); Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); - assert(CF_X86 != NULL && - "Found leaf node for which code generation has not happened yet!\n"); + assert(CF_X86 != NULL + && "Found leaf node for which code generation has not happened yet!\n"); assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && - "The generated function to be called from x86 backend is not an x86 " - "function\n"); + "The generated function to be called from x86 backend is not an x86 function\n"); DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); - std::vector<Value *> Args; + std::vector<Value*> Args; // Create argument list to pass to call instruction // First find the correct values using the edges // The remaing six values are inserted as constants for now. - for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { Args.push_back(getInValueAt(C, i, F_X86, IB)); } - Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); - for (unsigned j = 0; j < 6; j++) + Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); + for(unsigned j=0; j<6; j++) Args.push_back(I64Zero); - DEBUG(errs() << "Gen Function type: " << *CF_X86->getType() << "\n"); - DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n"); - DEBUG(errs() << "Arguments: " << Args.size() << "\n"); + errs() << "Gen Function type: " << *CF_X86->getType() << "\n"; + errs() << "Node Function type: " << *CF->getType() << "\n"; + errs() << "Arguments: " << Args.size() << "\n"; // Call the F_X86 function associated with this node - CallInst *CI = - CallInst::Create(CF_X86, Args, CF_X86->getName() + "_output", IB); + CallInst* CI = CallInst::Create(CF_X86, Args, + CF_X86->getName()+"_output", + IB); DEBUG(errs() << *CI << "\n"); OutputMap[C] = CI; @@ -1006,56 +922,55 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, // Based on number of dimensions, insert loop instructions std::string varNames[3] = {"x", "y", "z"}; unsigned numArgs = CI->getNumArgOperands(); - for (unsigned j = 0; j < C->getNumOfDim(); j++) { - Value *indexLimit = NULL; + for(unsigned j=0; j < C->getNumOfDim(); j++) { + Value* indexLimit = NULL; // Limit can either be a constant or an arguement of the internal node. // In case of constant we can use that constant value directly in the // new F_X86 function. In case of an argument, we need to get the mapped // value using VMap - if (isa<Constant>(C->getDimLimits()[j])) { + if(isa<Constant>(C->getDimLimits()[j])) { indexLimit = C->getDimLimits()[j]; DEBUG(errs() << "In Constant case:\n" - << " indexLimit type = " << *indexLimit->getType() << "\n"); - } else { + << " indexLimit type = " << *indexLimit->getType() << "\n"); + } + else { indexLimit = VMap[C->getDimLimits()[j]]; DEBUG(errs() << "In VMap case:" - << " indexLimit type = " << *indexLimit->getType() << "\n"); + <<" indexLimit type = " << *indexLimit->getType() << "\n"); } assert(indexLimit && "Invalid dimension limit!"); // Insert loop - Value *indexVar = addLoop(CI, indexLimit, varNames[j]); + Value* indexVar = addLoop(CI, indexLimit, varNames[j]); DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); // Insert index variable and limit arguments - CI->setArgOperand(numArgs - 6 + j, indexVar); - CI->setArgOperand(numArgs - 3 + j, indexLimit); + CI->setArgOperand(numArgs-6+j, indexVar); + CI->setArgOperand(numArgs-3+j, indexLimit); } // Insert call to runtime to push the dim limits and instanceID on the depth // stack - Value *args[] = { - ConstantInt::get(Type::getInt32Ty(CI->getContext()), - C->getNumOfDim()), // numDim - CI->getArgOperand(numArgs - 3 + 0), // limitX - CI->getArgOperand(numArgs - 6 + 0), // iX - CI->getArgOperand(numArgs - 3 + 1), // limitY - CI->getArgOperand(numArgs - 6 + 1), // iY - CI->getArgOperand(numArgs - 3 + 2), // limitZ - CI->getArgOperand(numArgs - 6 + 2) // iZ + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim + CI->getArgOperand(numArgs-3+0), // limitX + CI->getArgOperand(numArgs-6+0), // iX + CI->getArgOperand(numArgs-3+1), // limitY + CI->getArgOperand(numArgs-6+1), // iY + CI->getArgOperand(numArgs-3+2), // limitZ + CI->getArgOperand(numArgs-6+2) // iZ }; - CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push, - ArrayRef<Value *>(args, 7), "", CI); + CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI); DEBUG(errs() << "Push on stack: " << *Push << "\n"); // Insert call to runtime to pop the dim limits and instanceID from the depth // stack BasicBlock::iterator i(CI); ++i; - Instruction *NextI = &*i; + Instruction* NextI = &*i; // Next Instruction should also belong to the same basic block as the basic // block will have a terminator instruction - assert(NextI->getParent() == CI->getParent() && - "Next Instruction should also belong to the same basic block!"); + assert(NextI->getParent() == CI->getParent() + && "Next Instruction should also belong to the same basic block!"); - CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); + CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); DEBUG(errs() << *CI->getParent()->getParent()); } @@ -1076,33 +991,34 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, // Add runtime API calls to push output for each of the streaming outputs // Add loop around the basic block, which exits the loop if isLastInput is false -Function *CGT_X86::createFunctionFilter(DFNode *C) { - DEBUG(errs() << "*********Creating Function filter for " - << C->getFuncPointer()->getName() << "*****\n"); +Function* CGT_X86::createFunctionFilter(DFNode* C) { + DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n"); /* Create a function with same argument list as child.*/ DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); // Get the generated function for child node - Function *CF = C->getFuncPointer(); + Function* CF = C->getFuncPointer(); // Create Filter Function of type i8*(i8*) which calls the root function - Type *i8Ty = Type::getInt8Ty(M.getContext()); - FunctionType *CF_PipelineTy = FunctionType::get( - i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false); - Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(), - CF->getName() + "_Pipeline", &M); - DEBUG(errs() << "Generating Pipline Function\n"); + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* CF_Pipeline = Function::Create(CF_PipelineTy, + CF->getLinkage(), + CF->getName()+"_Pipeline", + &M); + DEBUG(errs() << "Generating Pipeline Function\n"); // Give a name to the argument which is used pass data to this thread - Value *data = &*CF_Pipeline->arg_begin(); + Value* data = &*CF_Pipeline->arg_begin(); data->setName("data.addr"); // Create a new basic block DEBUG(errs() << "\tCreate new BB and add a return function\n"); // Add a basic block to this empty function - BasicBlock *BB = - BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); + BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); // Add a return instruction to the basic block - ReturnInst *RI = - ReturnInst::Create(CF_Pipeline->getContext(), - UndefValue::get(CF_Pipeline->getReturnType()), BB); + ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(), + UndefValue::get(CF_Pipeline->getReturnType()), BB); + /* Extract the elements from the aggregate argument to the function. * Replace the streaming inputs with i8* types signifying handle to @@ -1113,24 +1029,25 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); // These Args will be used when passing arguments to the generated function // inside loop, and reading outputs as well. - std::vector<Value *> Args; - std::vector<Type *> TyList; + std::vector<Value*> Args; + std::vector<Type*> TyList; std::vector<std::string> names; // Adding inputs - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; - ++i) { - if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { TyList.push_back(i8Ty->getPointerTo()); - names.push_back((Twine(i->getName()) + "_buffer").str()); - } else { + names.push_back((Twine(i->getName())+"_buffer").str()); + } + else { TyList.push_back(i->getType()); names.push_back(i->getName()); } } // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, // because we get there buffer handles - StructType *RetTy = cast<StructType>(CF->getReturnType()); - for (unsigned i = 0; i < RetTy->getNumElements(); i++) { + StructType* RetTy = cast<StructType>(CF->getReturnType()); + for (unsigned i=0; i<RetTy->getNumElements(); i++) { TyList.push_back(i8Ty->getPointerTo()); names.push_back("out"); } @@ -1139,54 +1056,66 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { TyList.push_back(i8Ty->getPointerTo()); names.push_back("isLastInput_buffer"); - // Extract the inputs, outputs and + // Extract the inputs, outputs Args = extractElements(data, TyList, names, RI); - for (unsigned i = 0; i < Args.size(); i++) { + for(unsigned i=0; i<Args.size(); i++) { DEBUG(errs() << *Args[i] << "\n"); } // Split the Args vector into, input output and isLastInput unsigned numInputs = CF->getFunctionType()->getNumParams(); unsigned numOutputs = RetTy->getNumElements(); - std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs); - std::vector<Value *> OutputArgs(Args.begin() + numInputs, - Args.begin() + numInputs + numOutputs); - Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]); + std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs); + std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs); + Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]); /* Add runtime API calls to get input for each of the streaming input edges */ - DEBUG(errs() << "\tAdd runtime API calls to get input for each of the " - "streaming input edges\n"); + DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n"); // First read the termination condition variable islastInput - CallInst *isLastInputPop = CallInst::Create( - llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); - - CastInst *BI = BitCastInst::CreateIntegerCast( - isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false, - "isLastInput", RI); + CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(isLastInput), + "", + RI); + + CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop, + Type::getInt64Ty(CF_Pipeline->getContext()), + false, + "isLastInput", + RI); isLastInput = BI; // Create a loop termination condition - CmpInst *Cond = CmpInst::Create( - Instruction::ICmp, CmpInst::ICMP_NE, isLastInput, - Constant::getNullValue(Type::getInt64Ty(CF->getContext())), - "isLastInputNotZero", RI); + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, + isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero", + RI); // Get input from buffers of all the incoming streaming edges - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; - ++i) { - if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { - CallInst *bufferIn = - CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI); - CastInst *BI; - if (i->getType()->isPointerTy()) { - BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(), - i->getName() + ".addr", RI); - } else if (i->getType()->isFloatTy()) { - BI = CastInst::CreateFPCast(bufferIn, i->getType(), - i->getName() + ".addr", RI); - } else { - BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false, - i->getName() + ".addr", RI); + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(InputArgs[i->getArgNo()]), + "", + RI); + CastInst* BI; + if(i->getType()->isPointerTy()) { + BI = CastInst::Create(CastInst::IntToPtr, + bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else if(i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else { + BI = CastInst::CreateIntegerCast(bufferIn, + i->getType(), + false, + i->getName()+".addr", + RI); } // Replace the argument in Args vector. We would be using the vector as // parameters passed to the call @@ -1195,88 +1124,52 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { } /* Add a call to the generated function of the child node */ DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); - // DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); - // CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, - // C->getGenFunc()->getName()+".output", RI); +// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); +// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, +// C->getGenFunc()->getName()+".output", RI); Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); - DEBUG(errs() << "Type: " << *CGenF->getType() << "\n"); - CallInst *CI = - CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI); + DEBUG(errs() << "Type: " + << *CGenF->getType() + << "\n"); + CallInst* CI = CallInst::Create(CGenF, + InputArgs, + CGenF->getName()+".output", + RI); /* Add runtime API calls to push output for each of the streaming outputs */ // FIXME: Assumption // All edges between siblings are streaming edges - DEBUG(errs() << "\tAdd runtime API calls to push output for each of the " - "streaming outputs\n"); - for (unsigned i = 0; i < numOutputs; i++) { + DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n"); + for (unsigned i=0; i< numOutputs; i++) { // Extract output - ExtractValueInst *EI = - ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI); + ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), + "",RI); // Convert to i64 - CastInst *BI; - if (EI->getType()->isPointerTy()) - BI = - CastInst::Create(CastInst::PtrToInt, EI, - Type::getInt64Ty(CF_Pipeline->getContext()), "", RI); + CastInst* BI; + if(EI->getType()->isPointerTy()) + BI = CastInst::Create(CastInst::PtrToInt,EI, + Type::getInt64Ty(CF_Pipeline->getContext()), + "", + RI); else - BI = CastInst::CreateIntegerCast( - EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI); + BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()), + false, "", RI); // Push to Output buffer - Value *bufferOutArgs[] = {OutputArgs[i], BI}; - CallInst *bufferOut = CallInst::Create( - llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI); + Value* bufferOutArgs[] = {OutputArgs[i], BI}; + CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(bufferOutArgs, 2), + "", + RI); } - // Add loop around the basic block, which exits the loop if isLastInput is - // false Pointers to keep the created loop structure + // Add loop around the basic block, which exits the loop if isLastInput is false + // Pointers to keep the created loop structure BasicBlock *EntryBB, *CondBB, *BodyBB; Instruction *CondStartI = cast<Instruction>(isLastInputPop); Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); EntryBB = CondStartI->getParent(); addWhileLoop(CondStartI, BodyStartI, RI, Cond); - CondBB = CondStartI->getParent(); - BodyBB = CI->getParent(); - Instruction *CntI = NULL; - CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); - - // If the node function calls the visc runtime call to get policy, we update - // it with the counter information. This means we need to pass an additional - // argument to the generated function, that is the iteration number, and then - // use it as an argument to the policy_getVersion call - if (GetPolicyCI) { - CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB); - assert(CntI && "Counter instruction not found\n"); - - // Create new function type (with additional argument for iteration number) - Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); - std::vector<Type *> NewArgTypes; - for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); - ai != ae; ++ai) { - NewArgTypes.push_back(ai->getType()); - } - NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); - FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); - Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); - // At least one (the last) argument exists (we added it) - Function::arg_iterator ae = NewCGenF->arg_end(); - --ae; - Argument *CntArg = &*ae; - CntArg->setName("iteration"); - // Replace the old cpu gen func with this one - C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); - - // Add counter to the actual parameter list, to create the new call - InputArgs.push_back(CntI); - CallInst *newCI = - CallInst::Create(NewCGenF, InputArgs, NewCGenF->getName() + ".output"); - ReplaceInstWithInst(CI, newCI); - - // Set second operand of the policy_getVersion call to the last function - // argument - GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); - GetPolicyCI->setArgOperand(1, CntArg); - } // Return the Function pointer DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); @@ -1284,19 +1177,19 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { return CF_Pipeline; } -void CGT_X86::codeGen(DFInternalNode *N) { +void CGT_X86::codeGen(DFInternalNode* N) { // Check if N is root node and its graph is streaming. We do not do codeGen // for Root in such a case - if (N->isRoot() && N->isChildGraphStreaming()) + if(N->isRoot() && N->isChildGraphStreaming()) return; // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. - // if(N->getGenFunc() != NULL) - // return; +// if(N->getGenFunc() != NULL) +// return; if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { - DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() - << " : skipping it\n"); + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; return; } @@ -1309,10 +1202,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Only process if all children have a CPU x86 function // Otherwise skip to end bool codeGen = true; - for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); - ci != ce; ++ci) { - DFNode *C = *ci; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; // Skip dummy node call if (C->isDummyNode()) continue; @@ -1327,18 +1219,17 @@ void CGT_X86::codeGen(DFInternalNode *N) { } if (codeGen) { - Function *F = N->getFuncPointer(); + Function* F = N->getFuncPointer(); // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. - Function *F_X86; - + Function* F_X86; + // Clone the function, if we are seeing this function for the first time. We // only need a clone in terms of type. ValueToValueMapTy VMap; - + // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), - F->getName(), &M); + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); // Loop over the arguments, copying the names of arguments over. Function::arg_iterator dest_iterator = F_X86->arg_begin(); @@ -1351,19 +1242,19 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Add a basic block to this empty function BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); - ReturnInst *RI = ReturnInst::Create( - F_X86->getContext(), UndefValue::get(F_X86->getReturnType()), BB); + ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), + UndefValue::get(F_X86->getReturnType()), BB); - // Add Index and Dim arguments except for the root node and the child graph - // of parent node is not streaming - if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + // Add Index and Dim arguments except for the root node and the child graph of + // parent node is not streaming + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); BB = &*F_X86->begin(); RI = cast<ReturnInst>(BB->getTerminator()); - - // Add generated function info to DFNode - // N->setGenFunc(F_X86, visc::CPU_TARGET); + + //Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); N->addGenFunc(F_X86, visc::CPU_TARGET, true); // Loop over the arguments, to create the VMap. @@ -1376,59 +1267,59 @@ void CGT_X86::codeGen(DFInternalNode *N) { } // Iterate over children in topological order - for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); - ci != ce; ++ci) { - DFNode *C = *ci; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; // Skip dummy node call if (C->isDummyNode()) continue; - + // Create calls to CPU function of child node invokeChild_X86(C, F_X86, VMap, RI); + } - + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - DFNode *C = N->getChildGraph()->getExit(); + DFNode* C = N->getChildGraph()->getExit(); // Get OutputType of this node - StructType *OutTy = N->getOutputType(); + StructType* OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find all the input edges to exit node - for (unsigned i = 0; i < OutTy->getNumElements(); i++) { + for (unsigned i=0; i < OutTy->getNumElements(); i++) { DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge *E = C->getInDFEdgeAt(i); - + DFEdge* E = C->getInDFEdgeAt(i); + assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge - DFNode *SrcDF = E->getSourceDF(); - - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() - << "\n"); - + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value *inputVal; - if (SrcDF->isEntryNode()) { + Value* inputVal; + if(SrcDF->isEntryNode()) { inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); - } else { + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { // edge is from a internal node // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) && - "Source node call not found. Dependency violation!"); - + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + // Find Output Value associated with the Source DFNode using OutputMap - Value *CI = OutputMap[SrcDF]; - + Value* CI = OutputMap[SrcDF]; + // Extract element at source position from this call instruction std::vector<unsigned> IndexList; IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI - << "\n"); - ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); inputVal = EI; } std::vector<unsigned> IdxList; @@ -1437,8 +1328,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { } DEBUG(errs() << "Extracted all\n"); retVal->setName("output"); - ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); + } //-------------------------------------------------------------------------// @@ -1455,22 +1347,24 @@ void CGT_X86::codeGen(DFInternalNode *N) { bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " - << N->getTag() << "\n"); - DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n"); + DEBUG(errs() << "Before editing\n"); + DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"); + DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"); DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"); - DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); + DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"); DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); + if (N->getTag() == visc::None) { // No code is available for this node. This (usually) means that this // node is a node that // - from the accelerator backends has been mapped to an intermediate // node, and thus they have not produced a genFunc - // - a child node had no CPU hint, thus no code gen for CPU could + // - a child node had no CPU hint, thus no code gen for CPU could // take place DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node " - << N->getFuncPointer()->getName() << "\n"); + << N->getFuncPointer()->getName() << "\n"); } else if (viscUtils::isSingleTargetTag(N->getTag())) { // There is a single version for this node according to code gen hints. // Therefore, we do not need to check the policy, we simply use the @@ -1478,226 +1372,55 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Sanity check - to be removed TODO switch (N->getTag()) { - case visc::CPU_TARGET: - assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); - assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); - break; - case visc::GPU_TARGET: - assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); - break; - default: - assert(false && "Unreachable: we checked that tag was single target!\n"); - break; - } - - // If device abstraction is enabled, then we may need to edit the node - // function. In case this is a GPU or SPIR gen func, we issue a call to - // the runtime that waits for the device to be available - if (DeviceAbstraction) { - Function *NodeGenFunc = NULL; - switch (N->getTag()) { + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + break; case visc::GPU_TARGET: - NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); break; default: + assert(false && "Unreachable: we checked that tag was single target!\n"); break; - } - - if (NodeGenFunc) { - // If we found a function to edit, we add the call to the runtime as - // its first statement - BasicBlock *BB = &*NodeGenFunc->begin(); - std::vector<Value *> Args; // TODO: add the device type as argument? - FunctionCallee RTF = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_waitOnDeviceStatus", - runtimeModule - ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") - ->getFunctionType()); - CallInst *RTFInst = - CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); - } - } - - Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - N->removeGenFuncForTarget(visc::GPU_TARGET); - N->setTag(visc::None); - N->addGenFunc(Ftmp, visc::CPU_TARGET, true); - N->setTag(visc::CPU_TARGET); - - // Sanity checks - to be removed TODO - CF = N->getGenFuncForTarget(visc::CPU_TARGET); - GF = N->getGenFuncForTarget(visc::GPU_TARGET); - - CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - - DEBUG(errs() << "After editing\n"); - DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " - << N->getTag() << "\n"); - DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n"); - DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"); - DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); - DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); - - DEBUG(errs() << "Node Name (for policy) : " - << N->getFuncPointer()->getName() << "\n"); - - Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); - - bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - - // These assertions express what we can support with the current runtime. - // Code generation works the same way even for other target combinations. - // For now, we want either CPU and GPU, or CPU and SPIR - assert((CF && (GF || !GF)) && "Invalid target selection\n"); - assert((CFx86 && (GFx86 || !GFx86)) && - "Generated functions without appropriate x86 wrapper\n"); - - FunctionType *FT = CF->getFunctionType(); - if (GF) - assert(FT == GF->getFunctionType() && - "Type mismatch between generated functions for GPU and CPU " - "targets.\n"); - - // Code generation of wrapper function - Function *F_wrapper; - ValueToValueMapTy VMap; - F_wrapper = - Function::Create(FT, CF->getLinkage(), CF->getName() + "_wrapper", &M); - - // Copy argument names over - Function::arg_iterator dest_iterator = F_wrapper->arg_begin(); - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; - ++i) { - dest_iterator->setName(i->getName()); - VMap[&*i] = &*dest_iterator; - ++dest_iterator; - } - // Gather all arguments of wrapper in a vector, to prepare the call to - // the individual gen functions - std::vector<Value *> GenFuncCallArgs; - for (Function::arg_iterator i = F_wrapper->arg_begin(), - e = F_wrapper->arg_end(); - i != e; ++i) { - GenFuncCallArgs.push_back(&*i); - } - - BasicBlock *BBcurrent, *BBtrue, *BBfalse; - - BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper); - - StringRef FName = N->getFuncPointer()->getName(); - size_t nameSize = FName.size() + 1; - std::vector<Constant *> NameV; - for (char c : FName) { - NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c)); - } - NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0')); - ArrayType *NameType = - ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize); - AllocaInst *AI = new AllocaInst(NameType, 0, nullptr, "", BBcurrent); - Constant *NameConst = ConstantArray::get(NameType, NameV); - StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent); - CastInst *BI = BitCastInst::CreatePointerCast( - AI, Type::getInt8PtrTy(M.getContext()), "", BBcurrent); - std::vector<Value *> Args; - Args.push_back(BI); - Args.push_back( - ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); - FunctionCallee RTF = M.getOrInsertFunction( - "llvm_visc_policy_getVersion", - runtimeModule->getFunction("llvm_visc_policy_getVersion") - ->getFunctionType()); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); - - ConstantInt *CmpConst = - ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true); - CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - RTFInst, CmpConst, "", BBcurrent); - - BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper); - BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper); - BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); - - CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue); - ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); - - // Switch basic block pointers - BBcurrent = BBfalse; - if (GF) { - // We have a GPU version. Generate policy check and call - CmpConst = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true); - CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, RTFInst, - CmpConst, "", BBcurrent); - BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper); - BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper); - BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); - - GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue); - RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); - - if (DeviceAbstraction) { - // Prepare arguments and function for call to wait for device runtime - // call - std::vector<Value *> Args; // TODO: add the device type as argument? - FunctionCallee RTF = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_waitOnDeviceStatus", - runtimeModule - ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") - ->getFunctionType()); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); - } } + + N->addGenFunc(N->getGenFuncForTarget(N->getTag()), + visc::CPU_TARGET, + true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->setTag(visc::CPU_TARGET); + + // Sanity checks - to be removed TODO + CF = N->getGenFuncForTarget(visc::CPU_TARGET); + GF = N->getGenFuncForTarget(visc::GPU_TARGET); + + CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + + DEBUG(errs() << "After editing\n"); + DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"); + DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"); + DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"); + DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"); + DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); + + } + else { + assert(false && "Multiple tags unsupported!"); + } - // Switch basic block pointers - BBcurrent = BBfalse; - // if (SF) { - // We have a GPU version. Generate policy check and call - // CmpConst = - // ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true); - // CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - // RTFInst, CmpConst, "", BBcurrent); - // BBtrue = BasicBlock::Create(M.getContext(), "version_spir", F_wrapper); - // BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper); - // BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); - - // GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue); - // RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); - - // if (DeviceAbstraction) { - // Prepare arguments and function for call to wait for device runtime call - // std::vector<Value *> Args; // TODO: add the device type as argument? - // FunctionCallee RTF = - // M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", - // runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()); - // CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); - // } - // } - - RI = ReturnInst::Create(M.getContext(), - UndefValue::get(FT->getReturnType()), BBfalse); - - // Now, make the node cpu gen func to be this one - // Remove all other versions and update the tag - N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); - N->removeGenFuncForTarget(visc::GPU_TARGET); - N->setTag(visc::CPU_TARGET); - - // assert(false && "got to the point where we have to combine\n"); - } } // Code generation for leaf nodes -void CGT_X86::codeGen(DFLeafNode *N) { +void CGT_X86::codeGen(DFLeafNode* N) { // Skip code generation if it is a dummy node - if (N->isDummyNode()) { + if(N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } @@ -1714,24 +1437,21 @@ void CGT_X86::codeGen(DFLeafNode *N) { // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. - // if(N->getGenFunc() != NULL) - // return; +// if(N->getGenFunc() != NULL) +// return; if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { - DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() - << " : skipping it\n"); - - DEBUG(errs() << "Check for cudnn or promise hint for node " - << N->getFuncPointer()->getName() << "\n"); + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; switch (N->getTag()) { - case visc::GPU_TARGET: - // A leaf node should not have an x86 function for GPU - // by design of DFG2LLVM_NVPTX backend - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); - break; - default: - break; + case visc::GPU_TARGET: + // A leaf node should not have an x86 function for GPU + // by design of DFG2LLVM_NVPTX backend + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "Leaf node not expected to have GPU GenFunc"); + break; + default: + break; } return; @@ -1741,7 +1461,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { "Error: Visiting a node for which code already generated\n"); std::vector<IntrinsicInst *> IItoRemove; - std::vector<std::pair<IntrinsicInst *, Value *>> IItoReplace; + std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; // Get the function associated woth the dataflow node @@ -1757,61 +1477,61 @@ void CGT_X86::codeGen(DFLeafNode *N) { // Add the new argument to the argument list. Add arguments only if the cild // graph of parent node is not streaming - if (!N->getParent()->isChildGraphStreaming()) + if(!N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); // Add generated function info to DFNode - // N->setGenFunc(F_X86, visc::CPU_TARGET); +// N->setGenFunc(F_X86, visc::CPU_TARGET); N->addGenFunc(F_X86, visc::CPU_TARGET, true); // Go through the arguments, and any pointer arguments with in attribute need // to have x86_argument_ptr call to get the x86 ptr of the argument // Insert these calls in a new BB which would dominate all other BBs // Create new BB - BasicBlock *EntryBB = &*F_X86->begin(); - BasicBlock *BB = - BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); - BranchInst *Terminator = BranchInst::Create(EntryBB, BB); + BasicBlock* EntryBB = &*F_X86->begin(); + BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); + BranchInst* Terminator = BranchInst::Create(EntryBB, BB); // Insert calls - for (Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); - ai != ae; ++ai) { - if (F_X86->getAttributes().hasAttribute(ai->getArgNo() + 1, - Attribute::In)) { - assert(ai->getType()->isPointerTy() && - "Only pointer arguments can have visc in/out attributes "); + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); + ai != ae; ++ai) { + if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) { + assert(ai->getType()->isPointerTy() + && "Only pointer arguments can have visc in/out attributes "); Function::arg_iterator aiNext = ai; ++aiNext; - Argument *size = &*aiNext; - assert(size->getType() == Type::getInt64Ty(M.getContext()) && - "Next argument after a pointer should be an i64 type"); - CastInst *BI = BitCastInst::CreatePointerCast( - &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr", - Terminator); - Value *ArgPtrCallArgs[] = {BI, size}; + Argument* size = &*aiNext; + assert(size->getType() == Type::getInt64Ty(M.getContext()) + && "Next argument after a pointer should be an i64 type"); + CastInst* BI = BitCastInst::CreatePointerCast(&*ai, + Type::getInt8PtrTy(M.getContext()), + ai->getName()+".i8ptr", + Terminator); + Value* ArgPtrCallArgs[] = {BI, size}; CallInst::Create(llvm_visc_x86_argument_ptr, - ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator); + ArrayRef<Value*>(ArgPtrCallArgs, 2), + "", + Terminator); + } } - DEBUG(errs() << *BB << "\n"); + errs() << *BB << "\n"; // Go through all the instructions for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { Instruction *I = &(*i); DEBUG(errs() << *I << "\n"); // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && - "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && - "VISC graph intrinsic within a leaf dataflow node!"); + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); if (BuildDFG::isViscQueryIntrinsic(I)) { - IntrinsicInst *II = cast<IntrinsicInst>(I); - IntrinsicInst *ArgII; - DFNode *ArgDFNode; + IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; /*********************************************************************** - * Handle VISC Query intrinsics * - ***********************************************************************/ + * Handle VISC Query intrinsics * + ***********************************************************************/ switch (II->getIntrinsicID()) { /**************************** llvm.visc.getNode() *******************/ case Intrinsic::visc_getNode: { @@ -1840,9 +1560,8 @@ void CGT_X86::codeGen(DFLeafNode *N) { // get the appropriate field ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim(); - IntegerType *IntTy = Type::getInt32Ty(M.getContext()); - ConstantInt *numOfDimConstant = - ConstantInt::getSigned(IntTy, (int64_t)numOfDim); + IntegerType* IntTy = Type::getInt32Ty(M.getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); II->replaceAllUsesWith(numOfDimConstant); IItoRemove.push_back(II); @@ -1858,46 +1577,46 @@ void CGT_X86::codeGen(DFLeafNode *N) { // The dfnode argument should be an ancestor of this leaf node or // the leaf node itself int parentLevel = N->getAncestorHops(ArgDFNode); - assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) && - "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); // Get specified dimension // (dim = 0) => x // (dim = 1) => y // (dim = 2) => z - int dim = - (int)(II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x); - assert((dim >= 0) && (dim < 3) && - "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic " - "ID!"); + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!"); // For immediate ancestor, use the extra argument introduced in // F_X86 int numParamsF = F->getFunctionType()->getNumParams(); int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); - assert( - (numParamsF_X86 - numParamsF == 6) && - "Difference of arguments between function and its clone is not 6!"); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); - if (parentLevel == 0) { + if(parentLevel == 0) { // Case when the query is for this node itself - unsigned offset = 3 + (3 - dim); + unsigned offset = 3 + (3-dim); // Traverse argument list of F_X86 in reverse order to find the // correct index or dim argument. - Argument *indexVal = getArgumentFromEnd(F_X86, offset); + Argument* indexVal = getArgumentFromEnd(F_X86, offset); assert(indexVal && "Index argument not found. Invalid offset!"); DEBUG(errs() << *II << " replaced with " << *indexVal << "\n"); II->replaceAllUsesWith(indexVal); IItoRemove.push_back(II); - } else { + } + else { // Case when query is for an ancestor - Value *args[] = { - ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), - ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; - CallInst *CI = CallInst::Create(llvm_visc_x86_getDimInstance, - ArrayRef<Value *>(args, 2), + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance, + ArrayRef<Value*>(args, 2), "nodeInstanceID", II); DEBUG(errs() << *II << " replaced with " << *CI << "\n"); II->replaceAllUsesWith(CI); @@ -1916,46 +1635,46 @@ void CGT_X86::codeGen(DFLeafNode *N) { // The dfnode argument should be an ancestor of this leaf node or // the leaf node itself int parentLevel = N->getAncestorHops(ArgDFNode); - assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) && - "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); // Get specified dimension // (dim = 0) => x // (dim = 1) => y // (dim = 2) => z - int dim = - (int)(II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x); - assert((dim >= 0) && (dim < 3) && - "Invalid dimension for getNumNodeInstances_[xyz]. Check " - "Intrinsic ID!"); + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!"); // For immediate ancestor, use the extra argument introduced in // F_X86 int numParamsF = F->getFunctionType()->getNumParams(); int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); - assert( - (numParamsF_X86 - numParamsF == 6) && - "Difference of arguments between function and its clone is not 6!"); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); - if (parentLevel == 0) { + if(parentLevel == 0) { // Case when the query is for this node itself unsigned offset = 3 - dim; // Traverse argument list of F_X86 in reverse order to find the // correct index or dim argument. - Argument *limitVal = getArgumentFromEnd(F_X86, offset); + Argument* limitVal = getArgumentFromEnd(F_X86, offset); assert(limitVal && "Limit argument not found. Invalid offset!"); - DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); + DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); II->replaceAllUsesWith(limitVal); IItoRemove.push_back(II); - } else { + } + else { // Case when query is from the ancestor - Value *args[] = { - ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), - ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; - CallInst *CI = CallInst::Create(llvm_visc_x86_getDimLimit, - ArrayRef<Value *>(args, 2), + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit, + ArrayRef<Value*>(args, 2), "numNodeInstances", II); DEBUG(errs() << *II << " replaced with " << *CI << "\n"); II->replaceAllUsesWith(CI); @@ -1965,26 +1684,20 @@ void CGT_X86::codeGen(DFLeafNode *N) { break; } default: - DEBUG(errs() << "Found unknown intrinsic with ID = " - << II->getIntrinsicID() << "\n"); + DEBUG(errs() << "Found unknown intrinsic with ID = " << + II->getIntrinsicID() << "\n"); assert(false && "Unknown VISC Intrinsic!"); break; } } else { - // TODO: how to handle address space qualifiers in load/store } + } - // TODO: - // When to replace the uses? - // In which order is it safe to replace the instructions in - // IItoReplace? - // Probably in the reverse order in the vectors - // It is a good idea to have them in one vector and chech the type - // using dyn_cast in order to determine if we replace with inst or value - // TODO: maybe leave these instructions to be removed by a later DCE pass + + // Remove them in reverse order for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin(); i != IItoRemove.end(); ++i) { (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType())); @@ -1997,7 +1710,8 @@ void CGT_X86::codeGen(DFLeafNode *N) { } // End of namespace char DFG2LLVM_X86::ID = 0; -static RegisterPass<DFG2LLVM_X86> - X("dfg2llvm-x86", "Dataflow Graph to LLVM for X86 backend", - false /* does not modify the CFG */, - true /* transformation, not just analysis */); +static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86", + "Dataflow Graph to LLVM for X86 backend", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + diff --git a/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt b/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt deleted file mode 100644 index 7069470a1a6f8b1a49eea2824f27204ebdf3fb26..0000000000000000000000000000000000000000 --- a/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt +++ /dev/null @@ -1,2 +0,0 @@ -10 -10 15 10 16 15 30 15 25 20 15 diff --git a/hpvm/projects/visc-rt/device_abstraction.h b/hpvm/projects/visc-rt/device_abstraction.h deleted file mode 100644 index 7e77d100deb6b23b6ed9ca994796cd1cb108b0d4..0000000000000000000000000000000000000000 --- a/hpvm/projects/visc-rt/device_abstraction.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef __DEVICE_ABSTRACTION__ -#define __DEVICE_ABSTRACTION__ - -#include <fstream> -#include <iostream> -#include <stdio.h> -#include <stdlib.h> -#include <thread> -#include <time.h> -#include <vector> - -#define MIN_INTERVAL 2 -#define MAX_INTERVAL 8 -#define NUM_INTERVALS 10 - -// Device status variable: true if the device is available for use -volatile bool deviceStatus = true; -// Intervals at which to change the device status -std::vector<unsigned> Intervals; - -// Set to true when program execution ends and so we can end the device -// simulation -volatile bool executionEnd = false; - -void initializeDeviceStatusIntervals() { - - unsigned sz = 0; - unsigned tmp = 0; - - const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/" - "deviceStatusSwitchIntervals.txt"; - std::ifstream infile; - infile.open(fn); - if (!infile.is_open()) { - std::cout << "Failed to open " << fn << " for reading\n"; - return; - } - infile >> sz; - - if (sz) { - // We have data. Read them into the vector - for (unsigned i = 0; i < sz; i++) { - infile >> tmp; - Intervals.push_back(tmp); - } - infile.close(); - } else { - // We have no data. Create random data and write them into the file - infile.close(); - std::ofstream outfile; - outfile.open(fn); - if (!outfile.is_open()) { - std::cout << "Failed to open " << fn << " for writing\n"; - return; - } - sz = 1 + rand() % NUM_INTERVALS; - outfile << sz; - for (unsigned i = 0; i < sz; i++) { - Intervals.push_back(MIN_INTERVAL + - rand() % (MAX_INTERVAL - MIN_INTERVAL)); - outfile << Intervals[i]; - } - outfile.close(); - } - - return; -} - -void updateDeviceStatus() { - - unsigned i = 0; - while (!executionEnd) { - std::this_thread::sleep_for(std::chrono::seconds(Intervals[i])); - deviceStatus = !deviceStatus; - std::cout << "Changed device status to " << deviceStatus << "\n"; - i = (i + 1) % Intervals.size(); - } -} - -#endif // __DEVICE_ABSTRACTION__ diff --git a/hpvm/projects/visc-rt/makefile b/hpvm/projects/visc-rt/makefile deleted file mode 100644 index adcc6323356d2537eca6ed653cad6d17a1d1ef0e..0000000000000000000000000000000000000000 --- a/hpvm/projects/visc-rt/makefile +++ /dev/null @@ -1,29 +0,0 @@ -#LLVM_SRC_ROOT = -LLVM_BUILD_ROOT = ${LLVM_SRC_ROOT}/../build/ - -CUDA_INC_PATH = /software/cuda-9.1/include/CL/ - - -ifeq ($(NUM_CORES),) - NUM_CORES=1 -endif - -CPP_FLAGS = -I$(LLVM_SRC_ROOT)/include -I$(LLVM_BUILD_ROOT)/include -I$(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS -TARGET:=visc-rt - -LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang -LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++ - -OPTS = - -ifeq ($(DEBUG),1) - OPTS+=-DDEBUG_BUILD -endif - -all: $(TARGET:%=%.ll) - -$(TARGET:%=%.ll):%.ll:%.cpp %.h - $(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@ - -clean : - rm -f $(TARGET).ll diff --git a/hpvm/projects/visc-rt/policy.h b/hpvm/projects/visc-rt/policy.h deleted file mode 100644 index d50e65868b376bfbcc3d4bd00d4919db677722b8..0000000000000000000000000000000000000000 --- a/hpvm/projects/visc-rt/policy.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef __POLICY__ -#define __POLICY__ - -#include "device_abstraction.h" -#include <string> - -/************************* Policies *************************************/ -class Policy { -public: - virtual int getVersion(const char *, int64_t) = 0; - virtual ~Policy(){}; -}; - -class ConstPolicy : public Policy { -public: - ConstPolicy(int deviceID) : deviceID(deviceID) {} - - int getVersion(const char *, int64_t) override { return deviceID; } - -private: - int deviceID; -}; - -class NodePolicy : public Policy { - virtual int getVersion(const char *name, int64_t it) override { - std::string s(name); - // std::string NodeNames[1] = { - // "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" }; - std::string NodeNames[] = { - "WrapperGaussianSmoothing_cloned", - "WrapperlaplacianEstimate_cloned", - "WrapperComputeZeroCrossings_cloned", - "WrapperComputeGradient_cloned", - "WrapperComputeMaxGradient_cloned", - "WrapperRejectZeroCrossings_cloned", - }; - // if (!s.compare(NodeNames[4])) { - // std::cout << s << ": CPU" << "\n"; - // return 0; - //} - return 2; - } -}; - -class IterationPolicy : public Policy { - virtual int getVersion(const char *name, int64_t it) override { - if ((it % 10 == 0) || (it % 10 == 1)) - return 0; - else - return 2; - } -}; - -class DeviceStatusPolicy : public Policy { - virtual int getVersion(const char *name, int64_t it) override { - if (deviceStatus) { - // std::cout << "Returning GPU\n"; - return 2; - } else { - // std::cout << "Returning CPU\n"; - return 0; - } - } -}; - -/* ------------------------------------------------------------------------- */ -// Added for the CFAR interactive policy demo. - -class InteractivePolicy : public Policy { -private: - // 0 :for CPU, 1 for GPU, 2 for Vector - unsigned int userTargetDeviceChoice; - // Used to end thread execution - bool end; - // Thread that will update userTargetDeviceChoice - std::thread userTargetDeviceChoiceThread; - // Thread function - void updateUserTargetChoice() { - while (!end) { - std::cout << "Select target device (0 for CPU, 1 fpr GPU): "; - std::cin >> userTargetDeviceChoice; - if (userTargetDeviceChoice > 1) { - std::cout << "Invalid target device. Selecting GPU instead.\n"; - userTargetDeviceChoice = 1; - } - } - } - -public: - // Inherited method, erquired for every policy object - virtual int getVersion(const char *name, int64_t it) { - return userTargetDeviceChoice; - } - - InteractivePolicy() { - userTargetDeviceChoice = 1; - end = false; - userTargetDeviceChoiceThread = - std::thread(&InteractivePolicy::updateUserTargetChoice, this); - } - - ~InteractivePolicy() { - end = true; - userTargetDeviceChoiceThread.join(); - } -}; - -#endif // __POLICY__ diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/visc-rt/visc-rt.cpp index 53d3b516f22b59857b1a17aecba32a6b723998f0..df5b1b80f7ae71ca49f461a50f36f81064028ef9 100644 --- a/hpvm/projects/visc-rt/visc-rt.cpp +++ b/hpvm/projects/visc-rt/visc-rt.cpp @@ -52,7 +52,6 @@ cl_context globalOCLContext; cl_device_id *clDevices; cl_command_queue globalCommandQue; -Policy *policy = NULL; MemTracker MTracker; vector<DFGDepth> DStack; // Mutex to prevent concurrent access by multiple thereads in pipeline @@ -69,55 +68,6 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) { } } -/************************* Policies *************************************/ -void llvm_visc_policy_init() { - cout << "Initializing policy object ...\n"; - // policy = new NodePolicy(); - // policy = new IterationPolicy(); - // policy = new DeviceStatusPolicy(); - // policy = new InteractivePolicy(); - policy = new ConstPolicy(0); - cout << "DONE: Initializing policy object.\n"; -} - -void llvm_visc_policy_clear() { - if (policy) - free(policy); -} - -int llvm_visc_policy_getVersion(const char *name, int64_t i) { - return policy->getVersion(name, i); -} - -/******************** Device Abstraction ********************************/ -std::thread deviceStatusThread; - -void llvm_visc_deviceAbstraction_start() { - cout << "Starting device status simulation ...\n"; - // Initialize vector with points where ti switch device status - initializeDeviceStatusIntervals(); - // Create a thread that performs the changes - deviceStatusThread = std::thread(updateDeviceStatus); - cout << "Started device status simulation thread ...\n"; - return; -} - -void llvm_visc_deviceAbstraction_end() { - cout << "Ending device status simulation thread ...\n"; - // Set the variable that allows the thread to know that execution has ended - executionEnd = true; - // Wait for the thread that manages device status to terminate - deviceStatusThread.join(); - cout << "Ended device status simulation.\n"; - return; -} - -void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { - while (!deviceStatus) { - }; - return; -} - /************************* Depth Stack Routines ***************************/ void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/visc-rt/visc-rt.h index 3ad315768bf90584a68c1d620ac68936e62a17f0..d9d946f1da14245f8cde426e7b5ea92f791537f5 100644 --- a/hpvm/projects/visc-rt/visc-rt.h +++ b/hpvm/projects/visc-rt/visc-rt.h @@ -15,8 +15,6 @@ #include "../../include/SupportVISC/VISCHint.h" #include "../../include/SupportVISC/VISCTimer.h" -#include "device_abstraction.h" -#include "policy.h" #ifndef DEBUG_BUILD #define DEBUG(s) \ @@ -29,17 +27,6 @@ using namespace std; extern "C" { -/************************* Policies *************************************/ - -void llvm_visc_policy_init(); -void llvm_visc_policy_clear(); -int llvm_visc_policy_getVersion(const char *, int64_t); - -/******************** Device Abstraction ********************************/ -void llvm_visc_deviceAbstraction_start(); -void llvm_visc_deviceAbstraction_end(); -void llvm_visc_deviceAbstraction_waitOnDeviceStatus(); - /********************* DFG Depth Stack **********************************/ class DFGDepth { private: diff --git a/hpvm/test/README.md b/hpvm/test/README.md index b391bd2186b696685adb2645c288c01ea7006850..e709ef04195c90e0f91c2a4b4a4b1d2f0b716d1f 100644 --- a/hpvm/test/README.md +++ b/hpvm/test/README.md @@ -4,9 +4,7 @@ Tests are provided, along with a template Makefile for user projects. ## Parboil Several tests from the [parboil suite](http://impact.crhc.illinois.edu/parboil/parboil.aspx) have been ported to HPVM. To run one of these tests, navigate to its directory under `parboil/benchmarks/`. -Tests may be built for the cpu or gpu with hpvm, and openCL versions are provided for comparison. -Check under the `src/` directory in each benchmark to see which versions are available, -denoted by the names of the subdirectories. +Tests may be built for the cpu or gpu with hpvm. ``` # sgemm example cd parboil/benchmarks/sgemm @@ -16,9 +14,6 @@ make run TARGET=seq VERSION=visc # HPVM gpu make TARGET=gpu VERSION=visc make run TARGET=gpu VERSION=visc -# openCL -make VERSION=opencl_base -make run VERSION=opencl_base ``` ## Cava diff --git a/hpvm/test/hpvm-cava/Makefile b/hpvm/test/hpvm-cava/Makefile index 24027cb1a9f78599f2a41a5c1f960701c112395e..0054af8c4d9cc39c21b00e73a5b53c8ac2a089b8 100644 --- a/hpvm/test/hpvm-cava/Makefile +++ b/hpvm/test/hpvm-cava/Makefile @@ -52,7 +52,7 @@ OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -VISC_RT_PATH = $(VISC_BUILD_DIR)/tools/hpvm/projects/visc-rt +VISC_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/visc-rt VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc @@ -82,9 +82,7 @@ OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) KERNEL = $(TEST_OBJS).kernels.ll -ifeq ($(TARGET),seq) -else - KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll +ifeq ($(TARGET),gpu) KERNEL_OCL = $(TEST_OBJS).kernels.cl endif @@ -102,7 +100,7 @@ default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE) #default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE) $(KERNEL_OCL) : $(KERNEL) - $(OCLBE) --debug $< -o $@ + $(OCLBE) $< -o $@ $(EXE) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ @@ -111,7 +109,7 @@ $(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ $(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) - $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) + $(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -123,6 +121,6 @@ $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c $(CC) $(CFLAGS) -emit-llvm -S -o $@ $< $(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll - $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ + $(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@ ## END HPVM MAKEFILE diff --git a/hpvm/test/hpvm-cava/Makefile.config b/hpvm/test/hpvm-cava/Makefile.config index ffb2942911313421ce9b9186a392578d7bcaf4c7..1bdb62dec493fc63e581ae8204ea736c45ed5f7d 100644 --- a/hpvm/test/hpvm-cava/Makefile.config +++ b/hpvm/test/hpvm-cava/Makefile.config @@ -3,23 +3,21 @@ CUDA_LIB_PATH=$(CUDA_PATH)/lib64 OPENCL_PATH=/software/cuda-9.1 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64 -LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-release/hpvm/llvm/ -# NOTE: You may need to configure this based on your root path. -VISC_SRC_ROOT=$(LLVM_SRC_ROOT) +LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-reorg-9-temp/hpvm/llvm/ -VISC_BUILD_DIR =$(VISC_SRC_ROOT)/../build -CC = $(VISC_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include -OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe +LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build +CC = $(LLVM_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include +OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include +CXX = $(LLVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(LLVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/hpvm-cava/Makefile.config.example b/hpvm/test/hpvm-cava/Makefile.config.example index 269f0b7df273c958f0cd20a0f935716a329e00ae..2627ca508f17acb96c858bf4473eed4d89ebec20 100644 --- a/hpvm/test/hpvm-cava/Makefile.config.example +++ b/hpvm/test/hpvm-cava/Makefile.config.example @@ -1,23 +1,21 @@ -CUDA_PATH=/usr/local/cuda -CUDA_LIB_PATH=/usr/local/cuda/lib64 -OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64 -OPENCL_LIB_PATH=$(OPENCL_PATH)/lib +CUDA_PATH=/software/cuda-9.1 +CUDA_LIB_PATH=$(CUDA_PATH)/lib64 +OPENCL_PATH=/software/cuda-9.1 +OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64 -# NOTE: You may need to configure this based on your root path. -VISC_SRC_ROOT=$(LLVM_SRC_ROOT) +LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build +CC = $(LLVM_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include +OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe -VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build -CC = $(VISC_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include +CXX = $(LLVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include - -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(LLVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/Makefile deleted file mode 100644 index 25131be39c23baa95b34e95444f7e19e1d03b389..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# (c) 2010 The Board of Trustees of the University of Illinois. - -LANGUAGE=opencl -SRCDIR_OBJS=main.o lbm.o ocl.o -APP_CUDALDFLAGS=-lm -APP_CFLAGS=-ffast-math -O3 -APP_CXXFLAGS=-ffast-math -O3 -KERNEL_OBJS=kernel_offline.nvptx.s diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel.cl b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel.cl deleted file mode 100644 index 3f34ea5ef25943ac7eeb18eead429f70ec4cf807..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel.cl +++ /dev/null @@ -1,176 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef LBM_KERNEL_CL -#define LBM_KERNEL_CL - -#include "layout_config.h" -#include "lbm_macros.h" -/******************************************************************************/ - -__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid ) -{ - srcGrid += MARGIN; - dstGrid += MARGIN; - - - //Using some predefined macros here. Consider this the declaration - // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z - - SWEEP_VAR - SWEEP_X = get_local_id(0); - SWEEP_Y = get_group_id(0); - SWEEP_Z = get_group_id(1); - - float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB; - float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ; - float tempSB, tempET, tempEB, tempWT, tempWB ; - - //Load all of the input fields - //This is a gather operation of the SCATTER preprocessor variable - // is undefined in layout_config.h, or a "local" read otherwise - tempC = SRC_C(srcGrid); - - tempN = SRC_N(srcGrid); - tempS = SRC_S(srcGrid); - tempE = SRC_E(srcGrid); - tempW = SRC_W(srcGrid); - tempT = SRC_T(srcGrid); - tempB = SRC_B(srcGrid); - - tempNE = SRC_NE(srcGrid); - tempNW = SRC_NW(srcGrid); - tempSE = SRC_SE(srcGrid); - tempSW = SRC_SW(srcGrid); - tempNT = SRC_NT(srcGrid); - tempNB = SRC_NB(srcGrid); - tempST = SRC_ST(srcGrid); - tempSB = SRC_SB(srcGrid); - tempET = SRC_ET(srcGrid); - tempEB = SRC_EB(srcGrid); - tempWT = SRC_WT(srcGrid); - tempWB = SRC_WB(srcGrid); - - //Test whether the cell is fluid or obstacle - if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) { - - //Swizzle the inputs: reflect any fluid coming into this cell - // back to where it came from - temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ; - temp_swp = tempE ; tempE = tempW ; tempW = temp_swp; - temp_swp = tempT ; tempT = tempB ; tempB = temp_swp; - temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp; - temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp; - temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp; - temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp; - temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp; - temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp; - } - else { - - //The math meat of LBM: ignore for optimization - float ux, uy, uz, rho, u2; - float temp1, temp2, temp_base; - rho = tempC + tempN - + tempS + tempE - + tempW + tempT - + tempB + tempNE - + tempNW + tempSE - + tempSW + tempNT - + tempNB + tempST - + tempSB + tempET - + tempEB + tempWT - + tempWB; - - ux = + tempE - tempW - + tempNE - tempNW - + tempSE - tempSW - + tempET + tempEB - - tempWT - tempWB; - - uy = + tempN - tempS - + tempNE + tempNW - - tempSE - tempSW - + tempNT + tempNB - - tempST - tempSB; - - uz = + tempT - tempB - + tempNT - tempNB - + tempST - tempSB - + tempET - tempEB - + tempWT - tempWB; - - ux /= rho; - uy /= rho; - uz /= rho; - - if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) { - - ux = 0.005f; - uy = 0.002f; - uz = 0.000f; - } - - u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f; - temp_base = OMEGA*rho; - temp1 = DFL1*temp_base; - - //Put the output values for this cell in the shared memory - temp_base = OMEGA*rho; - temp1 = DFL1*temp_base; - temp2 = 1.0f-OMEGA; - tempC = temp2*tempC + temp1*( - u2); - temp1 = DFL2*temp_base; - tempN = temp2*tempN + temp1*( uy*(4.5f*uy + 3.0f) - u2); - tempS = temp2*tempS + temp1*( uy*(4.5f*uy - 3.0f) - u2); - tempT = temp2*tempT + temp1*( uz*(4.5f*uz + 3.0f) - u2); - tempB = temp2*tempB + temp1*( uz*(4.5f*uz - 3.0f) - u2); - tempE = temp2*tempE + temp1*( ux*(4.5f*ux + 3.0f) - u2); - tempW = temp2*tempW + temp1*( ux*(4.5f*ux - 3.0f) - u2); - temp1 = DFL3*temp_base; - tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2); - tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2); - tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2); - tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2); - tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2); - tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2); - tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2); - tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2); - tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2); - tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2); - tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2); - tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2); - } - - //Write the results computed above - //This is a scatter operation of the SCATTER preprocessor variable - // is defined in layout_config.h, or a "local" write otherwise - DST_C ( dstGrid ) = tempC; - - DST_N ( dstGrid ) = tempN; - DST_S ( dstGrid ) = tempS; - DST_E ( dstGrid ) = tempE; - DST_W ( dstGrid ) = tempW; - DST_T ( dstGrid ) = tempT; - DST_B ( dstGrid ) = tempB; - - DST_NE( dstGrid ) = tempNE; - DST_NW( dstGrid ) = tempNW; - DST_SE( dstGrid ) = tempSE; - DST_SW( dstGrid ) = tempSW; - DST_NT( dstGrid ) = tempNT; - DST_NB( dstGrid ) = tempNB; - DST_ST( dstGrid ) = tempST; - DST_SB( dstGrid ) = tempSB; - DST_ET( dstGrid ) = tempET; - DST_EB( dstGrid ) = tempEB; - DST_WT( dstGrid ) = tempWT; - DST_WB( dstGrid ) = tempWB; -} - -#endif // LBM_KERNEL_CL diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl deleted file mode 100644 index 3f34ea5ef25943ac7eeb18eead429f70ec4cf807..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl +++ /dev/null @@ -1,176 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef LBM_KERNEL_CL -#define LBM_KERNEL_CL - -#include "layout_config.h" -#include "lbm_macros.h" -/******************************************************************************/ - -__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid ) -{ - srcGrid += MARGIN; - dstGrid += MARGIN; - - - //Using some predefined macros here. Consider this the declaration - // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z - - SWEEP_VAR - SWEEP_X = get_local_id(0); - SWEEP_Y = get_group_id(0); - SWEEP_Z = get_group_id(1); - - float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB; - float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ; - float tempSB, tempET, tempEB, tempWT, tempWB ; - - //Load all of the input fields - //This is a gather operation of the SCATTER preprocessor variable - // is undefined in layout_config.h, or a "local" read otherwise - tempC = SRC_C(srcGrid); - - tempN = SRC_N(srcGrid); - tempS = SRC_S(srcGrid); - tempE = SRC_E(srcGrid); - tempW = SRC_W(srcGrid); - tempT = SRC_T(srcGrid); - tempB = SRC_B(srcGrid); - - tempNE = SRC_NE(srcGrid); - tempNW = SRC_NW(srcGrid); - tempSE = SRC_SE(srcGrid); - tempSW = SRC_SW(srcGrid); - tempNT = SRC_NT(srcGrid); - tempNB = SRC_NB(srcGrid); - tempST = SRC_ST(srcGrid); - tempSB = SRC_SB(srcGrid); - tempET = SRC_ET(srcGrid); - tempEB = SRC_EB(srcGrid); - tempWT = SRC_WT(srcGrid); - tempWB = SRC_WB(srcGrid); - - //Test whether the cell is fluid or obstacle - if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) { - - //Swizzle the inputs: reflect any fluid coming into this cell - // back to where it came from - temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ; - temp_swp = tempE ; tempE = tempW ; tempW = temp_swp; - temp_swp = tempT ; tempT = tempB ; tempB = temp_swp; - temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp; - temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp; - temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp; - temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp; - temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp; - temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp; - } - else { - - //The math meat of LBM: ignore for optimization - float ux, uy, uz, rho, u2; - float temp1, temp2, temp_base; - rho = tempC + tempN - + tempS + tempE - + tempW + tempT - + tempB + tempNE - + tempNW + tempSE - + tempSW + tempNT - + tempNB + tempST - + tempSB + tempET - + tempEB + tempWT - + tempWB; - - ux = + tempE - tempW - + tempNE - tempNW - + tempSE - tempSW - + tempET + tempEB - - tempWT - tempWB; - - uy = + tempN - tempS - + tempNE + tempNW - - tempSE - tempSW - + tempNT + tempNB - - tempST - tempSB; - - uz = + tempT - tempB - + tempNT - tempNB - + tempST - tempSB - + tempET - tempEB - + tempWT - tempWB; - - ux /= rho; - uy /= rho; - uz /= rho; - - if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) { - - ux = 0.005f; - uy = 0.002f; - uz = 0.000f; - } - - u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f; - temp_base = OMEGA*rho; - temp1 = DFL1*temp_base; - - //Put the output values for this cell in the shared memory - temp_base = OMEGA*rho; - temp1 = DFL1*temp_base; - temp2 = 1.0f-OMEGA; - tempC = temp2*tempC + temp1*( - u2); - temp1 = DFL2*temp_base; - tempN = temp2*tempN + temp1*( uy*(4.5f*uy + 3.0f) - u2); - tempS = temp2*tempS + temp1*( uy*(4.5f*uy - 3.0f) - u2); - tempT = temp2*tempT + temp1*( uz*(4.5f*uz + 3.0f) - u2); - tempB = temp2*tempB + temp1*( uz*(4.5f*uz - 3.0f) - u2); - tempE = temp2*tempE + temp1*( ux*(4.5f*ux + 3.0f) - u2); - tempW = temp2*tempW + temp1*( ux*(4.5f*ux - 3.0f) - u2); - temp1 = DFL3*temp_base; - tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2); - tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2); - tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2); - tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2); - tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2); - tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2); - tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2); - tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2); - tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2); - tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2); - tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2); - tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2); - } - - //Write the results computed above - //This is a scatter operation of the SCATTER preprocessor variable - // is defined in layout_config.h, or a "local" write otherwise - DST_C ( dstGrid ) = tempC; - - DST_N ( dstGrid ) = tempN; - DST_S ( dstGrid ) = tempS; - DST_E ( dstGrid ) = tempE; - DST_W ( dstGrid ) = tempW; - DST_T ( dstGrid ) = tempT; - DST_B ( dstGrid ) = tempB; - - DST_NE( dstGrid ) = tempNE; - DST_NW( dstGrid ) = tempNW; - DST_SE( dstGrid ) = tempSE; - DST_SW( dstGrid ) = tempSW; - DST_NT( dstGrid ) = tempNT; - DST_NB( dstGrid ) = tempNB; - DST_ST( dstGrid ) = tempST; - DST_SB( dstGrid ) = tempSB; - DST_ET( dstGrid ) = tempET; - DST_EB( dstGrid ) = tempEB; - DST_WT( dstGrid ) = tempWT; - DST_WB( dstGrid ) = tempWB; -} - -#endif // LBM_KERNEL_CL diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h deleted file mode 100644 index d44088661d313eeca6d44612549337b5a2630e04..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h +++ /dev/null @@ -1,86 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/*############################################################################*/ - -#ifndef _LAYOUT_CONFIG_H_ -#define _LAYOUT_CONFIG_H_ - -/*############################################################################*/ - -// Unchangeable settings: volume simulation size for the given example -#define SIZE_X (120) -#define SIZE_Y (120) -#define SIZE_Z (150) - -// Changeable settings -// Padding in each dimension -#define PADDING_X (8) -#define PADDING_Y (0) -#define PADDING_Z (4) - -// Pitch in each dimension -#define PADDED_X (SIZE_X + PADDING_X) -#define PADDED_Y (SIZE_Y + PADDING_Y) -#define PADDED_Z (SIZE_Z + PADDING_Z) - -#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) - -// Flattening function -// This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x, y, z, e) \ - (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) - -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) - -// Set this value to 1 for GATHER, or 0 for SCATTER -#if 1 -#define GATHER -#else -#define SCATTER -#endif - -// OpenCL block size (not trivially changeable here) -#define BLOCK_SIZE SIZE_X - -/*############################################################################*/ - -typedef enum { - C = 0, - N, - S, - E, - W, - T, - B, - NE, - NW, - SE, - SW, - NT, - NB, - ST, - SB, - ET, - EB, - WT, - WB, - FLAGS, - N_CELL_ENTRIES -} CELL_ENTRIES; - -#define N_DISTR_FUNCS FLAGS - -typedef enum { - OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2 -} CELL_FLAGS; - -#endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c deleted file mode 100644 index 14ffa4211b3763d7c1c6538e693a76be61a0b158..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c +++ /dev/null @@ -1,321 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/*############################################################################*/ - -// includes, system -#include <CL/cl.h> -#include <float.h> -#include <math.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -// includes, project -#include "layout_config.h" -#include "lbm.h" -#include "lbm_macros.h" -#include "ocl.h" - -/******************************************************************************/ - -void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, - cl_mem dstGrid) { - - cl_int clStatus; - - clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X, 1, 1}; - size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, - dimGrid, dimBlock, 0, NULL, NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") -} -/*############################################################################*/ - -void LBM_allocateGrid(float **ptr) { - const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - - *ptr = (float *)malloc(size); - if (!*ptr) { - printf("LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0 * 1024.0)); - exit(1); - } - - memset(*ptr, 0, size); - - printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); - - *ptr += MARGIN; -} - -/******************************************************************************/ - -void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { - const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - cl_int clStatus; - *ptr = - clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") -} - -/*############################################################################*/ - -void LBM_freeGrid(float **ptr) { - free(*ptr - MARGIN); - *ptr = NULL; -} - -/******************************************************************************/ - -void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } - -/*############################################################################*/ - -void LBM_initializeGrid(LBM_Grid grid) { - SWEEP_VAR - - SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) - SRC_C(grid) = DFL1; - SRC_N(grid) = DFL2; - SRC_S(grid) = DFL2; - SRC_E(grid) = DFL2; - SRC_W(grid) = DFL2; - SRC_T(grid) = DFL2; - SRC_B(grid) = DFL2; - SRC_NE(grid) = DFL3; - SRC_NW(grid) = DFL3; - SRC_SE(grid) = DFL3; - SRC_SW(grid) = DFL3; - SRC_NT(grid) = DFL3; - SRC_NB(grid) = DFL3; - SRC_ST(grid) = DFL3; - SRC_SB(grid) = DFL3; - SRC_ET(grid) = DFL3; - SRC_EB(grid) = DFL3; - SRC_WT(grid) = DFL3; - SRC_WB(grid) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP(grid); - SWEEP_END -} - -/******************************************************************************/ - -void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, - LBM_Grid h_grid) { - const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, - h_grid - MARGIN, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") -} - -void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, - LBM_Grid h_grid) { - const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, - h_grid - MARGIN, 0, NULL, NULL); - CHECK_ERROR("clEnqueueReadBuffer") -} - -/*############################################################################*/ - -void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; -} - -/*############################################################################*/ - -void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { - int x, y, z; - - FILE *file = fopen(filename, "rb"); - - for (z = 0; z < SIZE_Z; z++) { - for (y = 0; y < SIZE_Y; y++) { - for (x = 0; x < SIZE_X; x++) { - if (fgetc(file) != '.') - SET_FLAG(grid, x, y, z, OBSTACLE); - } - fgetc(file); - } - fgetc(file); - } - - fclose(file); -} - -/*############################################################################*/ - -void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { - int x, y, z; - - for (z = -2; z < SIZE_Z + 2; z++) { - for (y = 0; y < SIZE_Y; y++) { - for (x = 0; x < SIZE_X; x++) { - if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || - z == SIZE_Z - 1) { - SET_FLAG(grid, x, y, z, OBSTACLE); - } else { - if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && - y < SIZE_Y - 2) { - SET_FLAG(grid, x, y, z, ACCEL); - } - } - } - } - } -} - -/*############################################################################*/ - -void LBM_showGridStatistics(LBM_Grid grid) { - int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) - rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + - LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + - LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + - LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + - LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); - - if (rho < minRho) - minRho = rho; - if (rho > maxRho) - maxRho = rho; - mass += rho; - - if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { - nObstacleCells++; - } else { - if (TEST_FLAG_SWEEP(grid, ACCEL)) - nAccelCells++; - else - nFluidCells++; - - ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + - LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - - LOCAL(grid, WT) - LOCAL(grid, WB); - uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - - LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - - LOCAL(grid, ST) - LOCAL(grid, SB); - uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + - LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + - LOCAL(grid, WT) - LOCAL(grid, WB); - u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); - if (u2 < minU2) - minU2 = u2; - if (u2 > maxU2) - maxU2 = u2; - } - SWEEP_END - - printf("LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, - sqrt(minU2), sqrt(maxU2)); -} - -/*############################################################################*/ - -static void storeValue(FILE *file, OUTPUT_PRECISION *v) { - const int litteBigEndianTest = 1; - if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ - const char *vPtr = (char *)v; - char buffer[sizeof(OUTPUT_PRECISION)]; - int i; - - for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) - buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; - - fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); - } else { /* little endian */ - fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); - } -} - -/*############################################################################*/ - -static void loadValue(FILE *file, OUTPUT_PRECISION *v) { - const int litteBigEndianTest = 1; - if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ - char *vPtr = (char *)v; - char buffer[sizeof(OUTPUT_PRECISION)]; - int i; - - fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); - - for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) - vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; - } else { /* little endian */ - fread(v, sizeof(OUTPUT_PRECISION), 1, file); - } -} - -/*############################################################################*/ - -void LBM_storeVelocityField(LBM_Grid grid, const char *filename, - const int binary) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE *file = fopen(filename, (binary ? "wb" : "w")); - - SWEEP_VAR - SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) - rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + - SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + - SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + - SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + - SRC_WB(grid); - ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - - SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); - uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - - SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); - uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - - SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); - ux /= rho; - uy /= rho; - uz /= rho; - - if (binary) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue(file, &ux); - storeValue(file, &uy); - storeValue(file, &uz); - } else - fprintf(file, "%e %e %e\n", ux, uy, uz); - - SWEEP_END; - - fclose(file); -} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h deleted file mode 100644 index 64a617feb862bdffdcb0c6aa57b0f1b09c26debb..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h +++ /dev/null @@ -1,39 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/*############################################################################*/ - -#ifndef _LBM_H_ -#define _LBM_H_ - -/*############################################################################*/ - -void LBM_allocateGrid(float **ptr); -void LBM_freeGrid(float **ptr); -void LBM_initializeGrid(LBM_Grid grid); -void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); -void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); -void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); -void LBM_showGridStatistics(LBM_Grid Grid); -void LBM_storeVelocityField(LBM_Grid grid, const char *filename, - const BOOL binary); - -/* OpenCL *********************************************************************/ - -void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); -void OpenCL_LBM_freeGrid(cl_mem ptr); -void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, - LBM_Grid h_grid); -void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, - LBM_Grid h_grid); -void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, - cl_mem dstGrid); - -/*############################################################################*/ - -#endif /* _LBM_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h deleted file mode 100644 index 99c50c048a14bb47bb3659b61f088db95706bb0c..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h +++ /dev/null @@ -1,198 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef _LBM_MARCOS_H -#define _LBM_MACROS_H_ - -#define OMEGA (1.95f) - -#define OUTPUT_PRECISION float - -#define BOOL int -#define TRUE (-1) -#define FALSE (0) - -#define DFL1 (1.0f / 3.0f) -#define DFL2 (1.0f / 18.0f) -#define DFL3 (1.0f / 36.0f) - -/*############################################################################*/ - -typedef float - *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid *LBM_GridPtr; - -/*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ -#define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; - -#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ - for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ - for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ - for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { - -#define SWEEP_END \ - } \ - } \ - } - -#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ - ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) - -#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) -#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) -#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) -#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) -#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) -#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) -#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) -#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) -#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) -#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) -#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) -#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) -#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) -#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) -#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) -#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) -#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) -#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) -#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) -#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) - -#ifdef SCATTER - -#define SRC_C(g) (LOCAL(g, C)) -#define SRC_N(g) (LOCAL(g, N)) -#define SRC_S(g) (LOCAL(g, S)) -#define SRC_E(g) (LOCAL(g, E)) -#define SRC_W(g) (LOCAL(g, W)) -#define SRC_T(g) (LOCAL(g, T)) -#define SRC_B(g) (LOCAL(g, B)) -#define SRC_NE(g) (LOCAL(g, NE)) -#define SRC_NW(g) (LOCAL(g, NW)) -#define SRC_SE(g) (LOCAL(g, SE)) -#define SRC_SW(g) (LOCAL(g, SW)) -#define SRC_NT(g) (LOCAL(g, NT)) -#define SRC_NB(g) (LOCAL(g, NB)) -#define SRC_ST(g) (LOCAL(g, ST)) -#define SRC_SB(g) (LOCAL(g, SB)) -#define SRC_ET(g) (LOCAL(g, ET)) -#define SRC_EB(g) (LOCAL(g, EB)) -#define SRC_WT(g) (LOCAL(g, WT)) -#define SRC_WB(g) (LOCAL(g, WB)) - -#define DST_C(g) (NEIGHBOR_C(g, C)) -#define DST_N(g) (NEIGHBOR_N(g, N)) -#define DST_S(g) (NEIGHBOR_S(g, S)) -#define DST_E(g) (NEIGHBOR_E(g, E)) -#define DST_W(g) (NEIGHBOR_W(g, W)) -#define DST_T(g) (NEIGHBOR_T(g, T)) -#define DST_B(g) (NEIGHBOR_B(g, B)) -#define DST_NE(g) (NEIGHBOR_NE(g, NE)) -#define DST_NW(g) (NEIGHBOR_NW(g, NW)) -#define DST_SE(g) (NEIGHBOR_SE(g, SE)) -#define DST_SW(g) (NEIGHBOR_SW(g, SW)) -#define DST_NT(g) (NEIGHBOR_NT(g, NT)) -#define DST_NB(g) (NEIGHBOR_NB(g, NB)) -#define DST_ST(g) (NEIGHBOR_ST(g, ST)) -#define DST_SB(g) (NEIGHBOR_SB(g, SB)) -#define DST_ET(g) (NEIGHBOR_ET(g, ET)) -#define DST_EB(g) (NEIGHBOR_EB(g, EB)) -#define DST_WT(g) (NEIGHBOR_WT(g, WT)) -#define DST_WB(g) (NEIGHBOR_WB(g, WB)) - -#else /* GATHER */ - -#define SRC_C(g) (NEIGHBOR_C(g, C)) -#define SRC_N(g) (NEIGHBOR_S(g, N)) -#define SRC_S(g) (NEIGHBOR_N(g, S)) -#define SRC_E(g) (NEIGHBOR_W(g, E)) -#define SRC_W(g) (NEIGHBOR_E(g, W)) -#define SRC_T(g) (NEIGHBOR_B(g, T)) -#define SRC_B(g) (NEIGHBOR_T(g, B)) -#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) -#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) -#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) -#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) -#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) -#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) -#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) -#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) -#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) -#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) -#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) -#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) - -#define DST_C(g) (LOCAL(g, C)) -#define DST_N(g) (LOCAL(g, N)) -#define DST_S(g) (LOCAL(g, S)) -#define DST_E(g) (LOCAL(g, E)) -#define DST_W(g) (LOCAL(g, W)) -#define DST_T(g) (LOCAL(g, T)) -#define DST_B(g) (LOCAL(g, B)) -#define DST_NE(g) (LOCAL(g, NE)) -#define DST_NW(g) (LOCAL(g, NW)) -#define DST_SE(g) (LOCAL(g, SE)) -#define DST_SW(g) (LOCAL(g, SW)) -#define DST_NT(g) (LOCAL(g, NT)) -#define DST_NB(g) (LOCAL(g, NB)) -#define DST_ST(g) (LOCAL(g, ST)) -#define DST_SB(g) (LOCAL(g, SB)) -#define DST_ET(g) (LOCAL(g, ET)) -#define DST_EB(g) (LOCAL(g, EB)) -#define DST_WT(g) (LOCAL(g, WT)) -#define DST_WB(g) (LOCAL(g, WB)) - -#endif /* GATHER */ - -#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) -#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g, f) \ - { \ - FLAG_VAR(LOCAL(g, FLAGS)); \ - (*_aux_) |= (f); \ - } -#define CLEAR_FLAG_SWEEP(g, f) \ - { \ - FLAG_VAR(LOCAL(g, FLAGS)); \ - (*_aux_) &= ~(f); \ - } -#define CLEAR_ALL_FLAGS_SWEEP(g) \ - { \ - FLAG_VAR(LOCAL(g, FLAGS)); \ - (*_aux_) = 0; \ - } - -#define TEST_FLAG(g, x, y, z, f) \ - ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g, x, y, z, f) \ - { \ - FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ - (*_aux_) |= (f); \ - } -#define CLEAR_FLAG(g, x, y, z, f) \ - { \ - FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ - (*_aux_) &= ~(f); \ - } -#define CLEAR_ALL_FLAGS(g, x, y, z) \ - { \ - FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ - (*_aux_) = 0; \ - } - -/*############################################################################*/ - -#endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c deleted file mode 100644 index 18320b7394e5d499339ee820a992b00acd9b368e..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c +++ /dev/null @@ -1,288 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/*############################################################################*/ - -#include <CL/cl.h> -#include <parboil.h> -#include <stdio.h> -#include <stdlib.h> -#include <sys/stat.h> - -#include "layout_config.h" -#include "lbm.h" -#include "lbm_macros.h" -#include "main.h" -#include "ocl.h" - -/*############################################################################*/ - -static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; - -/*############################################################################*/ - -struct pb_TimerSet timers; -int main(int nArgs, char *arg[]) { - MAIN_Param param; - int t; - - OpenCL_Param prm; - - struct pb_Parameters *params; - params = pb_ReadParameters(&nArgs, arg); - - // Setup TEMP datastructures - MAIN_parseCommandLine(nArgs, arg, ¶m, params); - MAIN_printInfo(¶m); - - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - // Setup TEMP datastructures - LBM_allocateGrid((float **)&TEMP_srcGrid); - LBM_allocateGrid((float **)&TEMP_dstGrid); - LBM_initializeGrid(TEMP_srcGrid); - LBM_initializeGrid(TEMP_dstGrid); - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if (param.obstacleFilename != NULL) { - LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); - LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); - } - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); - LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics(TEMP_srcGrid); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - OpenCL_initialize(&prm); - - // Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); - OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); - - // Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - - for (t = 1; t <= param.nTimeSteps; t++) { - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); - - /*if( (t & 63) == 0 ) {*/ - /*printf( "timestep: %i\n", t );*/ -#if 0 - CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); - LBM_showGridStatistics( *TEMP_srcGrid ); -#endif - /*}*/ - } - - /*MAIN_finalize( ¶m, &prm );*/ // inlined - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OpenCL_LBM_freeGrid(OpenCL_srcGrid); - OpenCL_LBM_freeGrid(OpenCL_dstGrid); - - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid((float **)&TEMP_srcGrid); - LBM_freeGrid((float **)&TEMP_dstGrid); - - pb_FreeParameters(params); - return 0; -} - -/*############################################################################*/ - -void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, - struct pb_Parameters *params) { - struct stat fileStat; - - if (nArgs < 2) { - printf("syntax: lbm <time steps>\n"); - exit(1); - } - - param->nTimeSteps = atoi(arg[1]); - - if (params->inpFiles[0] != NULL) { - param->obstacleFilename = params->inpFiles[0]; - - if (stat(param->obstacleFilename, &fileStat) != 0) { - printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename); - exit(1); - } - if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { - printf("MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int)fileStat.st_size, - SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); - exit(1); - } - } else - param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; -} - -/*############################################################################*/ - -void MAIN_printInfo(const MAIN_Param *param) { - printf("MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, - param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" - : param->obstacleFilename); -} - -/*############################################################################*/ - -void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - // Setup TEMP datastructures - LBM_allocateGrid((float **)&TEMP_srcGrid); - LBM_allocateGrid((float **)&TEMP_dstGrid); - LBM_initializeGrid(TEMP_srcGrid); - LBM_initializeGrid(TEMP_dstGrid); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if (param->obstacleFilename != NULL) { - LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); - LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); - LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - - // Setup DEVICE datastructures - OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); - OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); - - // Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics(TEMP_srcGrid); - - LBM_freeGrid((float **)&TEMP_srcGrid); - LBM_freeGrid((float **)&TEMP_dstGrid); -} - -/*############################################################################*/ - -void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; - - // Setup TEMP datastructures - LBM_allocateGrid((float **)&TEMP_srcGrid); - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics(TEMP_srcGrid); - - LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - - LBM_freeGrid((float **)&TEMP_srcGrid); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OpenCL_LBM_freeGrid(OpenCL_srcGrid); - OpenCL_LBM_freeGrid(OpenCL_dstGrid); - - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); -} - -void OpenCL_initialize(OpenCL_Param *prm) { - cl_int clStatus; - - clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL); - CHECK_ERROR("clGetPlatformIDs") - - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; - - clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1, - &(prm->clDevice), NULL); - CHECK_ERROR("clGetDeviceIDs") - - prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL, - NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - prm->clCommandQueue = clCreateCommandQueue( - prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - - const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - prm->clProgram = - clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[100]; - sprintf(clOptions, "-I src/opencl_nvidia"); - - clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions, - NULL, NULL); - CHECK_ERROR("clBuildProgram") - - prm->clKernel = - clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus); - CHECK_ERROR("clCreateKernel") - - free((void *)clSource[0]); - - /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", - * "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, - * &prm->clProgram, &prm->clKernel);*/ -} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h deleted file mode 100644 index 9d8e145c93b37488a3826e77b964c56699377d2a..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h +++ /dev/null @@ -1,32 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef _MAIN_H_ -#define _MAIN_H_ - -/*############################################################################*/ - -typedef struct { - int nTimeSteps; - char *resultFilename; - char *obstacleFilename; -} MAIN_Param; - -/*############################################################################*/ - -void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, - struct pb_Parameters *); -void MAIN_printInfo(const MAIN_Param *param); -void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); -void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); - -void OpenCL_initialize(OpenCL_Param *prm); - -/*############################################################################*/ - -#endif /* _MAIN_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c deleted file mode 100644 index 4f232db0d9776f4f2d0eb4b2444036f35ff27257..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c +++ /dev/null @@ -1,36 +0,0 @@ -#include "ocl.h" -#include <CL/cl.h> -#include <stdio.h> -#include <stdlib.h> - -char *readFile(char *fileName) { - FILE *fp; - fp = fopen(fileName, "r"); - - if (fp == NULL) { - printf("Error 1!\n"); - return NULL; - } - - fseek(fp, 0, SEEK_END); - long size = ftell(fp); - rewind(fp); - - char *buffer = malloc(sizeof(char) * (size + 1)); - if (buffer == NULL) { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } - - size_t res = fread(buffer, 1, size, fp); - if (res != size) { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } - - buffer[size] = 0; - fclose(fp); - return buffer; -} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h deleted file mode 100644 index 5d5d984ba698d6ac71af3e51de3e6724a79135aa..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __OCLH__ -#define __OCLH__ - -typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; -} OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if (clStatus != CL_SUCCESS) { \ - printf("Error: %s!\n", errorMessage); \ - printf("Line: %d\n", __LINE__); \ - exit(1); \ - } - -char *readFile(char *); - -#endif diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/Makefile deleted file mode 100644 index 36b421ec6f1359114ea0035d21048ab0b95bf30e..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# (c) 2010 The Board of Trustees of the University of Illinois. - -LANGUAGE=opencl -SRCDIR_OBJS=main.o io.o #compute_gold.o -APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -APP_CXXFLAGS=-ffast-math -O3 -KERNEL_OBJS=kernel_offline.nvptx.s diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc deleted file mode 100644 index 04744f404ebaf6e669c2bbe91600519742b57dc9..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc +++ /dev/null @@ -1,84 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/* I/O routines for reading and writing matrices in column-major - * layout - */ - -#include <fstream> -#include <iostream> -#include <vector> - -char *readFile(const char *fileName) { - std::fstream f(fileName, std::fstream::in); - if (!f.good()) { - std::cerr << "Error Reading File!!" << std::endl; - return NULL; - } - - f.seekg(0, std::ios::end); - int length = f.tellg(); - f.seekg(0, std::ios::beg); - - char *buffer; - - if (length > 0) { - buffer = new char[length]; - f.read(buffer, length); - buffer[length - 1] = 0; - } else { - buffer = new char; - buffer[0] = 0; - } - - f.close(); - - return buffer; -} - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, - std::vector<float> &v) { - std::cerr << "Opening file:" << fn << std::endl; - std::fstream f(fn, std::fstream::in); - if (!f.good()) { - return false; - } - - // Read # of rows and cols - f >> nr_row; - f >> nr_col; - - float data; - std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; - while (f.good()) { - f >> data; - v.push_back(data); - } - v.pop_back(); // remove the duplicated last element - return true; -} - -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, - std::vector<float> &v) { - std::cerr << "Opening file:" << fn << " for write." << std::endl; - std::fstream f(fn, std::fstream::out); - if (!f.good()) { - return false; - } - - // Read # of rows and cols - f << nr_row << " " << nr_col << " "; - - float data; - std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; - for (int i = 0; i < v.size(); ++i) { - f << v[i] << ' '; - } - f << "\n"; - return true; -} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl deleted file mode 100644 index 5ee2fd5df0a57bc59c1f714bc3efb6b3670b0386..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl +++ /dev/null @@ -1,25 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/* - * Kernel of dense matrix-matrix multiplication kernel. - */ - -__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta ) -{ - float c = 0.0f; - int m = get_global_id(0); - int n = get_global_id(1); - - /*for (int i = 0; i < k; ++i) {*/ - /*float a = A[m + i * lda]; */ - /*float b = B[n + i * ldb];*/ - /*c += a * b;*/ - /*}*/ - C[m+n*ldc] = beta + alpha * c; -} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl deleted file mode 100644 index f376a27d90003e3c7c18dafb9f64a8b459a40029..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl +++ /dev/null @@ -1,25 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/* - * Kernel of dense matrix-matrix multiplication kernel. - */ - -__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta ) -{ - float c = 0.0f; - int m = get_global_id(0); - int n = get_global_id(1); - - for (int i = 0; i < k; ++i) { - float a = A[m + i * lda]; - float b = B[n + i * ldb]; - c += a * b; - } - C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c; -} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc deleted file mode 100644 index 5489f6a55ce6e8ba3676b0c98ad4b37ac7f4a7fd..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc +++ /dev/null @@ -1,242 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/* - * Main entry of dense matrix-matrix multiplication kernel - */ - -#include <CL/cl.h> -#include <iostream> -#include <malloc.h> -#include <math.h> -#include <parboil.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include <vector> - -// I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, - std::vector<float> &v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, - std::vector<float> &); -extern char *readFile(const char *); - -// Parameters of tile sizes -#define TILE_SZ 16 - -#define CHECK_ERROR(errorMessage) \ - if (clStatus != CL_SUCCESS) { \ - std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ - std::cout << "Line: " << __LINE__ << "\n"; \ - exit(1); \ - } - -void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, - cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, - int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) { - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } - - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } - - // In this code we assume the matrix sizes are multiple of tile size - if ((m % TILE_SZ) || (n % TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " - << TILE_SZ << "; n should be multiple of " << TILE_SZ - << std::endl; - } - - size_t db[2] = {TILE_SZ, TILE_SZ}; - size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - - cl_int clStatus; - - clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); - clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); - clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); - clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); - clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); - clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); - clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); - clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); - clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); - CHECK_ERROR("clSetKernelArg") - - clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, - 0, NULL, NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") -} - -int main(int argc, char *argv[]) { - - struct pb_Parameters *params; - struct pb_TimerSet timers; - - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || - (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } - - /* Read in data */ - // load A - readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - // load B^T - readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - - pb_InitializeTimerSet(&timers); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1, &clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, - (cl_context_properties)clPlatform, 0}; - cl_context clContext = - clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_command_queue clCommandQueue = clCreateCommandQueue( - clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char *clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")}; - // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - // cl_program clProgram = - // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - cl_kernel clKernel; - cl_program clProgram; - pb_CreateAndBuildKernelFromBinary( - "build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", - &clContext, &clDevice, &clProgram, &clKernel); - // cl_program clProgram = - // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - // CHECK_ERROR("clCreateProgramWithSource") - - // char clOptions[50]; - // sprintf(clOptions,""); - - // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - // CHECK_ERROR("clBuildProgram") - - // size_t binarySizes = 0; - // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, - // sizeof(size_t), &binarySizes, NULL); CHECK_ERROR("clGetProgramInfo") - - // std::cout << "Binary Size = " << binarySizes << "\n"; - - // unsigned char* binaries = (unsigned char*) malloc(binarySizes); - // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes, - // &binaries, NULL); CHECK_ERROR("clGetProgramInfo") - - // std::cout << "Binary = \n" << binaries << "\n"; - - // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - // CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - // copy A to device memory - A_sz = matArow * matAcol * sizeof(float); - B_sz = matBrow * matBcol * sizeof(float); - - // allocate space for C - C_sz = matArow * matBcol * sizeof(float); - - // OpenCL memory allocation - std::vector<float> matC(matArow * matBcol); - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - cl_mem dA = - clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - cl_mem dB = - clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - cl_mem dC = - clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - - // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, - &matA.front(), 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, - &matBT.front(), 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - for (int i = 0; i < matC.size(); i++) - matC[i] = 0.0f; - - clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, - &matC.front(), 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, - matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, - NULL, NULL); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseMemObject(dA); - clStatus = clReleaseMemObject(dB); - clStatus = clReleaseMemObject(dC); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - - if (params->outFile) { - - /* Write C to file */ - // pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); - } - - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 - << std::endl; - pb_FreeParameters(params); - - // free((void*)clSource[0]); - - return 0; -} diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/Makefile deleted file mode 100644 index 5774b2827bd0588b16cfc32b2ac787346c5b7b1d..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# (c) 2010 The Board of Trustees of the University of Illinois. - -LANGUAGE=opencl -TOOLS_SRC=common_src/convert-dataset -SRCDIR_OBJS=main.o gpu_info.o file.o ocl.o -APP_CUDALDFLAGS=-lm -APP_CFLAGS=-ffast-math -O3 -I$(TOOLS_SRC) -APP_CXXFLAGS=-ffast-math -O3 -KERNEL_OBJS=kernel_offline.nvptx.s - -include $(TOOLS_SRC)/commontools.mk diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c deleted file mode 100644 index 22397498f7f43b3f60926bf51c2ddbff91529787..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c +++ /dev/null @@ -1,72 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2007 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#include <endian.h> -#include <inttypes.h> -#include <malloc.h> -#include <stdio.h> -#include <stdlib.h> - -#if __BYTE_ORDER != __LITTLE_ENDIAN -#error "File I/O is not implemented for this system: wrong endianness." -#endif - -void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, - int *pad, float **h_data, int **h_indices, int **h_ptr, - int **h_perm, int **h_nzcnt) { - FILE *fid = fopen(fName, "rb"); - - if (fid == NULL) { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); - int _len = len[0]; - int _depth = depth[0]; - int _dim = dim[0]; - int _pad = pad[0]; - int _nzcnt_len = nzcnt_len[0]; - - *h_data = (float *)malloc(_len * sizeof(float)); - fread(*h_data, sizeof(float), _len, fid); - - *h_indices = (int *)malloc(_len * sizeof(int)); - fread(*h_indices, sizeof(int), _len, fid); - - *h_ptr = (int *)malloc(_depth * sizeof(int)); - fread(*h_ptr, sizeof(int), _depth, fid); - - *h_perm = (int *)malloc(_dim * sizeof(int)); - fread(*h_perm, sizeof(int), _dim, fid); - - *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); - fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); - - fclose(fid); -} - -void input_vec(char *fName, float *h_vec, int dim) { - FILE *fid = fopen(fName, "rb"); - fread(h_vec, sizeof(float), dim, fid); - fclose(fid); -} - -void outputData(char *fName, float *h_Ax_vector, int dim) { - FILE *fid = fopen(fName, "w"); - uint32_t tmp32; - if (fid == NULL) { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = dim; - fwrite(&tmp32, sizeof(uint32_t), 1, fid); - fwrite(h_Ax_vector, sizeof(float), dim, fid); - - fclose(fid); -} diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h deleted file mode 100644 index 5e38a6875e9e5f8be4d01b68569d80adf8c49548..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h +++ /dev/null @@ -1,18 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2007 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ -#ifndef __FILEH__ -#define __FILEH__ - -void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, - int *pad, float **h_data, int **h_indices, int **h_ptr, - int **h_perm, int **h_nzcnt); - -void input_vec(char *fNanme, float *h_vec, int dim); -void outputData(char *fName, float *h_Ax_vector, int dim); - -#endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c deleted file mode 100644 index 90beedd747480ede3fd1e5da4017ed0051e043be..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c +++ /dev/null @@ -1,44 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ -#include <endian.h> -#include <inttypes.h> -#include <malloc.h> -#include <stdio.h> -#include <stdlib.h> - -#include "gpu_info.h" - -void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, - int major, int minor, int sm) { - int max_thread; - int max_block = 8; - if (major == 1) { - if (minor >= 2) - max_thread = 1024; - else - max_thread = 768; - } else if (major == 2) - max_thread = 1536; - else - // newer GPU //keep using 2.0 - max_thread = 1536; - - int _grid; - int _thread; - - if (task * pad > sm * max_thread) { - _thread = max_thread / max_block; - _grid = ((task * pad + _thread - 1) / _thread) * _thread; - } else { - _thread = pad; - _grid = task * pad; - } - - thread[0] = _thread; - grid[0] = _grid; -} diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h deleted file mode 100644 index ab1af7d0b8ba92f87c643582171e48cee0a9b95e..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h +++ /dev/null @@ -1,15 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef __GPUINFOH__ -#define __GPUINFOH__ - -void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, - int major, int minor, int sm); - -#endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel.cl b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel.cl deleted file mode 100644 index 8c19a22511cd4c65364cb55c5df1a22d12b182b7..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel.cl +++ /dev/null @@ -1,73 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ -#define WARP_BITS 5 - -__kernel void spmv_jds(__global float *dst_vector, __global float *d_data, - __global int *d_index, __global int *d_perm, - __global float *x_vec, const int dim, - __constant int *jds_ptr_int, - __constant int *sh_zcnt_int) -{ - int ix = get_global_id(0); - int warp_id=ix>>WARP_BITS; - - if(ix<dim) - { - float sum=0.0f; - int bound=sh_zcnt_int[warp_id]; - //prefetch 0 - int j=jds_ptr_int[0]+ix; - float d = d_data[j]; - int i = d_index[j]; - float t = x_vec[i]; - - if (bound>1) //bound >=2 - { - //prefetch 1 - j=jds_ptr_int[1]+ix; - i = d_index[j]; - int in; - float dn; - float tn; - for(int k=2;k<bound;k++ ) - { - //prefetch k-1 - dn = d_data[j]; - //prefetch k - j=jds_ptr_int[k]+ix; - in = d_index[j]; - //prefetch k-1 - tn = x_vec[i]; - - //compute k-2 - sum += d*t; - //sweep to k - i = in; - //sweep to k-1 - d = dn; - t =tn; - } - - //fetch last - dn = d_data[j]; - tn = x_vec[i]; - - //compute last-1 - sum += d*t; - //sweep to last - d=dn; - t=tn; - } - //compute last - sum += d*t; // 3 3 - - //write out data - dst_vector[d_perm[ix]]=sum; - } -} - diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl deleted file mode 100644 index 9a17a299afdd032fb9c07d8ff5b559055704a573..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl +++ /dev/null @@ -1,73 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ -#define WARP_BITS 5 - -__kernel void spmv_jds(__global float *dst_vector, __global float *d_data, - __global int *d_index, __global int *d_perm, - __global float *x_vec, const int dim, - __constant int *jds_ptr_int, - __constant int *sh_zcnt_int) -{ - int ix = get_global_id(0); - int warp_id=ix>>WARP_BITS; - - if(ix<dim) - { - float sum=0.0f; - int bound=sh_zcnt_int[warp_id]; - //prefetch 0 - int j=jds_ptr_int[0]+ix; - float d = d_data[j]; - int i = d_index[j]; - float t = x_vec[i]; - - if (bound>1) //bound >=2 - { - //prefetch 1 - j=jds_ptr_int[1]+ix; - i = d_index[j]; - int in; - float dn; - float tn; - for(int k=2;k<bound;k++ ) - { - //prefetch k-1 - dn = d_data[j]; - //prefetch k - j=jds_ptr_int[k]+ix; - in = d_index[j]; - //prefetch k-1 - tn = x_vec[i]; - - //compute k-2 - sum += d*t; - //sweep to k - i = in; - //sweep to k-1 - d = dn; - t =tn; - } - - //fetch last - dn = d_data[j]; - tn = x_vec[i]; - - //compute last-1 - sum += d*t; - //sweep to last - d=dn; - t=tn; - } - //compute last - sum += d*t; // 3 3 - - //write out data - dst_vector[d_perm[ix]]=sum; - } -} - diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c deleted file mode 100644 index 343814149aa74139930380c2178e2f447c64e806..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c +++ /dev/null @@ -1,283 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#include <CL/cl.h> -#include <CL/cl_ext.h> -#include <parboil.h> -#include <stdio.h> -#include <stdlib.h> - -#include "convert_dataset.h" -#include "file.h" -#include "gpu_info.h" -#include "ocl.h" - -static int generate_vector(float *x_vector, int dim) { - srand(54321); - int i; - for (i = 0; i < dim; i++) { - x_vector[i] = (rand() / (float)RAND_MAX); - } - return 0; -} - -int main(int argc, char **argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " - "Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { - fprintf(stderr, "Expecting one two filenames\n"); - exit(-1); - } - - // load matrix from files - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - // parameters declaration - int len; - int depth; - int dim; - int pad = 32; - int nzcnt_len; - - // host memory allocation - // matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - // vector - float *h_Ax_vector; - float *h_x_vector; - - coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, - &len, &nzcnt_len, &depth); - - h_Ax_vector = (float *)malloc(sizeof(float) * dim); - h_x_vector = (float *)malloc(sizeof(float) * dim); - input_vec(parameters->inpFiles[1], h_x_vector, dim); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1, &clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, - (cl_context_properties)clPlatform, 0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = - clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue( - clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = - clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions, ""); - clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds", &clStatus); - CHECK_ERROR("clCreateKernel") - /*cl_kernel clKernel;*/ - /*cl_program clProgram;*/ - /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", - * "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/ - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - // device memory allocation - // matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, - sizeof(cl_uint), &(clDeviceProp.major), NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, - sizeof(cl_uint), &(clDeviceProp.minor), NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = - clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), - &(clDeviceProp.multiProcessorCount), NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - // memory allocation - d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), - NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), - NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, - &clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), - NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, - dim * sizeof(float), NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), - NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, - nzcnt_len * sizeof(int), NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); - - // memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, - len * sizeof(float), h_data, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, - len * sizeof(int), h_indices, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, - dim * sizeof(int), h_perm, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, - dim * sizeof(int), h_x_vector, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, - depth * sizeof(int), h_ptr, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = - clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, - nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, - clDeviceProp.minor, clDeviceProp.multiProcessorCount); - - clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - // main execution - - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - int i; - for (i = 0; i < 50; i++) { - clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, - &block, 0, NULL, NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - // HtoD memory copy - clStatus = - clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, - dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile, h_Ax_vector, dim); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - // free((void*)clSource[0]); - - free(h_data); - free(h_indices); - free(h_ptr); - free(h_perm); - free(h_nzcnt); - free(h_Ax_vector); - free(h_x_vector); - - pb_FreeParameters(parameters); - - return 0; -} diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c deleted file mode 100644 index 2990031255acae7fe480b0fe7cdc79db7cb08287..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c +++ /dev/null @@ -1,46 +0,0 @@ -#include "ocl.h" -#include <CL/cl.h> -#include <stdio.h> -#include <string.h> - -char *readFile(const char *fileName) { - FILE *fp; - fp = fopen(fileName, "r"); - if (fp == NULL) { - printf("Error 1!\n"); - exit(1); - } - - fseek(fp, 0, SEEK_END); - long size = ftell(fp); - rewind(fp); - - char *buffer = (char *)malloc(sizeof(char) * (size + 1)); - if (buffer == NULL) { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } - - size_t res = fread(buffer, 1, size, fp); - if (res != size) { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } - - buffer[size] = 0; - fclose(fp); - return buffer; -} - -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, - size_t size) { - cl_int clStatus; - char *temp = (char *)malloc(size); - memset(temp, val, size); - clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, - 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); -} diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h deleted file mode 100644 index 42ff7b4d1059550293b56325d0cce2afea6c004b..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __OCLH__ -#define __OCLH__ - -typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; -} OpenCLDeviceProp; - -void clMemSet(cl_command_queue, cl_mem, int, size_t); -char *readFile(const char *); - -#define CHECK_ERROR(errorMessage) \ - if (clStatus != CL_SUCCESS) { \ - printf("Error: %s!\n", errorMessage); \ - printf("Line: %d\n", __LINE__); \ - exit(1); \ - } - -#endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/Makefile deleted file mode 100644 index c4ef6c0dac3a1646d63aca309b2d44138abb55d3..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# (c) 2010 The Board of Trustees of the University of Illinois. - -LANGUAGE=opencl -SRCDIR_OBJS=main.o file.o -APP_CUDALDFLAGS=-lm -APP_CFLAGS=-ffast-math -O3 -APP_CXXFLAGS=-ffast-math -O3 -KERNEL_OBJS=kernel_offline.nvptx.s diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h deleted file mode 100644 index 1a682890b3619ef712c5e5e3a7313e325935ec6f..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h +++ /dev/null @@ -1,12 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef _COMMON_H_ -#define _COMMON_H_ -#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) -#endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc deleted file mode 100644 index 95cd65c4a0e013c60c6edd43077346a7efdad1ae..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc +++ /dev/null @@ -1,76 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#include <endian.h> -#include <inttypes.h> -#include <malloc.h> -#include <stdio.h> -#include <stdlib.h> - -#if __BYTE_ORDER != __LITTLE_ENDIAN -#error "File I/O is not implemented for this system: wrong endianness." -#endif - -extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { - FILE *fid = fopen(fName, "r"); - - if (fid == NULL) { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int), 1, fid); - fread(ny, sizeof(int), 1, fid); - fread(nz, sizeof(int), 1, fid); - fclose(fid); -} - -extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { - FILE *fid = fopen(fName, "w"); - uint32_t tmp32; - if (fid == NULL) { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx * ny * nz; - fwrite(&tmp32, sizeof(uint32_t), 1, fid); - fwrite(h_A0, sizeof(float), tmp32, fid); - - fclose(fid); -} - -extern "C" char *readFile(const char *fileName) { - FILE *fp; - fp = fopen(fileName, "r"); - if (fp == NULL) { - printf("Error 1!\n"); - exit(1); - } - - fseek(fp, 0, SEEK_END); - long size = ftell(fp); - rewind(fp); - - char *buffer = (char *)malloc(sizeof(char) * (size + 1)); - if (buffer == NULL) { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } - - size_t res = fread(buffer, 1, size, fp); - if (res != size) { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } - - buffer[size] = 0; - fclose(fp); - return buffer; -} diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h deleted file mode 100644 index b45c42371bbde3c3a39d88277adf39a8f537baab..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h +++ /dev/null @@ -1,23 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ -#ifndef __FILEH__ -#define __FILEH__ - -#ifdef __cplusplus -extern "C" { -#endif - -void inputData(char *fName, int *nx, int *ny, int *nz); -void outputData(char *fName, float *h_A0, int nx, int ny, int nz); -char *readFile(const char *fileName); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel.cl b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel.cl deleted file mode 100644 index 4c5d1263db5948e4e61ea2baa27261613cd0ea06..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel.cl +++ /dev/null @@ -1,28 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#include "common.h" - -__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz) -{ - int i = get_global_id(0)+1; - int j = get_global_id(1)+1; - int k = get_global_id(2)+1; - -if(i<nx-1) -{ - Anext[Index3D (nx, ny, i, j, k)] = c1 * - ( A0[Index3D (nx, ny, i, j, k + 1)] + - A0[Index3D (nx, ny, i, j, k - 1)] + - A0[Index3D (nx, ny, i, j + 1, k)] + - A0[Index3D (nx, ny, i, j - 1, k)] + - A0[Index3D (nx, ny, i + 1, j, k)] + - A0[Index3D (nx, ny, i - 1, j, k)] ) - - A0[Index3D (nx, ny, i, j, k)] * c0; -} -} diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel_offline.cl b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel_offline.cl deleted file mode 100644 index 4c5d1263db5948e4e61ea2baa27261613cd0ea06..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel_offline.cl +++ /dev/null @@ -1,28 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#include "common.h" - -__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz) -{ - int i = get_global_id(0)+1; - int j = get_global_id(1)+1; - int k = get_global_id(2)+1; - -if(i<nx-1) -{ - Anext[Index3D (nx, ny, i, j, k)] = c1 * - ( A0[Index3D (nx, ny, i, j, k + 1)] + - A0[Index3D (nx, ny, i, j, k - 1)] + - A0[Index3D (nx, ny, i, j + 1, k)] + - A0[Index3D (nx, ny, i, j - 1, k)] + - A0[Index3D (nx, ny, i + 1, j, k)] + - A0[Index3D (nx, ny, i - 1, j, k)] ) - - A0[Index3D (nx, ny, i, j, k)] * c0; -} -} diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c deleted file mode 100644 index ec47c22227648df094cbf03ea1b667943207207e..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c +++ /dev/null @@ -1,231 +0,0 @@ - -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#include <CL/cl.h> -#include <parboil.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "common.h" -#include "file.h" - -#define CHECK_ERROR(errorMessage) \ - if (clStatus != CL_SUCCESS) { \ - printf("Error: %s!\n", errorMessage); \ - printf("Line: %d\n", __LINE__); \ - exit(1); \ - } - -static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { - int s = 0; - int i, j, k; - for (i = 0; i < nz; i++) { - for (j = 0; j < ny; j++) { - for (k = 0; k < nx; k++) { - fread(A0 + s, sizeof(float), 1, fp); - s++; - } - } - } - return 0; -} - -int main(int argc, char **argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - // declaration - unsigned nx, ny, nz; - unsigned size; - int iteration; - float c0 = 1.0f / 6.0f; - float c1 = 1.0f / 6.0f / 6.0f; - - if (argc < 5) { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx < 1) - return -1; - ny = atoi(argv[2]); - if (ny < 1) - return -1; - nz = atoi(argv[3]); - if (nz < 1) - return -1; - iteration = atoi(argv[4]); - if (iteration < 1) - return -1; - - // host data - float *h_A0; - float *h_Anext; - // load data from files - - size = nx * ny * nz; - - h_A0 = (float *)malloc(sizeof(float) * size); - h_Anext = (float *)malloc(sizeof(float) * size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx, ny, nz, fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1, &clPlatform, NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, - (cl_context_properties)clPlatform, 0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = - clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue( - clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - /*cl_program clProgram;*/ - /*cl_kernel clKernel;*/ - - /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", - * "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/ - const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = - clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions, "-I src/opencl_base"); - clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus); - CHECK_ERROR("clCreateKernel") - - // device - cl_mem d_A0; - cl_mem d_Anext; - - memcpy(h_Anext, h_A0, sizeof(float) * size); - - // memory allocation - d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), - NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), - NULL, &clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - // memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, - size * sizeof(float), h_A0, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, - size * sizeof(float), h_Anext, 0, NULL, NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - // only use 1D thread block - unsigned tx = 256; - size_t block[3] = {tx, 1, 1}; - size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; - // size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1, 1, 1}; - // printf("block x is %d and y is %d z \n",block[0],block[1]); - // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - - clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); - clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); - clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); - clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); - clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); - clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); - clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); - CHECK_ERROR("clSetKernelArg") - - // main execution - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - int t; - for (t = 0; t < iteration; t++) { - clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, - block, 0, NULL, NULL); - // printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); - clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); - } - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, - size * sizeof(float), h_Anext, 0, NULL, NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile, h_Anext, nx, ny, nz); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - // free((void*)clSource[0]); - - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); - - return 0; -} diff --git a/hpvm/test/parboil/common/mk/visc.mk b/hpvm/test/parboil/common/mk/visc.mk index 7ef5b017f054661f2c4b6235bed43839a66d87ae..0a8984deeac5696557f4b6a220b4f0758f5aefcf 100755 --- a/hpvm/test/parboil/common/mk/visc.mk +++ b/hpvm/test/parboil/common/mk/visc.mk @@ -172,17 +172,9 @@ clean : if [ -d $(BUILDDIR) ]; then rm -rf $(BUILDDIR); fi if [ -d $(RUNDIR) ]; then rm -rf $(RUNDIR); fi -#$(APP_BINS) : $(KERNEL_OCL) $(BIN) - #echo Generating $(APP_BINS) ... - -#TODO: Fix this. Remove echo to actually compile kernel to ptx $(KERNEL_OCL) : $(KERNEL_OPT) $(OCLBE) $< -o $@ -$(SPIR_ASSEMBLY) : $(KERNEL_OPT) - python $(PYTHON_LLVM_40_34) $< $(BUILDDIR)/kernel_34.ll - $(LLVM_34_AS) $(BUILDDIR)/kernel_34.ll -o $@ - $(KERNEL_OPT) : $(KERNEL) $(OPT) $(APP_OPTFLAGS) -S $< -o $@ @@ -193,7 +185,7 @@ $(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ $(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS) - $(OPT) --debug $(VISC_OPTFLAGS) -S $< -o $(HOST) + $(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST) $(RUNDIR) : mkdir -p $(RUNDIR) @@ -212,9 +204,6 @@ $(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp $(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll $(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@ - cat $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil/RUN.parboil.script $@ > $@.tmp - mv $@.tmp $(BUILDDIR)/$(APP).visc.ll - #@cp $(VISC_OBJS) $(BUILDDIR)/$(VISC_OBJS) $(BUILDDIR)/%.o : $(SRCDIR)/%.c $(CC) $(CFLAGS) -c $< -o $@ diff --git a/hpvm/test/pipeline/Makefile b/hpvm/test/pipeline/Makefile index 52ac5a9b7e33448249ac3cb74a4bc9a473787d4b..e3572ecdfc4322ecd12c25517880b87f94c0f9e1 100644 --- a/hpvm/test/pipeline/Makefile +++ b/hpvm/test/pipeline/Makefile @@ -39,8 +39,8 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt -VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll +VISC_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/visc-rt +VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce @@ -94,9 +94,6 @@ $(EXE) : $(HOST_LINKED) $(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp - make -C $(LLVM_LIB_PATH) - $(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) diff --git a/hpvm/test/pipeline/Makefile.config b/hpvm/test/pipeline/Makefile.config index ffb2942911313421ce9b9186a392578d7bcaf4c7..1bdb62dec493fc63e581ae8204ea736c45ed5f7d 100644 --- a/hpvm/test/pipeline/Makefile.config +++ b/hpvm/test/pipeline/Makefile.config @@ -3,23 +3,21 @@ CUDA_LIB_PATH=$(CUDA_PATH)/lib64 OPENCL_PATH=/software/cuda-9.1 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64 -LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-release/hpvm/llvm/ -# NOTE: You may need to configure this based on your root path. -VISC_SRC_ROOT=$(LLVM_SRC_ROOT) +LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-reorg-9-temp/hpvm/llvm/ -VISC_BUILD_DIR =$(VISC_SRC_ROOT)/../build -CC = $(VISC_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include -OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe +LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build +CC = $(LLVM_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include +OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include +CXX = $(LLVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(LLVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/pipeline/Makefile.config.example b/hpvm/test/pipeline/Makefile.config.example index 269f0b7df273c958f0cd20a0f935716a329e00ae..2627ca508f17acb96c858bf4473eed4d89ebec20 100644 --- a/hpvm/test/pipeline/Makefile.config.example +++ b/hpvm/test/pipeline/Makefile.config.example @@ -1,23 +1,21 @@ -CUDA_PATH=/usr/local/cuda -CUDA_LIB_PATH=/usr/local/cuda/lib64 -OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64 -OPENCL_LIB_PATH=$(OPENCL_PATH)/lib +CUDA_PATH=/software/cuda-9.1 +CUDA_LIB_PATH=$(CUDA_PATH)/lib64 +OPENCL_PATH=/software/cuda-9.1 +OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64 -# NOTE: You may need to configure this based on your root path. -VISC_SRC_ROOT=$(LLVM_SRC_ROOT) +LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build +CC = $(LLVM_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include +OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe -VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build -CC = $(VISC_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include +CXX = $(LLVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include - -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(LLVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/template/Makefile b/hpvm/test/template/Makefile index 82d7c61e2b77947da770aab506c45919ee139a63..3aa4bd1d6f2c3f7bb2be07ba5e662c5b6faf1655 100644 --- a/hpvm/test/template/Makefile +++ b/hpvm/test/template/Makefile @@ -41,8 +41,8 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt -VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll +VISC_RT_PATH = $(LLVM_BUILD_ROOT)/tools/hpvm/projects/visc-rt +VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce @@ -69,9 +69,7 @@ OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) KERNEL = $(TEST_OBJS).kernels.ll -ifeq ($(TARGET),seq) -else - KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll +ifeq ($(TARGET),gpu) KERNEL_OCL = $(TEST_OBJS).kernels.cl endif @@ -88,7 +86,7 @@ endif default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE) $(KERNEL_OCL) : $(KERNEL) - $(OCLBE) -debug $< -o $@ + $(OCLBE) $< -o $@ $(EXE) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ @@ -96,11 +94,8 @@ $(EXE) : $(HOST_LINKED) $(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp - make -C $(LLVM_LIB_PATH) - $(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) - $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) + $(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -112,6 +107,6 @@ $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc $(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $< $(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll - $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ + $(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@ ## END HPVM MAKEFILE diff --git a/hpvm/test/template/Makefile.config.example b/hpvm/test/template/Makefile.config.example index 269f0b7df273c958f0cd20a0f935716a329e00ae..9b182a3b7998f3a302a758796a9fe950740b1366 100644 --- a/hpvm/test/template/Makefile.config.example +++ b/hpvm/test/template/Makefile.config.example @@ -1,23 +1,23 @@ -CUDA_PATH=/usr/local/cuda -CUDA_LIB_PATH=/usr/local/cuda/lib64 -OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64 -OPENCL_LIB_PATH=$(OPENCL_PATH)/lib +# These paths should be set to your local directories of CUDA and OpenCL +CUDA_PATH=/software/cuda-9.1 +CUDA_LIB_PATH=$(CUDA_PATH)/lib64 +OPENCL_PATH=$(CUDA_PATH) +OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64 -# NOTE: You may need to configure this based on your root path. -VISC_SRC_ROOT=$(LLVM_SRC_ROOT) +# These paths should be set based on your HPVM build and source locations. -VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build -CC = $(VISC_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include +LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build +CC = $(LLVM_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I$(LLVM_BUILD_DIR)/include -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include +CXX = $(LLVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I$(LLVM_BUILD_DIR)/include -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(LLVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/template/README.md b/hpvm/test/template/README.md new file mode 100644 index 0000000000000000000000000000000000000000..198604817d5a8463e555451a1188b426ec4e31cd --- /dev/null +++ b/hpvm/test/template/README.md @@ -0,0 +1,37 @@ +Compilation of a project with HPVM is a multi-step process. +Let's look at the compilation of the `pipeline` test for gpu as an example. + +`clang` is used to produce an LLVM IR file that contains the HPVM intrinsics in the form of function calls. +``` +/.../hpvm/build/bin/clang -Isrc/ -I -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -DDEVICE=GPU_TARGET -emit-llvm -S -o build/main.ll src/main.cc +``` + +`opt` is used to invoke the GenVISC pass, which converts the HPVM function calls to LLVM intrinsics. +``` +/.../hpvm/build/bin/opt -debug-only=genvisc -load LLVMGenVISC.so -genvisc -globaldce -visc-timers-gen build/main.ll -S -o build/main.visc.ll +``` + +`opt` is used again to invoke the BuildDFG pass, which converts the textual representation to the internal HPVM representation. +``` +/.../hpvm/build/bin/opt -debug -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -visc-timers-x86 -visc-timers-ptx -S build/main.visc.ll -o build/pipeline-gpu.host.ll +``` + +`llvm-cbe` is a C backend for LLVM. It is used here to create the OpenCL kernel. +``` +/.../hpvm/build/bin/llvm-cbe -debug build/gpu/main.visc.ll.kernels.ll -o build/gpu/main.visc.ll.kernels.cl +``` + +`clang` is used again to compile a separate source file that contains I/O code. +``` +/.../hpvm/build/bin/clang -Isrc/ -I -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -emit-llvm -S -o build/gpu/io.ll src/io.cc +``` + +`llvm-link` is used to link against the VISC runtime. +``` +/.../hpvm/build/bin/llvm-link build/gpu/pipeline-gpu.host.ll build/gpu/io.ll /.../hpvm/llvm/tools/hpvm/projects/visc-rt/visc-rt.ll -S -o build/gpu/pipeline-gpu.linked.ll +``` + +`clang++` is used to do the final linking against OpenCL and emit the binary. +``` +/.../hpvm/build/bin/clang++ -O3 `pkg-config opencv --libs` -lm -lpthread -lrt -lOpenCL -L/software/cuda-9.1/lib64 build/gpu/pipeline-gpu.linked.ll -o pipeline-gpu +``` \ No newline at end of file