diff --git a/.gitignore b/.gitignore index a17e2716a5e90ee10ac32c19c3fc2f29f953f286..0da6a3671489a915ad13194ada7007d94dd13321 100644 --- a/.gitignore +++ b/.gitignore @@ -34,5 +34,5 @@ hpvm/install/ hpvm/llvm/ hpvm/llvm-*.src.tar.xz hpvm/llvm-*.src/ -hpvm/projects/visc-rt/visc-rt.ll +hpvm/projects/hpvm-rt/hpvm-rt.ll hpvm/test/**/build/ diff --git a/hpvm/include/BuildDFG/BuildDFG.h b/hpvm/include/BuildDFG/BuildDFG.h index 28230e135beb68c07c998e607fa3d03d40a66791..ca4c616da5f4076528b1294992ec8ad3ab768809 100644 --- a/hpvm/include/BuildDFG/BuildDFG.h +++ b/hpvm/include/BuildDFG/BuildDFG.h @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "SupportVISC/DFGraph.h" +#include "SupportHPVM/DFGraph.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -58,10 +58,10 @@ public: // Functions virtual bool runOnModule(Module &M); - static bool isViscLaunchIntrinsic(Instruction *I); - static bool isViscGraphIntrinsic(Instruction *I); - static bool isViscQueryIntrinsic(Instruction *I); - static bool isViscIntrinsic(Instruction *I); + static bool isHPVMLaunchIntrinsic(Instruction *I); + static bool isHPVMGraphIntrinsic(Instruction *I); + static bool isHPVMQueryIntrinsic(Instruction *I); + static bool isHPVMIntrinsic(Instruction *I); static bool isTypeCongruent(Type *L, Type *R); // TODO: Maybe make these fields const diff --git a/hpvm/include/GenVISC/GenVISC.h b/hpvm/include/GenHPVM/GenHPVM.h similarity index 67% rename from hpvm/include/GenVISC/GenVISC.h rename to hpvm/include/GenHPVM/GenHPVM.h index 1db9929be70fdc4335e23d7e879248f0ebb45c07..24798bc2740e2299f67cc7f515437339f2fe8310 100644 --- a/hpvm/include/GenVISC/GenVISC.h +++ b/hpvm/include/GenHPVM/GenHPVM.h @@ -1,4 +1,4 @@ -//== GenVISC.h - Header file for "LLVM IR to VISC IR Pass" =// +//== GenHPVM.h - Header file for "LLVM IR to HPVM IR Pass" =// // // The LLVM Compiler Infrastructure // @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "SupportVISC/VISCTimer.h" +#include "SupportHPVM/HPVMTimer.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -18,24 +18,24 @@ using namespace llvm; -namespace genvisc { -// GenVISC - The first implementation. -struct GenVISC : public ModulePass { +namespace genhpvm { +// GenHPVM - The first implementation. +struct GenHPVM : public ModulePass { static char ID; // Pass identification, replacement for typeid - GenVISC() : ModulePass(ID) {} + GenHPVM() : ModulePass(ID) {} private: // Member variables Module *M; - FunctionCallee llvm_visc_initializeTimerSet; - FunctionCallee llvm_visc_switchToTimer; - FunctionCallee llvm_visc_printTimerSet; + FunctionCallee llvm_hpvm_initializeTimerSet; + FunctionCallee llvm_hpvm_switchToTimer; + FunctionCallee llvm_hpvm_printTimerSet; GlobalVariable *TimerSet; // Functions void initializeTimerSet(Instruction *); - void switchToTimer(enum visc_TimerID, Instruction *); + void switchToTimer(enum hpvm_TimerID, Instruction *); void printTimerSet(Instruction *); Value *getStringPointer(const Twine &S, Instruction *InsertBefore, const Twine &Name = ""); @@ -45,4 +45,4 @@ public: virtual bool runOnModule(Module &M); }; -} // namespace genvisc +} // namespace genhpvm diff --git a/hpvm/include/SupportVISC/DFG2LLVM.h b/hpvm/include/SupportHPVM/DFG2LLVM.h similarity index 82% rename from hpvm/include/SupportVISC/DFG2LLVM.h rename to hpvm/include/SupportHPVM/DFG2LLVM.h index b9e4cc4158b71ab18fbeadf2e4d094055feb6149..07147c6d909f5352dd886b5f8bc1a2b0ae434ffe 100644 --- a/hpvm/include/SupportVISC/DFG2LLVM.h +++ b/hpvm/include/SupportHPVM/DFG2LLVM.h @@ -1,7 +1,7 @@ #ifndef __DFG2LLVM_H__ #define __DFG2LLVM_H__ -//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===// +//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===// // // The LLVM Compiler Infrastructure // @@ -11,9 +11,9 @@ //===----------------------------------------------------------------------===// #include "BuildDFG/BuildDFG.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCTimer.h" -#include "SupportVISC/VISCUtils.h" +#include "SupportHPVM/HPVMHint.h" +#include "SupportHPVM/HPVMTimer.h" +#include "SupportHPVM/HPVMUtils.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -26,7 +26,7 @@ using namespace builddfg; #define TIMER(X) \ do { \ - if (VISCTimer) { \ + if (HPVMTimer) { \ X; \ } \ } while (0) @@ -37,8 +37,8 @@ using namespace builddfg; namespace dfg2llvm { // Helper Functions -static inline ConstantInt *getTimerID(Module &, enum visc_TimerID); -static inline ConstantInt *getTimerID(Module &, enum visc::Target); +static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID); +static inline ConstantInt *getTimerID(Module &, enum hpvm::Target); bool hasAttribute(Function *, unsigned, Attribute::AttrKind); @@ -69,7 +69,7 @@ protected: // Member variables Module &M; BuildDFG &DFG; - bool VISCTimer = false; + bool HPVMTimer = false; std::string TargetName = "None"; // Map from Old function associated with DFNode to new cloned function with @@ -78,12 +78,12 @@ protected: // "Have we visited this function before?") DenseMap<DFNode *, Value *> OutputMap; - // VISC Runtime API + // HPVM Runtime API std::unique_ptr<Module> runtimeModule; - FunctionCallee llvm_visc_initializeTimerSet; - FunctionCallee llvm_visc_switchToTimer; - FunctionCallee llvm_visc_printTimerSet; + FunctionCallee llvm_hpvm_initializeTimerSet; + FunctionCallee llvm_hpvm_switchToTimer; + FunctionCallee llvm_hpvm_printTimerSet; GlobalVariable *TimerSet; GlobalVariable *GraphIDAddr; Instruction *InitCall; @@ -109,7 +109,7 @@ protected: // Virtual Functions virtual void initializeTimerSet(Instruction *); - virtual void switchToTimer(enum visc_TimerID, Instruction *); + virtual void switchToTimer(enum hpvm_TimerID, Instruction *); virtual void printTimerSet(Instruction *); virtual ~CodeGenTraversal() {} @@ -118,9 +118,9 @@ public: // Constructor CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} - static bool checkPreferredTarget(DFNode *N, visc::Target T); - static bool preferredTargetIncludes(DFNode *N, visc::Target T); - visc::Target getPreferredTarget(DFNode *N); + static bool checkPreferredTarget(DFNode *N, hpvm::Target T); + static bool preferredTargetIncludes(DFNode *N, hpvm::Target T); + hpvm::Target getPreferredTarget(DFNode *N); virtual void visit(DFInternalNode *N) { // If code has already been generated for this internal node, skip the @@ -157,25 +157,25 @@ public: // -------------- CodeGenTraversal Implementation ----------------- -bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) { +bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) { Function *F = N->getFuncPointer(); Module *M = F->getParent(); NamedMDNode *HintNode; switch (T) { - case visc::GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + case hpvm::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu"); break; - case visc::SPIR_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + case hpvm::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_spir"); break; - case visc::CUDNN_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + case hpvm::CUDNN_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cudnn"); break; - case visc::PROMISE_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + case hpvm::PROMISE_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise"); break; - case visc::CPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + case hpvm::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu"); break; default: llvm_unreachable("Target Not supported yet!"); @@ -190,37 +190,37 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) { return false; } -visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) { - return viscUtils::getPreferredTarget(N->getFuncPointer()); +hpvm::Target CodeGenTraversal::getPreferredTarget(DFNode *N) { + return hpvmUtils::getPreferredTarget(N->getFuncPointer()); } -bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, visc::Target T) { +bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, hpvm::Target T) { Function *F = N->getFuncPointer(); Module *M = F->getParent(); std::vector<NamedMDNode *> HintNode; switch (T) { - case visc::GPU_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + case hpvm::GPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu")); break; - case visc::SPIR_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + case hpvm::SPIR_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_spir")); + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir")); break; - case visc::CPU_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + case hpvm::CPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir")); break; - case visc::CUDNN_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn")); + case hpvm::CUDNN_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cudnn")); break; - case visc::PROMISE_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise")); + case hpvm::PROMISE_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_promise")); break; - case visc::CPU_OR_GPU_TARGET: - case visc::CPU_OR_SPIR_TARGET: + case hpvm::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_SPIR_TARGET: assert(false && "Target should be one of CPU/GPU/SPIR\n"); break; default: @@ -308,11 +308,11 @@ Function *CodeGenTraversal::addArgument(Function *F, Type *Ty, Function *newF = Function::Create(FTy, F->getLinkage(), F->getName() + "_cloned", F->getParent()); renameNewArgument(newF, name); - newF = viscUtils::cloneFunction(F, newF, false); + newF = hpvmUtils::cloneFunction(F, newF, false); // Check if the function is used by a metadata node if (F->isUsedByMetadata()) { - viscUtils::fixHintMetadata(*F->getParent(), F, newF); + hpvmUtils::fixHintMetadata(*F->getParent(), F, newF); } return newF; @@ -396,32 +396,32 @@ Argument *CodeGenTraversal::getArgumentAt(Function *F, unsigned offset) { } void CodeGenTraversal::initTimerAPI() { - DECLARE(llvm_visc_initializeTimerSet); - DECLARE(llvm_visc_switchToTimer); - DECLARE(llvm_visc_printTimerSet); + DECLARE(llvm_hpvm_initializeTimerSet); + DECLARE(llvm_hpvm_switchToTimer); + DECLARE(llvm_hpvm_printTimerSet); } // Timer Routines // Initialize the timer set void CodeGenTraversal::initializeTimerSet(Instruction *InsertBefore) { - // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << + // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet << // "\n"); TIMER(TimerSet = new GlobalVariable( M, Type::getInt8PtrTy(M.getContext()), false, GlobalValue::CommonLinkage, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - Twine("viscTimerSet_") + TargetName); + Twine("hpvmTimerSet_") + TargetName); DEBUG(errs() << "New global variable: " << *TimerSet << "\n"); - Value *TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, + Value *TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, None, "", InsertBefore); new StoreInst(TimerSetAddr, TimerSet, InsertBefore);); } -void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, +void CodeGenTraversal::switchToTimer(enum hpvm_TimerID timer, Instruction *InsertBefore) { Value *switchArgs[] = {TimerSet, getTimerID(M, timer)}; - TIMER(CallInst::Create(llvm_visc_switchToTimer, + TIMER(CallInst::Create(llvm_hpvm_switchToTimer, ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); } @@ -430,16 +430,16 @@ void CodeGenTraversal::printTimerSet(Instruction *InsertBefore) { TIMER(TimerName = getStringPointer(TargetName + Twine("_Timer"), InsertBefore)); Value *printArgs[] = {TimerSet, TimerName}; - TIMER(CallInst::Create(llvm_visc_printTimerSet, + TIMER(CallInst::Create(llvm_hpvm_printTimerSet, ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); } // Implementation of Helper Functions -static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) { +static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); } -static inline ConstantInt *getTargetID(Module &M, enum visc::Target T) { +static inline ConstantInt *getTargetID(Module &M, enum hpvm::Target T) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), T); } diff --git a/hpvm/include/SupportVISC/DFGTreeTraversal.h b/hpvm/include/SupportHPVM/DFGTreeTraversal.h similarity index 100% rename from hpvm/include/SupportVISC/DFGTreeTraversal.h rename to hpvm/include/SupportHPVM/DFGTreeTraversal.h diff --git a/hpvm/include/SupportVISC/DFGraph.h b/hpvm/include/SupportHPVM/DFGraph.h similarity index 94% rename from hpvm/include/SupportVISC/DFGraph.h rename to hpvm/include/SupportHPVM/DFGraph.h index 0c224a344c4ec342f52f4816280e101518ba43dd..d904e2401d7e9a58a38e9bca024de1a437cd56d1 100644 --- a/hpvm/include/SupportVISC/DFGraph.h +++ b/hpvm/include/SupportHPVM/DFGraph.h @@ -20,8 +20,8 @@ #ifndef LLVM_IR_DFGRAPH_H #define LLVM_IR_DFGRAPH_H -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" +#include "SupportHPVM/HPVMHint.h" +#include "SupportHPVM/HPVMUtils.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" @@ -158,7 +158,7 @@ public: } }; -// DFNode represents a single VISC Dataflow Node in LLVM. +// DFNode represents a single HPVM Dataflow Node in LLVM. // // A Dataflow Node basically consists of // 1. Pointer to a function describing this dataflow node @@ -210,8 +210,8 @@ private: ///< hierarchy unsigned Rank; ///< Ordering based on toplogical sort const DFNodeKind Kind; ///< Kind of Node Internal/Leaf - visc::Target Tag; ///< Code Generated for which backend - visc::Target Hint; ///< To store preferred backend + hpvm::Target Tag; ///< Code Generated for which backend + hpvm::Target Hint; ///< To store preferred backend public: virtual ~DFNode() { @@ -287,13 +287,13 @@ public: DFNodeKind getKind() const { return Kind; } - DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, + DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, DFInternalNode *_Parent, unsigned _NumOfDim, std::vector<Value *> _DimLimits, DFNodeKind _K); bool isRoot() const { // It is a root node is it was created from a launch intrinsic - if (II->getCalledFunction()->getName().equals("llvm.visc.launch")) { + if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) { assert(Level == 0 && "Root node's level is zero."); return true; } @@ -326,9 +326,9 @@ public: unsigned getRank() const { return Rank; } - void setTag(visc::Target T) { Tag = T; } + void setTag(hpvm::Target T) { Tag = T; } - visc::Target getTag() const { return Tag; } + hpvm::Target getTag() const { return Tag; } void *getProperty(PropertyKind PType) { assert(PropertyList.count(PType) == 1 && @@ -342,24 +342,24 @@ public: PropertyList[PType] = PValue; } - void setGenFunc(Function *F, visc::Target T) { + void setGenFunc(Function *F, hpvm::Target T) { GenFunc = F; Tag = T; } Function *getGenFunc() const { return GenFunc; } - void setHasX86FuncForTarget(visc::Target T, bool isX86Func) { + void setHasX86FuncForTarget(hpvm::Target T, bool isX86Func) { switch (T) { - case visc::None: + case hpvm::None: return; // Do nothing. - case visc::CPU_TARGET: + case hpvm::CPU_TARGET: GenFuncInfo.cpu_hasX86Func = isX86Func; break; - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: GenFuncInfo.gpu_hasX86Func = isX86Func; break; - case visc::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_GPU_TARGET: break; default: assert(false && "Unknown target\n"); @@ -368,15 +368,15 @@ public: return; } - bool hasX86GenFuncForTarget(visc::Target T) const { + bool hasX86GenFuncForTarget(hpvm::Target T) const { switch (T) { - case visc::None: + case hpvm::None: return false; - case visc::CPU_TARGET: + case hpvm::CPU_TARGET: return GenFuncInfo.cpu_hasX86Func; - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: return GenFuncInfo.gpu_hasX86Func; - case visc::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_GPU_TARGET: assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n"); default: assert(false && "Unknown target\n"); @@ -384,10 +384,10 @@ public: return false; } - void addGenFunc(Function *F, visc::Target T, bool isX86Func) { + void addGenFunc(Function *F, hpvm::Target T, bool isX86Func) { switch (T) { - case visc::CPU_TARGET: + case hpvm::CPU_TARGET: if (GenFuncs.CPUGenFunc != NULL) { DEBUG(errs() << "Warning: Second generated CPU function for node " << FuncPointer->getName() << "\n"); @@ -395,7 +395,7 @@ public: GenFuncs.CPUGenFunc = F; GenFuncInfo.cpu_hasX86Func = isX86Func; break; - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: if (GenFuncs.GPUGenFunc != NULL) { DEBUG(errs() << "Warning: Second generated GPU function for node " << FuncPointer->getName() << "\n"); @@ -403,25 +403,25 @@ public: GenFuncs.GPUGenFunc = F; GenFuncInfo.gpu_hasX86Func = isX86Func; break; - case visc::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_GPU_TARGET: assert(false && "A node function should be set with a tag specifying its \ type, not the node hint itself\n"); default: assert(false && "Unknown target for generated function\n"); } - Tag = viscUtils::getUpdatedTag(Tag, T); + Tag = hpvmUtils::getUpdatedTag(Tag, T); } - Function *getGenFuncForTarget(visc::Target T) const { + Function *getGenFuncForTarget(hpvm::Target T) const { switch (T) { - case visc::None: + case hpvm::None: return NULL; - case visc::CPU_TARGET: + case hpvm::CPU_TARGET: return GenFuncs.CPUGenFunc; - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: return GenFuncs.GPUGenFunc; - case visc::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_GPU_TARGET: assert(false && "Requesting genarated node function with dual tag instead of \ CPU/GPU/SPIR/CUDNN/PROMISE\n"); @@ -431,19 +431,19 @@ public: return NULL; } - void removeGenFuncForTarget(visc::Target T) { + void removeGenFuncForTarget(hpvm::Target T) { switch (T) { - case visc::None: + case hpvm::None: return; - case visc::CPU_TARGET: + case hpvm::CPU_TARGET: GenFuncs.CPUGenFunc = NULL; GenFuncInfo.cpu_hasX86Func = false; break; - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: GenFuncs.GPUGenFunc = NULL; GenFuncInfo.gpu_hasX86Func = false; break; - case visc::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_GPU_TARGET: assert(false && "Removing genarated node function with dual tag instead of \ CPU/GPU/SPIR/CUDNN/PROMISE\n"); @@ -453,9 +453,9 @@ public: return; } - void setTargetHint(visc::Target T) { Hint = T; } + void setTargetHint(hpvm::Target T) { Hint = T; } - visc::Target getTargetHint() const { return Hint; } + hpvm::Target getTargetHint() const { return Hint; } bool isDummyNode() const { return isEntryNode() || isExitNode(); } @@ -496,7 +496,7 @@ private: DFGraph *childGraph; ///< Pointer to dataflow graph // Constructor - DFInternalNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, + DFInternalNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, DFInternalNode *Parent, int NumOfDim, std::vector<Value *> DimLimits) : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, @@ -508,7 +508,7 @@ private: public: static DFInternalNode * Create(IntrinsicInst *II, Function *FuncPointer, - visc::Target Hint = visc::CPU_TARGET, DFInternalNode *Parent = NULL, + hpvm::Target Hint = hpvm::CPU_TARGET, DFInternalNode *Parent = NULL, int NumOfDim = 0, std::vector<Value *> DimLimits = std::vector<Value *>()) { @@ -539,14 +539,14 @@ class DFLeafNode : public DFNode { private: // Constructor - DFLeafNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, + DFLeafNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, DFInternalNode *Parent, int NumOfDim = 0, std::vector<Value *> DimLimits = std::vector<Value *>()) : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {} public: static DFLeafNode * - Create(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, + Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, DFInternalNode *Parent, int NumOfDim = 0, std::vector<Value *> DimLimits = std::vector<Value *>()) { return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits); @@ -558,7 +558,7 @@ public: // void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ }; -// DFEdge represents a single VISC Dataflow Edge in LLVM. +// DFEdge represents a single HPVM Dataflow Edge in LLVM. // // A Dataflow Edge basically consists of // 1. Pointer to the dataflow node that is the source of this edge @@ -634,8 +634,8 @@ DFGraph::DFGraph(DFInternalNode *P) { Parent = P; // Create dummy entry and exit nodes and add them to the graph Entry = - DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); - Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); + DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent); + Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent); addChildDFNode(Entry); addChildDFNode(Exit); } @@ -655,7 +655,7 @@ bool DFGraph::isStreaming() { } //===--------------------- DFNode Outlined Functions --------------===// -DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, +DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, DFInternalNode *_Parent, unsigned _NumOfDim, std::vector<Value *> _DimLimits, DFNodeKind _K) : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim), @@ -663,7 +663,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, Type *Ty = FuncPointer->getFunctionType()->getReturnType(); - // Allow the return type to be void too, in the hVISC IR. If return type is + // Allow the return type to be void too, in the hHPVM IR. If return type is // void, create an empty struct type and keep that as the return type of the // node. if (Ty->isVoidTy()) @@ -683,7 +683,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, Level = (_Parent) ? _Parent->getLevel() + 1 : 0; Rank = 0; - Tag = visc::None; + Tag = hpvm::None; GenFuncs.CPUGenFunc = NULL; GenFuncs.GPUGenFunc = NULL; GenFuncs.SPIRGenFunc = NULL; diff --git a/hpvm/include/SupportVISC/VISCHint.h b/hpvm/include/SupportHPVM/HPVMHint.h similarity index 78% rename from hpvm/include/SupportVISC/VISCHint.h rename to hpvm/include/SupportHPVM/HPVMHint.h index 99266b071843ab0417ea73c6e4533dfa381d52cd..1ef4c6eb3b986328080caa9e99e96f444978c03e 100644 --- a/hpvm/include/SupportVISC/VISCHint.h +++ b/hpvm/include/SupportHPVM/HPVMHint.h @@ -1,4 +1,4 @@ -//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===// +//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===// // // The LLVM Compiler Infrastructure // @@ -7,12 +7,12 @@ // //===----------------------------------------------------------------------===// -#ifndef VISC_HINT_HEADER -#define VISC_HINT_HEADER +#ifndef HPVM_HINT_HEADER +#define HPVM_HINT_HEADER /************************** Hint Routines ***************************/ #ifdef __cplusplus -namespace visc { +namespace hpvm { #endif enum Target { @@ -32,4 +32,4 @@ enum Target { } #endif -#endif // VISC_HINT_HEADER +#endif // HPVM_HINT_HEADER diff --git a/hpvm/include/SupportHPVM/HPVMTimer.h b/hpvm/include/SupportHPVM/HPVMTimer.h new file mode 100644 index 0000000000000000000000000000000000000000..05b24d41d6d50c61cd38b458676dbf79d28a917f --- /dev/null +++ b/hpvm/include/SupportHPVM/HPVMTimer.h @@ -0,0 +1,151 @@ +//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef HPVM_TIMER_HEADER +#define HPVM_TIMER_HEADER + +/************************** Timer Routines ***************************/ +extern "C" { + +/* A time or duration. */ +//#if _POSIX_VERSION >= 200112L +typedef unsigned long long hpvm_Timestamp; /* time in microseconds */ +//#else +//# error "Timestamps not implemented" +//#endif + +enum hpvm_TimerState { + hpvm_Timer_STOPPED, + hpvm_Timer_RUNNING, +}; + +struct hpvm_Timer { + enum hpvm_TimerState state; + hpvm_Timestamp elapsed; /* Amount of time elapsed so far */ + hpvm_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ +}; + +/* Reset a timer. + * Use this to initialize a timer or to clear + * its elapsed time. The reset timer is stopped. + */ +void hpvm_ResetTimer(struct hpvm_Timer *timer); + +/* Start a timer. The timer is set to RUNNING mode and + * time elapsed while the timer is running is added to + * the timer. + * The timer should not already be running. + */ +void hpvm_StartTimer(struct hpvm_Timer *timer); + +/* Stop a timer. + * This stops adding elapsed time to the timer. + * The timer should not already be stopped. + */ +void hpvm_StopTimer(struct hpvm_Timer *timer); + +/* Get the elapsed time in seconds. */ +double hpvm_GetElapsedTime(struct hpvm_Timer *timer); + +/* Execution time is assigned to one of these categories. */ +enum hpvm_TimerID { + hpvm_TimerID_NONE = 0, + hpvm_TimerID_IO, /* Time spent in input/output */ + hpvm_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + hpvm_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + hpvm_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + hpvm_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + hpvm_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + hpvm_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ + // GPU FUNCTION + hpvm_TimerID_INIT_CTX, + hpvm_TimerID_CLEAR_CTX, + hpvm_TimerID_COPY_SCALAR, + hpvm_TimerID_COPY_PTR, + hpvm_TimerID_MEM_FREE, + hpvm_TimerID_READ_OUTPUT, + hpvm_TimerID_SETUP, + hpvm_TimerID_MEM_TRACK, + hpvm_TimerID_MEM_UNTRACK, + hpvm_TimerID_MISC, + // LAUNCH FUNCTION + hpvm_TimerID_PTHREAD_CREATE, + hpvm_TimerID_ARG_PACK, + hpvm_TimerID_ARG_UNPACK, + hpvm_TimerID_COMPUTATION, + hpvm_TimerID_OUTPUT_PACK, + hpvm_TimerID_OUTPUT_UNPACK, + + hpvm_TimerID_LAST /* Number of timer IDs */ +}; + +/* Dynamic list of asynchronously tracked times between events */ +struct hpvm_async_time_marker_list { + char *label; // actually just a pointer to a string + enum hpvm_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void *marker; + // cudaEvent_t marker; /* The driver event for this marker */ + struct hpvm_async_time_marker_list *next; +}; + +struct hpvm_SubTimer { + char *label; + struct hpvm_Timer timer; + struct hpvm_SubTimer *next; +}; + +struct hpvm_SubTimerList { + struct hpvm_SubTimer *current; + struct hpvm_SubTimer *subtimer_list; +}; + +/* A set of timers for recording execution times. */ +struct hpvm_TimerSet { + enum hpvm_TimerID current; + struct hpvm_async_time_marker_list *async_markers; + hpvm_Timestamp async_begin; + hpvm_Timestamp wall_begin; + struct hpvm_Timer timers[hpvm_TimerID_LAST]; + struct hpvm_SubTimerList *sub_timer_list[hpvm_TimerID_LAST]; +}; + +/* Reset all timers in the set. */ +void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers); + +void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label, + enum hpvm_TimerID hpvm_Category); + +/* Select which timer the next interval of time should be accounted + * to. The selected timer is started and other timers are stopped. + * Using hpvm_TimerID_NONE stops all timers. */ +inline void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, + enum hpvm_TimerID timer); + +void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, + enum hpvm_TimerID category); + +/* Print timer values to standard output. */ +void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers); + +/* Release timer resources */ +void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers); +} +#endif // HPVM_RT_HEADER diff --git a/hpvm/include/SupportVISC/VISCUtils.h b/hpvm/include/SupportHPVM/HPVMUtils.h similarity index 84% rename from hpvm/include/SupportVISC/VISCUtils.h rename to hpvm/include/SupportHPVM/HPVMUtils.h index 0efd20b5b5eb57943de1feb6d2afa886c6c48a5c..25b9880180f2cb4590f5b5fcbb3f3f2fbe025f8f 100644 --- a/hpvm/include/SupportVISC/VISCUtils.h +++ b/hpvm/include/SupportHPVM/HPVMUtils.h @@ -1,5 +1,5 @@ // -//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===// +//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===// // // The LLVM Compiler Infrastructure // @@ -8,12 +8,12 @@ // //===----------------------------------------------------------------------===// -#ifndef VISC_UTILS_HEADER -#define VISC_UTILS_HEADER +#ifndef HPVM_UTILS_HEADER +#define HPVM_UTILS_HEADER #include <assert.h> -#include "SupportVISC/VISCHint.h" +#include "SupportHPVM/HPVMHint.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -29,31 +29,31 @@ using namespace llvm; -namespace viscUtils { +namespace hpvmUtils { // Helper Functions -static bool isViscCreateNodeIntrinsic(Instruction *I) { +static bool isHPVMCreateNodeIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); return (II->getCalledFunction()->getName()) - .startswith("llvm.visc.createNode"); + .startswith("llvm.hpvm.createNode"); } -static bool isViscCreateNodeCall(Instruction *I) { +static bool isHPVMCreateNodeCall(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); return (CI->getCalledValue()->stripPointerCasts()->getName()) - .startswith("__visc__createNode"); + .startswith("__hpvm__createNode"); } -static bool isViscLaunchCall(Instruction *I) { +static bool isHPVMLaunchCall(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); return (CI->getCalledValue()->stripPointerCasts()->getName()) - .startswith("__visc__launch"); + .startswith("__hpvm__launch"); } // Creates a new createNode intrinsic, similar to II but with different // associated function F instead @@ -69,22 +69,22 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, ArrayRef<Value *> CreateNodeArgs; switch (II->getIntrinsicID()) { - case Intrinsic::visc_createNode: { + case Intrinsic::hpvm_createNode: { CreateNodeArgs = ArrayRef<Value *>(Fp); break; } - case Intrinsic::visc_createNode1D: { + case Intrinsic::hpvm_createNode1D: { Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2); break; } - case Intrinsic::visc_createNode2D: { + case Intrinsic::hpvm_createNode2D: { Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2)}; CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3); break; } - case Intrinsic::visc_createNode3D: { + case Intrinsic::hpvm_createNode3D: { Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2), II->getArgOperand(3)}; CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4); @@ -101,7 +101,7 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, return CreateNodeII; } -// Fix VISC hints for this function +// Fix HPVM hints for this function void fixHintMetadata(Module &M, Function *F, Function *G) { Metadata *MD_F = ValueAsMetadata::getIfExists(F); MDTuple *MDT_F = @@ -119,9 +119,9 @@ void fixHintMetadata(Module &M, Function *F, Function *G) { } }; - FixHint("visc_hint_gpu"); - FixHint("visc_hint_cpu"); - FixHint("visc_hint_cpu_gpu"); + FixHint("hpvm_hint_gpu"); + FixHint("hpvm_hint_cpu"); + FixHint("hpvm_hint_cpu_gpu"); } // Assuming that the changed function is a node function, it is only used as a @@ -138,7 +138,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { ++i) { Instruction *I = &*i; // Grab pointer to Instruction - if (isViscCreateNodeIntrinsic(I)) { + if (isHPVMCreateNodeIntrinsic(I)) { IntrinsicInst *II = cast<IntrinsicInst>(I); // The found createNode is not associated with the changed function if (II->getArgOperand(0) != F) @@ -150,7 +150,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II); II->replaceAllUsesWith(CreateNodeII); toBeErased.push_back(II); - } else if (isViscCreateNodeCall(I)) { + } else if (isHPVMCreateNodeCall(I)) { CallInst *CI = cast<CallInst>(I); // The found createNode is not associated with the changed function if (CI->getArgOperand(1) != F) @@ -161,7 +161,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { // Replace use of F with use of G CI->setArgOperand(1, G); DEBUG(errs() << "Fixed use: " << *CI << "\n"); - } else if (isViscLaunchCall(I)) { + } else if (isHPVMLaunchCall(I)) { CallInst *CI = cast<CallInst>(I); // The found launch call is not associated with the changed function if (CI->getArgOperand(1)->stripPointerCasts() != F) @@ -370,21 +370,21 @@ Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg, //------------------- Helper Functions For Handling Hints -------------------// // Return true if 1st arg (tag) contains 2nd (target) -bool tagIncludesTarget(visc::Target Tag, visc::Target T) { +bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { switch (Tag) { - case visc::None: + case hpvm::None: return false; - case visc::CPU_TARGET: - if (T == visc::CPU_TARGET) + case hpvm::CPU_TARGET: + if (T == hpvm::CPU_TARGET) return true; return false; - case visc::GPU_TARGET: - if (T == visc::GPU_TARGET) + case hpvm::GPU_TARGET: + if (T == hpvm::GPU_TARGET) return true; return false; - case visc::CPU_OR_GPU_TARGET: - if ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET) || - (T == visc::CPU_OR_GPU_TARGET)) + case hpvm::CPU_OR_GPU_TARGET: + if ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) || + (T == hpvm::CPU_OR_GPU_TARGET)) return true; return false; default: @@ -392,41 +392,41 @@ bool tagIncludesTarget(visc::Target Tag, visc::Target T) { } } -bool isSingleTargetTag(visc::Target T) { - return ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)); +bool isSingleTargetTag(hpvm::Target T) { + return ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET)); } // Add the specified target to the given tag -visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) { - assert(((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)) && +hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) { + assert(((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET)) && "The target is only allowed to be a single target: CPU, GPU, SPIR, " "CUDNN, PROMISE\n"); switch (Tag) { - case visc::None: + case hpvm::None: return T; - case visc::CPU_TARGET: - if (T == visc::CPU_TARGET) - return visc::CPU_TARGET; - if (T == visc::GPU_TARGET) - return visc::CPU_OR_GPU_TARGET; + case hpvm::CPU_TARGET: + if (T == hpvm::CPU_TARGET) + return hpvm::CPU_TARGET; + if (T == hpvm::GPU_TARGET) + return hpvm::CPU_OR_GPU_TARGET; return T; - case visc::GPU_TARGET: - if (T == visc::CPU_TARGET) - return visc::CPU_OR_GPU_TARGET; - if (T == visc::GPU_TARGET) - return visc::GPU_TARGET; + case hpvm::GPU_TARGET: + if (T == hpvm::CPU_TARGET) + return hpvm::CPU_OR_GPU_TARGET; + if (T == hpvm::GPU_TARGET) + return hpvm::GPU_TARGET; return T; - case visc::CPU_OR_GPU_TARGET: - return visc::CPU_OR_GPU_TARGET; + case hpvm::CPU_OR_GPU_TARGET: + return hpvm::CPU_OR_GPU_TARGET; default: assert(false && "Unknown Target\n"); } return T; } -// This functions add the hint as metadata in visc code -void addHint(Function *F, visc::Target T) { +// This functions add the hint as metadata in hpvm code +void addHint(Function *F, hpvm::Target T) { // Get Module Module *M = F->getParent(); DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); @@ -434,17 +434,17 @@ void addHint(Function *F, visc::Target T) { // Based on the hint, get the hint metadata NamedMDNode *HintNode; switch (T) { - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: DEBUG(errs() << "GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu"); break; - case visc::CPU_TARGET: + case hpvm::CPU_TARGET: DEBUG(errs() << "CPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu"); break; - case visc::CPU_OR_GPU_TARGET: + case hpvm::CPU_OR_GPU_TARGET: DEBUG(errs() << "CPU or GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"); break; default: llvm_unreachable("Unsupported Target Hint!"); @@ -457,8 +457,8 @@ void addHint(Function *F, visc::Target T) { HintNode->addOperand(N); } -// This function removes the hint as metadata in visc code -void removeHint(Function *F, visc::Target T) { +// This function removes the hint as metadata in hpvm code +void removeHint(Function *F, hpvm::Target T) { // Get Module Module *M = F->getParent(); DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T @@ -467,14 +467,14 @@ void removeHint(Function *F, visc::Target T) { // Based on the hint, get the hint metadata NamedMDNode *HintNode; switch (T) { - case visc::GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + case hpvm::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu"); break; - case visc::CPU_OR_GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + case hpvm::CPU_OR_GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"); break; - case visc::CPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + case hpvm::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu"); break; default: llvm_unreachable("Unsupported Target Hint!"); @@ -501,7 +501,7 @@ void removeHint(Function *F, visc::Target T) { } } -visc::Target getPreferredTarget(Function *F) { +hpvm::Target getPreferredTarget(Function *F) { DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); Module *M = F->getParent(); @@ -517,16 +517,16 @@ visc::Target getPreferredTarget(Function *F) { return false; }; - if (FoundPrefTarget("visc_hint_cpu")) - return visc::CPU_TARGET; - if (FoundPrefTarget("visc_hint_gpu")) - return visc::GPU_TARGET; - if (FoundPrefTarget("visc_hint_cpu_gpu")) - return visc::CPU_OR_GPU_TARGET; + if (FoundPrefTarget("hpvm_hint_cpu")) + return hpvm::CPU_TARGET; + if (FoundPrefTarget("hpvm_hint_gpu")) + return hpvm::GPU_TARGET; + if (FoundPrefTarget("hpvm_hint_cpu_gpu")) + return hpvm::CPU_OR_GPU_TARGET; - return visc::None; + return hpvm::None; } -} // namespace viscUtils +} // namespace hpvmUtils -#endif // VISC_UTILS_HEADER +#endif // HPVM_UTILS_HEADER diff --git a/hpvm/include/SupportVISC/VISCTimer.h b/hpvm/include/SupportVISC/VISCTimer.h deleted file mode 100644 index ce3dc8a5e0f7c77ff06fec5857f223ca4f0e142f..0000000000000000000000000000000000000000 --- a/hpvm/include/SupportVISC/VISCTimer.h +++ /dev/null @@ -1,151 +0,0 @@ -//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef VISC_TIMER_HEADER -#define VISC_TIMER_HEADER - -/************************** Timer Routines ***************************/ -extern "C" { - -/* A time or duration. */ -//#if _POSIX_VERSION >= 200112L -typedef unsigned long long visc_Timestamp; /* time in microseconds */ -//#else -//# error "Timestamps not implemented" -//#endif - -enum visc_TimerState { - visc_Timer_STOPPED, - visc_Timer_RUNNING, -}; - -struct visc_Timer { - enum visc_TimerState state; - visc_Timestamp elapsed; /* Amount of time elapsed so far */ - visc_Timestamp init; /* Beginning of the current time interval, - * if state is RUNNING. End of the last - * recorded time interfal otherwise. */ -}; - -/* Reset a timer. - * Use this to initialize a timer or to clear - * its elapsed time. The reset timer is stopped. - */ -void visc_ResetTimer(struct visc_Timer *timer); - -/* Start a timer. The timer is set to RUNNING mode and - * time elapsed while the timer is running is added to - * the timer. - * The timer should not already be running. - */ -void visc_StartTimer(struct visc_Timer *timer); - -/* Stop a timer. - * This stops adding elapsed time to the timer. - * The timer should not already be stopped. - */ -void visc_StopTimer(struct visc_Timer *timer); - -/* Get the elapsed time in seconds. */ -double visc_GetElapsedTime(struct visc_Timer *timer); - -/* Execution time is assigned to one of these categories. */ -enum visc_TimerID { - visc_TimerID_NONE = 0, - visc_TimerID_IO, /* Time spent in input/output */ - visc_TimerID_KERNEL, /* Time spent computing on the device, - * recorded asynchronously */ - visc_TimerID_COPY, /* Time spent synchronously moving data - * to/from device and allocating/freeing - * memory on the device */ - visc_TimerID_DRIVER, /* Time spent in the host interacting with the - * driver, primarily for recording the time - * spent queueing asynchronous operations */ - visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ - visc_TimerID_COMPUTE, /* Time for all program execution other - * than parsing command line arguments, - * I/O, kernel, and copy */ - visc_TimerID_OVERLAP, /* Time double-counted in asynchronous and - * host activity: automatically filled in, - * not intended for direct usage */ - // GPU FUNCTION - visc_TimerID_INIT_CTX, - visc_TimerID_CLEAR_CTX, - visc_TimerID_COPY_SCALAR, - visc_TimerID_COPY_PTR, - visc_TimerID_MEM_FREE, - visc_TimerID_READ_OUTPUT, - visc_TimerID_SETUP, - visc_TimerID_MEM_TRACK, - visc_TimerID_MEM_UNTRACK, - visc_TimerID_MISC, - // LAUNCH FUNCTION - visc_TimerID_PTHREAD_CREATE, - visc_TimerID_ARG_PACK, - visc_TimerID_ARG_UNPACK, - visc_TimerID_COMPUTATION, - visc_TimerID_OUTPUT_PACK, - visc_TimerID_OUTPUT_UNPACK, - - visc_TimerID_LAST /* Number of timer IDs */ -}; - -/* Dynamic list of asynchronously tracked times between events */ -struct visc_async_time_marker_list { - char *label; // actually just a pointer to a string - enum visc_TimerID timerID; /* The ID to which the interval beginning - * with this marker should be attributed */ - void *marker; - // cudaEvent_t marker; /* The driver event for this marker */ - struct visc_async_time_marker_list *next; -}; - -struct visc_SubTimer { - char *label; - struct visc_Timer timer; - struct visc_SubTimer *next; -}; - -struct visc_SubTimerList { - struct visc_SubTimer *current; - struct visc_SubTimer *subtimer_list; -}; - -/* A set of timers for recording execution times. */ -struct visc_TimerSet { - enum visc_TimerID current; - struct visc_async_time_marker_list *async_markers; - visc_Timestamp async_begin; - visc_Timestamp wall_begin; - struct visc_Timer timers[visc_TimerID_LAST]; - struct visc_SubTimerList *sub_timer_list[visc_TimerID_LAST]; -}; - -/* Reset all timers in the set. */ -void visc_InitializeTimerSet(struct visc_TimerSet *timers); - -void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, - enum visc_TimerID visc_Category); - -/* Select which timer the next interval of time should be accounted - * to. The selected timer is started and other timers are stopped. - * Using visc_TimerID_NONE stops all timers. */ -inline void visc_SwitchToTimer(struct visc_TimerSet *timers, - enum visc_TimerID timer); - -void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, - enum visc_TimerID category); - -/* Print timer values to standard output. */ -void visc_PrintTimerSet(struct visc_TimerSet *timers); - -/* Release timer resources */ -void visc_DestroyTimerSet(struct visc_TimerSet *timers); -} -#endif // VISC_RT_HEADER diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp index 058419f1dc80a8650e7a3b834090a88099741431..be3e6cae3dae775716fc3e2206879e978febddb0 100644 --- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp +++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp @@ -10,8 +10,8 @@ #define DEBUG_TYPE "buildDFG" #include "BuildDFG/BuildDFG.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" +#include "SupportHPVM/HPVMHint.h" +#include "SupportHPVM/HPVMUtils.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/ValueSymbolTable.h" @@ -35,7 +35,7 @@ bool BuildDFG::runOnModule(Module &M) { for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &*i; // Grab pointer to Instruction - if (isViscLaunchIntrinsic(I)) { + if (isHPVMLaunchIntrinsic(I)) { DEBUG(errs() << "------------ Found launch site --------------\n"); II = cast<IntrinsicInst>(I); @@ -43,7 +43,7 @@ bool BuildDFG::runOnModule(Module &M) { // Intrinsic Instruction has been initialized from this point on. Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts()); - Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F)); + Root = DFInternalNode::Create(II, F, hpvmUtils::getPreferredTarget(F)); Roots.push_back(Root); BuildGraph(Root, F); @@ -118,37 +118,37 @@ void BuildDFG::removeElementFromHandleToDFEdgeMap(Value *V) { HandleToDFEdgeMap.erase(V); } -// Returns true if instruction I is a visc launch intrinsic, false otherwise -bool BuildDFG::isViscLaunchIntrinsic(Instruction *I) { +// Returns true if instruction I is a hpvm launch intrinsic, false otherwise +bool BuildDFG::isHPVMLaunchIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).equals("llvm.visc.launch"); + return (II->getCalledFunction()->getName()).equals("llvm.hpvm.launch"); } -// Returns true if instruction I is a visc graph intrinsic, false otherwise -bool BuildDFG::isViscGraphIntrinsic(Instruction *I) { +// Returns true if instruction I is a hpvm graph intrinsic, false otherwise +bool BuildDFG::isHPVMGraphIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") || - (II->getCalledFunction()->getName()).startswith("llvm.visc.bind"); + return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.create") || + (II->getCalledFunction()->getName()).startswith("llvm.hpvm.bind"); } -// Returns true if instruction I is a visc query intrinsic, false otherwise -bool BuildDFG::isViscQueryIntrinsic(Instruction *I) { +// Returns true if instruction I is a hpvm query intrinsic, false otherwise +bool BuildDFG::isHPVMQueryIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.visc.get"); + return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.get"); } -// Returns true if instruction I is a visc intrinsic, false otherwise -bool BuildDFG::isViscIntrinsic(Instruction *I) { +// Returns true if instruction I is a hpvm intrinsic, false otherwise +bool BuildDFG::isHPVMIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.visc"); + return (II->getCalledFunction()->getName()).startswith("llvm.hpvm"); } // Two types are "congruent" if they are identical, or if they are both @@ -163,7 +163,7 @@ bool BuildDFG::isTypeCongruent(Type *L, Type *R) { return PL->getAddressSpace() == PR->getAddressSpace(); } -// Handles all the createNodeXX visc intrinsics. +// Handles all the createNodeXX hpvm intrinsics. void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { bool isInternalNode = false; @@ -173,7 +173,7 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { // internal node for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &*i; // Grab pointer to Instruction - if (isViscGraphIntrinsic(I)) + if (isHPVMGraphIntrinsic(I)) isInternalNode = true; } @@ -196,14 +196,14 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { // Create Internal DFNode, add it to the map and recursively build its // dataflow graph DFInternalNode *childDFNode = DFInternalNode::Create( - II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; BuildGraph(childDFNode, F); } else { // Create Leaf DFnode and add it to the map. DFLeafNode *childDFNode = DFLeafNode::Create( - II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; } @@ -336,11 +336,11 @@ void BuildDFG::handleBindOutput(DFInternalNode *N, IntrinsicInst *II) { void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { DEBUG(errs() << "FUNCTION: " << F->getName() << "\n"); - // TODO: Place checks for valid visc functions. For example one of the - // check can be that any function that contains visc dataflow graph + // TODO: Place checks for valid hpvm functions. For example one of the + // check can be that any function that contains hpvm dataflow graph // construction intrinsics should not have other llvm IR statements. - // Iterate over all the instructions of a function and look for visc + // Iterate over all the instructions of a function and look for hpvm // intrinsics. for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &*i; // Grab pointer to Instruction @@ -349,25 +349,25 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName() << "\n"); switch (II->getIntrinsicID()) { - case Intrinsic::visc_createNode: - case Intrinsic::visc_createNode1D: - case Intrinsic::visc_createNode2D: - case Intrinsic::visc_createNode3D: + case Intrinsic::hpvm_createNode: + case Intrinsic::hpvm_createNode1D: + case Intrinsic::hpvm_createNode2D: + case Intrinsic::hpvm_createNode3D: handleCreateNode(N, II); break; - case Intrinsic::visc_createEdge: + case Intrinsic::hpvm_createEdge: handleCreateEdge(N, II); break; - case Intrinsic::visc_bind_input: + case Intrinsic::hpvm_bind_input: handleBindInput(N, II); break; - case Intrinsic::visc_bind_output: + case Intrinsic::hpvm_bind_output: handleBindOutput(N, II); break; // TODO: Reconsider launch within a dataflow graph (recursion?) - case Intrinsic::visc_wait: - case Intrinsic::visc_launch: + case Intrinsic::hpvm_wait: + case Intrinsic::hpvm_launch: DEBUG(errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" << *II << "\n"); @@ -375,7 +375,7 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { default: DEBUG( - errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" + errs() << "Error: Invalid HPVM Intrinsic inside Internal node!\n\t" << *II << "\n"); break; } diff --git a/hpvm/lib/Transforms/CMakeLists.txt b/hpvm/lib/Transforms/CMakeLists.txt index 68724684e56648d307df52624e47ed7393bfd3f9..5c9b8b9fe026ea5612caa124535e02d28d619c53 100644 --- a/hpvm/lib/Transforms/CMakeLists.txt +++ b/hpvm/lib/Transforms/CMakeLists.txt @@ -2,5 +2,5 @@ add_subdirectory(BuildDFG) add_subdirectory(ClearDFG) add_subdirectory(DFG2LLVM_NVPTX) add_subdirectory(DFG2LLVM_X86) -add_subdirectory(GenVISC) +add_subdirectory(GenHPVM) add_subdirectory(LocalMem) diff --git a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp index 6dae9e6977d31a0b62a9fa903966ec10810a2f71..c23043e7829a8947a995f7ad97688091c46cf23d 100644 --- a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp +++ b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp @@ -18,7 +18,7 @@ using namespace llvm; using namespace builddfg; -// STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted"); +// STATISTIC(IntrinsicCounter, "Counts number of hpvm intrinsics greeted"); namespace { @@ -101,8 +101,8 @@ bool ClearDFG::runOnModule(Module &M) { // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - Function *VI = M.getFunction("llvm.visc.init"); - assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n"); + Function *VI = M.getFunction("llvm.hpvm.init"); + assert(VI->hasOneUse() && "More than one use of llvm.hpvm.init\n"); for (Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) { Instruction *I = dyn_cast<Instruction>(*ui); @@ -111,8 +111,8 @@ bool ClearDFG::runOnModule(Module &M) { VI->replaceAllUsesWith(UndefValue::get(VI->getType())); VI->eraseFromParent(); - Function *VC = M.getFunction("llvm.visc.cleanup"); - assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n"); + Function *VC = M.getFunction("llvm.hpvm.cleanup"); + assert(VC->hasOneUse() && "More than one use of llvm.hpvm.cleanup\n"); for (Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) { Instruction *I = dyn_cast<Instruction>(*ui); diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 8a36e3b8af5c031715d1e341f3ac166501c0a5b9..584da07e6e4786c8c1f06c89ff1cd2a8780f0cb2 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -15,40 +15,39 @@ #define SHARED_ADDRSPACE 3 #define DEBUG_TYPE "DFG2LLVM_NVPTX" +#include "SupportHPVM/DFG2LLVM.h" +#include "SupportHPVM/HPVMTimer.h" +#include "SupportHPVM/HPVMUtils.h" +#include "llvm-c/Core.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" #include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/Pass.h" #include "llvm/Support/FileSystem.h" -#include "llvm/IR/Attributes.h" -#include "llvm-c/Core.h" -#include "SupportVISC/VISCTimer.h" -#include "SupportVISC/DFG2LLVM.h" -#include "SupportVISC/VISCUtils.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/ToolOutputFile.h" #include "llvm/IR/UseListOrder.h" - +#include "llvm/Support/ToolOutputFile.h" #include <sstream> using namespace llvm; using namespace builddfg; using namespace dfg2llvm; -using namespace viscUtils; +using namespace hpvmUtils; -// VISC Command line option to use timer or not -static cl::opt<bool> -VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers")); +// HPVM Command line option to use timer or not +static cl::opt<bool> HPVMTimer_NVPTX("hpvm-timers-ptx", + cl::desc("Enable hpvm timers")); namespace { // Helper class declarations @@ -57,94 +56,88 @@ namespace { // in bytes. Would have preferred to use tuple but support not yet available class OutputPtr { public: - OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) - : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} - Value* h_ptr; - Value* d_ptr; - Value* bytes; + Value *h_ptr; + Value *d_ptr; + Value *bytes; }; // Class to maintain important kernel info required for generating runtime // calls class Kernel { public: - Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = - std::map<unsigned, unsigned>(), - std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap = - std::map<unsigned, std::pair<Value*, unsigned> >(), - std::vector<unsigned> _outArgMap = std::vector<unsigned>(), - unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), - unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) - : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), - sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), - globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { - - assert(gridDim == globalWGSize.size() - && "gridDim should be same as the size of vector globalWGSize"); - assert(blockDim == localWGSize.size() - && "blockDim should be same as the size of vector localWGSize"); + Kernel( + Function *_KF, DFLeafNode *_KLeafNode, + std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(), + std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap = + std::map<unsigned, std::pair<Value *, unsigned>>(), + std::vector<unsigned> _outArgMap = std::vector<unsigned>(), + unsigned _gridDim = 0, + std::vector<Value *> _globalWGSize = std::vector<Value *>(), + unsigned _blockDim = 0, + std::vector<Value *> _localWGSize = std::vector<Value *>()) + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), + gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), + localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() && + "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() && + "blockDim should be same as the size of vector localWGSize"); } - Function* KernelFunction; - DFLeafNode* KernelLeafNode; + Function *KernelFunction; + DFLeafNode *KernelLeafNode; std::map<unsigned, unsigned> inArgMap; // Map for shared memory arguments - std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap; + std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap; // Fields for (potential) allocation node - DFLeafNode* AllocationNode; - Function* AllocationFunction; + DFLeafNode *AllocationNode; + Function *AllocationFunction; std::map<unsigned, unsigned> allocInArgMap; std::vector<unsigned> outArgMap; unsigned gridDim; - std::vector<Value*> globalWGSize; + std::vector<Value *> globalWGSize; unsigned blockDim; - std::vector<Value*> localWGSize; + std::vector<Value *> localWGSize; std::vector<int> localDimMap; - std::map<unsigned, unsigned> &getInArgMap() { - return inArgMap; - } - void setInArgMap(std::map<unsigned, unsigned> map) { - inArgMap = map; - } + std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; } + void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; } - std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() { + std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() { return sharedInArgMap; } - void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) { + void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) { sharedInArgMap = map; } - std::vector<unsigned> &getOutArgMap() { - return outArgMap; - } - void setOutArgMap(std::vector<unsigned> map) { - outArgMap = map; - } + std::vector<unsigned> &getOutArgMap() { return outArgMap; } + void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; } - void setLocalWGSize(std::vector<Value*> V) { - localWGSize = V; - } + void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; } - bool hasLocalWG() const { - return blockDim != 0; - } + bool hasLocalWG() const { return blockDim != 0; } }; // Helper function declarations -static bool canBePromoted(Argument* arg, Function* F); -static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*, - ValueToValueMapTy&, Instruction*); -static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&, - Instruction*, const Twine& WGName = "WGSize"); -static std::string getPTXFilename(const Module&); -static std::string getFilenameFromModule(const Module& M); +static bool canBePromoted(Argument *arg, Function *F); +static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&, + Kernel *, ValueToValueMapTy &, Instruction *); +static Value *genWorkGroupPtr(Module &M, std::vector<Value *>, + ValueToValueMapTy &, Instruction *, + const Twine &WGName = "WGSize"); +static std::string getPTXFilename(const Module &); +static std::string getFilenameFromModule(const Module &M); static void changeDataLayout(Module &); static void changeTargetTriple(Module &); static void findReturnInst(Function *, std::vector<ReturnInst *> &); -static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &); +static void findIntrinsicInst(Function *, Intrinsic::ID, + std::vector<IntrinsicInst *> &); static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID); static std::string getAtomicOpName(Intrinsic::ID); @@ -154,7 +147,6 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM { DFG2LLVM_NVPTX() : DFG2LLVM(ID) {} private: - public: bool runOnModule(Module &M); }; @@ -163,57 +155,60 @@ public: class CGT_NVPTX : public CodeGenTraversal { private: - //Member variables + // Member variables std::unique_ptr<Module> KernelM; - DFNode* KernelLaunchNode = NULL; - Kernel* kernel; - - // VISC Runtime API - FunctionCallee llvm_visc_ocl_launch; - FunctionCallee llvm_visc_ocl_wait; - FunctionCallee llvm_visc_ocl_initContext; - FunctionCallee llvm_visc_ocl_clearContext; - FunctionCallee llvm_visc_ocl_argument_shared; - FunctionCallee llvm_visc_ocl_argument_scalar; - FunctionCallee llvm_visc_ocl_argument_ptr; - FunctionCallee llvm_visc_ocl_output_ptr; - FunctionCallee llvm_visc_ocl_free; - FunctionCallee llvm_visc_ocl_getOutput; - FunctionCallee llvm_visc_ocl_executeNode; - - //Functions + DFNode *KernelLaunchNode = NULL; + Kernel *kernel; + + // HPVM Runtime API + FunctionCallee llvm_hpvm_ocl_launch; + FunctionCallee llvm_hpvm_ocl_wait; + FunctionCallee llvm_hpvm_ocl_initContext; + FunctionCallee llvm_hpvm_ocl_clearContext; + FunctionCallee llvm_hpvm_ocl_argument_shared; + FunctionCallee llvm_hpvm_ocl_argument_scalar; + FunctionCallee llvm_hpvm_ocl_argument_ptr; + FunctionCallee llvm_hpvm_ocl_output_ptr; + FunctionCallee llvm_hpvm_ocl_free; + FunctionCallee llvm_hpvm_ocl_getOutput; + FunctionCallee llvm_hpvm_ocl_executeNode; + + // Functions std::string getKernelsModuleName(Module &M); - void fixValueAddrspace(Value* V, unsigned addrspace); - std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*); - Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i); - void addCLMetadata(Function* F); - Function* transformFunctionToVoid(Function* F); - void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); + void fixValueAddrspace(Value *V, unsigned addrspace); + std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *, + Function *); + Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags, + unsigned i); + void addCLMetadata(Function *F); + Function *transformFunctionToVoid(Function *F); + void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName); // Virtual Functions void init() { - VISCTimer = VISCTimer_NVPTX; + HPVMTimer = HPVMTimer_NVPTX; TargetName = "NVPTX"; } void initRuntimeAPI(); - void codeGen(DFInternalNode* N); - void codeGen(DFLeafNode* N); + void codeGen(DFInternalNode *N); + void codeGen(DFLeafNode *N); public: - // Constructor - CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { + CGT_NVPTX(Module &_M, BuildDFG &_DFG) + : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { init(); initRuntimeAPI(); - errs() << "Old module pointer: " << &_M << "\n"; - errs() << "New module pointer: " << KernelM.get() << "\n"; + DEBUG(errs() << "Old module pointer: " << &_M << "\n"); + DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n"); - // Copying instead of creating new, in order to preserve required info (metadata) - // Remove functions, global variables and aliases - std::vector<GlobalVariable*> GVVect; + // Copying instead of creating new, in order to preserve required info + // (metadata) Remove functions, global variables and aliases + std::vector<GlobalVariable *> GVVect; for (Module::global_iterator mi = KernelM->global_begin(), - me = KernelM->global_end(); (mi != me); ++mi) { - GlobalVariable* GV = &*mi; + me = KernelM->global_end(); + (mi != me); ++mi) { + GlobalVariable *GV = &*mi; GVVect.push_back(GV); } for (auto *GV : GVVect) { @@ -221,10 +216,10 @@ public: GV->eraseFromParent(); } - std::vector<Function*> FuncVect; - for (Module::iterator mi = KernelM->begin(), - me = KernelM->end(); (mi != me); ++mi) { - Function* F = &*mi; + std::vector<Function *> FuncVect; + for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); + (mi != me); ++mi) { + Function *F = &*mi; FuncVect.push_back(F); } for (auto *F : FuncVect) { @@ -232,10 +227,11 @@ public: F->eraseFromParent(); } - std::vector<GlobalAlias*> GAVect; + std::vector<GlobalAlias *> GAVect; for (Module::alias_iterator mi = KernelM->alias_begin(), - me = KernelM->alias_end(); (mi != me); ++mi) { - GlobalAlias* GA = &*mi; + me = KernelM->alias_end(); + (mi != me); ++mi) { + GlobalAlias *GA = &*mi; GAVect.push_back(GA); } for (auto *GA : GAVect) { @@ -246,73 +242,69 @@ public: changeDataLayout(*KernelM); changeTargetTriple(*KernelM); - DEBUG(errs() << *KernelM); - } void writeKernelsModule(); }; -// Initialize the VISC runtime API. This makes it easier to insert these calls +// Initialize the HPVM runtime API. This makes it easier to insert these calls void CGT_NVPTX::initRuntimeAPI() { // Load Runtime API Module SMDiagnostic Err; - char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc"; + Twine runtimeAPI = + llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc"; runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if(runtimeModule == nullptr) { + if (runtimeModule == nullptr) { DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); assert(false && "couldn't parse runtime"); - } - else - DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + } else + DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_visc_ocl_launch); - DECLARE(llvm_visc_ocl_wait); - DECLARE(llvm_visc_ocl_initContext); - DECLARE(llvm_visc_ocl_clearContext); - DECLARE(llvm_visc_ocl_argument_shared); - DECLARE(llvm_visc_ocl_argument_scalar); - DECLARE(llvm_visc_ocl_argument_ptr); - DECLARE(llvm_visc_ocl_output_ptr); - DECLARE(llvm_visc_ocl_free); - DECLARE(llvm_visc_ocl_getOutput); - DECLARE(llvm_visc_ocl_executeNode); + DECLARE(llvm_hpvm_ocl_launch); + DECLARE(llvm_hpvm_ocl_wait); + DECLARE(llvm_hpvm_ocl_initContext); + DECLARE(llvm_hpvm_ocl_clearContext); + DECLARE(llvm_hpvm_ocl_argument_shared); + DECLARE(llvm_hpvm_ocl_argument_scalar); + DECLARE(llvm_hpvm_ocl_argument_ptr); + DECLARE(llvm_hpvm_ocl_output_ptr); + DECLARE(llvm_hpvm_ocl_free); + DECLARE(llvm_hpvm_ocl_getOutput); + DECLARE(llvm_hpvm_ocl_executeNode); // Get or insert timerAPI functions as well if you plan to use timers initTimerAPI(); // Insert init context in main DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n"); - Function* VI = M.getFunction("llvm.visc.init"); - assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + Function *VI = M.getFunction("llvm.hpvm.init"); + assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); InitCall = cast<Instruction>(*VI->user_begin()); initializeTimerSet(InitCall); - switchToTimer(visc_TimerID_INIT_CTX, InitCall); - CallInst::Create(llvm_visc_ocl_initContext, - ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)), - "", InitCall); - switchToTimer(visc_TimerID_NONE, InitCall); + switchToTimer(hpvm_TimerID_INIT_CTX, InitCall); + CallInst::Create(llvm_hpvm_ocl_initContext, + ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "", + InitCall); + switchToTimer(hpvm_TimerID_NONE, InitCall); - // Insert print instruction at visc exit + // Insert print instruction at hpvm exit DEBUG(errs() << "Gen Code to print NVPTX Timer\n"); - Function* VC = M.getFunction("llvm.visc.cleanup"); + Function *VC = M.getFunction("llvm.hpvm.cleanup"); DEBUG(errs() << *VC << "\n"); - assert(VC->getNumUses() == 1 && "__visc__clear should only be used once"); + assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once"); CleanupCall = cast<Instruction>(*VC->user_begin()); printTimerSet(CleanupCall); - - } // Generate Code to call the kernel @@ -320,36 +312,37 @@ void CGT_NVPTX::initRuntimeAPI() { // used to generate a function to associate with this leaf node. The function // is responsible for all the memory allocation/transfer and invoking the // kernel call on the device -void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { +void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, + const Twine &FileName) { // Check if clone already exists. If it does, it means we have visited this // function before. -// assert(N->getGenFunc() == NULL && "Code already generated for this node"); + // assert(N->getGenFunc() == NULL && "Code already generated for this node"); - assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL && + assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL && "Code already generated for this node"); // Useful values - Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); - Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); // If kernel struct has not been initialized with kernel function, then fail assert(K != NULL && "No kernel found!!"); DEBUG(errs() << "Generating kernel call code\n"); - Function* F = N->getFuncPointer(); - + Function *F = N->getFuncPointer(); // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. - Function* F_X86; + Function *F_X86; // Clone the function, if we are seeing this function for the first time. We // only need a clone in terms of type. ValueToValueMapTy VMap; // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + F_X86 = + Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); // Loop over the arguments, copying the names of arguments over. Function::arg_iterator dest_iterator = F_X86->arg_begin(); @@ -362,26 +355,25 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi // Add a basic block to this empty function BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); - ReturnInst* RI = ReturnInst::Create(M.getContext(), - UndefValue::get(F_X86->getReturnType()), BB); + ReturnInst *RI = ReturnInst::Create( + M.getContext(), UndefValue::get(F_X86->getReturnType()), BB); // FIXME: Adding Index and Dim arguments are probably not required except // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do // have those arguments) // Add Index and Dim arguments except for the root node - if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); BB = &*F_X86->begin(); RI = cast<ReturnInst>(BB->getTerminator()); - //Add the generated function info to DFNode -// N->setGenFunc(F_X86, visc::CPU_TARGET); - N->addGenFunc(F_X86, visc::GPU_TARGET, true); - errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " - << N->getFuncPointer()->getName() << "\n"; - + // Add the generated function info to DFNode + // N->setGenFunc(F_X86, hpvm::CPU_TARGET); + N->addGenFunc(F_X86, hpvm::GPU_TARGET, true); + DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " + << N->getFuncPointer()->getName() << "\n"); // Loop over the arguments, to create the VMap dest_iterator = F_X86->arg_begin(); @@ -414,51 +406,53 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi break; } - assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + assert(C->isDummyNode() == false && "Internal Node only contains dummy + nodes!"); Function* CF = C->getFuncPointer(); */ - Function* KF = K->KernelLeafNode->getFuncPointer(); + Function *KF = K->KernelLeafNode->getFuncPointer(); // Initialize context - //DEBUG(errs() << "Initializing context" << "\n"); - //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); + // DEBUG(errs() << "Initializing context" << "\n"); + // CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI); - DEBUG(errs() << "Initializing commandQ" << "\n"); + DEBUG(errs() << "Initializing commandQ" + << "\n"); // Initialize command queue - switchToTimer(visc_TimerID_SETUP, InitCall); - Value* fileStr = getStringPointer(FileName, InitCall, "Filename"); + switchToTimer(hpvm_TimerID_SETUP, InitCall); + Value *fileStr = getStringPointer(FileName, InitCall, "Filename"); DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); - DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); - Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName"); - - Value* LaunchInstArgs[] = {fileStr, kernelStr}; - - DEBUG(errs() << "Inserting launch call" << "\n"); - CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, - ArrayRef<Value*>(LaunchInstArgs, 2), - "graph"+KF->getName(), - InitCall); + DEBUG(errs() << "Generating code for kernel - " + << K->KernelFunction->getName() << "\n"); + Value *kernelStr = + getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName"); + + Value *LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" + << "\n"); + CallInst *NVPTX_Ctx = CallInst::Create(llvm_hpvm_ocl_launch, + ArrayRef<Value *>(LaunchInstArgs, 2), + "graph" + KF->getName(), InitCall); DEBUG(errs() << *NVPTX_Ctx << "\n"); - GraphIDAddr = new GlobalVariable(M, - NVPTX_Ctx->getType(), - false, + GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false, GlobalValue::CommonLinkage, Constant::getNullValue(NVPTX_Ctx->getType()), - "graph"+KF->getName()+".addr"); + "graph" + KF->getName() + ".addr"); DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n"); - StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); + StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); DEBUG(errs() << *SI << "\n"); - switchToTimer(visc_TimerID_NONE, InitCall); - switchToTimer(visc_TimerID_SETUP, RI); - Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI); + switchToTimer(hpvm_TimerID_NONE, InitCall); + switchToTimer(hpvm_TimerID_SETUP, RI); + Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI); - // Iterate over the required input edges of the node and use the visc-rt API + // Iterate over the required input edges of the node and use the hpvm-rt API // to set inputs - DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); + DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n"); std::vector<OutputPtr> OutputPointers; - // Vector to hold the device memory object that need to be cleared before we release - // context - std::vector<Value*> DevicePointers; + // Vector to hold the device memory object that need to be cleared before we + // release context + std::vector<Value *> DevicePointers; std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap(); /* @@ -470,133 +464,134 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi */ - for(auto &InArgMapPair : kernelInArgMap) { + for (auto &InArgMapPair : kernelInArgMap) { unsigned i = InArgMapPair.first; - Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second); - DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); + Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second); + DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n"); // input value has been obtained. // Check if input is a scalar value or a pointer operand // For scalar values such as int, float, etc. the size is simply the size of // type on target machine, but for pointers, the size of data would be the // next integer argument - if(inputVal->getType()->isPointerTy()) { + if (inputVal->getType()->isPointerTy()) { - switchToTimer(visc_TimerID_COPY_PTR, RI); + switchToTimer(hpvm_TimerID_COPY_PTR, RI); // Pointer Input // CheckAttribute - Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; - Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) - && !(hasAttribute(KF, i, Attribute::In)))? False : True; - - Argument* A = getArgumentAt(KF, i); - if(isOutput == True) { + Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False; + Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) && + !(hasAttribute(KF, i, Attribute::In))) + ? False + : True; + + Argument *A = getArgumentAt(KF, i); + if (isOutput == True) { DEBUG(errs() << *A << " is an OUTPUT argument\n"); } - if(isInput == True) { + if (isInput == True) { DEBUG(errs() << *A << " is an INPUT argument\n"); } - - Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, - Type::getInt8PtrTy(M.getContext()), - inputVal->getName()+".i8ptr", - RI); + Value *inputValI8Ptr = CastInst::CreatePointerCast( + inputVal, Type::getInt8PtrTy(M.getContext()), + inputVal->getName() + ".i8ptr", RI); // Assert that the pointer argument size (next argument) is in the map - assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); - - Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); - assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) - && "Pointer type input must always be followed by size (integer type)"); - Value* setInputArgs[] = {GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - inputSize, - isInput, - isOutput - }; - Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, - ArrayRef<Value*>(setInputArgs, 6), "", RI); + assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end()); + + Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]); + assert( + inputSize->getType() == Type::getInt64Ty(M.getContext()) && + "Pointer type input must always be followed by size (integer type)"); + Value *setInputArgs[] = { + GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), i), + inputSize, + isInput, + isOutput}; + Value *d_ptr = + CallInst::Create(llvm_hpvm_ocl_argument_ptr, + ArrayRef<Value *>(setInputArgs, 6), "", RI); DevicePointers.push_back(d_ptr); // If this has out attribute, store the returned device pointer in // memory to read device memory later - if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); - } - else { - switchToTimer(visc_TimerID_COPY_SCALAR, RI); + if (isOutput == True) + OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } else { + switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); // Scalar Input // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI); - StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); - - Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, - Type::getInt8PtrTy(M.getContext()), - inputVal->getName()+".i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - ConstantExpr::getSizeOf(inputVal->getType()) - }; - CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + AllocaInst *inputValPtr = new AllocaInst( + inputVal->getType(), 0, inputVal->getName() + ".ptr", RI); + StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI); + + Value *inputValI8Ptr = CastInst::CreatePointerCast( + inputValPtr, Type::getInt8PtrTy(M.getContext()), + inputVal->getName() + ".i8ptr", RI); + + Value *setInputArgs[] = { + GraphID, inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), i), + ConstantExpr::getSizeOf(inputVal->getType())}; + CallInst::Create(llvm_hpvm_ocl_argument_scalar, + ArrayRef<Value *>(setInputArgs, 4), "", RI); } } - DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); + DEBUG( + errs() << "Setup shared memory arguments of node and insert hpvm api\n"); // Check to see if all the allocation sizes are constant (determined // statically) bool constSizes = true; - for (auto& e: K->getSharedInArgMap()) { + for (auto &e : K->getSharedInArgMap()) { constSizes &= isa<Constant>(e.second.first); } // If the sizes are all constant if (constSizes) { - for (auto& e: K->getSharedInArgMap()) { + for (auto &e : K->getSharedInArgMap()) { unsigned argNum = e.first; - Value* allocSize = e.second.first; + Value *allocSize = e.second.first; - DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at " << argNum + << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position - switchToTimer(visc_TimerID_COPY_SCALAR, RI); + switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); - assert(isa<Constant>(allocSize) && "Constant shared memory size is expected"); + assert(isa<Constant>(allocSize) && + "Constant shared memory size is expected"); - Value* setInputArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - allocSize - }; - CallInst::Create(llvm_visc_ocl_argument_shared, - ArrayRef<Value*>(setInputArgs, 3), "", RI); - } - else { + Value *setInputArgs[] = { + GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + allocSize}; + CallInst::Create(llvm_hpvm_ocl_argument_shared, + ArrayRef<Value *>(setInputArgs, 3), "", RI); + } else { // Sharem memory size argument - scalar at address position - switchToTimer(visc_TimerID_COPY_SCALAR, RI); + switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, - allocSize->getName()+".sharedMem.ptr", RI); - StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, - Type::getInt8PtrTy(M.getContext()), - allocSize->getName()+".sharedMem.i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - ConstantExpr::getSizeOf(allocSize->getType()) - }; - CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + AllocaInst *allocSizePtr = + new AllocaInst(allocSize->getType(), 0, + allocSize->getName() + ".sharedMem.ptr", RI); + StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value *allocSizeI8Ptr = CastInst::CreatePointerCast( + allocSizePtr, Type::getInt8PtrTy(M.getContext()), + allocSize->getName() + ".sharedMem.i8ptr", RI); + + Value *setInputArgs[] = { + GraphID, allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + ConstantExpr::getSizeOf(allocSize->getType())}; + CallInst::Create(llvm_hpvm_ocl_argument_scalar, + ArrayRef<Value *>(setInputArgs, 4), "", RI); } } } else { @@ -617,68 +612,64 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi ExtractValueInstVec.push_back(EI); } - for (auto& e: K->getSharedInArgMap()) { + for (auto &e : K->getSharedInArgMap()) { unsigned argNum = e.first; - Value* allocSize = ExtractValueInstVec[e.second.second/2]; + Value *allocSize = ExtractValueInstVec[e.second.second / 2]; - DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at " << argNum + << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position - switchToTimer(visc_TimerID_COPY_SCALAR, RI); - - Value* setInputArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - allocSize - }; - CallInst::Create(llvm_visc_ocl_argument_shared, - ArrayRef<Value*>(setInputArgs, 3), "", RI); - } - else { + switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); + + Value *setInputArgs[] = { + GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + allocSize}; + CallInst::Create(llvm_hpvm_ocl_argument_shared, + ArrayRef<Value *>(setInputArgs, 3), "", RI); + } else { // Sharem memory size argument - scalar at address position - switchToTimer(visc_TimerID_COPY_SCALAR, RI); + switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, - allocSize->getName()+".sharedMem.ptr", RI); - StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, - Type::getInt8PtrTy(M.getContext()), - allocSize->getName()+".sharedMem.i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - ConstantExpr::getSizeOf(allocSize->getType()) - }; - CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + AllocaInst *allocSizePtr = + new AllocaInst(allocSize->getType(), 0, + allocSize->getName() + ".sharedMem.ptr", RI); + StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value *allocSizeI8Ptr = CastInst::CreatePointerCast( + allocSizePtr, Type::getInt8PtrTy(M.getContext()), + allocSize->getName() + ".sharedMem.i8ptr", RI); + + Value *setInputArgs[] = { + GraphID, allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + ConstantExpr::getSizeOf(allocSize->getType())}; + CallInst::Create(llvm_hpvm_ocl_argument_scalar, + ArrayRef<Value *>(setInputArgs, 4), "", RI); } } } - - DEBUG(errs() << "Setup output edges of node and insert visc api\n"); + DEBUG(errs() << "Setup output edges of node and insert hpvm api\n"); // Set output if struct is not an empty struct - StructType* OutputTy = K->KernelLeafNode->getOutputType(); - std::vector<Value*> d_Outputs; - if(!OutputTy->isEmptyTy()) { - switchToTimer(visc_TimerID_COPY_PTR, RI); + StructType *OutputTy = K->KernelLeafNode->getOutputType(); + std::vector<Value *> d_Outputs; + if (!OutputTy->isEmptyTy()) { + switchToTimer(hpvm_TimerID_COPY_PTR, RI); // Not an empty struct // Iterate over all elements of the struct and put them in - for(unsigned i=0; i < OutputTy->getNumElements(); i++) { - unsigned outputIndex = KF->getFunctionType()->getNumParams()+i; - Value* setOutputArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), - ConstantExpr::getSizeOf(OutputTy->getElementType(i)) - }; - - CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, - ArrayRef<Value*>(setOutputArgs, 3), - "d_output."+KF->getName(), - RI); + for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { + unsigned outputIndex = KF->getFunctionType()->getNumParams() + i; + Value *setOutputArgs[] = { + GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex), + ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; + + CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr, + ArrayRef<Value *>(setOutputArgs, 3), + "d_output." + KF->getName(), RI); d_Outputs.push_back(d_Output); } } @@ -688,50 +679,41 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi // Allocate size_t[numDims] space on stack. Store the work group sizes and // pass it as an argument to ExecNode - switchToTimer(visc_TimerID_MISC, RI); + switchToTimer(hpvm_TimerID_MISC, RI); Value *workDim, *LocalWGPtr, *GlobalWGPtr; getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); - switchToTimer(visc_TimerID_KERNEL, RI); - Value* ExecNodeArgs[] = {GraphID, - workDim, - LocalWGPtr, - GlobalWGPtr - }; - CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, - ArrayRef<Value*>(ExecNodeArgs, 4), - "event."+KF->getName(), - RI); + switchToTimer(hpvm_TimerID_KERNEL, RI); + Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr}; + CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode, + ArrayRef<Value *>(ExecNodeArgs, 4), + "event." + KF->getName(), RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); // Wait for Kernel to Finish - CallInst::Create(llvm_visc_ocl_wait, - ArrayRef<Value*>(GraphID), - "", - RI); + CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI); - switchToTimer(visc_TimerID_READ_OUTPUT, RI); + switchToTimer(hpvm_TimerID_READ_OUTPUT, RI); // Read Output Struct if not empty - if(!OutputTy->isEmptyTy()) { - std::vector<Value*>h_Outputs; - Value* KernelOutput = UndefValue::get(OutputTy); - for(unsigned i=0; i < OutputTy->getNumElements(); i++) { - Value* GetOutputArgs[] = {GraphID, - Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - d_Outputs[i], - ConstantExpr::getSizeOf(OutputTy->getElementType(i)) - }; - CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, - ArrayRef<Value*>(GetOutputArgs, 4), - "h_output."+KF->getName()+".addr", - RI); + if (!OutputTy->isEmptyTy()) { + std::vector<Value *> h_Outputs; + Value *KernelOutput = UndefValue::get(OutputTy); + for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { + Value *GetOutputArgs[] = { + GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; + CallInst *h_Output = CallInst::Create( + llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4), + "h_output." + KF->getName() + ".addr", RI); // Read each device pointer listed in output struct // Load the output struct - CastInst* BI = BitCastInst::CreatePointerCast(h_Output, - OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI); - - Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI); - KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i), - KF->getName()+"output", RI); + CastInst *BI = BitCastInst::CreatePointerCast( + h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr", + RI); + + Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI); + KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, + ArrayRef<unsigned>(i), + KF->getName() + "output", RI); } OutputMap[K->KernelLeafNode] = KernelOutput; } @@ -746,75 +728,76 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); - Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; - CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, + output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "", RI); }*/ - switchToTimer(visc_TimerID_MEM_FREE, RI); + switchToTimer(hpvm_TimerID_MEM_FREE, RI); // Clear Context and free device memory - DEBUG(errs() << "Clearing context" << "\n"); + DEBUG(errs() << "Clearing context" + << "\n"); // Free Device Memory - for(auto d_ptr: DevicePointers) { - CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); + for (auto d_ptr : DevicePointers) { + CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI); } - switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); + switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall); // Clear Context - LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); - CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); - switchToTimer(visc_TimerID_NONE, CleanupCall); + LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall); + CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "", + CleanupCall); + switchToTimer(hpvm_TimerID_NONE, CleanupCall); - switchToTimer(visc_TimerID_MISC, RI); + switchToTimer(hpvm_TimerID_MISC, RI); DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - DFNode* C = N->getChildGraph()->getExit(); + DFNode *C = N->getChildGraph()->getExit(); // Get OutputType of this node - StructType* OutTy = N->getOutputType(); + StructType *OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find the kernel's output arg map, to use instead of the bindings std::vector<unsigned> outArgMap = kernel->getOutArgMap(); // Find all the input edges to exit node - for (unsigned i=0; i < OutTy->getNumElements(); i++) { + for (unsigned i = 0; i < OutTy->getNumElements(); i++) { DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); + DFEdge *E = C->getInDFEdgeAt(i); assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); + DFNode *SrcDF = E->getSourceDF(); - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() + << "\n"); // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { + Value *inputVal; + if (SrcDF->isEntryNode()) { inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { + DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); + } else { // edge is from a internal node // Check - code should already be generated for this source dfnode // FIXME: Since the 2-level kernel code gen has aspecific structure, we // can assume the SrcDF is same as Kernel Leaf node. // Use outArgMap to get correct mapping SrcDF = K->KernelLeafNode; - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); + assert(OutputMap.count(SrcDF) && + "Source node call not found. Dependency violation!"); // Find Output Value associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; + Value *CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; // i is the destination of DFEdge E // Use the mapping instead of the bindings -// IndexList.push_back(E->getSourcePosition()); + // IndexList.push_back(E->getSourcePosition()); IndexList.push_back(outArgMap[i]); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "",RI); + DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); + ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI); inputVal = EI; } std::vector<unsigned> IdxList; @@ -823,31 +806,33 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi } DEBUG(errs() << "Extracted all\n"); - switchToTimer(visc_TimerID_NONE, RI); + switchToTimer(hpvm_TimerID_NONE, RI); retVal->setName("output"); - ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); } - // Right now, only targeting the one level case. In general, device functions // can return values so we don't need to change them -void CGT_NVPTX::codeGen(DFInternalNode* N) { - errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n"; - if(KernelLaunchNode == NULL) - errs () << "No kernel launch node\n"; +void CGT_NVPTX::codeGen(DFInternalNode *N) { + DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName() + << "\n"); + if (KernelLaunchNode == NULL) + DEBUG(errs() << "No kernel launch node\n"); else { - errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; + DEBUG(errs() << "KernelLaunchNode: " + << KernelLaunchNode->getFuncPointer()->getName() << "\n"); } if (!KernelLaunchNode) { - DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); + DEBUG(errs() + << "No code generated (host code for kernel launch complete).\n"); return; } if (N == KernelLaunchNode) { DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); - //TODO + // TODO // Now the remaining nodes to be visited should be ignored KernelLaunchNode = NULL; @@ -862,7 +847,8 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // TODO: Structure assumed: one thread node, one allocation node (at most), // TB node std::map<unsigned, unsigned> inmapFinal; - for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), + ie = inmap2.end(); ib != ie; ++ib) { inmapFinal[ib->first] = inmap1[ib->second]; } @@ -879,8 +865,9 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // 0 ... outmap2.size()-1 // The limit is the size of outmap2, because this is the number of kernel // output arguments for which the mapping matters - // For now, it reasonable to assume that all the kernel arguments are returned, - // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size() + // For now, it reasonable to assume that all the kernel arguments are + // returned, maybe plys some others from other nodes, thus outmap2.size() <= + // outmap1.size() for (unsigned i = 0; i < outmap2.size(); i++) { outmap1[i] = outmap2[outmap1[i]]; } @@ -888,15 +875,14 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // Track the source of local dimlimits for the kernel // Dimension limit can either be a constant or an argument of parent - // function. Since Internal node would no longer exist, we need to insert the - // localWGSize with values from the parent of N. - std::vector<Value*> localWGSizeMapped; + // function. Since Internal node would no longer exist, we need to insert + // the localWGSize with values from the parent of N. + std::vector<Value *> localWGSizeMapped; for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { if (isa<Constant>(kernel->localWGSize[i])) { // if constant, use as it is localWGSizeMapped.push_back(kernel->localWGSize[i]); - } - else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { // if argument, find the argument location in N. Use InArgMap of N to // find the source location in Parent of N. Retrieve the argument from // parent to insert in the vector. @@ -906,46 +892,49 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); unsigned parentArgNum = N->getInArgMap()[argNum]; - Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + Argument *A = + getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); localWGSizeMapped.push_back(A); - } - else { - assert(false && "LocalWGsize using value which is neither argument nor constant!"); + } else { + assert( + false && + "LocalWGsize using value which is neither argument nor constant!"); } } // Update localWGSize vector of kernel kernel->setLocalWGSize(localWGSizeMapped); } - } -void CGT_NVPTX::codeGen(DFLeafNode* N) { - errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n"; +void CGT_NVPTX::codeGen(DFLeafNode *N) { + DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName() + << "\n"); // Skip code generation if it is a dummy node - if(N->isDummyNode()) { + if (N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } // Skip code generation if it is an allocation node - if(N->isAllocationNode()) { + if (N->isAllocationNode()) { DEBUG(errs() << "Skipping allocation node\n"); return; } // Generate code only if it has the right hint -// if(!checkPreferredTarget(N, visc::GPU_TARGET)) { -// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; -// return; -// } - if(!preferredTargetIncludes(N, visc::GPU_TARGET)) { - errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + // if(!checkPreferredTarget(N, hpvm::GPU_TARGET)) { + // errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + // return; + // } + if (!preferredTargetIncludes(N, hpvm::GPU_TARGET)) { + DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName() + << "\n"); return; } // Checking which node is the kernel launch - DFNode* PNode = N->getParent(); + DFNode *PNode = N->getParent(); int pLevel = PNode->getLevel(); int pReplFactor = PNode->getNumOfDim(); @@ -953,42 +942,40 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // (1) Parent is the top level node i.e., Root of DFG // OR // (2) Parent does not have multiple instances - errs() << "pLevel = " << pLevel << "\n"; - errs() << "pReplFactor = " << pReplFactor << "\n"; + DEBUG(errs() << "pLevel = " << pLevel << "\n"); + DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n"); assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node."); // Only these options are supported - enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy; - if(pLevel == 1 || !pReplFactor) { - errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n"; + enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy; + if (pLevel == 1 || !pReplFactor) { + DEBUG(errs() + << "*************** Kernel Gen: 1-Level Hierarchy **************\n"); SelectedHierarchy = ONE_LEVEL; KernelLaunchNode = PNode; - kernel = new Kernel(NULL, - N, - N->getInArgMap(), - N->getSharedInArgMap(), - N->getOutArgMap(), - N->getNumOfDim(), - N->getDimLimits()); - } - else { + kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(), + N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits()); + } else { // Converting a 2-level DFG to opencl kernel - errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n"; - assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node."); + DEBUG(errs() + << "*************** Kernel Gen: 2-Level Hierarchy **************\n"); + assert((pLevel >= 2) && + "Selected node not nested deep enough to be Kernel Node."); SelectedHierarchy = TWO_LEVEL; KernelLaunchNode = PNode->getParent(); - assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); + assert((PNode->getNumOfDim() == N->getNumOfDim()) && + "Dimension number must match"); // Contains the instructions generating the kernel configuration parameters - kernel = new Kernel(NULL, // kernel function - N, // kernel leaf node - N->getInArgMap(), // kenel argument mapping + kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node + N->getInArgMap(), // kenel argument mapping N->getSharedInArgMap(), - N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node - PNode->getNumOfDim(), // gridDim - PNode->getDimLimits(),// grid size - N->getNumOfDim(), // blockDim - N->getDimLimits()); // block size - + N->getOutArgMap(), // kernel output mapping from the + // leaf to the interemediate node + PNode->getNumOfDim(), // gridDim + PNode->getDimLimits(), // grid size + N->getNumOfDim(), // blockDim + N->getDimLimits()); // block size } std::vector<Instruction *> IItoRemove; @@ -1000,58 +987,62 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // Look up if we have visited this function before. If we have, then just // get the cloned function pointer from DFNode. Otherwise, create the cloned // function and add it to the DFNode GenFunc. -// Function *F_nvptx = N->getGenFunc(); - Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET); + // Function *F_nvptx = N->getGenFunc(); + Function *F_nvptx = N->getGenFuncForTarget(hpvm::GPU_TARGET); - assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated"); + assert(F_nvptx == NULL && + "Error: Visiting a node for which code already generated"); // Clone the function ValueToValueMapTy VMap; - //F_nvptx->setName(FName+"_nvptx"); + // F_nvptx->setName(FName+"_nvptx"); Twine FName = F->getName(); StringRef fStr = FName.getSingleStringRef(); - Twine newFName = Twine(fStr, "_nvptx"); + Twine newFName = Twine(fStr, "_nvptx"); F_nvptx = CloneFunction(F, VMap); F_nvptx->setName(newFName); - // errs() << "Old Function Name: " << F->getName() << "\n"; // errs() << "New Function Name: " << F_nvptx->getName() << "\n"; F_nvptx->removeFromParent(); - // Insert the cloned function into the kernels module KernelM->getFunctionList().push_back(F_nvptx); - - //TODO: Iterate over all the instructions of F_nvptx and identify the - //callees and clone them into this module. + // TODO: Iterate over all the instructions of F_nvptx and identify the + // callees and clone them into this module. DEBUG(errs() << *F_nvptx->getType()); DEBUG(errs() << *F_nvptx); // Transform the function to void and remove all target dependent attributes // from the function F_nvptx = transformFunctionToVoid(F_nvptx); - - //Add generated function info to DFNode -// N->setGenFunc(F_nvptx, visc::GPU_TARGET); - N->addGenFunc(F_nvptx, visc::GPU_TARGET, false); - DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); - F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes()); + // Add generated function info to DFNode + // N->setGenFunc(F_nvptx, hpvm::GPU_TARGET); + N->addGenFunc(F_nvptx, hpvm::GPU_TARGET, false); + + DEBUG( + errs() + << "Removing all attributes from Kernel Function and adding nounwind\n"); + F_nvptx->removeAttributes(AttributeList::FunctionIndex, + F_nvptx->getAttributes().getFnAttributes()); F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); - //FIXME: For now, assume only one allocation node + // FIXME: For now, assume only one allocation node kernel->AllocationNode = NULL; - for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), + iee = N->indfedge_end(); ieb != iee; ++ieb) { DFNode *SrcDFNode = (*ieb)->getSourceDF(); - DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Found edge from node: " + << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"); + DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode() + << "\n"); if (!SrcDFNode->isDummyNode()) { assert(SrcDFNode->isAllocationNode()); kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); @@ -1065,19 +1056,22 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // If no allocation node was found, SharedMemArgs is empty if (kernel->AllocationNode) { + ValueToValueMapTy VMap; - Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); - //F_alloc->removeFromParent(); + Function *F_alloc = + CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); + // F_alloc->removeFromParent(); // Insert the cloned function into the kernels module - //M.getFunctionList().push_back(F_alloc); + // M.getFunctionList().push_back(F_alloc); - std::vector<IntrinsicInst *> ViscMallocInstVec; - findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec); + std::vector<IntrinsicInst *> HPVMMallocInstVec; + findIntrinsicInst(F_alloc, Intrinsic::hpvm_malloc, HPVMMallocInstVec); - for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) { - IntrinsicInst *II = ViscMallocInstVec[i]; - assert(II->hasOneUse() && "visc_malloc result is used more than once"); - II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); + for (unsigned i = 0; i < HPVMMallocInstVec.size(); i++) { + IntrinsicInst *II = HPVMMallocInstVec[i]; + assert(II->hasOneUse() && "hpvm_malloc result is used more than once"); + II->replaceAllUsesWith( + ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); II->eraseFromParent(); } kernel->AllocationFunction = F_alloc; @@ -1092,15 +1086,19 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { assert(RetStructTy && "Allocation node does not return a struct type"); unsigned numFields = RetStructTy->getNumElements(); */ - std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap(); - AllocationNodeProperty* APN = - (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); - for (auto& AllocPair: APN->getAllocationList()) { + std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap = + kernel->getSharedInArgMap(); + AllocationNodeProperty *APN = + (AllocationNodeProperty *)kernel->AllocationNode->getProperty( + DFNode::Allocation); + for (auto &AllocPair : APN->getAllocationList()) { unsigned destPos = AllocPair.first->getDestPosition(); unsigned srcPos = AllocPair.first->getSourcePosition(); SharedMemArgs.push_back(destPos); - sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); - sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + sharedInMap[destPos] = + std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); + sharedInMap[destPos + 1] = + std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); } kernel->setSharedInArgMap(sharedInMap); } @@ -1110,12 +1108,14 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // global address space unsigned argIndex = 0; std::vector<unsigned> GlobalMemArgs; - for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end(); - ai != ae; ++ai) { - if (ai->getType()->isPointerTy()) { - // If the arguement is already chosen for shared memory arguemnt list, skip. - // Else put it in Global memory arguement list - if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) { + for (Function::arg_iterator ai = F_nvptx->arg_begin(), + ae = F_nvptx->arg_end(); + ai != ae; ++ai) { + if (ai->getType()->isPointerTy()) { + // If the arguement is already chosen for shared memory arguemnt list, + // skip. Else put it in Global memory arguement list + if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == + 0) { GlobalMemArgs.push_back(argIndex); } } @@ -1129,20 +1129,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // Optimization: Gloabl memory arguments, which are not modified and whose // loads are not dependent on node id of current node, should be moved to // constant memory, subject to size of course - std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); + std::vector<unsigned> ConstantMemArgs = + globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE); -// Function to replace call instructions to functions in the kernel + // Function to replace call instructions to functions in the kernel std::map<Function *, Function *> OrgToClonedFuncMap; std::vector<Function *> FuncToBeRemoved; - auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) { - Function* NewFunc; + auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) { + Function *NewFunc; // Check if the called function has already been cloned before. auto It = OrgToClonedFuncMap.find(OrgFunc); - if(It == OrgToClonedFuncMap.end()) { + if (It == OrgToClonedFuncMap.end()) { ValueToValueMapTy VMap; NewFunc = CloneFunction(OrgFunc, VMap); OrgToClonedFuncMap[OrgFunc] = NewFunc; @@ -1151,43 +1152,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { NewFunc = (*It).second; } // Replace the calls to this function - std::vector<Value*> args; - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + std::vector<Value *> args; + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { args.push_back(CI->getArgOperand(i)); } - CallInst* Inst = CallInst::Create(NewFunc, args, - OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + CallInst *Inst = CallInst::Create( + NewFunc, args, + OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); CI->replaceAllUsesWith(Inst); IItoRemove.push_back(CI); return NewFunc; }; - // Go through all the instructions - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; + ++i) { Instruction *I = &(*i); - // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + // Leaf nodes should not contain HPVM graph intrinsics or launch + assert(!BuildDFG::isHPVMLaunchIntrinsic(I) && + "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isHPVMGraphIntrinsic(I) && + "HPVM graph intrinsic within a leaf dataflow node!"); - if (BuildDFG::isViscIntrinsic(I)) { - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - IntrinsicInst* ArgII; - DFNode* ArgDFNode; + if (BuildDFG::isHPVMIntrinsic(I)) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst *ArgII; + DFNode *ArgDFNode; - /************************ Handle VISC Query intrinsics ************************/ + /************************ Handle HPVM Query intrinsics + * ************************/ switch (II->getIntrinsicID()) { - /**************************** llvm.visc.getNode() *****************************/ - case Intrinsic::visc_getNode: { + /**************************** llvm.hpvm.getNode() + * *****************************/ + case Intrinsic::hpvm_getNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); - } - break; - /************************* llvm.visc.getParentNode() **************************/ - case Intrinsic::visc_getParentNode: { + } break; + /************************* llvm.hpvm.getParentNode() + * **************************/ + case Intrinsic::hpvm_getParentNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); // get the parent node of the arg node // get argument node @@ -1200,10 +1206,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); IItoRemove.push_back(II); - } - break; - /*************************** llvm.visc.getNumDims() ***************************/ - case Intrinsic::visc_getNumDims: { + } break; + /*************************** llvm.hpvm.getNumDims() + * ***************************/ + case Intrinsic::hpvm_getNumDims: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); // get node from map // get the appropriate field @@ -1211,47 +1217,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; int numOfDim = ArgDFNode->getNumOfDim(); DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); - IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext()); - ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext()); + ConstantInt *numOfDimConstant = + ConstantInt::getSigned(IntTy, (int64_t)numOfDim); // Replace the result of the intrinsic with the computed value II->replaceAllUsesWith(numOfDimConstant); IItoRemove.push_back(II); - } - break; - /*********************** llvm.visc.getNodeInstanceID() ************************/ - case Intrinsic::visc_getNodeInstanceID_x: - case Intrinsic::visc_getNodeInstanceID_y: - case Intrinsic::visc_getNodeInstanceID_z: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n"); + } break; + /*********************** llvm.hpvm.getNodeInstanceID() + * ************************/ + case Intrinsic::hpvm_getNodeInstanceID_x: + case Intrinsic::hpvm_getNodeInstanceID_y: + case Intrinsic::hpvm_getNodeInstanceID_z: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" + << "\t: " << *II << "\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; assert(ArgDFNode && "Arg node is NULL"); // A leaf node always has a parent - DFNode* ParentDFNode = ArgDFNode->getParent(); + DFNode *ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = II->getIntrinsicID() - - Intrinsic::visc_getNodeInstanceID_x; + uint64_t dim = + II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt * DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - //ArrayRef<Value *> Args(DimConstant); + ConstantInt *DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + // ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function * OpenCLFunction; + Function *OpenCLFunction; - FunctionType* FT = - FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), - false); + FunctionType *FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), false); if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel @@ -1260,838 +1267,867 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // itself DEBUG(errs() << "Substitute with get_global_id()\n"); DEBUG(errs() << *II << "\n"); - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)) + .getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { - //DEBUG(errs() << "Here inside cond 2\n"); + // DEBUG(errs() << "Here inside cond 2\n"); // We are asking for this node's id with respect to its parent // this is a local id call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee()); - //DEBUG(errs() << "exiting condition 2\n"); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)) + .getCallee()); + // DEBUG(errs() << "exiting condition 2\n"); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's id with respect to its // parent: this is a group id call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)) + .getCallee()); } else { - errs() << N->getFuncPointer()->getName() << "\n"; - errs() << N->getParent()->getFuncPointer()->getName() << "\n"; - errs() << *II << "\n"; + DEBUG(errs() << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << *II << "\n"); assert(false && "Unable to translate getNodeInstanceID intrinsic"); } - //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n"); - //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n"); - //DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); - //DEBUG(errs() << "Argument: " << Args[0] << "\n"); - //DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); + // DEBUG(errs() << "Create call instruction, insert it before the + // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction << + // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); + // DEBUG(errs() << "Argument: " << Args[0] << "\n"); + // DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); - //DEBUG(errs() << "Replace uses\n"); + CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + // DEBUG(errs() << "Replace uses\n"); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } - break; - /********************** llvm.visc.getNumNodeInstances() ***********************/ - case Intrinsic::visc_getNumNodeInstances_x: - case Intrinsic::visc_getNumNodeInstances_y: - case Intrinsic::visc_getNumNodeInstances_z: { + } break; + /********************** llvm.hpvm.getNumNodeInstances() + * ***********************/ + case Intrinsic::hpvm_getNumNodeInstances_x: + case Intrinsic::hpvm_getNumNodeInstances_y: + case Intrinsic::hpvm_getNumNodeInstances_z: { // TODO: think about whether this is the best way to go there are hw // specific registers. therefore it is good to have the intrinsic but // then, why do we need to keep that info in the graph? (only for the // kernel configuration during the call) - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); + DEBUG(errs() << F_nvptx->getName() + << "\t: Handling getNumNodeInstances\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; // A leaf node always has a parent - DFNode* ParentDFNode = ArgDFNode->getParent(); + DFNode *ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = II->getIntrinsicID() - - Intrinsic::visc_getNumNodeInstances_x; + uint64_t dim = + II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt * DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - //ArrayRef<Value *> Args(DimConstant); + ConstantInt *DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + // ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function * OpenCLFunction; - FunctionType* FT = + Function *OpenCLFunction; + FunctionType *FT = FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), - false); + Type::getInt32Ty(KernelM->getContext()), false); if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel // launch, so the instances are global_size (gridDim x blockDim) - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)) + .getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { // We are asking for this node's instances // this is a local size (block dim) call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)) + .getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's instances // this is a (global_size/local_size) (grid dim) call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)) + .getCallee()); } else { assert(false && "Unable to translate getNumNodeInstances intrinsic"); } // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } - break; - case Intrinsic::visc_barrier: - { + } break; + case Intrinsic::hpvm_barrier: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n"); DEBUG(errs() << "Substitute with barrier()\n"); DEBUG(errs() << *II << "\n"); - FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()), - std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())), - false); - Function* OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee()); - CallInst* CI = CallInst::Create(OpenCLFunction, - ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)), - "", II); + FunctionType *FT = FunctionType::get( + Type::getVoidTy(KernelM->getContext()), + std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())), + false); + Function *OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("barrier"), FT)) + .getCallee()); + CallInst *CI = + CallInst::Create(OpenCLFunction, + ArrayRef<Value *>(ConstantInt::get( + Type::getInt32Ty(KernelM->getContext()), 1)), + "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); + } break; + case Intrinsic::hpvm_atomic_cmpxchg: + break; + case Intrinsic::hpvm_atomic_add: + case Intrinsic::hpvm_atomic_sub: + case Intrinsic::hpvm_atomic_xchg: + case Intrinsic::hpvm_atomic_min: + case Intrinsic::hpvm_atomic_max: + case Intrinsic::hpvm_atomic_and: + case Intrinsic::hpvm_atomic_or: + case Intrinsic::hpvm_atomic_xor: + // case Intrinsic::hpvm_atomic_inc: + // case Intrinsic::hpvm_atomic_dec: + { + DEBUG(errs() << *II << "\n"); + // Only have support for i32 atomic intrinsics + assert(II->getType() == Type::getInt32Ty(II->getContext()) && + "Only support i32 atomic intrinsics for now"); + // Substitute with atomicrmw instruction + assert(II->getNumArgOperands() == 2 && + "Expecting 2 operands for these atomics"); + Value *Ptr = II->getArgOperand(0); + Value *Val = II->getArgOperand(1); + assert(Ptr->getType()->isPointerTy() && + "First argument of supported atomics is expected to be a " + "pointer"); + PointerType *PtrTy = cast<PointerType>(Ptr->getType()); + PointerType *TargetTy = + Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); + if (PtrTy != TargetTy) { + Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); + PtrTy = TargetTy; + } + + std::string name; + if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_add) + name = "atomic_add"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_sub) + name = "atomic_sub"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xchg) + name = "atomic_xchg"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_min) + name = "atomic_min"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_max) + name = "atomic_max"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_and) + name = "atomic_and"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_or) + name = "atomic_or"; + else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xor) + name = "atomic_xor"; + Type *paramTypes[] = {PtrTy, Val->getType()}; + FunctionType *AtomFuncT = FunctionType::get( + II->getType(), ArrayRef<Type *>(paramTypes, 2), false); + FunctionCallee AtomFunc = + KernelM->getOrInsertFunction(name, AtomFuncT); + + Value *Params[] = {Ptr, Val}; + CallInst *AtomCI = CallInst::Create( + AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II); + DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); + II->replaceAllUsesWith(AtomCI); + IItoRemove.push_back(II); + } + break; + default: + llvm_unreachable("Unknown HPVM Intrinsic!"); + break; } - break; - case Intrinsic::visc_atomic_add: - case Intrinsic::visc_atomic_sub: - case Intrinsic::visc_atomic_xchg: - case Intrinsic::visc_atomic_min: - case Intrinsic::visc_atomic_max: - case Intrinsic::visc_atomic_and: - case Intrinsic::visc_atomic_or: - case Intrinsic::visc_atomic_xor: - { - DEBUG(errs() << *II << "\n"); - // Only have support for i32 atomic intrinsics - assert(II->getType() == Type::getInt32Ty(II->getContext()) - && "Only support i32 atomic intrinsics for now"); - // Substitute with atomicrmw instruction - assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics"); - Value* Ptr = II->getArgOperand(0); - Value* Val = II->getArgOperand(1); - assert(Ptr->getType()->isPointerTy() - && "First argument of supported atomics is expected to be a pointer"); - PointerType* PtrTy = cast<PointerType>(Ptr->getType()); - PointerType* TargetTy = Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); - if (PtrTy != TargetTy) { - Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); - PtrTy = TargetTy; + + } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { + IRBuilder<> Builder(I); + Value *Source = MemCpyI->getSource(); + Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); + Value *Length = MemCpyI->getOperand(2); + DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); + DEBUG(errs() << "Source: " << *Source << "\n"); + DEBUG(errs() << "Destination: " << *Destination << "\n"); + DEBUG(errs() << "Length: " << *Length << "\n"); + + size_t memcpy_length; + unsigned int memcpy_count; + if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) { + if (CI->getBitWidth() <= 64) { + memcpy_length = CI->getSExtValue(); + DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); + Type *Source_Type = Source->getType()->getPointerElementType(); + DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); + memcpy_count = + memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); + DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); + if (GetElementPtrInst *sourceGEPI = + dyn_cast<GetElementPtrInst>(Source)) { + if (GetElementPtrInst *destGEPI = + dyn_cast<GetElementPtrInst>(Destination)) { + Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); + Value *DestPtrOperand = destGEPI->getPointerOperand(); + for (int i = 0; i < memcpy_count; ++i) { + Constant *increment; + LoadInst *newLoadI; + StoreInst *newStoreI; + // First, need to increment the correct index for both source + // and dest This invluves checking to see how many indeces the + // GEP has Assume for now only 1 or 2 are the viable options. + + std::vector<Value *> GEPlIndex; + if (sourceGEPI->getNumIndices() == 1) { + Value *Index = sourceGEPI->getOperand(1); + increment = ConstantInt::get(Index->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPlIndex.push_back(incAdd); + Value *newGEPIl = Builder.CreateGEP( + SourcePtrOperand, ArrayRef<Value *>(GEPlIndex)); + DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); + newLoadI = Builder.CreateLoad(newGEPIl); + DEBUG(errs() << "Load: " << *newLoadI << "\n"); + } else { + llvm_unreachable("Unhandled case where source GEPI has more " + "than 1 indices!\n"); + } + + std::vector<Value *> GEPsIndex; + if (destGEPI->getNumIndices() == 1) { + + } else if (destGEPI->getNumIndices() == 2) { + Value *Index0 = destGEPI->getOperand(1); + GEPsIndex.push_back(Index0); + Value *Index1 = destGEPI->getOperand(2); + increment = ConstantInt::get(Index1->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index1, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPsIndex.push_back(incAdd); + Value *newGEPIs = Builder.CreateGEP( + DestPtrOperand, ArrayRef<Value *>(GEPsIndex)); + DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); + newStoreI = Builder.CreateStore(newLoadI, newGEPIs, + MemCpyI->isVolatile()); + DEBUG(errs() << "Store: " << *newStoreI << "\n"); + } else { + llvm_unreachable("Unhandled case where dest GEPI has more " + "than 2 indices!\n"); + } + } + IItoRemove.push_back(sourceGEPI); + IItoRemove.push_back(destGEPI); + Instruction *destBitcastI = + dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); + Instruction *sourceBitcastI = + dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); + IItoRemove.push_back(destBitcastI); + IItoRemove.push_back(sourceBitcastI); + IItoRemove.push_back(MemCpyI); + } + } } + } else { + llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); + } + // llvm_unreachable("HERE!"); + } - std::string name; - if(II->getIntrinsicID() == Intrinsic::visc_atomic_add) - name = "atomic_add"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub) - name = "atomic_sub"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg) - name = "atomic_xchg"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min) - name = "atomic_min"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max) - name = "atomic_max"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and) - name = "atomic_and"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or) - name = "atomic_or"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor) - name = "atomic_xor"; - Type* paramTypes[] = {PtrTy, Val->getType()}; - FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false); - FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT); - - Value* Params[] = {Ptr, Val}; - CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II); - DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); - II->replaceAllUsesWith(AtomCI); - IItoRemove.push_back(II); - } - break; - default: - llvm_unreachable("Unknown VISC Intrinsic!"); - break; - } - - } - else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { - IRBuilder<> Builder(I); - Value *Source = MemCpyI->getSource(); - Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); - Value *Length = MemCpyI->getOperand(2); - DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); - DEBUG(errs() << "Source: " << *Source << "\n"); - DEBUG(errs() << "Destination: " << *Destination << "\n"); - DEBUG(errs() << "Length: " << *Length << "\n"); - - size_t memcpy_length; - unsigned int memcpy_count; - if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) { - if (CI->getBitWidth() <= 64) { - memcpy_length = CI->getSExtValue(); - DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); - Type *Source_Type = Source->getType()->getPointerElementType(); - DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); - memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); - DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); - if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) { - if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) { - Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); - Value *DestPtrOperand = destGEPI->getPointerOperand(); - for(int i = 0; i < memcpy_count; ++i) { - Constant *increment; - LoadInst *newLoadI; - StoreInst *newStoreI; - // First, need to increment the correct index for both source and dest - // This invluves checking to see how many indeces the GEP has - // Assume for now only 1 or 2 are the viable options. - - std::vector<Value*> GEPlIndex; - if (sourceGEPI->getNumIndices() == 1) { - Value *Index = sourceGEPI->getOperand(1); - increment = ConstantInt::get(Index->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPlIndex.push_back(incAdd); - Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex)); - DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); - newLoadI = Builder.CreateLoad(newGEPIl); - DEBUG(errs() << "Load: " << *newLoadI << "\n"); - } else { - llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n"); - } - - - std::vector<Value*> GEPsIndex; - if (destGEPI->getNumIndices() == 1) { - - } else if (destGEPI->getNumIndices() == 2) { - Value *Index0 = destGEPI->getOperand(1); - GEPsIndex.push_back(Index0); - Value *Index1 = destGEPI->getOperand(2); - increment = ConstantInt::get(Index1->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index1, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPsIndex.push_back(incAdd); - Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex)); - DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); - newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile()); - DEBUG(errs() << "Store: " << *newStoreI << "\n"); - } else { - llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n"); - } - } - IItoRemove.push_back(sourceGEPI); - IItoRemove.push_back(destGEPI); - Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); - Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); - IItoRemove.push_back(destBitcastI); - IItoRemove.push_back(sourceBitcastI); - IItoRemove.push_back(MemCpyI); - } - } - - } - } else { - llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); - } - // llvm_unreachable("HERE!"); - } - - else if(CallInst* CI = dyn_cast<CallInst>(I)) { - DEBUG(errs() << "Found a call: " << *CI << "\n"); - Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); - if(calleeF->isDeclaration()) { - // Add the declaration to kernel module - if (calleeF->getName() == "sqrtf") { - calleeF->setName(Twine("sqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } else if (calleeF->getName() == "rsqrtf") { - calleeF->setName(Twine("rsqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } - DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); - KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); - } - else { - // Check if the called function has already been cloned before. - Function *NewFunc = CloneAndReplaceCall(CI, calleeF); - // Iterate over the new function to see if it calls any other functions - // in the module. - for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) { - if(auto *Call = dyn_cast<CallInst>(&*i)) { - Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts()); - CloneAndReplaceCall(Call, CalledFunc); - } - } - } - //TODO: how to handle address space qualifiers in load/store - } - - } - // search for pattern where float is being casted to int and loaded/stored and change it. - DEBUG(errs() << "finding pattern for replacement!\n"); - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { - bool cont = false; - bool keepGEPI = false; - bool keepGEPI2= false; - Instruction *I = &(*i); - GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I); - - if (!GEPI) { - // did nod find pattern start, continue - continue; - } - // may have found pattern, check - DEBUG(errs() << "GEPI " << *GEPI << "\n"); - // print whatever we want for debug - Value* PtrOp = GEPI->getPointerOperand(); - Type *SrcTy = GEPI->getSourceElementType(); - unsigned GEPIaddrspace = GEPI->getAddressSpace(); - - if (SrcTy->isArrayTy()) - DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n"); - else - DEBUG(errs() << *SrcTy << " is not an array type!\n"); - // check that source element type is float - if (SrcTy->isArrayTy()) { - if (!(SrcTy->getArrayElementType()->isFloatTy())) { - DEBUG(errs() << "GEPI type is array but not float!\n"); - continue; - } - } - else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) { - DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); - // does not fit this pattern - no float GEP instruction - continue; - } - // check that addressspace is 1 - // if (GEPIaddrspace != 1) { - // // does not fit this pattern - addrspace of pointer argument is not global - // continue; - // } - if (!(GEPI->hasOneUse())) { - // does not fit this pattern - more than one uses - //continue; - // Keep GEPI around if it has other uses - keepGEPI = true; - } - DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); - - // 1st GEPI it has one use - // assert(GEPI->hasOneUse() && "GEPI has a single use"); - - // See if it is a bitcast - BitCastInst *BitCastI; - for (User * U : GEPI->users()) { - if(Instruction *ui = dyn_cast<Instruction> (U)) { - DEBUG(errs() << "--" << *ui << "\n"); - if (isa<BitCastInst>(ui)) { - BitCastI = dyn_cast<BitCastInst>(ui); - DEBUG(errs() << "---Found bitcast as only use of GEP\n"); - break; - } - } - DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); - cont = true; - } - // for (Value::user_iterator ui = GEPI->user_begin(), - // ue = GEPI->user_end(); ui!=ue; ++ui) { - // DEBUG(errs() << "--" << *ui << "\n"); - // if (isa<BitCastInst>(*ui)) { - // BitCastI = dyn_cast<BitCastInst>(*ui); - // DEBUG(errs() << "Found bitcast as only use of GEP\n"); - // } - // } - - if (cont/*!BitCastI*/) { - continue; // not in pattern - } - - // DEBUG(errs() << *BitCastI << "\n"); - // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP. - Value *Op2 = BitCastI->getOperand(0); - DEBUG(errs() << "----" << *Op2 << "\n"); - // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); - // Type *OpTy = cast<Type>(Op2); - Type *OpTy = BitCastI->getDestTy(); - DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); - // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n"); - if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { - // maybe right syntax is (Type::getInt32Ty)->getPointerTo() - continue; // not in pattern - } - - DEBUG(errs() << "----Here!\n"); - // We are in GEP, bitcast. - - // user_iterator, to find the load. - - if (!(BitCastI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - } - DEBUG(errs() << "----Bitcast has one use!\n"); - // it has one use - assert(BitCastI->hasOneUse() && "BitCastI has a single use"); - LoadInst *LoadI; - for (User * U : BitCastI->users()) { - if (Instruction *ui = dyn_cast<Instruction> (U)) { - DEBUG(errs() << "-----" << *ui << "\n"); - if (isa<LoadInst>(ui)) { - LoadI = dyn_cast<LoadInst>(ui); - DEBUG(errs() << "-----Found load as only use of bitcast\n"); - break; - } - } - DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); - cont = true; - } - // for (Value::user_iterator ui = BitCastI->user_begin(), - // ue = BitCastI->user_end(); ui!=ue; ++ui) { - // if (isa<LoadInst>(*ui)) { - // LoadI = dyn_cast<LoadInst>(*ui); - // errs() << "Found load as only use of bitcast\n"; - // } - // } - - if (cont) { - continue; // not in pattern - } - - DEBUG("HERE!\n"); - // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from - assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n"); - - // Copy user_iterator, to find the store. - - if (!(LoadI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - // TODO: generalize: one load can have more than one store users - } - - // it has one use - assert(LoadI->hasOneUse() && "LoadI has a single use"); - Value::user_iterator ui = LoadI->user_begin(); - // skipped loop, because is has a single use - StoreInst *StoreI = dyn_cast<StoreInst>(*ui); - if (!StoreI) { - continue; // not in pattern - } - - // Also check that the store uses the loaded value as the value operand - if (StoreI->getValueOperand() != LoadI) { - continue; - } - - DEBUG(errs() << "-------Found store instruction\n"); - - // Look for its bitcast, which is its pointer operand - Value *StPtrOp = StoreI->getPointerOperand(); - DEBUG(errs() << "-------" << *StPtrOp << "\n"); - BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); - DEBUG(errs() << "-------" << *BitCastI2 << "\n"); - if (!BitCastI2) { - continue; //not in pattern - } - - DEBUG(errs() << "-------- Found Bit Cast of store!\n" ); - // found bitcast. Look for the second GEP, its from operand. - Value *BCFromOp = BitCastI2->getOperand(0); - GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); - DEBUG(errs() << "---------- " << *GEPI2 << "\n"); - if (!GEPI2) { - continue; //not in pattern - } - - if (!(GEPI2->hasOneUse())) { - // does not fit this pattern - more than one uses - //continue; - // Keep GEPI around if it has other uses - keepGEPI2 = true; - } - DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); - - Value *PtrOp2 = GEPI2->getPointerOperand(); - - // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above. - - // Assume we found pattern - if (!keepGEPI) { - IItoRemove.push_back(GEPI); - DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); - } else { - DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); - } - IItoRemove.push_back(BitCastI); - DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); - IItoRemove.push_back(LoadI); - DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); - IItoRemove.push_back(GEPI2); - DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); - IItoRemove.push_back(BitCastI2); - DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); - if (!keepGEPI2) { - IItoRemove.push_back(StoreI); - DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); - } else { - - DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n"); - } - - std::vector<Value*> GEPlIndex; - if (GEPI->hasIndices()) { - for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); - GEPlIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); - - std::vector<Value*> GEPsIndex; - if (GEPI2->hasIndices()) { - for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); - GEPsIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); - - - - // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); - GetElementPtrInst* newlGEP = - GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()), - PtrOp, // operand from 1st GEP - ArrayRef<Value*>(GEPlIndex), - Twine(), - StoreI); - DEBUG(errs() << "Adding: " << *newlGEP << "\n"); - // insert load before GEPI - LoadInst *newLoadI = - new LoadInst(Type::getFloatTy(M.getContext()), - newlGEP, // new GEP - Twine(), - LoadI->isVolatile(), - LoadI->getAlignment(), - LoadI->getOrdering(), - LoadI->getSyncScopeID(), - StoreI); - DEBUG(errs() << "Adding: " << *newLoadI << "\n"); - // same for GEP for store, for store operand - GetElementPtrInst* newsGEP = - GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), - PtrOp2, // operand from 2nd GEP - ArrayRef<Value*>(GEPsIndex), - Twine(), - StoreI); - DEBUG(errs() << "Adding: " << *newsGEP << "\n"); - // insert store before GEPI - StoreInst *newStoreI = - new StoreInst(newLoadI, - newsGEP, // new GEP - StoreI->isVolatile(), - StoreI->getAlignment(), - StoreI->getOrdering(), - StoreI->getSyncScopeID(), - StoreI); - DEBUG(errs() << "Adding: " << *newStoreI << "\n"); - - } - - // We need to do this explicitly: DCE pass will not remove them because we - // have assumed theworst memory behaviour for these function calls - // Traverse the vector backwards, otherwise definitions are deleted while - // their subsequent uses are still around - for (auto *I : reverse(IItoRemove)) { - DEBUG(errs() << "Erasing: " << *I << "\n"); - I->eraseFromParent(); - } - - // Removed the cloned functions from the parent module into the new module - for(auto *F : FuncToBeRemoved) { - F->removeFromParent(); //TODO: MARIA check - KernelM->getFunctionList().push_back(F); - } - - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; - errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; - DEBUG(errs() << *KernelM); - - return; -} + else if (CallInst *CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function *calleeF = + cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if (calleeF->isDeclaration()) { + // Add the declaration to kernel module + if (calleeF->getName() == "sqrtf") { + calleeF->setName(Twine("sqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } else if (calleeF->getName() == "rsqrtf") { + calleeF->setName(Twine("rsqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF + << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), + calleeF->getFunctionType()); + } else { + // Check if the called function has already been cloned before. + Function *NewFunc = CloneAndReplaceCall(CI, calleeF); + // Iterate over the new function to see if it calls any other functions + // in the module. + for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); + i != e; ++i) { + if (auto *Call = dyn_cast<CallInst>(&*i)) { + Function *CalledFunc = + cast<Function>(Call->getCalledValue()->stripPointerCasts()); + CloneAndReplaceCall(Call, CalledFunc); + } + } + } + // TODO: how to handle address space qualifiers in load/store + } + } + // search for pattern where float is being casted to int and loaded/stored and + // change it. + DEBUG(errs() << "finding pattern for replacement!\n"); + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; + ++i) { + bool cont = false; + bool keepGEPI = false; + bool keepGEPI2 = false; + Instruction *I = &(*i); + GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I); -bool DFG2LLVM_NVPTX::runOnModule(Module &M) { - errs() << "\nDFG2LLVM_NVPTX PASS\n"; + if (!GEPI) { + // did nod find pattern start, continue + continue; + } + // may have found pattern, check + DEBUG(errs() << "GEPI " << *GEPI << "\n"); + // print whatever we want for debug + Value *PtrOp = GEPI->getPointerOperand(); + Type *SrcTy = GEPI->getSourceElementType(); + unsigned GEPIaddrspace = GEPI->getAddressSpace(); + + if (SrcTy->isArrayTy()) + DEBUG(errs() << *SrcTy << " is an array type! " + << *(SrcTy->getArrayElementType()) << "\n"); + else + DEBUG(errs() << *SrcTy << " is not an array type!\n"); + // check that source element type is float + if (SrcTy->isArrayTy()) { + if (!(SrcTy->getArrayElementType()->isFloatTy())) { + DEBUG(errs() << "GEPI type is array but not float!\n"); + continue; + } + } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) { + DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); + // does not fit this pattern - no float GEP instruction + continue; + } + // check that addressspace is 1 + // if (GEPIaddrspace != 1) { + // // does not fit this pattern - addrspace of pointer + // argument is not global continue; + // } + if (!(GEPI->hasOneUse())) { + // does not fit this pattern - more than one uses + // continue; + // Keep GEPI around if it has other uses + keepGEPI = true; + } + DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); + + // 1st GEPI it has one use + // assert(GEPI->hasOneUse() && "GEPI has a single use"); + + // See if it is a bitcast + BitCastInst *BitCastI; + for (User *U : GEPI->users()) { + if (Instruction *ui = dyn_cast<Instruction>(U)) { + DEBUG(errs() << "--" << *ui << "\n"); + if (isa<BitCastInst>(ui)) { + BitCastI = dyn_cast<BitCastInst>(ui); + DEBUG(errs() << "---Found bitcast as only use of GEP\n"); + break; + } + } + DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); + cont = true; + } + // for (Value::user_iterator ui = GEPI->user_begin(), + // ue = GEPI->user_end(); ui!=ue; ++ui) { + // DEBUG(errs() << "--" << *ui << "\n"); + // if (isa<BitCastInst>(*ui)) { + // BitCastI = dyn_cast<BitCastInst>(*ui); + // DEBUG(errs() << "Found bitcast as only use of GEP\n"); + // } + // } + + if (cont /*!BitCastI*/) { + continue; // not in pattern + } + + // DEBUG(errs() << *BitCastI << "\n"); + // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand + // has to be the GEP, since this is a use of the GEP. + Value *Op2 = BitCastI->getOperand(0); + DEBUG(errs() << "----" << *Op2 << "\n"); + // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); + // Type *OpTy = cast<Type>(Op2); + Type *OpTy = BitCastI->getDestTy(); + DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); + // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << + // "\n"); + if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { + // maybe right syntax is (Type::getInt32Ty)->getPointerTo() + continue; // not in pattern + } + + DEBUG(errs() << "----Here!\n"); + // We are in GEP, bitcast. + + // user_iterator, to find the load. + + if (!(BitCastI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + } + DEBUG(errs() << "----Bitcast has one use!\n"); + // it has one use + assert(BitCastI->hasOneUse() && "BitCastI has a single use"); + LoadInst *LoadI; + for (User *U : BitCastI->users()) { + if (Instruction *ui = dyn_cast<Instruction>(U)) { + DEBUG(errs() << "-----" << *ui << "\n"); + if (isa<LoadInst>(ui)) { + LoadI = dyn_cast<LoadInst>(ui); + DEBUG(errs() << "-----Found load as only use of bitcast\n"); + break; + } + } + DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); + cont = true; + } + // for (Value::user_iterator ui = BitCastI->user_begin(), + // ue = BitCastI->user_end(); ui!=ue; ++ui) { + // if (isa<LoadInst>(*ui)) { + // LoadI = dyn_cast<LoadInst>(*ui); + // errs() << "Found load as only use of bitcast\n"; + // } + // } + + if (cont) { + continue; // not in pattern + } - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); + DEBUG("HERE!\n"); + // check that we load from pointer we got from bitcast - assert - the unique + // argument must be the use we found it from + assert(LoadI->getPointerOperand() == BitCastI && + "Unexpected Load Instruction Operand\n"); - // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); - // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); - // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + // Copy user_iterator, to find the store. - // Visitor for Code Generation Graph Traversal - CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); + if (!(LoadI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + // TODO: generalize: one load can have more than one store users + } + + // it has one use + assert(LoadI->hasOneUse() && "LoadI has a single use"); + Value::user_iterator ui = LoadI->user_begin(); + // skipped loop, because is has a single use + StoreInst *StoreI = dyn_cast<StoreInst>(*ui); + if (!StoreI) { + continue; // not in pattern + } + + // Also check that the store uses the loaded value as the value operand + if (StoreI->getValueOperand() != LoadI) { + continue; + } + + DEBUG(errs() << "-------Found store instruction\n"); + + // Look for its bitcast, which is its pointer operand + Value *StPtrOp = StoreI->getPointerOperand(); + DEBUG(errs() << "-------" << *StPtrOp << "\n"); + BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); + DEBUG(errs() << "-------" << *BitCastI2 << "\n"); + if (!BitCastI2) { + continue; // not in pattern + } + + DEBUG(errs() << "-------- Found Bit Cast of store!\n"); + // found bitcast. Look for the second GEP, its from operand. + Value *BCFromOp = BitCastI2->getOperand(0); + GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); + DEBUG(errs() << "---------- " << *GEPI2 << "\n"); + if (!GEPI2) { + continue; // not in pattern + } + + if (!(GEPI2->hasOneUse())) { + // does not fit this pattern - more than one uses + // continue; + // Keep GEPI around if it has other uses + keepGEPI2 = true; + } + DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); + + Value *PtrOp2 = GEPI2->getPointerOperand(); + + // Found GEPI2. TODO: kind of confused as o what checks I need to add here, + // let's add them together- all the code for int-float type checks is + // already above. + + // Assume we found pattern + if (!keepGEPI) { + IItoRemove.push_back(GEPI); + DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); + } else { + DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); + } + IItoRemove.push_back(BitCastI); + DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); + IItoRemove.push_back(LoadI); + DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); + IItoRemove.push_back(GEPI2); + DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); + IItoRemove.push_back(BitCastI2); + DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); + if (!keepGEPI2) { + IItoRemove.push_back(StoreI); + DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); + } else { + + DEBUG(errs() << "Keeping " << *StoreI + << " since it has multiple uses!\n"); + } + + std::vector<Value *> GEPlIndex; + if (GEPI->hasIndices()) { + for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); + GEPlIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); + + std::vector<Value *> GEPsIndex; + if (GEPI2->hasIndices()) { + for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); + GEPsIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); + + // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); + GetElementPtrInst *newlGEP = GetElementPtrInst::Create( + GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp, // operand from 1st GEP + ArrayRef<Value *>(GEPlIndex), Twine(), StoreI); + DEBUG(errs() << "Adding: " << *newlGEP << "\n"); + // insert load before GEPI + LoadInst *newLoadI = + new LoadInst(Type::getFloatTy(M.getContext()), + newlGEP, // new GEP + Twine(), LoadI->isVolatile(), LoadI->getAlignment(), + LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI); + DEBUG(errs() << "Adding: " << *newLoadI << "\n"); + // same for GEP for store, for store operand + GetElementPtrInst *newsGEP = GetElementPtrInst::Create( + GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp2, // operand from 2nd GEP + ArrayRef<Value *>(GEPsIndex), Twine(), StoreI); + DEBUG(errs() << "Adding: " << *newsGEP << "\n"); + // insert store before GEPI + StoreInst *newStoreI = + new StoreInst(newLoadI, + newsGEP, // new GEP + StoreI->isVolatile(), StoreI->getAlignment(), + StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI); + DEBUG(errs() << "Adding: " << *newStoreI << "\n"); + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (auto *I : reverse(IItoRemove)) { + DEBUG(errs() << "Erasing: " << *I << "\n"); + I->eraseFromParent(); + } + + // Removed the cloned functions from the parent module into the new module + for (auto *F : FuncToBeRemoved) { + F->removeFromParent(); // TODO: MARIA check + KernelM->getFunctionList().push_back(F); + } + + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName() + << "\n"); + DEBUG(errs() << *KernelM); - // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode: Roots) { - // Initiate code generation for root DFNode - CGTVisitor->visit(rootNode); - } + return; +} + +bool DFG2LLVM_NVPTX::runOnModule(Module &M) { + DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n"); + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode *> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = + // DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap + // = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode : Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } - CGTVisitor->writeKernelsModule(); + CGTVisitor->writeKernelsModule(); - //TODO: Edit module epilogue to remove the VISC intrinsic declarations - delete CGTVisitor; + // TODO: Edit module epilogue to remove the HPVM intrinsic declarations + delete CGTVisitor; - return true; + return true; } std::string CGT_NVPTX::getKernelsModuleName(Module &M) { - /*SmallString<128> currentDir; - llvm::sys::fs::current_path(currentDir); - std::string fileName = getFilenameFromModule(M); - Twine output = Twine(currentDir) + "/Output/" + fileName + ""; - return output.str().append(".kernels.ll");*/ - std::string mid = M.getModuleIdentifier(); - return mid.append(".kernels.ll"); + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); } -void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) { - assert(isa<PointerType>(V->getType()) - && "Value should be of Pointer Type!"); - PointerType* OldTy = cast<PointerType>(V->getType()); - PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); - V->mutateType(NewTy); - for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { - // Change all uses producing pointer type in same address space to new - // addressspace. - if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { - if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { - fixValueAddrspace(*ui, addrspace); - } - } - } +void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!"); + PointerType *OldTy = cast<PointerType>(V->getType()); + PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; + ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) { + if (PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } } - -std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) { - std::vector<unsigned> ConstantMemArgs; - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Argument* arg = &*ai; - std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), - GlobalMemArgs->end(), arg->getArgNo()); - // It has to be a global memory argument to be promotable - if(pos == GlobalMemArgs->end()) - continue; - - // Check if it can/should be promoted - if(canBePromoted(arg, F)) { - errs() << "Promoting << " << arg->getName() << " to constant memory."<< "\n"; - ConstantMemArgs.push_back(arg->getArgNo()); - GlobalMemArgs->erase(pos); - } - } - return ConstantMemArgs; +std::vector<unsigned> +CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs, + Function *F) { + std::vector<unsigned> ConstantMemArgs; + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai) { + Argument *arg = &*ai; + std::vector<unsigned>::iterator pos = std::find( + GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo()); + // It has to be a global memory argument to be promotable + if (pos == GlobalMemArgs->end()) + continue; + + // Check if it can/should be promoted + if (canBePromoted(arg, F)) { + DEBUG(errs() << "Promoting << " << arg->getName() + << " to constant memory." + << "\n"); + ConstantMemArgs.push_back(arg->getArgNo()); + GlobalMemArgs->erase(pos); + } + } + return ConstantMemArgs; } -Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { - unsigned idx = 0; - std::vector<Type*> ArgTypes; - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Argument *arg = &*ai; - DEBUG(errs() << *arg << "\n"); - unsigned argno = arg->getArgNo(); - if ((idx < Args.size()) && (argno == Args[idx])) { - fixValueAddrspace(arg, addrspace); - idx++; - } - ArgTypes.push_back(arg->getType()); - } - FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); - - //F->mutateType(PTy); - Function* newF = cloneFunction(F, newFT, false); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - - DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); - return newF; +Function *CGT_NVPTX::changeArgAddrspace(Function *F, + std::vector<unsigned> &Args, + unsigned addrspace) { + unsigned idx = 0; + std::vector<Type *> ArgTypes; + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai) { + Argument *arg = &*ai; + DEBUG(errs() << *arg << "\n"); + unsigned argno = arg->getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(arg, addrspace); + idx++; + } + ArgTypes.push_back(arg->getType()); + } + FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); + + // F->mutateType(PTy); + Function *newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n"); + return newF; } /* Add metadata to module KernelM, for OpenCL kernels */ void CGT_NVPTX::addCLMetadata(Function *F) { - IRBuilder<> Builder(&*F->begin()); + IRBuilder<> Builder(&*F->begin()); + + SmallVector<Metadata *, 8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); + + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations + + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_kernels = + KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); + + KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(ValueAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_annotations = + KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); +} - SmallVector<Metadata*,8> KernelMD; - KernelMD.push_back(ValueAsMetadata::get(F)); +void CGT_NVPTX::writeKernelsModule() { - // TODO: There is additional metadata used by kernel files but we skip them as - // they are not mandatory. In future they might be useful to enable - // optimizations + // In addition to deleting all other functions, we also want to spiff it + // up a little bit. Do this now. + legacy::PassManager Passes; - MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); - MDN_kernels->addOperand(MDKernelNode); + DEBUG(errs() << "Writing to File --- "); + DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n"); + std::error_code EC; + ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); + if (EC) { + DEBUG(errs() << EC.message() << '\n'); + } - KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); - // TODO: Replace 1 with the number of the kernel. - // Add when support for multiple launces is added - KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); - MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); - MDN_annotations->addOperand(MDNvvmAnnotationsNode); + Passes.add(createPrintModulePass(Out.os())); + Passes.run(*KernelM); + + // Declare success. + Out.keep(); } -void CGT_NVPTX::writeKernelsModule() { +Function *CGT_NVPTX::transformFunctionToVoid(Function *F) { - // In addition to deleting all other functions, we also want to spiff it - // up a little bit. Do this now. - legacy::PassManager Passes; + DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); + // FIXME: Maybe do that using the Node? + StructType *FRetTy = dyn_cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); - errs() << "Writing to File --- "; - errs() << getKernelsModuleName(M).c_str() << "\n"; - std::error_code EC; - ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); - if (EC) { - errs() << EC.message() << '\n'; - } + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); - Passes.add( - createPrintModulePass(Out.os())); + std::vector<Type *> RetArgTypes; + std::vector<Argument *> RetArgs; + std::vector<Argument *> Args; + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { - Passes.run(*KernelM); + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); - // Declare success. - Out.keep(); -} + // Replacing return statements with others returning void + for (auto *RI : RItoRemove) { + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + for (unsigned i = 0; i < FRetTy->getNumElements(); i++) { + Argument *RetArg = + new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + RetArgs.push_back(RetArg); + RetArgTypes.push_back(RetArg->getType()); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } -Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { - - DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); - // FIXME: Maybe do that using the Node? - StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); - assert(FRetTy && "Return Type must always be a struct"); - - // Keeps return statements, because we will need to replace them - std::vector<ReturnInst *> RItoRemove; - findReturnInst(F, RItoRemove); - - std::vector<Type *> RetArgTypes; - std::vector<Argument*> RetArgs; - std::vector<Argument*> Args; - // Check for { } return struct, which means that the function returns void - if (FRetTy->isEmptyTy()) { - - DEBUG(errs() << "\tFunction output struct is void\n"); - DEBUG(errs() << "\tNo parameters added\n"); - - // Replacing return statements with others returning void - for (auto *RI : RItoRemove) { - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - DEBUG(errs() << "\tChanged return statements to return void\n"); - } - else { - // The struct has return values, thus needs to be converted to parameter - - // Iterate over all element types of return struct and add arguments to the - // function - for (unsigned i=0; i<FRetTy->getNumElements(); i++) { - Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); - RetArgs.push_back(RetArg); - RetArgTypes.push_back(RetArg->getType()); - DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); - } - - DEBUG(errs() << "\tReplacing Return statements\n"); - // Replace return statements with extractValue and store instructions - for (auto *RI : RItoRemove) { - Value* RetVal = RI->getReturnValue(); - for(unsigned i = 0; i < RetArgs.size(); i++) { - ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), - RetArgs[i]->getName()+".val", RI); - new StoreInst(EI, RetArgs[i], RI); - } - // assert(RetVal && "Return value should not be null at this point"); - // StructType* RetType = cast<StructType>(RetVal->getType()); - // assert(RetType && "Return type is not a struct"); - - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - - } - } - DEBUG(errs() << "\tReplaced return statements\n"); - - // Create the argument type list with the added argument's type - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - for(auto *RATy: RetArgTypes) { - ArgTypes.push_back(RATy); - } - - // Creating Args vector to use in cloning! - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Args.push_back(&*ai); - } - for(auto *ai : RetArgs) { - Args.push_back(ai); - } - - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - Type* VoidRetType = Type::getVoidTy(F->getContext()); - FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); - - // Change the function type - //F->mutateType(PTy); - Function* newF = cloneFunction(F, newFT, false, NULL, &Args); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - //F->eraseFromParent(); - return newF; + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (auto *RI : RItoRemove) { + Value *RetVal = RI->getReturnValue(); + for (unsigned i = 0; i < RetArgs.size(); i++) { + ExtractValueInst *EI = ExtractValueInst::Create( + RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI); + new StoreInst(EI, RetArgs[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type *> ArgTypes; + for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + for (auto *RATy : RetArgTypes) { + ArgTypes.push_back(RATy); + } + + // Creating Args vector to use in cloning! + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai) { + Args.push_back(&*ai); + } + for (auto *ai : RetArgs) { + Args.push_back(ai); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type *VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + // F->mutateType(PTy); + Function *newF = cloneFunction(F, newFT, false, NULL, &Args); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + // F->eraseFromParent(); + return newF; } /****************************************************************************** @@ -2102,314 +2138,344 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { // 1. No stores // 2. Loads not dependent on getNodeInstanceID itrinsic -static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) { - if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - VisitedList->push_back(V); - for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); - ui != ue; ++ui) { - Instruction* I = dyn_cast<Instruction>(*ui); - if(!I) { - // if use is not an instruction, then skip it - continue; - } - DEBUG(errs() << "\t" << *I << "\n"); - if(isa<LoadInst>(I)) { - DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); - DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); - UseList->push_back(V); - } - else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { - // found a store in use chain - DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); - return true; - } - else if(BuildDFG::isViscIntrinsic(I)) { - // If it is an atomic intrinsic, we found a store - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") - && "Only visc atomic intrinsics can have an argument as input"); - return true; - } - else { - DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); - if(findLoadStoreUses(I, UseList, VisitedList)) - return true; - } - } - return false; +static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList, + std::vector<Value *> *VisitedList) { + if (std::find(VisitedList->begin(), VisitedList->end(), V) != + VisitedList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + VisitedList->push_back(V); + for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; + ++ui) { + Instruction *I = dyn_cast<Instruction>(*ui); + if (!I) { + // if use is not an instruction, then skip it + continue; + } + DEBUG(errs() << "\t" << *I << "\n"); + if (isa<LoadInst>(I)) { + DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); + DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); + UseList->push_back(V); + } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { + // found a store in use chain + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); + return true; + } else if (BuildDFG::isHPVMIntrinsic(I)) { + // If it is an atomic intrinsic, we found a store + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + assert(II && + II->getCalledValue()->getName().startswith("llvm.hpvm.atomic") && + "Only hpvm atomic intrinsics can have an argument as input"); + return true; + } else { + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); + if (findLoadStoreUses(I, UseList, VisitedList)) + return true; + } + } + return false; } -static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) { - if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - DependenceList->push_back(V); - // If not an instruction, then not dependent on node instance id - if(!isa<Instruction>(V) || isa<Constant>(V)) { - DEBUG(errs() << "\tStop\n"); - return false; - } - - Instruction* I = cast<Instruction>(V); - for(unsigned i = 0; i < I->getNumOperands(); i++) { - Value* operand = I->getOperand(i); - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { - if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x - || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y - || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { - Value* Node = II->getArgOperand(0); - IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); - assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); - if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { - DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); - return true; - } - } - } - if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { - DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); - continue; - } - DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); - if(isDependentOnNodeInstanceID(operand, DependenceList)) { - return true; - } - } - return false; +static bool isDependentOnNodeInstanceID(Value *V, + std::vector<Value *> *DependenceList) { + if (std::find(DependenceList->begin(), DependenceList->end(), V) != + DependenceList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + DependenceList->push_back(V); + // If not an instruction, then not dependent on node instance id + if (!isa<Instruction>(V) || isa<Constant>(V)) { + DEBUG(errs() << "\tStop\n"); + return false; + } + + Instruction *I = cast<Instruction>(V); + for (unsigned i = 0; i < I->getNumOperands(); i++) { + Value *operand = I->getOperand(i); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) { + if ((II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_x || + II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_y || + II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_z)) { + Value *Node = II->getArgOperand(0); + IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node); + assert( + GN && + "NodeInstanceID operande should be node/parent node intrinsic\n"); + if (GN->getIntrinsicID() == Intrinsic::hpvm_getNode) { + DEBUG(errs() << "\tDependency found on Node instance ID: " << *II + << "\n"); + return true; + } + } + } + if (CmpInst *CI = dyn_cast<CmpInst>(operand)) { + DEBUG(errs() << "Found compare instruction: " << *CI + << "\nNot following its dependency list\n"); + continue; + } + DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n"); + if (isDependentOnNodeInstanceID(operand, DependenceList)) { + return true; + } + } + return false; } // Function to check if argument arg can be changed to a constant memory pointer -static bool canBePromoted(Argument* arg, Function* F) { - DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); - std::vector<Value*> UseList; - std::vector<Value*> VisitedList; - // recursively traverse use chain - // if find a store instruction return false, everything fails, cannot be - // promoted - // if find a load instruction as use, add the GEP instruction to list - bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); - if(foundStore == true) - return false; - // See that the GEP instructions are not dependent on getNodeInstanceID - // intrinsic - DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); - std::vector<Value*>DependenceList; - for(auto U: UseList) { - if(isDependentOnNodeInstanceID(U, &DependenceList)) - return false; - } - DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); - return true; +static bool canBePromoted(Argument *arg, Function *F) { + DEBUG(errs() << "OPT: Check if Argument " << *arg + << " can be changed to constant memory\n"); + std::vector<Value *> UseList; + std::vector<Value *> VisitedList; + // recursively traverse use chain + // if find a store instruction return false, everything fails, cannot be + // promoted + // if find a load instruction as use, add the GEP instruction to list + bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); + if (foundStore == true) + return false; + // See that the GEP instructions are not dependent on getNodeInstanceID + // intrinsic + DEBUG(errs() << foundStore + << "\tNo Store Instruction found. Check dependence on node " + "instance ID\n"); + std::vector<Value *> DependenceList; + for (auto U : UseList) { + if (isDependentOnNodeInstanceID(U, &DependenceList)) + return false; + } + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); + return true; } - // Calculate execute node parameters which include, number of diemnsions for // dynamic instances of the kernel, local and global work group sizes. -static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* - &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { - - // Assign number of dimenstions a constant value - workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); - - // If local work group size if null - if(!kernel->hasLocalWG()) { - LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); - } - else { - for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { - if(isa<Argument>(kernel->localWGSize[i])) - kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; - } - LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); - } - - for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { - if(isa<Argument>(kernel->globalWGSize[i])) - kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; - } - - // For OpenCL, global work group size is the total bumber of instances in each - // dimension. So, multiply local and global dim limits. - std::vector<Value*> globalWGSizeInsts; - if(kernel->hasLocalWG()) { - for (unsigned i = 0; i < kernel->gridDim; i++) { - BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); - globalWGSizeInsts.push_back(MulInst); - } - } - else { - globalWGSizeInsts = kernel->globalWGSize; - } - GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); - DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr, + Value *&GlobalWGPtr, Kernel *kernel, + ValueToValueMapTy &VMap, Instruction *IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if (!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } else { + for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if (isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = + genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if (isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value *> globalWGSizeInsts; + if (kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator *MulInst = + BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], + kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); } // CodeGen for allocating space for Work Group on stack and returning a pointer // to its address -static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { - Value* WGPtr; - // Get int64_t and or ease of use - Type* Int64Ty = Type::getInt64Ty(M.getContext()); - - // Work Group type is [#dim x i64] - Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); - // Allocate space of Global work group data on stack and get pointer to - // first element. - AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB); - WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); - Value* nextDim = WGPtr; - DEBUG(errs() << *WGPtr << "\n"); - - // Iterate over the number of dimensions and store the global work group - // size in that dimension - for(unsigned i=0; i < WGSize.size(); i++) { - DEBUG(errs() << *WGSize[i] << "\n"); - assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); - - if(WGSize[i]->getType() != Int64Ty) { - // If number of dimensions are mentioned in any other integer format, - // generate code to extend it to i64. We need to use the mapped value in - // the new generated function, hence the use of VMap - // FIXME: Why are we changing the kernel WGSize vector here? - DEBUG(errs() << "Not i64. Zero extend required.\n"); - DEBUG(errs() << *WGSize[i] << "\n"); - CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); - DEBUG(errs() << "Bitcast done.\n"); - StoreInst* SI = new StoreInst(CI, nextDim, IB); - DEBUG(errs() << "Zero extend done.\n"); - DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); - } else { - // Store the value representing work group size in ith dimension on - // stack - StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); - - DEBUG(errs() << "\t Work group size: " << *SI << "\n"); - } - if(i+1 < WGSize.size()) { - // Move to next dimension - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, - ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), - WG->getName()+"."+Twine(i+1), - IB); - DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); - nextDim = GEP; - } - } - return WGPtr; +static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize, + ValueToValueMapTy &VMap, Instruction *IB, + const Twine &WGName) { + Value *WGPtr; + // Get int64_t and or ease of use + Type *Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type *WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), + WG->getName() + ".0", IB); + Value *nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for (unsigned i = 0; i < WGSize.size(); i++) { + DEBUG(errs() << *WGSize[i] << "\n"); + assert(WGSize[i]->getType()->isIntegerTy() && + "Dimension not an integer type!"); + + if (WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst *CI = + BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst *SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB); + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if (i + 1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)), + WG->getName() + "." + Twine(i + 1), IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; } // Get generated PTX binary name -static std::string getPTXFilename(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - moduleID.append(".kernels.cl"); - return moduleID; +static std::string getPTXFilename(const Module &M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".kernels.cl"); + return moduleID; } // Get the name of the input file from module ID -static std::string getFilenameFromModule(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - return moduleID.substr(moduleID.find_last_of("/")+1); +static std::string getFilenameFromModule(const Module &M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/") + 1); } // Changes the data layout of the Module to be compiled with NVPTX backend // TODO: Figure out when to call it, probably after duplicating the modules static void changeDataLayout(Module &M) { - std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; - std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; - if (TARGET_PTX == 32) - M.setDataLayout(StringRef(nvptx32_layoutStr)); - else if (TARGET_PTX == 64) - M.setDataLayout(StringRef(nvptx64_layoutStr)); - else assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else + assert(false && "Invalid PTX target"); - return; + return; } static void changeTargetTriple(Module &M) { - std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; - std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; - if (TARGET_PTX == 32) - M.setTargetTriple(StringRef(nvptx32_TargetTriple)); - else if (TARGET_PTX == 64) - M.setTargetTriple(StringRef(nvptx64_TargetTriple)); - else assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else + assert(false && "Invalid PTX target"); - return; + return; } // Helper function, populate a vector with all return statements in a function -static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { - for (auto &BB : *F) { - if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) - ReturnInstVec.push_back(RI); - } +static void findReturnInst(Function *F, + std::vector<ReturnInst *> &ReturnInstVec) { + for (auto &BB : *F) { + if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) + ReturnInstVec.push_back(RI); + } } -// Helper function, populate a vector with all IntrinsicID intrinsics in a function -static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction *I = &(*i); - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - if (II && II->getIntrinsicID() == IntrinsicID) { - IntrinsicInstVec.push_back(II); - } - } +// Helper function, populate a vector with all IntrinsicID intrinsics in a +// function +static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID, + std::vector<IntrinsicInst *> &IntrinsicInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } } -// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op +// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic +// op static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { - switch(ID) { - case Intrinsic::visc_atomic_add: - return AtomicRMWInst::Add; - case Intrinsic::visc_atomic_sub: - return AtomicRMWInst::Sub; - case Intrinsic::visc_atomic_min: - return AtomicRMWInst::Min; - case Intrinsic::visc_atomic_max: - return AtomicRMWInst::Max; - case Intrinsic::visc_atomic_xchg: - return AtomicRMWInst::Xchg; - case Intrinsic::visc_atomic_and: - return AtomicRMWInst::And; - case Intrinsic::visc_atomic_or: - return AtomicRMWInst::Or; - case Intrinsic::visc_atomic_xor: - return AtomicRMWInst::Xor; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch (ID) { + case Intrinsic::hpvm_atomic_add: + return AtomicRMWInst::Add; + case Intrinsic::hpvm_atomic_sub: + return AtomicRMWInst::Sub; + case Intrinsic::hpvm_atomic_min: + return AtomicRMWInst::Min; + case Intrinsic::hpvm_atomic_umin: + return AtomicRMWInst::UMin; + case Intrinsic::hpvm_atomic_max: + return AtomicRMWInst::Max; + case Intrinsic::hpvm_atomic_umax: + return AtomicRMWInst::UMax; + // case Intrinsic::hpvm_atomic_inc: return AtomicRMWInst::Inc; + // case Intrinsic::hpvm_atomic_dec: return AtomicRMWInst::Dec; + case Intrinsic::hpvm_atomic_xchg: + return AtomicRMWInst::Xchg; + case Intrinsic::hpvm_atomic_and: + return AtomicRMWInst::And; + case Intrinsic::hpvm_atomic_or: + return AtomicRMWInst::Or; + case Intrinsic::hpvm_atomic_xor: + return AtomicRMWInst::Xor; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } - // Helper funtion, returns the OpenCL function name, corresponding to atomic op static std::string getAtomicOpName(Intrinsic::ID ID) { - switch(ID) { - case Intrinsic::visc_atomic_add: - return "atom_add"; - case Intrinsic::visc_atomic_sub: - return "atom_sub"; - case Intrinsic::visc_atomic_min: - return "atom_min"; - case Intrinsic::visc_atomic_max: - return "atom_max"; - case Intrinsic::visc_atomic_xchg: - return "atom_xchg"; - case Intrinsic::visc_atomic_and: - return "atom_and"; - case Intrinsic::visc_atomic_or: - return "atom_or"; - case Intrinsic::visc_atomic_xor: - return "atom_xor"; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch (ID) { + case Intrinsic::hpvm_atomic_cmpxchg: + return "atom_cmpxchg"; + case Intrinsic::hpvm_atomic_add: + return "atom_add"; + case Intrinsic::hpvm_atomic_sub: + return "atom_sub"; + case Intrinsic::hpvm_atomic_min: + return "atom_min"; + case Intrinsic::hpvm_atomic_max: + return "atom_max"; + case Intrinsic::hpvm_atomic_inc: + return "atom_inc"; + case Intrinsic::hpvm_atomic_dec: + return "atom_dec"; + case Intrinsic::hpvm_atomic_xchg: + return "atom_xchg"; + case Intrinsic::hpvm_atomic_and: + return "atom_and"; + case Intrinsic::hpvm_atomic_or: + return "atom_or"; + case Intrinsic::hpvm_atomic_xor: + return "atom_xor"; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } } // End of namespace @@ -2420,4 +2486,3 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx", false /* does not modify the CFG */, true /* transformation, * * not just analysis */); - diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index a0fa9fcde477018cf00af5c932512ce804105c9d..8152817d9a9dbdb9d0164ba8cb7b9a49ce2f081f 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "DFG2LLVM_X86" -#include "SupportVISC/DFG2LLVM.h" +#include "SupportHPVM/DFG2LLVM.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" @@ -25,29 +25,29 @@ using namespace llvm; using namespace builddfg; using namespace dfg2llvm; -// VISC Command line option to use timer or not -static cl::opt<bool> VISCTimer_X86("visc-timers-x86", - cl::desc("Enable visc timers")); +// HPVM Command line option to use timer or not +static cl::opt<bool> HPVMTimer_X86("hpvm-timers-x86", + cl::desc("Enable hpvm timers")); // Command line option to enable device abstraction or not static cl::opt<bool> - DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, - cl::desc("Enable visc device abstraction")); + DeviceAbstraction("hpvm-eda", cl::init(false), cl::Hidden, + cl::desc("Enable hpvm device abstraction")); namespace { // Helper Functions -static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { +static bool isHPVMCall_llvm_hpvm_policy_getVersion(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); return (CI->getCalledValue()->stripPointerCasts()->getName()) - .equals("llvm_visc_policy_getVersion"); + .equals("llvm_hpvm_policy_getVersion"); } -CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { +CallInst *get_llvm_hpvm_policy_getVersion_call(Function *F) { for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { Instruction *I = &*ib; - if (isVISCCall_llvm_visc_policy_getVersion(I)) + if (isHPVMCall_llvm_hpvm_policy_getVersion(I)) return cast<CallInst>(I); } return NULL; @@ -74,27 +74,27 @@ private: // Member variables FunctionCallee malloc; - // VISC Runtime API - FunctionCallee llvm_visc_x86_launch; - FunctionCallee llvm_visc_x86_wait; - FunctionCallee llvm_visc_x86_argument_ptr; - - FunctionCallee llvm_visc_streamLaunch; - FunctionCallee llvm_visc_streamPush; - FunctionCallee llvm_visc_streamPop; - FunctionCallee llvm_visc_streamWait; - FunctionCallee llvm_visc_createBindInBuffer; - FunctionCallee llvm_visc_createBindOutBuffer; - FunctionCallee llvm_visc_createEdgeBuffer; - FunctionCallee llvm_visc_createLastInputBuffer; - FunctionCallee llvm_visc_createThread; - // Constant* llvm_visc_freeThreads; - FunctionCallee llvm_visc_bufferPush; - FunctionCallee llvm_visc_bufferPop; - FunctionCallee llvm_visc_x86_dstack_push; - FunctionCallee llvm_visc_x86_dstack_pop; - FunctionCallee llvm_visc_x86_getDimLimit; - FunctionCallee llvm_visc_x86_getDimInstance; + // HPVM Runtime API + FunctionCallee llvm_hpvm_x86_launch; + FunctionCallee llvm_hpvm_x86_wait; + FunctionCallee llvm_hpvm_x86_argument_ptr; + + FunctionCallee llvm_hpvm_streamLaunch; + FunctionCallee llvm_hpvm_streamPush; + FunctionCallee llvm_hpvm_streamPop; + FunctionCallee llvm_hpvm_streamWait; + FunctionCallee llvm_hpvm_createBindInBuffer; + FunctionCallee llvm_hpvm_createBindOutBuffer; + FunctionCallee llvm_hpvm_createEdgeBuffer; + FunctionCallee llvm_hpvm_createLastInputBuffer; + FunctionCallee llvm_hpvm_createThread; + // Constant* llvm_hpvm_freeThreads; + FunctionCallee llvm_hpvm_bufferPush; + FunctionCallee llvm_hpvm_bufferPop; + FunctionCallee llvm_hpvm_x86_dstack_push; + FunctionCallee llvm_hpvm_x86_dstack_pop; + FunctionCallee llvm_hpvm_x86_getDimLimit; + FunctionCallee llvm_hpvm_x86_getDimInstance; // Functions std::vector<IntrinsicInst *> *getUseList(Value *LI); @@ -120,7 +120,7 @@ private: // Virtual Functions void init() { - VISCTimer = VISCTimer_X86; + HPVMTimer = HPVMTimer_X86; TargetName = "X86"; } void initRuntimeAPI(); @@ -177,7 +177,7 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { return true; } -// Initialize the VISC runtime API. This makes it easier to insert these calls +// Initialize the HPVM runtime API. This makes it easier to insert these calls void CGT_X86::initRuntimeAPI() { // Load Runtime API Module @@ -187,51 +187,51 @@ void CGT_X86::initRuntimeAPI() { assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll"; + Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/hpvm-rt/hpvm-rt.ll"; runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); if (runtimeModule == NULL) DEBUG(errs() << Err.getMessage()); else - DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_visc_x86_launch); + DECLARE(llvm_hpvm_x86_launch); DECLARE(malloc); - DECLARE(llvm_visc_x86_wait); - DECLARE(llvm_visc_x86_argument_ptr); - DECLARE(llvm_visc_streamLaunch); - DECLARE(llvm_visc_streamPush); - DECLARE(llvm_visc_streamPop); - DECLARE(llvm_visc_streamWait); - DECLARE(llvm_visc_createBindInBuffer); - DECLARE(llvm_visc_createBindOutBuffer); - DECLARE(llvm_visc_createEdgeBuffer); - DECLARE(llvm_visc_createLastInputBuffer); - DECLARE(llvm_visc_createThread); - // DECLARE(llvm_visc_freeThreads); - DECLARE(llvm_visc_bufferPush); - DECLARE(llvm_visc_bufferPop); - DECLARE(llvm_visc_x86_dstack_push); - DECLARE(llvm_visc_x86_dstack_pop); - DECLARE(llvm_visc_x86_getDimLimit); - DECLARE(llvm_visc_x86_getDimInstance); + DECLARE(llvm_hpvm_x86_wait); + DECLARE(llvm_hpvm_x86_argument_ptr); + DECLARE(llvm_hpvm_streamLaunch); + DECLARE(llvm_hpvm_streamPush); + DECLARE(llvm_hpvm_streamPop); + DECLARE(llvm_hpvm_streamWait); + DECLARE(llvm_hpvm_createBindInBuffer); + DECLARE(llvm_hpvm_createBindOutBuffer); + DECLARE(llvm_hpvm_createEdgeBuffer); + DECLARE(llvm_hpvm_createLastInputBuffer); + DECLARE(llvm_hpvm_createThread); + // DECLARE(llvm_hpvm_freeThreads); + DECLARE(llvm_hpvm_bufferPush); + DECLARE(llvm_hpvm_bufferPop); + DECLARE(llvm_hpvm_x86_dstack_push); + DECLARE(llvm_hpvm_x86_dstack_pop); + DECLARE(llvm_hpvm_x86_getDimLimit); + DECLARE(llvm_hpvm_x86_getDimInstance); // Get or insert timerAPI functions as well if you plan to use timers initTimerAPI(); // Insert init context in main - Function *VI = M.getFunction("llvm.visc.init"); - assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + Function *VI = M.getFunction("llvm.hpvm.init"); + assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); DEBUG(errs() << "Inserting x86 timer initialization\n"); Instruction *I = cast<Instruction>(*VI->user_begin()); initializeTimerSet(I); - switchToTimer(visc_TimerID_NONE, I); + switchToTimer(hpvm_TimerID_NONE, I); // Insert code for initializing the sceduling policy FunctionCallee IP = M.getOrInsertFunction( - "llvm_visc_policy_init", - runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()); + "llvm_hpvm_policy_init", + runtimeModule->getFunction("llvm_hpvm_policy_init")->getFunctionType()); CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IPCallInst << "\n"); @@ -239,22 +239,22 @@ void CGT_X86::initRuntimeAPI() { // device status simulation if (DeviceAbstraction) { FunctionCallee ID = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_start", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_start") + "llvm_hpvm_deviceAbstraction_start", + runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_start") ->getFunctionType()); CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IDCallInst << "\n"); } - // Insert print instruction at visc exit - Function *VC = M.getFunction("llvm.visc.cleanup"); - assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); + // Insert print instruction at hpvm exit + Function *VC = M.getFunction("llvm.hpvm.cleanup"); + assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once"); // Insert code for clearing the sceduling policy I = cast<Instruction>(*VC->user_begin()); IP = M.getOrInsertFunction( - "llvm_visc_policy_clear", - runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()); + "llvm_hpvm_policy_clear", + runtimeModule->getFunction("llvm_hpvm_policy_clear")->getFunctionType()); IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IPCallInst << "\n"); @@ -265,8 +265,8 @@ void CGT_X86::initRuntimeAPI() { // device status simulation if (DeviceAbstraction) { FunctionCallee ID = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_end", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_end") + "llvm_hpvm_deviceAbstraction_end", + runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_end") ->getFunctionType()); CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IDCallInst << "\n"); @@ -542,7 +542,7 @@ void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args, // Call runtime to create the thread with these arguments DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); - // DEBUG(errs() << *llvm_visc_createThread << "\n"); + // DEBUG(errs() << *llvm_hpvm_createThread << "\n"); DEBUG(errs() << *graphID->getType() << "\n"); DEBUG(errs() << *C_Pipeline->getType() << "\n"); DEBUG(errs() << *Struct->getType() << "\n"); @@ -551,7 +551,7 @@ void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args, Struct->getName(), IB); Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI}; CallInst *CreateThread = CallInst::Create( - llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); + llvm_hpvm_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); } Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { @@ -639,17 +639,17 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition()); Value *BindInCallArgs[] = {graphID, size, Int_ArgNo}; CI = CallInst::Create( - llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), + llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI); } else if (Edge->getDestDF()->isExitNode()) { // Bind Output Edge CI = CallInst::Create( - llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), + llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI); } else { // Streaming Edge CI = CallInst::Create( - llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), + llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), Edge->getSourceDF()->getFuncPointer()->getName() + "." + Edge->getDestDF()->getFuncPointer()->getName(), RI); @@ -668,7 +668,7 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); Value *CallArgs[] = {graphID, size}; CallInst *CI = CallInst::Create( - llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), + llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI); NodeLastInputMap[child] = CI; } @@ -729,7 +729,7 @@ void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) { DEBUG(errs() << "Substitute launch intrinsic\n"); Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)}; CallInst *LaunchInst = CallInst::Create( - llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), + llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), "graph" + Root->getFuncPointer()->getName(), LI); // ReplaceInstWithInst(LI, LaunchInst); @@ -742,16 +742,16 @@ void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) { CallInst *CI; Value *PushArgs[] = {LaunchInst, II->getOperand(1)}; switch (II->getIntrinsicID()) { - case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst), + case Intrinsic::hpvm_wait: + CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst), ""); break; - case Intrinsic::visc_push: - CI = CallInst::Create(llvm_visc_streamPush, + case Intrinsic::hpvm_push: + CI = CallInst::Create(llvm_hpvm_streamPush, ArrayRef<Value *>(PushArgs, 2), ""); break; - case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst), + case Intrinsic::hpvm_pop: + CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst), ""); break; default: @@ -771,7 +771,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { DEBUG(errs() << "Generating Launch Function\n"); // Get Launch Instruction IntrinsicInst *LI = Root->getInstruction(); - switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); + switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI); DEBUG(errs() << "Generating Launch Function\n"); /* Now we have all the necessary global declarations necessary to generate the @@ -802,14 +802,14 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { ReturnInst *RI = ReturnInst::Create(AppFunc->getContext(), Constant::getNullValue(AppFunc->getReturnType()), BB); - switchToTimer(visc_TimerID_ARG_UNPACK, RI); + switchToTimer(hpvm_TimerID_ARG_UNPACK, RI); DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and // Function* RootF_X86 = Root->getGenFunc(); - Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); + Function *RootF_X86 = Root->getGenFuncForTarget(hpvm::CPU_TARGET); assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); - assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && + assert(Root->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && "Error: Generated Function for Root node with no x86 wrapper\n"); // Generate a call to RootF_X86 with null parameters for now @@ -837,8 +837,8 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { CI->setArgOperand(i, elements[i]); // Add timers around Call to RootF_X86 function - switchToTimer(visc_TimerID_COMPUTATION, CI); - switchToTimer(visc_TimerID_OUTPUT_PACK, RI); + switchToTimer(hpvm_TimerID_COMPUTATION, CI); + switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI); StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType()); @@ -888,7 +888,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { new StoreInst(CI, OutputAddrCast, RI); } - switchToTimer(visc_TimerID_NONE, RI); + switchToTimer(hpvm_TimerID_NONE, RI); DEBUG(errs() << "Application specific function:\n"); DEBUG(errs() << *AppFunc << "\n"); @@ -896,7 +896,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { // Substitute launch intrinsic main Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)}; CallInst *LaunchInst = CallInst::Create( - llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2), + llvm_hpvm_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2), "graph" + Root->getFuncPointer()->getName(), LI); // ReplaceInstWithInst(LI, LaunchInst); @@ -907,16 +907,16 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { IntrinsicInst *II = UseList->at(i); CallInst *CI; switch (II->getIntrinsicID()) { - case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst), + case Intrinsic::hpvm_wait: + CI = CallInst::Create(llvm_hpvm_x86_wait, ArrayRef<Value *>(LaunchInst), ""); break; - case Intrinsic::visc_push: - CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst), + case Intrinsic::hpvm_push: + CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst), ""); break; - case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst), + case Intrinsic::hpvm_pop: + CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst), ""); break; default: @@ -970,10 +970,10 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, Function *CF = C->getFuncPointer(); // Function* CF_X86 = C->getGenFunc(); - Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); + Function *CF_X86 = C->getGenFuncForTarget(hpvm::CPU_TARGET); assert(CF_X86 != NULL && "Found leaf node for which code generation has not happened yet!\n"); - assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && + assert(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && "The generated function to be called from x86 backend is not an x86 " "function\n"); DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); @@ -1040,7 +1040,7 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, CI->getArgOperand(numArgs - 6 + 2) // iZ }; - CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push, + CallInst *Push = CallInst::Create(llvm_hpvm_x86_dstack_push, ArrayRef<Value *>(args, 7), "", CI); DEBUG(errs() << "Push on stack: " << *Push << "\n"); // Insert call to runtime to pop the dim limits and instanceID from the depth @@ -1053,7 +1053,7 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, assert(NextI->getParent() == CI->getParent() && "Next Instruction should also belong to the same basic block!"); - CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); + CallInst *Pop = CallInst::Create(llvm_hpvm_x86_dstack_pop, None, "", NextI); DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); DEBUG(errs() << *CI->getParent()->getParent()); } @@ -1156,7 +1156,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { "streaming input edges\n"); // First read the termination condition variable islastInput CallInst *isLastInputPop = CallInst::Create( - llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); + llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); CastInst *BI = BitCastInst::CreateIntegerCast( isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false, @@ -1173,7 +1173,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { ++i) { if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { CallInst *bufferIn = - CallInst::Create(llvm_visc_bufferPop, + CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI); CastInst *BI; if (i->getType()->isPointerTy()) { @@ -1196,7 +1196,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { // DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); // CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, // C->getGenFunc()->getName()+".output", RI); - Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); + Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET); DEBUG(errs() << "Type: " << *CGenF->getType() << "\n"); CallInst *CI = CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI); @@ -1222,7 +1222,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { // Push to Output buffer Value *bufferOutArgs[] = {OutputArgs[i], BI}; CallInst *bufferOut = CallInst::Create( - llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI); + llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI); } // Add loop around the basic block, which exits the loop if isLastInput is @@ -1236,9 +1236,9 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { CondBB = CondStartI->getParent(); BodyBB = CI->getParent(); Instruction *CntI = NULL; - CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); + CallInst *GetPolicyCI = get_llvm_hpvm_policy_getVersion_call(CGenF); - // If the node function calls the visc runtime call to get policy, we update + // If the node function calls the hpvm runtime call to get policy, we update // it with the counter information. This means we need to pass an additional // argument to the generated function, that is the iteration number, and then // use it as an argument to the policy_getVersion call @@ -1255,14 +1255,14 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { } NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); - Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); + Function *NewCGenF = hpvmUtils::cloneFunction(CGenF, NewFT, false); // At least one (the last) argument exists (we added it) Function::arg_iterator ae = NewCGenF->arg_end(); --ae; Argument *CntArg = &*ae; CntArg->setName("iteration"); // Replace the old cpu gen func with this one - C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); + C->addGenFunc(NewCGenF, hpvm::CPU_TARGET, true); // Add counter to the actual parameter list, to create the new call InputArgs.push_back(CntI); @@ -1272,7 +1272,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { // Set second operand of the policy_getVersion call to the last function // argument - GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); + GetPolicyCI = get_llvm_hpvm_policy_getVersion_call(NewCGenF); GetPolicyCI->setArgOperand(1, CntArg); } @@ -1292,13 +1292,13 @@ void CGT_X86::codeGen(DFInternalNode *N) { // function before and nothing else needs to be done for this leaf node. // if(N->getGenFunc() != NULL) // return; - if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) { DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << " : skipping it\n"); return; } - assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL && "Error: Visiting a node for which code already generated\n"); // Sort children in topological order before code generation @@ -1315,7 +1315,7 @@ void CGT_X86::codeGen(DFInternalNode *N) { if (C->isDummyNode()) continue; - if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { + if (!(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET))) { errs() << "No CPU x86 version for child node " << C->getFuncPointer()->getName() << "\n Skip code gen for parent node " @@ -1361,8 +1361,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { RI = cast<ReturnInst>(BB->getTerminator()); // Add generated function info to DFNode - // N->setGenFunc(F_X86, visc::CPU_TARGET); - N->addGenFunc(F_X86, visc::CPU_TARGET, true); + // N->setGenFunc(F_X86, hpvm::CPU_TARGET); + N->addGenFunc(F_X86, hpvm::CPU_TARGET, true); // Loop over the arguments, to create the VMap. dest_iterator = F_X86->arg_begin(); @@ -1445,13 +1445,13 @@ void CGT_X86::codeGen(DFInternalNode *N) { // If not, we see which version exists, check that it is in fact an x86 // function and save it as the CPU_TARGET function - // TODO: visc_id per node, so we can use this for id for policies + // TODO: hpvm_id per node, so we can use this for id for policies // For now, use node function name and change it later - Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); - bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET); DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " << N->getTag() << "\n"); @@ -1460,7 +1460,7 @@ void CGT_X86::codeGen(DFInternalNode *N) { DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); - if (N->getTag() == visc::None) { + if (N->getTag() == hpvm::None) { // No code is available for this node. This (usually) means that this // node is a node that // - from the accelerator backends has been mapped to an intermediate @@ -1469,24 +1469,24 @@ void CGT_X86::codeGen(DFInternalNode *N) { // take place DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node " << N->getFuncPointer()->getName() << "\n"); - } else if (viscUtils::isSingleTargetTag(N->getTag())) { + } else if (hpvmUtils::isSingleTargetTag(N->getTag())) { // There is a single version for this node according to code gen hints. // Therefore, we do not need to check the policy, we simply use the // available implementation, whichever target it is for. // Sanity check - to be removed TODO switch (N->getTag()) { - case visc::CPU_TARGET: - assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); - assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + case hpvm::CPU_TARGET: + assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && ""); break; - case visc::GPU_TARGET: - assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); + case hpvm::GPU_TARGET: + assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET) && ""); break; default: assert(false && "Unreachable: we checked that tag was single target!\n"); @@ -1499,8 +1499,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { if (DeviceAbstraction) { Function *NodeGenFunc = NULL; switch (N->getTag()) { - case visc::GPU_TARGET: - NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); + case hpvm::GPU_TARGET: + NodeGenFunc = N->getGenFuncForTarget(hpvm::GPU_TARGET); break; default: break; @@ -1512,9 +1512,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { BasicBlock *BB = &*NodeGenFunc->begin(); std::vector<Value *> Args; // TODO: add the device type as argument? FunctionCallee RTF = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_waitOnDeviceStatus", + "llvm_hpvm_deviceAbstraction_waitOnDeviceStatus", runtimeModule - ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") + ->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus") ->getFunctionType()); CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); @@ -1522,17 +1522,17 @@ void CGT_X86::codeGen(DFInternalNode *N) { } Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - N->removeGenFuncForTarget(visc::GPU_TARGET); - N->setTag(visc::None); - N->addGenFunc(Ftmp, visc::CPU_TARGET, true); - N->setTag(visc::CPU_TARGET); + N->removeGenFuncForTarget(hpvm::GPU_TARGET); + N->setTag(hpvm::None); + N->addGenFunc(Ftmp, hpvm::CPU_TARGET, true); + N->setTag(hpvm::CPU_TARGET); // Sanity checks - to be removed TODO - CF = N->getGenFuncForTarget(visc::CPU_TARGET); - GF = N->getGenFuncForTarget(visc::GPU_TARGET); + CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); + GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); - CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET); DEBUG(errs() << "After editing\n"); DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " @@ -1545,11 +1545,11 @@ void CGT_X86::codeGen(DFInternalNode *N) { DEBUG(errs() << "Node Name (for policy) : " << N->getFuncPointer()->getName() << "\n"); - Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); - bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET); // These assertions express what we can support with the current runtime. // Code generation works the same way even for other target combinations. @@ -1610,8 +1610,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { Args.push_back( ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); FunctionCallee RTF = M.getOrInsertFunction( - "llvm_visc_policy_getVersion", - runtimeModule->getFunction("llvm_visc_policy_getVersion") + "llvm_hpvm_policy_getVersion", + runtimeModule->getFunction("llvm_hpvm_policy_getVersion") ->getFunctionType()); CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); @@ -1646,9 +1646,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { // call std::vector<Value *> Args; // TODO: add the device type as argument? FunctionCallee RTF = M.getOrInsertFunction( - "llvm_visc_deviceAbstraction_waitOnDeviceStatus", + "llvm_hpvm_deviceAbstraction_waitOnDeviceStatus", runtimeModule - ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") + ->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus") ->getFunctionType()); CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); } @@ -1673,8 +1673,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Prepare arguments and function for call to wait for device runtime call // std::vector<Value *> Args; // TODO: add the device type as argument? // FunctionCallee RTF = - // M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", - // runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()); + // M.getOrInsertFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus", + // runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()); // CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); // } // } @@ -1684,9 +1684,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Now, make the node cpu gen func to be this one // Remove all other versions and update the tag - N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); - N->removeGenFuncForTarget(visc::GPU_TARGET); - N->setTag(visc::CPU_TARGET); + N->addGenFunc(F_wrapper, hpvm::CPU_TARGET, true); + N->removeGenFuncForTarget(hpvm::GPU_TARGET); + N->setTag(hpvm::CPU_TARGET); // assert(false && "got to the point where we have to combine\n"); } @@ -1715,7 +1715,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // if(N->getGenFunc() != NULL) // return; - if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) { DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << " : skipping it\n"); @@ -1723,10 +1723,10 @@ void CGT_X86::codeGen(DFLeafNode *N) { << N->getFuncPointer()->getName() << "\n"); switch (N->getTag()) { - case visc::GPU_TARGET: + case hpvm::GPU_TARGET: // A leaf node should not have an x86 function for GPU // by design of DFG2LLVM_NVPTX backend - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && ""); break; default: break; @@ -1735,7 +1735,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { return; } - assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL && "Error: Visiting a node for which code already generated\n"); std::vector<IntrinsicInst *> IItoRemove; @@ -1759,8 +1759,8 @@ void CGT_X86::codeGen(DFLeafNode *N) { F_X86 = addIdxDimArgs(F_X86); // Add generated function info to DFNode - // N->setGenFunc(F_X86, visc::CPU_TARGET); - N->addGenFunc(F_X86, visc::CPU_TARGET, true); + // N->setGenFunc(F_X86, hpvm::CPU_TARGET); + N->addGenFunc(F_X86, hpvm::CPU_TARGET, true); // Go through the arguments, and any pointer arguments with in attribute need // to have x86_argument_ptr call to get the x86 ptr of the argument @@ -1768,7 +1768,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // Create new BB BasicBlock *EntryBB = &*F_X86->begin(); BasicBlock *BB = - BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); + BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_X86, EntryBB); BranchInst *Terminator = BranchInst::Create(EntryBB, BB); // Insert calls for (Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); @@ -1776,7 +1776,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { if (F_X86->getAttributes().hasAttribute(ai->getArgNo() + 1, Attribute::In)) { assert(ai->getType()->isPointerTy() && - "Only pointer arguments can have visc in/out attributes "); + "Only pointer arguments can have hpvm in/out attributes "); Function::arg_iterator aiNext = ai; ++aiNext; Argument *size = &*aiNext; @@ -1786,7 +1786,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr", Terminator); Value *ArgPtrCallArgs[] = {BI, size}; - CallInst::Create(llvm_visc_x86_argument_ptr, + CallInst::Create(llvm_hpvm_x86_argument_ptr, ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator); } } @@ -1796,30 +1796,30 @@ void CGT_X86::codeGen(DFLeafNode *N) { for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { Instruction *I = &(*i); DEBUG(errs() << *I << "\n"); - // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && + // Leaf nodes should not contain HPVM graph intrinsics or launch + assert(!BuildDFG::isHPVMLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && - "VISC graph intrinsic within a leaf dataflow node!"); + assert(!BuildDFG::isHPVMGraphIntrinsic(I) && + "HPVM graph intrinsic within a leaf dataflow node!"); - if (BuildDFG::isViscQueryIntrinsic(I)) { + if (BuildDFG::isHPVMQueryIntrinsic(I)) { IntrinsicInst *II = cast<IntrinsicInst>(I); IntrinsicInst *ArgII; DFNode *ArgDFNode; /*********************************************************************** - * Handle VISC Query intrinsics * + * Handle HPVM Query intrinsics * ***********************************************************************/ switch (II->getIntrinsicID()) { - /**************************** llvm.visc.getNode() *******************/ - case Intrinsic::visc_getNode: { + /**************************** llvm.hpvm.getNode() *******************/ + case Intrinsic::hpvm_getNode: { // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); break; } - /************************* llvm.visc.getParentNode() ****************/ - case Intrinsic::visc_getParentNode: { + /************************* llvm.hpvm.getParentNode() ****************/ + case Intrinsic::hpvm_getParentNode: { // get the parent node of the arg node // get argument node ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); @@ -1832,8 +1832,8 @@ void CGT_X86::codeGen(DFLeafNode *N) { IItoRemove.push_back(II); break; } - /*************************** llvm.visc.getNumDims() *****************/ - case Intrinsic::visc_getNumDims: { + /*************************** llvm.hpvm.getNumDims() *****************/ + case Intrinsic::hpvm_getNumDims: { // get node from map // get the appropriate field ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); @@ -1846,10 +1846,10 @@ void CGT_X86::codeGen(DFLeafNode *N) { IItoRemove.push_back(II); break; } - /*********************** llvm.visc.getNodeInstanceID() **************/ - case Intrinsic::visc_getNodeInstanceID_x: - case Intrinsic::visc_getNodeInstanceID_y: - case Intrinsic::visc_getNodeInstanceID_z: { + /*********************** llvm.hpvm.getNodeInstanceID() **************/ + case Intrinsic::hpvm_getNodeInstanceID_x: + case Intrinsic::hpvm_getNodeInstanceID_y: + case Intrinsic::hpvm_getNodeInstanceID_z: { ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; @@ -1864,7 +1864,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // (dim = 1) => y // (dim = 2) => z int dim = - (int)(II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x); + (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x); assert((dim >= 0) && (dim < 3) && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic " "ID!"); @@ -1894,7 +1894,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { Value *args[] = { ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; - CallInst *CI = CallInst::Create(llvm_visc_x86_getDimInstance, + CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimInstance, ArrayRef<Value *>(args, 2), "nodeInstanceID", II); DEBUG(errs() << *II << " replaced with " << *CI << "\n"); @@ -1903,10 +1903,10 @@ void CGT_X86::codeGen(DFLeafNode *N) { } break; } - /********************** llvm.visc.getNumNodeInstances() *************/ - case Intrinsic::visc_getNumNodeInstances_x: - case Intrinsic::visc_getNumNodeInstances_y: - case Intrinsic::visc_getNumNodeInstances_z: { + /********************** llvm.hpvm.getNumNodeInstances() *************/ + case Intrinsic::hpvm_getNumNodeInstances_x: + case Intrinsic::hpvm_getNumNodeInstances_y: + case Intrinsic::hpvm_getNumNodeInstances_z: { ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; @@ -1922,7 +1922,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // (dim = 1) => y // (dim = 2) => z int dim = - (int)(II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x); + (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x); assert((dim >= 0) && (dim < 3) && "Invalid dimension for getNumNodeInstances_[xyz]. Check " "Intrinsic ID!"); @@ -1952,7 +1952,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { Value *args[] = { ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; - CallInst *CI = CallInst::Create(llvm_visc_x86_getDimLimit, + CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimLimit, ArrayRef<Value *>(args, 2), "numNodeInstances", II); DEBUG(errs() << *II << " replaced with " << *CI << "\n"); @@ -1965,7 +1965,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { default: DEBUG(errs() << "Found unknown intrinsic with ID = " << II->getIntrinsicID() << "\n"); - assert(false && "Unknown VISC Intrinsic!"); + assert(false && "Unknown HPVM Intrinsic!"); break; } diff --git a/hpvm/lib/Transforms/GenVISC/CMakeLists.txt b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt similarity index 74% rename from hpvm/lib/Transforms/GenVISC/CMakeLists.txt rename to hpvm/lib/Transforms/GenHPVM/CMakeLists.txt index ed087f63b4933a33792d7cd773acdf8fab1ac8e3..967766e7058c1ef8bcc1414afb7ff0087e3ce188 100644 --- a/hpvm/lib/Transforms/GenVISC/CMakeLists.txt +++ b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt @@ -2,9 +2,9 @@ if(WIN32 OR CYGWIN) set(LLVM_LINK_COMPONENTS Core Support) endif() -add_llvm_library( LLVMGenVISC +add_llvm_library( LLVMGenHPVM MODULE - GenVISC.cpp + GenHPVM.cpp DEPENDS intrinsics_gen diff --git a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp new file mode 100644 index 0000000000000000000000000000000000000000..738b39905b885aa42bc861e3a19c3bdf9c65668e --- /dev/null +++ b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp @@ -0,0 +1,894 @@ +//=== GenHPVM.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "genhpvm" +#include "GenHPVM/GenHPVM.h" + +#include "SupportHPVM/HPVMHint.h" +#include "SupportHPVM/HPVMUtils.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +#define TIMER(X) \ + do { \ + if (HPVMTimer) { \ + X; \ + } \ + } while (0) + +using namespace llvm; +using namespace hpvmUtils; + +// HPVM Command line option to use timer or not +static cl::opt<bool> HPVMTimer("hpvm-timers-gen", + cl::desc("Enable GenHPVM timer")); + +namespace genhpvm { + +// Helper Functions + +static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID); +static Function *transformReturnTypeToStruct(Function *F); +static Type *getReturnTypeFromReturnInst(Function *F); + +// Check if the dummy function call is a __hpvm__node call +#define IS_HPVM_CALL(callName) \ + static bool isHPVMCall_##callName(Instruction *I) { \ + if (!isa<CallInst>(I)) \ + return false; \ + CallInst *CI = cast<CallInst>(I); \ + return (CI->getCalledValue()->stripPointerCasts()->getName()) \ + .equals("__hpvm__" #callName); \ + } + +static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID, + std::vector<Instruction *> *Erase) { + // Check if the instruction is Call Instruction + assert(isa<CallInst>(I) && "Expecting CallInst"); + CallInst *CI = cast<CallInst>(I); + DEBUG(errs() << "Found call: " << *CI << "\n"); + + // Find the correct intrinsic call + Module *M = CI->getParent()->getParent()->getParent(); + Function *F; + std::vector<Type *> ArgTypes; + std::vector<Value *> args; + if (Intrinsic::isOverloaded(IntrinsicID)) { + // This is an overloaded intrinsic. The types must exactly match. Get the + // argument types + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + ArgTypes.push_back(CI->getArgOperand(i)->getType()); + args.push_back(CI->getArgOperand(i)); + } + F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); + DEBUG(errs() << *F << "\n"); + } else { // Non-overloaded intrinsic + F = Intrinsic::getDeclaration(M, IntrinsicID); + FunctionType *FTy = F->getFunctionType(); + DEBUG(errs() << *F << "\n"); + + // Create argument list + assert(CI->getNumArgOperands() == FTy->getNumParams() && + "Number of arguments of call do not match with Intrinsic"); + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + Value *V = CI->getArgOperand(i); + // Either the type should match or both should be of pointer type + assert((V->getType() == FTy->getParamType(i) || + (V->getType()->isPointerTy() && + FTy->getParamType(i)->isPointerTy())) && + "Dummy function call argument does not match with Intrinsic " + "argument!"); + // If the types do not match, then both must be pointer type and pointer + // cast needs to be performed + if (V->getType() != FTy->getParamType(i)) { + V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); + } + args.push_back(V); + } + } + // Insert call instruction + CallInst *Inst = CallInst::Create( + F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); + + DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); + + CI->replaceAllUsesWith(Inst); + // If the previous instruction needs to be erased, insert it in the vector + // Erased + if (Erase != NULL) + Erase->push_back(CI); +} + +IS_HPVM_CALL(launch) /* Exists but not required */ +IS_HPVM_CALL(edge) /* Exists but not required */ +IS_HPVM_CALL(createNodeND) +// IS_HPVM_CALL(createNode) +// IS_HPVM_CALL(createNode1D) +// IS_HPVM_CALL(createNode2D) +// IS_HPVM_CALL(createNode3D) +IS_HPVM_CALL(bindIn) +IS_HPVM_CALL(bindOut) +IS_HPVM_CALL(push) +IS_HPVM_CALL(pop) +IS_HPVM_CALL(getNode) +IS_HPVM_CALL(getParentNode) +IS_HPVM_CALL(barrier) +IS_HPVM_CALL(malloc) +IS_HPVM_CALL(return ) +IS_HPVM_CALL(getNodeInstanceID_x) +IS_HPVM_CALL(getNodeInstanceID_y) +IS_HPVM_CALL(getNodeInstanceID_z) +IS_HPVM_CALL(getNumNodeInstances_x) +IS_HPVM_CALL(getNumNodeInstances_y) +IS_HPVM_CALL(getNumNodeInstances_z) +// Atomics +IS_HPVM_CALL(atomic_cmpxchg) +IS_HPVM_CALL(atomic_add) +IS_HPVM_CALL(atomic_sub) +IS_HPVM_CALL(atomic_xchg) +IS_HPVM_CALL(atomic_inc) +IS_HPVM_CALL(atomic_dec) +IS_HPVM_CALL(atomic_min) +IS_HPVM_CALL(atomic_max) +IS_HPVM_CALL(atomic_umin) +IS_HPVM_CALL(atomic_umax) +IS_HPVM_CALL(atomic_and) +IS_HPVM_CALL(atomic_or) +IS_HPVM_CALL(atomic_xor) +// Misc Fn +IS_HPVM_CALL(floor) +IS_HPVM_CALL(rsqrt) +IS_HPVM_CALL(sqrt) +IS_HPVM_CALL(sin) +IS_HPVM_CALL(cos) + +IS_HPVM_CALL(init) +IS_HPVM_CALL(cleanup) +IS_HPVM_CALL(wait) +IS_HPVM_CALL(trackMemory) +IS_HPVM_CALL(untrackMemory) +IS_HPVM_CALL(requestMemory) +IS_HPVM_CALL(attributes) +IS_HPVM_CALL(hint) + +// Return the constant integer represented by value V +static unsigned getNumericValue(Value *V) { + assert( + isa<ConstantInt>(V) && + "Value indicating the number of arguments should be a constant integer"); + return cast<ConstantInt>(V)->getZExtValue(); +} + +// Take the __hpvm__return instruction and generate code for combining the +// values being returned into a struct and returning it. +// The first operand is the number of returned values +static Value *genCodeForReturn(CallInst *CI) { + LLVMContext &Ctx = CI->getContext(); + assert(isHPVMCall_return(CI) && "__hpvm__return instruction expected!"); + + // Parse the dummy function call here + assert(CI->getNumArgOperands() > 0 && + "Too few arguments for __hpvm_return call!\n"); + unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); + + assert(CI->getNumArgOperands() - 1 == numRetVals && + "Too few arguments for __hpvm_return call!\n"); + DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); + + std::vector<Type *> ArgTypes; + for (unsigned i = 1; i < CI->getNumArgOperands(); i++) { + ArgTypes.push_back(CI->getArgOperand(i)->getType()); + } + Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); + StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); + + InsertValueInst *IV = InsertValueInst::Create( + UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI); + DEBUG(errs() << "Code generation for return:\n"); + DEBUG(errs() << *IV << "\n"); + + for (unsigned i = 2; i < CI->getNumArgOperands(); i++) { + IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(), + CI); + DEBUG(errs() << *IV << "\n"); + } + + return IV; +} + +// Analyse the attribute call for this function. Add the in and out +// attributes to pointer parameters. +static void handleHPVMAttributes(Function *F, CallInst *CI) { + DEBUG(errs() << "Kernel before adding In/Out HPVM attributes:\n" + << *F << "\n"); + // Parse the dummy function call here + unsigned offset = 0; + // Find number of In pointers + assert(CI->getNumArgOperands() > offset && + "Too few arguments for __hpvm__attributes call!"); + unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); + DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); + + for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) { + Value *V = CI->getArgOperand(i); + if (Argument *arg = dyn_cast<Argument>(V)) { + F->addAttribute(1 + arg->getArgNo(), Attribute::In); + } else { + DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n"); + llvm_unreachable( + "Only pointer arguments can be passed to __hpvm__attributes call"); + } + } + // Find number of Out Pointers + offset += 1 + numInPtrs; + assert(CI->getNumArgOperands() > offset && + "Too few arguments for __hpvm__attributes call!"); + unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); + DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); + for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) { + Value *V = CI->getArgOperand(i); + if (Argument *arg = dyn_cast<Argument>(V)) { + F->addAttribute(1 + arg->getArgNo(), Attribute::Out); + } else { + DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n"); + llvm_unreachable( + "Only pointer arguments can be passed to __hpvm__attributes call"); + } + } + DEBUG(errs() << "Kernel after adding In/Out HPVM attributes:\n" + << *F << "\n"); +} + +// Public Functions of GenHPVM pass +bool GenHPVM::runOnModule(Module &M) { + DEBUG(errs() << "\nGENHPVM PASS\n"); + this->M = &M; + + // Load Runtime API Module + SMDiagnostic Err; + + char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = + llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc"; + DEBUG(errs() << llvmSrcRoot << "\n"); + + std::unique_ptr<Module> runtimeModule = + parseIRFile(runtimeAPI.str(), Err, M.getContext()); + + if (runtimeModule == NULL) { + DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); + assert(false && "couldn't parse runtime"); + } else + DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); + + llvm_hpvm_initializeTimerSet = M.getOrInsertFunction( + "llvm_hpvm_initializeTimerSet", + runtimeModule->getFunction("llvm_hpvm_initializeTimerSet") + ->getFunctionType()); + // DEBUG(errs() << *llvm_hpvm_initializeTimerSet); + + llvm_hpvm_switchToTimer = M.getOrInsertFunction( + "llvm_hpvm_switchToTimer", + runtimeModule->getFunction("llvm_hpvm_switchToTimer")->getFunctionType()); + // DEBUG(errs() << *llvm_hpvm_switchToTimer); + + llvm_hpvm_printTimerSet = M.getOrInsertFunction( + "llvm_hpvm_printTimerSet", + runtimeModule->getFunction("llvm_hpvm_printTimerSet")->getFunctionType()); + // DEBUG(errs() << *llvm_hpvm_printTimerSet); + + // Insert init context in main + DEBUG(errs() << "Locate __hpvm__init()\n"); + Function *VI = M.getFunction("__hpvm__init"); + assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); + Instruction *I = cast<Instruction>(*VI->user_begin()); + + DEBUG(errs() << "Initialize Timer Set\n"); + initializeTimerSet(I); + switchToTimer(hpvm_TimerID_NONE, I); + + // Insert print instruction at hpvm exit + DEBUG(errs() << "Locate __hpvm__cleanup()\n"); + Function *VC = M.getFunction("__hpvm__cleanup"); + assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once"); + I = cast<Instruction>(*VC->user_begin()); + printTimerSet(I); + + DEBUG(errs() << "-------- Searching for launch sites ----------\n"); + + std::vector<Instruction *> toBeErased; + std::vector<Function *> functions; + + for (auto &F : M) + functions.push_back(&F); + + // Iterate over all functions in the module + for (Function *f : functions) { + DEBUG(errs() << "Function: " << f->getName() << "\n"); + + // List with the required additions in the function's return type + std::vector<Type *> FRetTypes; + + enum mutateTypeCause { + mtc_None, + mtc_BIND, + mtc_RETURN, + mtc_NUM_CAUSES + } bind; + bind = mutateTypeCause::mtc_None; + + // Iterate over all the instructions in this function + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) { + Instruction *I = &*i; // Grab pointer to Instruction + // If not a call instruction, move to next instruction + if (!isa<CallInst>(I)) + continue; + + CallInst *CI = cast<CallInst>(I); + LLVMContext &Ctx = CI->getContext(); + + if (isHPVMCall_init(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_init, &toBeErased); + } + if (isHPVMCall_cleanup(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_cleanup, &toBeErased); + } + if (isHPVMCall_wait(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_wait, &toBeErased); + } + if (isHPVMCall_trackMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_trackMemory, &toBeErased); + } + if (isHPVMCall_untrackMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_untrackMemory, &toBeErased); + } + if (isHPVMCall_requestMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_requestMemory, &toBeErased); + } + if (isHPVMCall_hint(I)) { + assert(isa<ConstantInt>(CI->getArgOperand(0)) && + "Argument to hint must be constant integer!"); + ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0)); + + hpvm::Target t = (hpvm::Target)hint->getZExtValue(); + addHint(CI->getParent()->getParent(), t); + DEBUG(errs() << "Found hpvm hint call: " << *CI << "\n"); + toBeErased.push_back(CI); + } + if (isHPVMCall_launch(I)) { + Function *LaunchF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_launch); + DEBUG(errs() << *LaunchF << "\n"); + // Get i8* cast to function pointer + Function *graphFunc = cast<Function>(CI->getArgOperand(1)); + graphFunc = transformReturnTypeToStruct(graphFunc); + Constant *F = + ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + assert( + F && + "Function invoked by HPVM launch has to be define and constant."); + + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0)); + assert(Op && "HPVM launch's streaming argument is a constant value."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + + auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType()); + assert(ArgTy && "HPVM launch argument should be pointer type."); + Value *Arg = CI->getArgOperand(2); + if (!ArgTy->getElementType()->isIntegerTy(8)) + Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), + Type::getInt8PtrTy(Ctx), "", CI); + Value *LaunchArgs[] = {F, Arg, isStreaming}; + CallInst *LaunchInst = CallInst::Create( + LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI); + DEBUG(errs() << "Found hpvm launch call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); + CI->replaceAllUsesWith(LaunchInst); + toBeErased.push_back(CI); + } + if (isHPVMCall_push(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_push, &toBeErased); + } + if (isHPVMCall_pop(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_pop, &toBeErased); + } + if (isHPVMCall_createNodeND(I)) { + assert(CI->getNumArgOperands() > 0 && + "Too few arguments for __hpvm__createNodeND call"); + unsigned numDims = getNumericValue(CI->getArgOperand(0)); + // We need as meny dimension argments are there are dimensions + assert(CI->getNumArgOperands() - 2 == numDims && + "Too few arguments for __hpvm_createNodeND call!\n"); + + Function *CreateNodeF; + switch (numDims) { + case 0: + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode); + break; + case 1: + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode1D); + break; + case 2: + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode2D); + break; + case 3: + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode3D); + break; + default: + llvm_unreachable("Unsupported number of dimensions\n"); + break; + } + DEBUG(errs() << *CreateNodeF << "\n"); + DEBUG(errs() << *I << "\n"); + DEBUG(errs() << "in " << I->getParent()->getParent()->getName() + << "\n"); + + // Get i8* cast to function pointer + Function *graphFunc = cast<Function>(CI->getArgOperand(1)); + graphFunc = transformReturnTypeToStruct(graphFunc); + Constant *F = + ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + + CallInst *CreateNodeInst; + switch (numDims) { + case 0: + CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F), + graphFunc->getName() + ".node", CI); + break; + case 1: { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)}; + CreateNodeInst = CallInst::Create( + CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2), + graphFunc->getName() + ".node", CI); + } break; + case 2: { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 3, expected to be i64\n"); + Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), + CI->getArgOperand(3)}; + CreateNodeInst = CallInst::Create( + CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3), + graphFunc->getName() + ".node", CI); + } break; + case 3: { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 3, expected to be i64\n"); + assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 4, expected to be i64\n"); + Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), + CI->getArgOperand(3), + CI->getArgOperand(4)}; + CreateNodeInst = CallInst::Create( + CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4), + graphFunc->getName() + ".node", CI); + } break; + default: + llvm_unreachable( + "Impossible path: number of dimensions is 0, 1, 2, 3\n"); + break; + } + + DEBUG(errs() << "Found hpvm createNode call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n"); + CI->replaceAllUsesWith(CreateNodeInst); + toBeErased.push_back(CI); + } + + if (isHPVMCall_edge(I)) { + Function *EdgeF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createEdge); + DEBUG(errs() << *EdgeF << "\n"); + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5)); + ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); + assert(Op && EdgeTypeOp && + "Arguments of CreateEdge are not constant integers."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + isAllToAll, CI->getArgOperand(3), + CI->getArgOperand(4), isStreaming}; + CallInst *EdgeInst = CallInst::Create( + EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI); + DEBUG(errs() << "Found hpvm edge call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); + CI->replaceAllUsesWith(EdgeInst); + toBeErased.push_back(CI); + } + if (isHPVMCall_bindIn(I)) { + Function *BindInF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_input); + DEBUG(errs() << *BindInF << "\n"); + // Check if this is a streaming bind or not + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind in intrinsic should be a " + "constant integer."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming}; + CallInst *BindInInst = + CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI); + DEBUG(errs() << "Found hpvm bindIn call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); + CI->replaceAllUsesWith(BindInInst); + toBeErased.push_back(CI); + } + if (isHPVMCall_bindOut(I)) { + Function *BindOutF = + Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_output); + DEBUG(errs() << *BindOutF << "\n"); + // Check if this is a streaming bind or not + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind out intrinsic should be a " + "constant integer."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming}; + CallInst *BindOutInst = CallInst::Create( + BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI); + DEBUG(errs() << "Found hpvm bindOut call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); + + DEBUG(errs() << "Fixing the return type of the function\n"); + // FIXME: What if the child node function has not been visited already. + // i.e., it's return type has not been fixed. + Function *F = I->getParent()->getParent(); + DEBUG(errs() << F->getName() << "\n";); + IntrinsicInst *NodeIntrinsic = + cast<IntrinsicInst>(CI->getArgOperand(0)); + assert(NodeIntrinsic && + "Instruction value in bind out is not a create node intrinsic."); + DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); + assert( + (NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode || + NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode1D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode2D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode3D) && + "Instruction value in bind out is not a create node intrinsic."); + Function *ChildF = cast<Function>( + NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); + DEBUG(errs() << ChildF->getName() << "\n";); + int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); + int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); + StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType()); + + Type *ReturnType = F->getReturnType(); + DEBUG(errs() << *ReturnType << "\n";); + assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) && + "Return type should either be a struct or void type!"); + + FRetTypes.insert(FRetTypes.begin() + destpos, + ChildReturnTy->getElementType(srcpos)); + assert(((bind == mutateTypeCause::mtc_BIND) || + (bind == mutateTypeCause::mtc_None)) && + "Both bind_out and hpvm_return detected"); + bind = mutateTypeCause::mtc_BIND; + + CI->replaceAllUsesWith(BindOutInst); + toBeErased.push_back(CI); + } + if (isHPVMCall_attributes(I)) { + Function *F = CI->getParent()->getParent(); + handleHPVMAttributes(F, CI); + toBeErased.push_back(CI); + } + if (isHPVMCall_getNode(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNode, &toBeErased); + } + if (isHPVMCall_getParentNode(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getParentNode, &toBeErased); + } + if (isHPVMCall_barrier(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_barrier, &toBeErased); + } + if (isHPVMCall_malloc(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_malloc, &toBeErased); + } + if (isHPVMCall_return(I)) { + DEBUG(errs() << "Function before hpvm return processing\n" + << *I->getParent()->getParent() << "\n"); + // The operands to this call are the values to be returned by the node + Value *ReturnVal = genCodeForReturn(CI); + DEBUG(errs() << *ReturnVal << "\n"); + Type *ReturnType = ReturnVal->getType(); + assert(isa<StructType>(ReturnType) && + "Return type should be a struct type!"); + + assert(((bind == mutateTypeCause::mtc_RETURN) || + (bind == mutateTypeCause::mtc_None)) && + "Both bind_out and hpvm_return detected"); + + if (bind == mutateTypeCause::mtc_None) { + // If this is None, this is the first __hpvm__return + // instruction we have come upon. Place the return type of the + // function in the return type vector + bind = mutateTypeCause::mtc_RETURN; + StructType *ReturnStructTy = cast<StructType>(ReturnType); + for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) + FRetTypes.push_back(ReturnStructTy->getElementType(i)); + } else { // bind == mutateTypeCause::mtc_RETURN + // This is not the first __hpvm__return + // instruction we have come upon. + // Check that the return types are the same + assert((ReturnType == FRetTypes[0]) && + "Multiple returns with mismatching types"); + } + + ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal); + DEBUG(errs() << "Found hpvm return call: " << *CI << "\n"); + Instruction *oldReturn = CI->getParent()->getTerminator(); + assert(isa<ReturnInst>(oldReturn) && + "Expecting a return to be the terminator of this BB!"); + DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); + DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); + // CI->replaceAllUsesWith(RetInst); + toBeErased.push_back(CI); + ReplaceInstWithInst(oldReturn, RetInst); + DEBUG(errs() << "Function after hpvm return processing\n" + << *I->getParent()->getParent() << "\n"); + } + + if (isHPVMCall_getNodeInstanceID_x(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_x, + &toBeErased); + } + if (isHPVMCall_getNodeInstanceID_y(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_y, + &toBeErased); + } + if (isHPVMCall_getNodeInstanceID_z(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_z, + &toBeErased); + } + if (isHPVMCall_getNumNodeInstances_x(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_x, + &toBeErased); + } + if (isHPVMCall_getNumNodeInstances_y(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_y, + &toBeErased); + } + if (isHPVMCall_getNumNodeInstances_z(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_z, + &toBeErased); + } + if (isHPVMCall_atomic_add(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_add, &toBeErased); + } + if (isHPVMCall_atomic_sub(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_sub, &toBeErased); + } + if (isHPVMCall_atomic_xchg(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xchg, &toBeErased); + } + if (isHPVMCall_atomic_min(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_min, &toBeErased); + } + if (isHPVMCall_atomic_max(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_max, &toBeErased); + } + if (isHPVMCall_atomic_and(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_and, &toBeErased); + } + if (isHPVMCall_atomic_or(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_or, &toBeErased); + } + if (isHPVMCall_atomic_xor(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xor, &toBeErased); + } + if (isHPVMCall_sin(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased); + } + if (isHPVMCall_cos(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased); + } + } + + // Erase the __hpvm__node calls + DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); + for (auto I : toBeErased) { + DEBUG(errs() << *I << "\n"); + } + while (!toBeErased.empty()) { + Instruction *I = toBeErased.back(); + DEBUG(errs() << "\tErasing " << *I << "\n"); + I->eraseFromParent(); + toBeErased.pop_back(); + } + + if (bind == mutateTypeCause::mtc_BIND || + bind == mutateTypeCause::mtc_RETURN) { + DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); + // Argument type list. + std::vector<Type *> FArgTypes; + for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); + ai != ae; ++ai) { + FArgTypes.push_back(ai->getType()); + } + + // Find new return type of function + Type *NewReturnTy; + if (bind == mutateTypeCause::mtc_BIND) { + + std::vector<Type *> TyList; + for (unsigned i = 0; i < FRetTypes.size(); i++) + TyList.push_back(FRetTypes[i]); + + NewReturnTy = + StructType::create(f->getContext(), TyList, + Twine("struct.out." + f->getName()).str(), true); + } else { + NewReturnTy = getReturnTypeFromReturnInst(f); + assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); + } + + FunctionType *FTy = + FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); + + // Change the function type + Function *newF = cloneFunction(f, FTy, false); + DEBUG(errs() << *newF << "\n"); + + if (bind == mutateTypeCause::mtc_BIND) { + // This is certainly an internal node, and hence just one BB with one + // return terminator instruction. Change return statement + ReturnInst *RI = + cast<ReturnInst>(newF->getEntryBlock().getTerminator()); + ReturnInst *newRI = ReturnInst::Create(newF->getContext(), + UndefValue::get(NewReturnTy)); + ReplaceInstWithInst(RI, newRI); + } + if (bind == mutateTypeCause::mtc_RETURN) { + // Nothing + } + replaceNodeFunctionInIR(*f->getParent(), f, newF); + DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); + } + } + return false; // TODO: What does returning "false" mean? +} + +// Generate Code for declaring a constant string [L x i8] and return a pointer +// to the start of it. +Value *GenHPVM::getStringPointer(const Twine &S, Instruction *IB, + const Twine &Name) { + Constant *SConstant = + ConstantDataArray::getString(M->getContext(), S.str(), true); + Value *SGlobal = + new GlobalVariable(*M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); + Value *GEPArgs[] = {Zero, Zero}; + GetElementPtrInst *SPtr = GetElementPtrInst::Create( + nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB); + return SPtr; +} + +void GenHPVM::initializeTimerSet(Instruction *InsertBefore) { + Value *TimerSetAddr; + StoreInst *SI; + TIMER(TimerSet = new GlobalVariable( + *M, Type::getInt8PtrTy(M->getContext()), false, + GlobalValue::CommonLinkage, + Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), + "hpvmTimerSet_GenHPVM")); + DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet + << "\n"); + // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet << + // "\n"); + + TIMER(TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, None, "", + InsertBefore)); + DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"); + TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore)); + DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n"); +} + +void GenHPVM::switchToTimer(enum hpvm_TimerID timer, + Instruction *InsertBefore) { + Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)}; + TIMER(CallInst::Create(llvm_hpvm_switchToTimer, + ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); +} + +void GenHPVM::printTimerSet(Instruction *InsertBefore) { + Value *TimerName; + TIMER(TimerName = getStringPointer("GenHPVM_Timer", InsertBefore)); + Value *printArgs[] = {TimerSet, TimerName}; + TIMER(CallInst::Create(llvm_hpvm_printTimerSet, + ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); +} + +static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) { + return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); +} + +static Function *transformReturnTypeToStruct(Function *F) { + // Currently only works for void return types + DEBUG(errs() << "Transforming return type of function to Struct: " + << F->getName() << "\n"); + + if (isa<StructType>(F->getReturnType())) { + DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " + << *F->getReturnType() << "\n"); + return F; + } + + assert(F->getReturnType()->isVoidTy() && + "Unhandled case - Only void return type handled\n"); + + // Create the argument type list with added argument types + std::vector<Type *> ArgTypes; + for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + + StructType *RetTy = + StructType::create(F->getContext(), None, "emptyStruct", true); + FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); + + SmallVector<ReturnInst *, 8> Returns; + Function *newF = cloneFunction(F, FTy, false, &Returns); + // Replace ret void instruction with ret %RetTy undef + for (auto &RI : Returns) { + DEBUG(errs() << "Found return inst: " << *RI << "\n"); + ReturnInst *newRI = + ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); + ReplaceInstWithInst(RI, newRI); + } + + replaceNodeFunctionInIR(*F->getParent(), F, newF); + return newF; +} + +static Type *getReturnTypeFromReturnInst(Function *F) { + for (BasicBlock &BB : *F) { + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) { + DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() + << "\n"); + return RI->getReturnValue()->getType(); + } + } +} + +char genhpvm::GenHPVM::ID = 0; +static RegisterPass<genhpvm::GenHPVM> + X("genhpvm", + "Pass to generate HPVM IR from LLVM IR (with dummy function calls)", + false, false); + +} // End of namespace genhpvm diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.exports b/hpvm/lib/Transforms/GenHPVM/GenHPVM.exports similarity index 100% rename from hpvm/lib/Transforms/GenVISC/GenVISC.exports rename to hpvm/lib/Transforms/GenHPVM/GenHPVM.exports diff --git a/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt b/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt similarity index 88% rename from hpvm/lib/Transforms/GenVISC/LLVMBuild.txt rename to hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt index 9266b2c5972984a179beba227946964182761239..94ef73ac07ca5c1ff23a05e404b0ea1f751ef36c 100644 --- a/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt +++ b/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Transforms/GenVISC/LLVMBuild.txt -------------------*- Conf -*--===; +;===- ./lib/Transforms/GenHPVM/LLVMBuild.txt -------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,5 +17,5 @@ [component_0] type = Library -name = GenVISC +name = GenHPVM parent = Transforms diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp deleted file mode 100644 index cc505415396b4a0441d5a5bfe0cf58adc945b9f8..0000000000000000000000000000000000000000 --- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp +++ /dev/null @@ -1,866 +0,0 @@ -//=== GenVISC.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "genvisc" -#include "GenVISC/GenVISC.h" - -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/CallSite.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/IR/DerivedTypes.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "SupportVISC/VISCUtils.h" - - -#define TIMER(X) do { if (VISCTimer) { X; } } while (0) - -using namespace llvm; -using namespace viscUtils; - - -// VISC Command line option to use timer or not -static cl::opt<bool> -VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer")); - -namespace genvisc { - -// Helper Functions - -static inline ConstantInt* getTimerID(Module&, enum visc_TimerID); -static Function* transformReturnTypeToStruct(Function* F); -static Type* getReturnTypeFromReturnInst(Function* F); - -// Check if the dummy function call is a __visc__node call -#define IS_VISC_CALL(callName) \ - static bool isVISCCall_##callName(Instruction* I) { \ - if(!isa<CallInst>(I)) \ - return false; \ - CallInst* CI = cast<CallInst>(I); \ - return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \ - } - -static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) { - // Check if the instruction is Call Instruction - assert(isa<CallInst>(I) && "Expecting CallInst"); - CallInst* CI = cast<CallInst>(I); - DEBUG(errs() << "Found call: " << *CI << "\n"); - - // Find the correct intrinsic call - Module* M = CI->getParent()->getParent()->getParent(); - Function* F; - std::vector<Type*> ArgTypes; - std::vector<Value*> args; - if(Intrinsic::isOverloaded(IntrinsicID)) { - // This is an overloaded intrinsic. The types must exactly match. Get the - // argument types - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { - ArgTypes.push_back(CI->getArgOperand(i)->getType()); - args.push_back(CI->getArgOperand(i)); - } - F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); - DEBUG(errs() << *F << "\n"); - } - else { // Non-overloaded intrinsic - F = Intrinsic::getDeclaration(M, IntrinsicID); - FunctionType* FTy = F->getFunctionType(); - DEBUG(errs() << *F << "\n"); - - // Create argument list - assert(CI->getNumArgOperands() == FTy->getNumParams() - && "Number of arguments of call do not match with Intrinsic"); - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { - Value* V = CI->getArgOperand(i); - // Either the type should match or both should be of pointer type - assert((V->getType() == FTy->getParamType(i) || - (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) - && "Dummy function call argument does not match with Intrinsic argument!"); - // If the types do not match, then both must be pointer type and pointer - // cast needs to be performed - if(V->getType() != FTy->getParamType(i)) { - V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); - } - args.push_back(V); - } - } - // Insert call instruction - CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); - - DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); - - CI->replaceAllUsesWith(Inst); - // If the previous instruction needs to be erased, insert it in the vector - // Erased - if(Erase != NULL) - Erase->push_back(CI); -} - -IS_VISC_CALL(launch) /* Exists but not required */ -IS_VISC_CALL(edge) /* Exists but not required */ -IS_VISC_CALL(createNodeND) -//IS_VISC_CALL(createNode) -//IS_VISC_CALL(createNode1D) -//IS_VISC_CALL(createNode2D) -//IS_VISC_CALL(createNode3D) -IS_VISC_CALL(bindIn) -IS_VISC_CALL(bindOut) -IS_VISC_CALL(push) -IS_VISC_CALL(pop) -IS_VISC_CALL(getNode) -IS_VISC_CALL(getParentNode) -IS_VISC_CALL(barrier) -IS_VISC_CALL(malloc) -IS_VISC_CALL(return) -IS_VISC_CALL(getNodeInstanceID_x) -IS_VISC_CALL(getNodeInstanceID_y) -IS_VISC_CALL(getNodeInstanceID_z) -IS_VISC_CALL(getNumNodeInstances_x) -IS_VISC_CALL(getNumNodeInstances_y) -IS_VISC_CALL(getNumNodeInstances_z) -// Atomics -IS_VISC_CALL(atomic_cmpxchg) -IS_VISC_CALL(atomic_add) -IS_VISC_CALL(atomic_sub) -IS_VISC_CALL(atomic_xchg) -IS_VISC_CALL(atomic_inc) -IS_VISC_CALL(atomic_dec) -IS_VISC_CALL(atomic_min) -IS_VISC_CALL(atomic_max) -IS_VISC_CALL(atomic_umin) -IS_VISC_CALL(atomic_umax) -IS_VISC_CALL(atomic_and) -IS_VISC_CALL(atomic_or) -IS_VISC_CALL(atomic_xor) -// Misc Fn -IS_VISC_CALL(floor) -IS_VISC_CALL(rsqrt) -IS_VISC_CALL(sqrt) -IS_VISC_CALL(sin) -IS_VISC_CALL(cos) - - -IS_VISC_CALL(init) -IS_VISC_CALL(cleanup) -IS_VISC_CALL(wait) -IS_VISC_CALL(trackMemory) -IS_VISC_CALL(untrackMemory) -IS_VISC_CALL(requestMemory) -IS_VISC_CALL(attributes) -IS_VISC_CALL(hint) - -// Return the constant integer represented by value V -static unsigned getNumericValue(Value* V) { - assert(isa<ConstantInt>(V) - && "Value indicating the number of arguments should be a constant integer"); - return cast<ConstantInt>(V)->getZExtValue(); -} - -// Take the __visc__return instruction and generate code for combining the -// values being returned into a struct and returning it. -// The first operand is the number of returned values -static Value* genCodeForReturn(CallInst* CI) { - LLVMContext& Ctx = CI->getContext(); - assert(isVISCCall_return(CI) - && "__visc__return instruction expected!"); - - // Parse the dummy function call here - assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n"); - unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); - - assert(CI->getNumArgOperands()-1 == numRetVals && - "Too few arguments for __visc_return call!\n"); - DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); - - std::vector<Type*> ArgTypes; - for(unsigned i=1; i < CI->getNumArgOperands(); i++) { - ArgTypes.push_back(CI->getArgOperand(i)->getType()); - } - Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); - StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); - - InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy), - CI->getArgOperand(1), - 0, - "returnStruct", - CI); - DEBUG(errs() << "Code generation for return:\n"); - DEBUG(errs() << *IV << "\n"); - - for(unsigned i=2; i < CI->getNumArgOperands(); i++) { - IV = InsertValueInst::Create(IV, - CI->getArgOperand(i), - i-1, - IV->getName(), - CI); - DEBUG(errs() << *IV << "\n"); - } - - return IV; -} - -// Analyse the attribute call for this function. Add the in and out -// attributes to pointer parameters. -static void handleVISCAttributes(Function* F, CallInst* CI) { - DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n"); - // Parse the dummy function call here - unsigned offset = 0; - // Find number of In pointers - assert(CI->getNumArgOperands() > offset - && "Too few arguments for __visc__attributes call!"); - unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); - DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); - - for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) { - Value* V = CI->getArgOperand(i); - if(Argument* arg = dyn_cast<Argument>(V)) { - F->addAttribute(1+arg->getArgNo(), Attribute::In); - } - else { - errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; - llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); - } - } - // Find number of Out Pointers - offset += 1 + numInPtrs; - assert(CI->getNumArgOperands() > offset - && "Too few arguments for __visc__attributes call!"); - unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); - DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); - for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) { - Value* V = CI->getArgOperand(i); - if(Argument* arg = dyn_cast<Argument>(V)) { - F->addAttribute(1+arg->getArgNo(), Attribute::Out); - } - else { - errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; - llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); - } - } - DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n"); -} - -// Public Functions of GenVISC pass -bool GenVISC::runOnModule(Module &M) { - errs() << "\nGENVISC PASS\n"; - this->M = &M; - - // Load Runtime API Module - SMDiagnostic Err; - - char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); - assert(LLVM_SRC_ROOT != NULL && - "Define LLVM_SRC_ROOT environment variable!"); - - Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc"; - errs() << llvmSrcRoot << "\n"; - - std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - - if(runtimeModule == NULL) { - DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); - assert(false && "couldn't parse runtime"); - } - else - DEBUG(errs() << "Successfully loaded visc-rt API module\n"); - - llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet", - runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType()); - //DEBUG(errs() << *llvm_visc_initializeTimerSet); - - llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer", - runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType()); - // DEBUG(errs() << *llvm_visc_switchToTimer); - - llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet", - runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType()); - //DEBUG(errs() << *llvm_visc_printTimerSet); - - // Insert init context in main - DEBUG(errs() << "Locate __visc__init()\n"); - Function* VI = M.getFunction("__visc__init"); - assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); - Instruction* I = cast<Instruction>(*VI->user_begin()); - - DEBUG(errs() << "Initialize Timer Set\n"); - initializeTimerSet(I); - switchToTimer(visc_TimerID_NONE, I); - - // Insert print instruction at visc exit - DEBUG(errs() << "Locate __visc__cleanup()\n"); - Function* VC = M.getFunction("__visc__cleanup"); - assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); - I = cast<Instruction>(*VC->user_begin()); - printTimerSet(I); - - - DEBUG(errs() << "-------- Searching for launch sites ----------\n"); - - std::vector<Instruction*> toBeErased; - std::vector<Function*> functions; - - for (auto &F : M) - functions.push_back(&F); - - // Iterate over all functions in the module - for (Function *f : functions) { - DEBUG(errs() << "Function: " << f->getName() << "\n"); - - // List with the required additions in the function's return type - std::vector<Type*> FRetTypes; - - enum mutateTypeCause { - mtc_None, - mtc_BIND, - mtc_RETURN, - mtc_NUM_CAUSES - } bind; - bind = mutateTypeCause::mtc_None; - - // Iterate over all the instructions in this function - for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { - Instruction* I = &*i; // Grab pointer to Instruction - // If not a call instruction, move to next instruction - if(!isa<CallInst>(I)) - continue; - - CallInst* CI = cast<CallInst>(I); - LLVMContext& Ctx = CI->getContext(); - - if(isVISCCall_init(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased); - } - if(isVISCCall_cleanup(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased); - } - if(isVISCCall_wait(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased); - } - if(isVISCCall_trackMemory(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased); - } - if(isVISCCall_untrackMemory(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased); - } - if(isVISCCall_requestMemory(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased); - } - if(isVISCCall_hint(I)) { - assert(isa<ConstantInt>(CI->getArgOperand(0)) - && "Argument to hint must be constant integer!"); - ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0)); - - visc::Target t = (visc::Target) hint->getZExtValue(); - addHint(CI->getParent()->getParent(), t); - DEBUG(errs() << "Found visc hint call: " << *CI << "\n"); - toBeErased.push_back(CI); - } - if(isVISCCall_launch(I)) { - Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); - DEBUG(errs() << *LaunchF << "\n"); - // Get i8* cast to function pointer - Function* graphFunc = cast<Function>(CI->getArgOperand(1)); - graphFunc = transformReturnTypeToStruct(graphFunc); - Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - assert(F && "Function invoked by VISC launch has to be define and constant."); - - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0)); - assert(Op && "VISC launch's streaming argument is a constant value."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - - auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType()); - assert(ArgTy && "VISC launch argument should be pointer type."); - Value *Arg = CI->getArgOperand(2); - if(!ArgTy->getElementType()->isIntegerTy(8)) - Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI); - Value* LaunchArgs[] = {F, Arg, isStreaming}; - CallInst* LaunchInst = CallInst::Create(LaunchF, - ArrayRef<Value*>(LaunchArgs, 3), - "graphID", CI); - DEBUG(errs() << "Found visc launch call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); - CI->replaceAllUsesWith(LaunchInst); - toBeErased.push_back(CI); - } - if(isVISCCall_push(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased); - } - if(isVISCCall_pop(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased); - } - if(isVISCCall_createNodeND(I)) { - assert(CI->getNumArgOperands() > 0 && - "Too few arguments for __visc__createNodeND call"); - unsigned numDims = getNumericValue(CI->getArgOperand(0)); - // We need as meny dimension argments are there are dimensions - assert(CI->getNumArgOperands()-2 == numDims && - "Too few arguments for __visc_createNodeND call!\n"); - - Function* CreateNodeF; - switch (numDims) { - case 0: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); - break; - case 1: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); - break; - case 2: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); - break; - case 3: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); - break; - default: - llvm_unreachable("Unsupported number of dimensions\n"); - break; - } - DEBUG(errs() << *CreateNodeF << "\n"); - DEBUG(errs() << *I << "\n"); - DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n"); - - // Get i8* cast to function pointer - Function* graphFunc = cast<Function>(CI->getArgOperand(1)); - graphFunc = transformReturnTypeToStruct(graphFunc); - Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - - CallInst* CreateNodeInst; - switch (numDims) { - case 0: - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(F), - graphFunc->getName()+".node", CI); - break; - case 1: - { - assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 2, expected to be i64\n"); - Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)}; - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(CreateNodeArgs, 2), - graphFunc->getName()+".node", CI); - } - break; - case 2: - { - assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 2, expected to be i64\n"); - assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 3, expected to be i64\n"); - Value* CreateNodeArgs[] = {F, - CI->getArgOperand(2), - CI->getArgOperand(3)}; - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(CreateNodeArgs, 3), - graphFunc->getName()+".node", CI); - } - break; - case 3: - { - assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 2, expected to be i64\n"); - assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 3, expected to be i64\n"); - assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 4, expected to be i64\n"); - Value* CreateNodeArgs[] = {F, - CI->getArgOperand(2), - CI->getArgOperand(3), - CI->getArgOperand(4)}; - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(CreateNodeArgs, 4), - graphFunc->getName()+".node", CI); - } - break; - default: - llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n"); - break; - } - - DEBUG(errs() << "Found visc createNode call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n"); - CI->replaceAllUsesWith(CreateNodeInst); - toBeErased.push_back(CI); - } - - if(isVISCCall_edge(I)) { - Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); - DEBUG(errs() << *EdgeF << "\n"); - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5)); - ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); - assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4), - isStreaming - }; - CallInst* EdgeInst = CallInst::Create(EdgeF, - ArrayRef<Value*>(EdgeArgs, 6), - "output", CI); - DEBUG(errs() << "Found visc edge call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); - CI->replaceAllUsesWith(EdgeInst); - toBeErased.push_back(CI); - } - if(isVISCCall_bindIn(I)) { - Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); - DEBUG(errs() << *BindInF << "\n"); - // Check if this is a streaming bind or not - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind in intrinsic should be a constant integer."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming - }; - CallInst* BindInInst = CallInst::Create(BindInF, - ArrayRef<Value*>(BindInArgs, 4), - "", CI); - DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); - CI->replaceAllUsesWith(BindInInst); - toBeErased.push_back(CI); - } - if(isVISCCall_bindOut(I)) { - Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); - DEBUG(errs() << *BindOutF << "\n"); - // Check if this is a streaming bind or not - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind out intrinsic should be a constant integer."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming - }; - CallInst* BindOutInst = CallInst::Create(BindOutF, - ArrayRef<Value*>(BindOutArgs, 4), - "", CI); - DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); - - DEBUG(errs() << "Fixing the return type of the function\n"); - // FIXME: What if the child node function has not been visited already. - // i.e., it's return type has not been fixed. - Function* F = I->getParent()->getParent(); - DEBUG(errs() << F->getName() << "\n";); - IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0)); - assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic."); - DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); - assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) && - "Instruction value in bind out is not a create node intrinsic."); - Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); - DEBUG(errs() << ChildF->getName() << "\n";); - int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); - int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); - StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType()); - - Type* ReturnType = F->getReturnType(); - DEBUG(errs() << *ReturnType << "\n";); - assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) - && "Return type should either be a struct or void type!"); - - FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos)); - assert(((bind == mutateTypeCause::mtc_BIND) || - (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and visc_return detected"); - bind = mutateTypeCause::mtc_BIND; - - CI->replaceAllUsesWith(BindOutInst); - toBeErased.push_back(CI); - } - if(isVISCCall_attributes(I)) { - Function* F = CI->getParent()->getParent(); - handleVISCAttributes(F, CI); - toBeErased.push_back(CI); - } - if (isVISCCall_getNode(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased); - } - if (isVISCCall_getParentNode(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased); - } - if (isVISCCall_barrier(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased); - } - if (isVISCCall_malloc(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased); - } - if (isVISCCall_return(I)) { - DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n"); - // The operands to this call are the values to be returned by the node - Value* ReturnVal = genCodeForReturn(CI); - DEBUG(errs() << *ReturnVal << "\n"); - Type* ReturnType = ReturnVal->getType(); - assert(isa<StructType>(ReturnType) - && "Return type should be a struct type!"); - - assert(((bind == mutateTypeCause::mtc_RETURN) || - (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and visc_return detected"); - - if (bind == mutateTypeCause::mtc_None) { - // If this is None, this is the first __visc__return - // instruction we have come upon. Place the return type of the - // function in the return type vector - bind = mutateTypeCause::mtc_RETURN; - StructType* ReturnStructTy = cast<StructType>(ReturnType); - for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) - FRetTypes.push_back(ReturnStructTy->getElementType(i)); - } else { // bind == mutateTypeCause::mtc_RETURN - // This is not the first __visc__return - // instruction we have come upon. - // Check that the return types are the same - assert((ReturnType == FRetTypes[0]) - && "Multiple returns with mismatching types"); - } - - ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal); - DEBUG(errs() << "Found visc return call: " << *CI << "\n"); - Instruction* oldReturn = CI->getParent()->getTerminator(); - assert(isa<ReturnInst>(oldReturn) - && "Expecting a return to be the terminator of this BB!"); - DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); - DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); - //CI->replaceAllUsesWith(RetInst); - toBeErased.push_back(CI); - ReplaceInstWithInst(oldReturn, RetInst); - DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n"); - } - - if (isVISCCall_getNodeInstanceID_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased); - } - if (isVISCCall_getNodeInstanceID_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased); - } - if (isVISCCall_getNodeInstanceID_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased); - } - if (isVISCCall_getNumNodeInstances_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased); - } - if (isVISCCall_getNumNodeInstances_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased); - } - if (isVISCCall_getNumNodeInstances_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased); - } - if (isVISCCall_atomic_add(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased); - } - if (isVISCCall_atomic_sub(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased); - } - if (isVISCCall_atomic_xchg(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased); - } - if (isVISCCall_atomic_min(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased); - } - if (isVISCCall_atomic_max(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased); - } - if (isVISCCall_atomic_and(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased); - } - if (isVISCCall_atomic_or(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased); - } - if (isVISCCall_atomic_xor(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased); - } - if (isVISCCall_sin(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased); - } - if (isVISCCall_cos(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased); - } - } - - // Erase the __visc__node calls - DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); - for(auto I: toBeErased) { - DEBUG(errs() << *I << "\n"); - } - while(!toBeErased.empty()) { - Instruction* I = toBeErased.back(); - DEBUG(errs() << "\tErasing " << *I << "\n"); - I->eraseFromParent(); - toBeErased.pop_back(); - } - - if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) { - DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); - // Argument type list. - std::vector<Type*> FArgTypes; - for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); - ai != ae; ++ai) { - FArgTypes.push_back(ai->getType()); - } - - // Find new return type of function - Type* NewReturnTy; - if(bind == mutateTypeCause::mtc_BIND) { - - std::vector<Type*> TyList; - for (unsigned i = 0; i < FRetTypes.size(); i++) - TyList.push_back(FRetTypes[i]); - - NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true); - } - else { - NewReturnTy = getReturnTypeFromReturnInst(f); - assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); - } - - FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); - - // Change the function type - Function* newF = cloneFunction(f, FTy, false); - DEBUG(errs() << *newF << "\n"); - - if (bind == mutateTypeCause::mtc_BIND) { - // This is certainly an internal node, and hence just one BB with one - // return terminator instruction. Change return statement - ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator()); - ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy)); - ReplaceInstWithInst(RI, newRI); - } - if (bind == mutateTypeCause::mtc_RETURN) { - // Nothing - } - replaceNodeFunctionInIR(*f->getParent(), f, newF); - DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); - } - - - } - return false; //TODO: What does returning "false" mean? -} - -// Generate Code for declaring a constant string [L x i8] and return a pointer -// to the start of it. -Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { - Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true); - Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true, - GlobalValue::InternalLinkage, SConstant, Name); - Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); - Value* GEPArgs[] = {Zero, Zero}; - GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, - ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); - return SPtr; -} - -void GenVISC::initializeTimerSet(Instruction* InsertBefore) { - Value* TimerSetAddr; - StoreInst* SI; - TIMER(TimerSet = new GlobalVariable(*M, - Type::getInt8PtrTy(M->getContext()), - false, - GlobalValue::CommonLinkage, - Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), - "viscTimerSet_GenVISC")); - DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n"); - //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n"); - - TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, - None, - "", - InsertBefore)); - DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"); - TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore)); - DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n"); -} - -void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) { - Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)}; - TIMER(CallInst::Create(llvm_visc_switchToTimer, - ArrayRef<Value*>(switchArgs, 2), - "", - InsertBefore)); -} - -void GenVISC::printTimerSet(Instruction* InsertBefore) { - Value* TimerName; - TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore)); - Value* printArgs[] = {TimerSet, TimerName}; - TIMER(CallInst::Create(llvm_visc_printTimerSet, - ArrayRef<Value*>(printArgs, 2), - "", - InsertBefore)); -} - -static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) { - return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); -} - -static Function* transformReturnTypeToStruct(Function* F) { - // Currently only works for void return types - DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n"); - - if (isa<StructType>(F->getReturnType())) { - DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n"); - return F; - } - - assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n"); - - // Create the argument type list with added argument types - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - - StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true); - FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); - - SmallVector<ReturnInst*, 8> Returns; - Function* newF = cloneFunction(F, FTy, false, &Returns); - // Replace ret void instruction with ret %RetTy undef - for(auto &RI: Returns) { - DEBUG(errs() << "Found return inst: "<< *RI << "\n"); - ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); - ReplaceInstWithInst(RI, newRI); - } - - replaceNodeFunctionInIR(*F->getParent(), F, newF); - return newF; -} - -static Type* getReturnTypeFromReturnInst(Function* F) { - for(BasicBlock &BB: *F) { - if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) { - DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n"); - return RI->getReturnValue()->getType(); - } - } -} - - -char genvisc::GenVISC::ID = 0; -static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false); - -} // End of namespace genvisc - - diff --git a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp index 7bd66b62c6c8cda589fe3e6c1e3711893aceaffb..fc33ebee71123d89c5f931901dd213c82a401941 100644 --- a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp +++ b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "LocalMem" -#include "SupportVISC/DFG2LLVM.h" +#include "SupportHPVM/DFG2LLVM.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" @@ -134,7 +134,7 @@ void AT_OCL::codeGen(DFLeafNode *N) { // Return pointer to property if this leaf node matches the conditions for being // an allocation node. Conditions // 1. No incoming memory pointer. No in/out attribute on a pointer argument -// 2. Uses visc malloc intrinsic to allocate memory +// 2. Uses hpvm malloc intrinsic to allocate memory // 3. Sends it out // 2. (TODO:) Whether the allocated pointer escapes the parent node AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { @@ -148,18 +148,18 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { Function *F = N->getFuncPointer(); - // Allocation node must use visc malloc intrinsic - bool usesVISCMalloc = false; + // Allocation node must use hpvm malloc intrinsic + bool usesHPVMMalloc = false; for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) { Instruction *I = &*i; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - if (II->getIntrinsicID() == Intrinsic::visc_malloc) { - usesVISCMalloc = true; + if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) { + usesHPVMMalloc = true; break; } } } - if (!usesVISCMalloc) + if (!usesHPVMMalloc) return NULL; // TODO: Check if allocated pointer leaves parent node @@ -197,20 +197,20 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { assert(OutValues[i]->getType()->isPointerTy() && "Expected outgoing edge to be of pointer type"); if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(OutValues[i])) { - if (II->getIntrinsicID() == Intrinsic::visc_malloc) { + if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) { // Sanity check: Size passed to malloc intrinsic is same as the value // going into the next outgoing edge - DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); + DEBUG(errs() << "HPVM malloc size: " << *II->getArgOperand(0) << "\n"); DEBUG(errs() << "Out edge value: " << *OutValues[i + 1] << "\n"); assert(II->getArgOperand(0) == OutValues[i + 1] && - "Sanity Check Failed: VISC Malloc size argument != next " + "Sanity Check Failed: HPVM Malloc size argument != next " "outgoing edge"); ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0)); i = i + 2; continue; } } - llvm_unreachable("Expecting visc malloc intrinsic instruction!"); + llvm_unreachable("Expecting hpvm malloc intrinsic instruction!"); } return ANP; } diff --git a/hpvm/llvm_installer/llvm_installer.sh b/hpvm/llvm_installer/llvm_installer.sh index d7fcda4ac4de8c129e47cfce65264097e040d228..e072d042b79a1a3caf8003794a89b5cee2dca67a 100755 --- a/hpvm/llvm_installer/llvm_installer.sh +++ b/hpvm/llvm_installer/llvm_installer.sh @@ -179,10 +179,10 @@ echo make -j$NUM_THREADS make -j$NUM_THREADS #make install -#echo Building HPVM runtime -#HPVM_RT_DIR=$HPVM_DIR/projects/visc-rt -#cd $HPVM_RT_DIR -#make +# echo Building HPVM runtime +# HPVM_RT_DIR=$HPVM_DIR/projects/hpvm-rt +# cd $HPVM_RT_DIR +# make #cp -r $CURRENT_DIR/projects $HPVM_DIR/ #make -j$NUM_THREADS diff --git a/hpvm/llvm_patches/apply_patch.sh b/hpvm/llvm_patches/apply_patch.sh index ea86575207a4aa7b4ca138b604f7423943924b22..289e5c11e319aa16262952d2d079f986c2e987b8 100644 --- a/hpvm/llvm_patches/apply_patch.sh +++ b/hpvm/llvm_patches/apply_patch.sh @@ -1,7 +1,7 @@ #!/bin/sh ### File Copies -cp include/IR/IntrinsicsVISC.td ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsVISC.td +cp include/IR/IntrinsicsHPVM.td ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsHPVM.td ## Header File Patches diff --git a/hpvm/llvm_patches/include/IR/Attributes.td b/hpvm/llvm_patches/include/IR/Attributes.td index b644cdb30bbd590a8b8c238bfde15e4b451e8ea3..c6ff8ef3c6c962f5444d718ff5a7e16ce392a522 100644 --- a/hpvm/llvm_patches/include/IR/Attributes.td +++ b/hpvm/llvm_patches/include/IR/Attributes.td @@ -151,7 +151,7 @@ def ShadowCallStack : EnumAttr<"shadowcallstack">; /// Sign extended before/after call. def SExt : EnumAttr<"signext">; -/// VISC Attributes +/// HPVM Attributes /// Pointer to read only memory def In : EnumAttr<"in">; diff --git a/hpvm/llvm_patches/include/IR/Intrinsics.td b/hpvm/llvm_patches/include/IR/Intrinsics.td index 2f79964a2e381c6d4ec22a5bc3c80a9d411f9fb0..2e3f34eb1a8408371a0b516089dd970adfe9223c 100644 --- a/hpvm/llvm_patches/include/IR/Intrinsics.td +++ b/hpvm/llvm_patches/include/IR/Intrinsics.td @@ -1249,4 +1249,4 @@ include "llvm/IR/IntrinsicsBPF.td" include "llvm/IR/IntrinsicsSystemZ.td" include "llvm/IR/IntrinsicsWebAssembly.td" include "llvm/IR/IntrinsicsRISCV.td" -include "llvm/IR/IntrinsicsVISC.td" +include "llvm/IR/IntrinsicsHPVM.td" diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td new file mode 100644 index 0000000000000000000000000000000000000000..410e9c8d3345e67df9614e0d518e5e596a4368e1 --- /dev/null +++ b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td @@ -0,0 +1,208 @@ +//===- IntrinsicsHPVM.td - Defines HPVM intrinsics ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the HPVM-specific intrinsics. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "hpvm" in { + /* All intrinsics start with "llvm.hpvm." + * As we do not want the compiler to mess with these intrinsics, we assume + * worst memory behavior for all these intrinsics. + */ + + /* Initialization intrinsic - + * i8* llvm.hpvm.setup(function*); + */ + def int_hpvm_init : Intrinsic<[], [], []>; + + /* Launch intrinsic - with streaming argument + * i8* llvm.hpvm.launch(i8*, ArgList*, i1); + */ + def int_hpvm_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_ptr_ty, llvm_i1_ty], []>; + + /* Push intrinsic - push data on streaming pipeline + * void llvm.hpvm.push(i8*, ArgList*); + */ + def int_hpvm_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>; + + /* Pop intrinsic - pop data from streaming pipeline + * i8* llvm.hpvm.pop(i8*); + */ + def int_hpvm_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; + + /* Cleanup intrinsic - + * void llvm.hpvm.cleanup(i8*); + */ + def int_hpvm_cleanup : Intrinsic<[], [], []>; + + /* Wait intrinsic - + * void llvm.hpvm.wait(graphID*); + */ + def int_hpvm_wait : Intrinsic<[], [llvm_ptr_ty], []>; + + /* Track memory intrinsic - + * void llvm.hpvm.trackMemory(i8*, i64); + */ + def int_hpvm_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; + + /* Track memory intrinsic - + * void llvm.hpvm.untrackMemory(i8*); + */ + def int_hpvm_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>; + + /* Request memory intrinsic - + * void llvm.hpvm.requestMemory(i8*, i64); + */ + def int_hpvm_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; + + /* Create Node intrinsic - + * i8* llvm.hpvm.createNode(function*); + */ + def int_hpvm_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; + + /* Create Node 1D array intrinsic - + * i8* llvm.hpvm.createNode1D(function*, i64); + */ + def int_hpvm_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_i64_ty], []>; + + /* Create Node 2D array intrinsic - + * i8* llvm.hpvm.createNode2D(function*, i64, i64); + */ + def int_hpvm_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_i64_ty, llvm_i64_ty], []>; + + /* Create Node 3D array intrinsic - + * i8* llvm.hpvm.createNode2D(function*, i64, i64, i64); + */ + def int_hpvm_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], + []>; + + /* Create dataflow edge intrinsic - + * i8* llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1); + */ + def int_hpvm_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty, + llvm_i1_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i1_ty], + []>; + + /* Create bind input intrinsic - + * void llvm.hpvm.bind.input(i8*, i32, i32); + */ + def int_hpvm_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i1_ty], []>; + + /* Create bind output intrinsic - + * void llvm.hpvm.bind.output(i8*, i32, i32); + */ + def int_hpvm_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i1_ty], []>; + + /* Find associated dataflow node intrinsic - + * i8* llvm.hpvm.getNode(); + */ + def int_hpvm_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; + + /* Find parent dataflow node intrinsic - + * i8* llvm.hpvm.getParentNode(i8*); + */ + def int_hpvm_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>; + + /* Find the number of dimensions of a dataflow node intrinsic - + * i32 llvm.hpvm.getNumDims(i8*); + */ + def int_hpvm_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; + + /* Find the unique indentifier of a dataflow node (with respect to his parent + * node) in the specified dimension intrinsic - + */ + + /* i64 llvm.hpvm.getNodeInstanceID.[xyz](i8*); + */ + def int_hpvm_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_hpvm_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_hpvm_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + /* Find the number of instances of a dataflow node in the specified dimension + * intrinsic - + */ + + /* i64 llvm.hpvm.getNumNodeInstances.[xyz](i8*); + */ + def int_hpvm_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_hpvm_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_hpvm_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + /* Local Barrier + * void llvm.hpvm.barrier(); + */ + def int_hpvm_barrier : Intrinsic<[], [], []>; + + /* Memory allocation inside the graph + * i8* llvm.hpvm.malloc(); + */ + def int_hpvm_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>; + + /* Find the vector length supported by target architecture + * intrinsic - + * i32 llvm.hpvm.getVectorLength(); + */ + def int_hpvm_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>; + + /* ============ Atomic intrinsics ============= */ + // Atomic arithmetic operations + + /* i32 llvm.hpvm.atomic.add(i32*, i32)*/ + def int_hpvm_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.hpvm.atomic.sub(i32*, i32)*/ + def int_hpvm_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.hpvm.atomic.xchg(i32*, i32)*/ + def int_hpvm_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.hpvm.atomic.min(i32*, i32)*/ + def int_hpvm_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.hpvm.atomic.maxi32*, i32)*/ + def int_hpvm_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + // Atomic bitwise operations + + /* i32 llvm.hpvm.atomic.and(i32*, i32)*/ + def int_hpvm_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.hpvm.atomic.or(i32*, i32)*/ + def int_hpvm_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.hpvm.atomic.xor(i32*, i32)*/ + def int_hpvm_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + +} diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td b/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td deleted file mode 100644 index d5330175d86c9576394c9363a4ba30fd651f19e8..0000000000000000000000000000000000000000 --- a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td +++ /dev/null @@ -1,208 +0,0 @@ -//===- IntrinsicsVISC.td - Defines VISC intrinsics ---------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines all of the VISC-specific intrinsics. -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "visc" in { - /* All intrinsics start with "llvm.visc." - * As we do not want the compiler to mess with these intrinsics, we assume - * worst memory behavior for all these intrinsics. - */ - - /* Initialization intrinsic - - * i8* llvm.visc.setup(function*); - */ - def int_visc_init : Intrinsic<[], [], []>; - - /* Launch intrinsic - with streaming argument - * i8* llvm.visc.launch(i8*, ArgList*, i1); - */ - def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_ptr_ty, llvm_i1_ty], []>; - - /* Push intrinsic - push data on streaming pipeline - * void llvm.visc.push(i8*, ArgList*); - */ - def int_visc_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>; - - /* Pop intrinsic - pop data from streaming pipeline - * i8* llvm.visc.pop(i8*); - */ - def int_visc_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; - - /* Cleanup intrinsic - - * void llvm.visc.cleanup(i8*); - */ - def int_visc_cleanup : Intrinsic<[], [], []>; - - /* Wait intrinsic - - * void llvm.visc.wait(graphID*); - */ - def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>; - - /* Track memory intrinsic - - * void llvm.visc.trackMemory(i8*, i64); - */ - def int_visc_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; - - /* Track memory intrinsic - - * void llvm.visc.untrackMemory(i8*); - */ - def int_visc_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>; - - /* Request memory intrinsic - - * void llvm.visc.requestMemory(i8*, i64); - */ - def int_visc_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; - - /* Create Node intrinsic - - * i8* llvm.visc.createNode(function*); - */ - def int_visc_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; - - /* Create Node 1D array intrinsic - - * i8* llvm.visc.createNode1D(function*, i64); - */ - def int_visc_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_i64_ty], []>; - - /* Create Node 2D array intrinsic - - * i8* llvm.visc.createNode2D(function*, i64, i64); - */ - def int_visc_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_i64_ty, llvm_i64_ty], []>; - - /* Create Node 3D array intrinsic - - * i8* llvm.visc.createNode2D(function*, i64, i64, i64); - */ - def int_visc_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], - []>; - - /* Create dataflow edge intrinsic - - * i8* llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1); - */ - def int_visc_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty, - llvm_i1_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i1_ty], - []>; - - /* Create bind input intrinsic - - * void llvm.visc.bind.input(i8*, i32, i32); - */ - def int_visc_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i1_ty], []>; - - /* Create bind output intrinsic - - * void llvm.visc.bind.output(i8*, i32, i32); - */ - def int_visc_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i1_ty], []>; - - /* Find associated dataflow node intrinsic - - * i8* llvm.visc.getNode(); - */ - def int_visc_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; - - /* Find parent dataflow node intrinsic - - * i8* llvm.visc.getParentNode(i8*); - */ - def int_visc_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>; - - /* Find the number of dimensions of a dataflow node intrinsic - - * i32 llvm.visc.getNumDims(i8*); - */ - def int_visc_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; - - /* Find the unique indentifier of a dataflow node (with respect to his parent - * node) in the specified dimension intrinsic - - */ - - /* i64 llvm.visc.getNodeInstanceID.[xyz](i8*); - */ - def int_visc_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_visc_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_visc_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - /* Find the number of instances of a dataflow node in the specified dimension - * intrinsic - - */ - - /* i64 llvm.visc.getNumNodeInstances.[xyz](i8*); - */ - def int_visc_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_visc_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_visc_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - /* Local Barrier - * void llvm.visc.barrier(); - */ - def int_visc_barrier : Intrinsic<[], [], []>; - - /* Memory allocation inside the graph - * i8* llvm.visc.malloc(); - */ - def int_visc_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>; - - /* Find the vector length supported by target architecture - * intrinsic - - * i32 llvm.visc.getVectorLength(); - */ - def int_visc_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>; - - /* ============ Atomic intrinsics ============= */ - // Atomic arithmetic operations - - /* i32 llvm.visc.atomic.add(i32*, i32)*/ - def int_visc_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.visc.atomic.sub(i32*, i32)*/ - def int_visc_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.visc.atomic.xchg(i32*, i32)*/ - def int_visc_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.visc.atomic.min(i32*, i32)*/ - def int_visc_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.visc.atomic.maxi32*, i32)*/ - def int_visc_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - // Atomic bitwise operations - - /* i32 llvm.visc.atomic.and(i32*, i32)*/ - def int_visc_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.visc.atomic.or(i32*, i32)*/ - def int_visc_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.visc.atomic.xor(i32*, i32)*/ - def int_visc_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - -} diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp index a924405a2cac85ccd2e5e903a1ee1abb52774566..2c54392f8020ac7334117f1343214d085dbd6b84 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp +++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp @@ -855,7 +855,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(bit); KEYWORD(varFlags); - // VISC parameter attributes + // HPVM parameter attributes KEYWORD(in); KEYWORD(out); KEYWORD(inout); diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp index f5ce44e2a920405f7e3790fcb1d9eb7fba28d636..7446ff1e32dd79a18fd678446af56e6d193468ad 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp +++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp @@ -1470,7 +1470,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, case lltok::kw_swiftself: case lltok::kw_immarg: - // VISC Parameter only attributes + // HPVM Parameter only attributes case lltok::kw_in: case lltok::kw_out: case lltok::kw_inout: @@ -1808,7 +1808,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { B.addAttribute(Attribute::ImmArg); break; - // VISC parameter attributes + // HPVM parameter attributes case lltok::kw_in: B.addAttribute(Attribute::In); break; @@ -1927,7 +1927,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { case lltok::kw_swiftself: case lltok::kw_immarg: - // VISC Parameter only attributes + // HPVM Parameter only attributes case lltok::kw_in: case lltok::kw_out: case lltok::kw_inout: diff --git a/hpvm/llvm_patches/lib/AsmParser/LLToken.h b/hpvm/llvm_patches/lib/AsmParser/LLToken.h index 7f9816965b2a21ae3d23873ca789a22481b575fa..cb0479b41c3b9e68d9697cd9d8adce4c80fa5c25 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLToken.h +++ b/hpvm/llvm_patches/lib/AsmParser/LLToken.h @@ -351,7 +351,7 @@ enum Kind { kw_insertvalue, kw_blockaddress, - // VISC parameter attributes + // HPVM parameter attributes kw_in, kw_out, kw_inout, diff --git a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp index 7eb289d5872713ef826174b1e691c6440d4dd43e..a1e64472850911013250976312a8dd7d8b879c98 100644 --- a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1395,7 +1395,7 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) { case Attribute::NoFree: return 1ULL << 63; - // VISC Attributes + // HPVM Attributes case Attribute::In: return 3ULL << 0; case Attribute::Out: diff --git a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp index 55e7415efbea2b37d85f20b1d123ce9a80efe67e..fd671c397583fad6ec8a9998635705417f59eed1 100644 --- a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -773,7 +773,7 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { case Attribute::SanitizeMemTag: return bitc::ATTR_KIND_SANITIZE_MEMTAG; - // VISC Attributes + // HPVM Attributes case Attribute::In: return bitc::ATTR_KIND_IN; case Attribute::Out: diff --git a/hpvm/llvm_patches/lib/IR/Attributes.cpp b/hpvm/llvm_patches/lib/IR/Attributes.cpp index 3cc95b3102fdf6c7062fffe1f9486cfa094bba9b..29c47a9e1107524278dcc57c188b320821ba7d86 100644 --- a/hpvm/llvm_patches/lib/IR/Attributes.cpp +++ b/hpvm/llvm_patches/lib/IR/Attributes.cpp @@ -404,7 +404,7 @@ std::string Attribute::getAsString(bool InAttrGrp) const { if (hasAttribute(Attribute::ImmArg)) return "immarg"; - // VISC attributes for arguments + // HPVM attributes for arguments if (hasAttribute(Attribute::In)) return "in"; if (hasAttribute(Attribute::Out)) diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..be7f69c4bfa7623c093bd5e913af1de3dbcf951c --- /dev/null +++ b/hpvm/projects/hpvm-rt/CMakeLists.txt @@ -0,0 +1,22 @@ +add_definitions(-DNUM_CORES=8) + +SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang) +SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++) + +add_llvm_library(hpvm-rt.ll hpvm-rt.cpp + + DEPENDS + clang + llvm-dis + ) + + +target_compile_options(hpvm-rt.ll PUBLIC -flto ) +target_compile_options(hpvm-rt.ll PUBLIC -std=c++11) + +add_custom_target(hpvm-rt.cpp.o ALL + COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libhpvm-rt.ll.a + COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc + COMMAND ${CMAKE_BINARY_DIR}/bin/llvm-dis ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc) + +add_dependencies(hpvm-rt.cpp.o hpvm-rt.ll) diff --git a/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt b/hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt similarity index 100% rename from hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt rename to hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt diff --git a/hpvm/projects/visc-rt/device_abstraction.h b/hpvm/projects/hpvm-rt/device_abstraction.h similarity index 96% rename from hpvm/projects/visc-rt/device_abstraction.h rename to hpvm/projects/hpvm-rt/device_abstraction.h index 7e77d100deb6b23b6ed9ca994796cd1cb108b0d4..4948502ce8ae47cbb7e37c1372fcd81813486e15 100644 --- a/hpvm/projects/visc-rt/device_abstraction.h +++ b/hpvm/projects/hpvm-rt/device_abstraction.h @@ -27,7 +27,7 @@ void initializeDeviceStatusIntervals() { unsigned sz = 0; unsigned tmp = 0; - const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/" + const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/hpvm-rt/" "deviceStatusSwitchIntervals.txt"; std::ifstream infile; infile.open(fn); diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/hpvm-rt/hpvm-rt.cpp similarity index 82% rename from hpvm/projects/visc-rt/visc-rt.cpp rename to hpvm/projects/hpvm-rt/hpvm-rt.cpp index 53d3b516f22b59857b1a17aecba32a6b723998f0..ec2534cf43f43f971140de741c7a04f79613e982 100644 --- a/hpvm/projects/visc-rt/visc-rt.cpp +++ b/hpvm/projects/hpvm-rt/hpvm-rt.cpp @@ -13,7 +13,7 @@ #if _POSIX_VERSION >= 200112L #include <sys/time.h> #endif -#include "visc-rt.h" +#include "hpvm-rt.h" #ifndef DEBUG_BUILD #define DEBUG(s) \ @@ -59,7 +59,7 @@ vector<DFGDepth> DStack; pthread_mutex_t ocl_mtx; #define NUM_TESTS 1 -visc_TimerSet kernel_timer; +hpvm_TimerSet kernel_timer; static inline void checkErr(cl_int err, cl_int success, const char *name) { if (err != success) { @@ -70,7 +70,7 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) { } /************************* Policies *************************************/ -void llvm_visc_policy_init() { +void llvm_hpvm_policy_init() { cout << "Initializing policy object ...\n"; // policy = new NodePolicy(); // policy = new IterationPolicy(); @@ -80,19 +80,19 @@ void llvm_visc_policy_init() { cout << "DONE: Initializing policy object.\n"; } -void llvm_visc_policy_clear() { +void llvm_hpvm_policy_clear() { if (policy) free(policy); } -int llvm_visc_policy_getVersion(const char *name, int64_t i) { +int llvm_hpvm_policy_getVersion(const char *name, int64_t i) { return policy->getVersion(name, i); } /******************** Device Abstraction ********************************/ std::thread deviceStatusThread; -void llvm_visc_deviceAbstraction_start() { +void llvm_hpvm_deviceAbstraction_start() { cout << "Starting device status simulation ...\n"; // Initialize vector with points where ti switch device status initializeDeviceStatusIntervals(); @@ -102,7 +102,7 @@ void llvm_visc_deviceAbstraction_start() { return; } -void llvm_visc_deviceAbstraction_end() { +void llvm_hpvm_deviceAbstraction_end() { cout << "Ending device status simulation thread ...\n"; // Set the variable that allows the thread to know that execution has ended executionEnd = true; @@ -112,7 +112,7 @@ void llvm_visc_deviceAbstraction_end() { return; } -void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { +void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus() { while (!deviceStatus) { }; return; @@ -120,7 +120,7 @@ void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { /************************* Depth Stack Routines ***************************/ -void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, +void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY, uint64_t iY, uint64_t limitZ, uint64_t iZ) { DEBUG(cout << "Pushing node information on stack:\n"); @@ -134,7 +134,7 @@ void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_x86_dstack_pop() { +void llvm_hpvm_x86_dstack_pop() { DEBUG(cout << "Popping from depth stack\n"); pthread_mutex_lock(&ocl_mtx); DStack.pop_back(); @@ -142,7 +142,7 @@ void llvm_visc_x86_dstack_pop() { pthread_mutex_unlock(&ocl_mtx); } -uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) { +uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim) { DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level << flush << "\n"); pthread_mutex_lock(&ocl_mtx); @@ -154,7 +154,7 @@ uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) { return result; } -uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) { +uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim) { DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level << flush << "\n"); pthread_mutex_lock(&ocl_mtx); @@ -168,7 +168,7 @@ uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) { /********************** Memory Tracking Routines **************************/ -void llvm_visc_track_mem(void *ptr, size_t size) { +void llvm_hpvm_track_mem(void *ptr, size_t size) { DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE != NULL) { @@ -180,7 +180,7 @@ void llvm_visc_track_mem(void *ptr, size_t size) { DEBUG(MTracker.print()); } -void llvm_visc_untrack_mem(void *ptr) { +void llvm_hpvm_untrack_mem(void *ptr) { DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE == NULL) { @@ -195,7 +195,7 @@ void llvm_visc_untrack_mem(void *ptr) { DEBUG(MTracker.print()); } -static void *llvm_visc_ocl_request_mem(void *ptr, size_t size, +static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size, DFNodeContext_OCL *Context, bool isInput, bool isOutput) { pthread_mutex_lock(&ocl_mtx); @@ -233,7 +233,7 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size, else clFlags = CL_MEM_READ_ONLY; - visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); + hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY); // pthread_mutex_lock(&ocl_mtx); cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); @@ -249,7 +249,7 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size, checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device"); } - visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); + hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); DEBUG(cout << " done\n"); MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context); DEBUG(cout << "Updated Table\n"); @@ -258,11 +258,11 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size, return d_input; } -void *llvm_visc_x86_argument_ptr(void *ptr, size_t size) { - return llvm_visc_request_mem(ptr, size); +void *llvm_hpvm_x86_argument_ptr(void *ptr, size_t size) { + return llvm_hpvm_request_mem(ptr, size); } -void *llvm_visc_request_mem(void *ptr, size_t size) { +void *llvm_hpvm_request_mem(void *ptr, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); @@ -283,13 +283,13 @@ void *llvm_visc_request_mem(void *ptr, size_t size) { DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); DEBUG(cout << "\tCopying ..."); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); + hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY); // pthread_mutex_lock(&ocl_mtx); cl_int errcode = clEnqueueReadBuffer( ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue, (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL); // pthread_mutex_unlock(&ocl_mtx); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); + hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); DEBUG(cout << " done\n"); checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output"); DEBUG(cout << "Free mem object on device\n"); @@ -303,25 +303,25 @@ void *llvm_visc_request_mem(void *ptr, size_t size) { /*************************** Timer Routines **********************************/ -static int is_async(enum visc_TimerID timer) { - return (timer == visc_TimerID_KERNEL) || (timer == visc_TimerID_COPY_ASYNC); +static int is_async(enum hpvm_TimerID timer) { + return (timer == hpvm_TimerID_KERNEL) || (timer == hpvm_TimerID_COPY_ASYNC); } -static int is_blocking(enum visc_TimerID timer) { - return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE); +static int is_blocking(enum hpvm_TimerID timer) { + return (timer == hpvm_TimerID_COPY) || (timer == hpvm_TimerID_NONE); } -#define INVALID_TIMERID visc_TimerID_LAST +#define INVALID_TIMERID hpvm_TimerID_LAST -static int asyncs_outstanding(struct visc_TimerSet *timers) { +static int asyncs_outstanding(struct hpvm_TimerSet *timers) { return (timers->async_markers != NULL) && (timers->async_markers->timerID != INVALID_TIMERID); } -static struct visc_async_time_marker_list * -get_last_async(struct visc_TimerSet *timers) { +static struct hpvm_async_time_marker_list * +get_last_async(struct hpvm_TimerSet *timers) { /* Find the last event recorded thus far */ - struct visc_async_time_marker_list *last_event = timers->async_markers; + struct hpvm_async_time_marker_list *last_event = timers->async_markers; if (last_event != NULL && last_event->timerID != INVALID_TIMERID) { while (last_event->next != NULL && last_event->next->timerID != INVALID_TIMERID) @@ -331,17 +331,17 @@ get_last_async(struct visc_TimerSet *timers) { return NULL; } -static void insert_marker(struct visc_TimerSet *tset, enum visc_TimerID timer) { +static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct visc_async_time_marker_list **new_event = &(tset->async_markers); + struct hpvm_async_time_marker_list **new_event = &(tset->async_markers); while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } if (*new_event == NULL) { - *new_event = (struct visc_async_time_marker_list *)malloc( - sizeof(struct visc_async_time_marker_list)); + *new_event = (struct hpvm_async_time_marker_list *)malloc( + sizeof(struct hpvm_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* // I don't think this is needed at all. I believe clEnqueueMarker 'creates' @@ -372,18 +372,18 @@ Event Status!\n"); } } -static void insert_submarker(struct visc_TimerSet *tset, char *label, - enum visc_TimerID timer) { +static void insert_submarker(struct hpvm_TimerSet *tset, char *label, + enum hpvm_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct visc_async_time_marker_list **new_event = &(tset->async_markers); + struct hpvm_async_time_marker_list **new_event = &(tset->async_markers); while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } if (*new_event == NULL) { - *new_event = (struct visc_async_time_marker_list *)malloc( - sizeof(struct visc_async_time_marker_list)); + *new_event = (struct hpvm_async_time_marker_list *)malloc( + sizeof(struct hpvm_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) @@ -414,10 +414,10 @@ Event Status!\n"); } /* Assumes that all recorded events have completed */ -static visc_Timestamp record_async_times(struct visc_TimerSet *tset) { - struct visc_async_time_marker_list *next_interval = NULL; - struct visc_async_time_marker_list *last_marker = get_last_async(tset); - visc_Timestamp total_async_time = 0; +static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) { + struct hpvm_async_time_marker_list *next_interval = NULL; + struct hpvm_async_time_marker_list *last_marker = get_last_async(tset); + hpvm_Timestamp total_async_time = 0; for (next_interval = tset->async_markers; next_interval != last_marker; next_interval = next_interval->next) { @@ -439,11 +439,11 @@ static visc_Timestamp record_async_times(struct visc_TimerSet *tset) { ciErrNum); } - visc_Timestamp interval = - (visc_Timestamp)(((double)(command_end - command_start))); + hpvm_Timestamp interval = + (hpvm_Timestamp)(((double)(command_end - command_start))); tset->timers[next_interval->timerID].elapsed += interval; if (next_interval->label != NULL) { - struct visc_SubTimer *subtimer = + struct hpvm_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; while (subtimer != NULL) { if (strcmp(subtimer->label, next_interval->label) == 0) { @@ -463,8 +463,8 @@ static visc_Timestamp record_async_times(struct visc_TimerSet *tset) { return total_async_time; } -static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start, - visc_Timestamp end) { +static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start, + hpvm_Timestamp end) { #if _POSIX_VERSION >= 200112L *accum += end - start; #else @@ -473,33 +473,33 @@ static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start, } #if _POSIX_VERSION >= 200112L -static visc_Timestamp get_time() { +static hpvm_Timestamp get_time() { struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); - return (visc_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); + return (hpvm_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); } #else #error "no supported time libraries are available on this platform" #endif -void visc_ResetTimer(struct visc_Timer *timer) { - timer->state = visc_Timer_STOPPED; +void hpvm_ResetTimer(struct hpvm_Timer *timer) { + timer->state = hpvm_Timer_STOPPED; #if _POSIX_VERSION >= 200112L timer->elapsed = 0; #else -#error "visc_ResetTimer: not implemented for this system" +#error "hpvm_ResetTimer: not implemented for this system" #endif } -void visc_StartTimer(struct visc_Timer *timer) { - if (timer->state != visc_Timer_STOPPED) { +void hpvm_StartTimer(struct hpvm_Timer *timer) { + if (timer->state != hpvm_Timer_STOPPED) { // FIXME: Removing warning statement to avoid printing this error // fputs("Ignoring attempt to start a running timer\n", stderr); return; } - timer->state = visc_Timer_RUNNING; + timer->state = hpvm_Timer_RUNNING; #if _POSIX_VERSION >= 200112L { @@ -508,19 +508,19 @@ void visc_StartTimer(struct visc_Timer *timer) { timer->init = tv.tv_sec * BILLION + tv.tv_nsec; } #else -#error "visc_StartTimer: not implemented for this system" +#error "hpvm_StartTimer: not implemented for this system" #endif } -void visc_StartTimerAndSubTimer(struct visc_Timer *timer, - struct visc_Timer *subtimer) { +void hpvm_StartTimerAndSubTimer(struct hpvm_Timer *timer, + struct hpvm_Timer *subtimer) { unsigned int numNotStopped = 0x3; // 11 - if (timer->state != visc_Timer_STOPPED) { + if (timer->state != hpvm_Timer_STOPPED) { fputs("Warning: Timer was not stopped\n", stderr); numNotStopped &= 0x1; // Zero out 2^1 } - if (subtimer->state != visc_Timer_STOPPED) { + if (subtimer->state != hpvm_Timer_STOPPED) { fputs("Warning: Subtimer was not stopped\n", stderr); numNotStopped &= 0x2; // Zero out 2^0 } @@ -529,8 +529,8 @@ void visc_StartTimerAndSubTimer(struct visc_Timer *timer, return; } - timer->state = visc_Timer_RUNNING; - subtimer->state = visc_Timer_RUNNING; + timer->state = hpvm_Timer_RUNNING; + subtimer->state = hpvm_Timer_RUNNING; #if _POSIX_VERSION >= 200112L { @@ -546,19 +546,19 @@ void visc_StartTimerAndSubTimer(struct visc_Timer *timer, } } #else -#error "visc_StartTimer: not implemented for this system" +#error "hpvm_StartTimer: not implemented for this system" #endif } -void visc_StopTimer(struct visc_Timer *timer) { - visc_Timestamp fini; +void hpvm_StopTimer(struct hpvm_Timer *timer) { + hpvm_Timestamp fini; - if (timer->state != visc_Timer_RUNNING) { + if (timer->state != hpvm_Timer_RUNNING) { // fputs("Ignoring attempt to stop a stopped timer\n", stderr); return; } - timer->state = visc_Timer_STOPPED; + timer->state = hpvm_Timer_STOPPED; #if _POSIX_VERSION >= 200112L { @@ -567,24 +567,24 @@ void visc_StopTimer(struct visc_Timer *timer) { fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -#error "visc_StopTimer: not implemented for this system" +#error "hpvm_StopTimer: not implemented for this system" #endif accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } -void visc_StopTimerAndSubTimer(struct visc_Timer *timer, - struct visc_Timer *subtimer) { +void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer, + struct hpvm_Timer *subtimer) { - visc_Timestamp fini; + hpvm_Timestamp fini; unsigned int numNotRunning = 0x3; // 11 - if (timer->state != visc_Timer_RUNNING) { + if (timer->state != hpvm_Timer_RUNNING) { fputs("Warning: Timer was not running\n", stderr); numNotRunning &= 0x1; // Zero out 2^1 } - if (subtimer->state != visc_Timer_RUNNING) { + if (subtimer->state != hpvm_Timer_RUNNING) { fputs("Warning: Subtimer was not running\n", stderr); numNotRunning &= 0x2; // Zero out 2^0 } @@ -593,8 +593,8 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, return; } - timer->state = visc_Timer_STOPPED; - subtimer->state = visc_Timer_STOPPED; + timer->state = hpvm_Timer_STOPPED; + subtimer->state = hpvm_Timer_STOPPED; #if _POSIX_VERSION >= 200112L { @@ -603,7 +603,7 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -#error "visc_StopTimer: not implemented for this system" +#error "hpvm_StopTimer: not implemented for this system" #endif if (numNotRunning & 0x2) { @@ -618,59 +618,59 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, } /* Get the elapsed time in seconds. */ -double visc_GetElapsedTime(struct visc_Timer *timer) { +double hpvm_GetElapsedTime(struct hpvm_Timer *timer) { double ret; - if (timer->state != visc_Timer_STOPPED) { + if (timer->state != hpvm_Timer_STOPPED) { fputs("Elapsed time from a running timer is inaccurate\n", stderr); } #if _POSIX_VERSION >= 200112L ret = timer->elapsed / 1e9; #else -#error "visc_GetElapsedTime: not implemented for this system" +#error "hpvm_GetElapsedTime: not implemented for this system" #endif return ret; } -void visc_InitializeTimerSet(struct visc_TimerSet *timers) { +void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers) { int n; timers->wall_begin = get_time(); - timers->current = visc_TimerID_NONE; + timers->current = hpvm_TimerID_NONE; timers->async_markers = NULL; - for (n = 0; n < visc_TimerID_LAST; n++) { - visc_ResetTimer(&timers->timers[n]); + for (n = 0; n < hpvm_TimerID_LAST; n++) { + hpvm_ResetTimer(&timers->timers[n]); timers->sub_timer_list[n] = NULL; } } -void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, - enum visc_TimerID visc_Category) { +void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label, + enum hpvm_TimerID hpvm_Category) { - struct visc_SubTimer *subtimer = - (struct visc_SubTimer *)malloc(sizeof(struct visc_SubTimer)); + struct hpvm_SubTimer *subtimer = + (struct hpvm_SubTimer *)malloc(sizeof(struct hpvm_SubTimer)); int len = strlen(label); subtimer->label = (char *)malloc(sizeof(char) * (len + 1)); sprintf(subtimer->label, "%s", label); - visc_ResetTimer(&subtimer->timer); + hpvm_ResetTimer(&subtimer->timer); subtimer->next = NULL; - struct visc_SubTimerList *subtimerlist = - timers->sub_timer_list[visc_Category]; + struct hpvm_SubTimerList *subtimerlist = + timers->sub_timer_list[hpvm_Category]; if (subtimerlist == NULL) { subtimerlist = - (struct visc_SubTimerList *)calloc(1, sizeof(struct visc_SubTimerList)); + (struct hpvm_SubTimerList *)calloc(1, sizeof(struct hpvm_SubTimerList)); subtimerlist->subtimer_list = subtimer; - timers->sub_timer_list[visc_Category] = subtimerlist; + timers->sub_timer_list[hpvm_Category] = subtimerlist; } else { // Append to list - struct visc_SubTimer *element = subtimerlist->subtimer_list; + struct hpvm_SubTimer *element = subtimerlist->subtimer_list; while (element->next != NULL) { element = element->next; } @@ -678,37 +678,37 @@ void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, } } -void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { +void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { // cerr << "Switch to timer: " << timer << flush << "\n"; /* Stop the currently running timer */ - if (timers->current != visc_TimerID_NONE) { - struct visc_SubTimerList *subtimerlist = + if (timers->current != hpvm_TimerID_NONE) { + struct hpvm_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct visc_SubTimer *currSubTimer = + struct hpvm_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; if (!is_async(timers->current)) { if (timers->current != timer) { if (currSubTimer != NULL) { - visc_StopTimerAndSubTimer(&timers->timers[timers->current], + hpvm_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); } else { - visc_StopTimer(&timers->timers[timers->current]); + hpvm_StopTimer(&timers->timers[timers->current]); } } else { if (currSubTimer != NULL) { - visc_StopTimer(&currSubTimer->timer); + hpvm_StopTimer(&currSubTimer->timer); } } } else { insert_marker(timers, timer); if (!is_async(timer)) { // if switching to async too, keep driver going - visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); + hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]); } } } - visc_Timestamp currentTime = get_time(); + hpvm_Timestamp currentTime = get_time(); /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next @@ -716,7 +716,7 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { if (asyncs_outstanding(timers) && (!is_async(timers->current) || is_blocking(timer))) { - struct visc_async_time_marker_list *last_event = get_last_async(timers); + struct hpvm_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; @@ -736,7 +736,7 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { // timer to switch to is COPY or NONE if (async_done != CL_COMPLETE) { - accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), + accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed), timers->async_begin, currentTime); } @@ -746,14 +746,14 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { fprintf(stderr, "Error Waiting for Events!\n"); } - visc_Timestamp total_async_time = record_async_times(timers); + hpvm_Timestamp total_async_time = record_async_times(timers); /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ if (async_done == CL_COMPLETE) { // fprintf(stderr, "Async_done: total_async_type = %lld\n", // total_async_time); - timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; + timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time; } } else @@ -763,15 +763,15 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { if (async_done == CL_COMPLETE) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - timers->timers[visc_TimerID_OVERLAP].elapsed += + timers->timers[hpvm_TimerID_OVERLAP].elapsed += record_async_times(timers); } } /* Start the new timer */ - if (timer != visc_TimerID_NONE) { + if (timer != hpvm_TimerID_NONE) { if (!is_async(timer)) { - visc_StartTimer(&timers->timers[timer]); + hpvm_StartTimer(&timers->timers[timer]); } else { // toSwitchTo Is Async (KERNEL/COPY_ASYNC) if (!asyncs_outstanding(timers)) { @@ -785,48 +785,48 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { * so we can rename that marker as the beginning of this async * operation */ - struct visc_async_time_marker_list *last_event = get_last_async(timers); + struct hpvm_async_time_marker_list *last_event = get_last_async(timers); last_event->label = NULL; last_event->timerID = timer; } if (!is_async(timers->current)) { - visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); + hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]); } } } timers->current = timer; } -void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, - enum visc_TimerID category) { - struct visc_SubTimerList *subtimerlist = +void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, + enum hpvm_TimerID category) { + struct hpvm_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct visc_SubTimer *curr = + struct hpvm_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; - if (timers->current != visc_TimerID_NONE) { + if (timers->current != hpvm_TimerID_NONE) { if (!is_async(timers->current)) { if (timers->current != category) { if (curr != NULL) { - visc_StopTimerAndSubTimer(&timers->timers[timers->current], + hpvm_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); } else { - visc_StopTimer(&timers->timers[timers->current]); + hpvm_StopTimer(&timers->timers[timers->current]); } } else { if (curr != NULL) { - visc_StopTimer(&curr->timer); + hpvm_StopTimer(&curr->timer); } } } else { insert_submarker(timers, label, category); if (!is_async(category)) { // if switching to async too, keep driver going - visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); + hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]); } } } - visc_Timestamp currentTime = get_time(); + hpvm_Timestamp currentTime = get_time(); /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next @@ -834,7 +834,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, if (asyncs_outstanding(timers) && (!is_async(timers->current) || is_blocking(category))) { - struct visc_async_time_marker_list *last_event = get_last_async(timers); + struct hpvm_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; @@ -858,7 +858,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, // because everything is being stopped to wait for synchronization it // seems that the extra sync wall time isn't being recorded anywhere if (async_done != CL_COMPLETE) - accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), + accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed), timers->async_begin, currentTime); /* Wait on async operation completion */ @@ -866,7 +866,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Waiting for Events!\n"); } - visc_Timestamp total_async_time = record_async_times(timers); + hpvm_Timestamp total_async_time = record_async_times(timers); /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ @@ -874,7 +874,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, // into OVERLAP the immediately preceding EventSynchronize theoretically // didn't have any effect since it was already completed. if (async_done == CL_COMPLETE /*cudaSuccess*/) - timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; + timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time; } else /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ @@ -883,14 +883,14 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, if (async_done == CL_COMPLETE /*cudaSuccess*/) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - timers->timers[visc_TimerID_OVERLAP].elapsed += + timers->timers[hpvm_TimerID_OVERLAP].elapsed += record_async_times(timers); } // else, this isn't blocking, so just check the next time around } subtimerlist = timers->sub_timer_list[category]; - struct visc_SubTimer *subtimer = NULL; + struct hpvm_SubTimer *subtimer = NULL; if (label != NULL) { subtimer = subtimerlist->subtimer_list; @@ -904,18 +904,18 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, } /* Start the new timer */ - if (category != visc_TimerID_NONE) { + if (category != hpvm_TimerID_NONE) { if (!is_async(category)) { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } if (category != timers->current && subtimer != NULL) { - visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + hpvm_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); } else if (subtimer != NULL) { - visc_StartTimer(&subtimer->timer); + hpvm_StartTimer(&subtimer->timer); } else { - visc_StartTimer(&timers->timers[category]); + hpvm_StartTimer(&timers->timers[category]); } } else { if (subtimerlist != NULL) { @@ -933,7 +933,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, * so we can rename that marker as the beginning of this async * operation */ - struct visc_async_time_marker_list *last_event = get_last_async(timers); + struct hpvm_async_time_marker_list *last_event = get_last_async(timers); last_event->timerID = category; last_event->label = label; } // else, marker for switchToThis was already inserted @@ -941,7 +941,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, // toSwitchto is already asynchronous, but if current/prev state is async // too, then DRIVER is already running if (!is_async(timers->current)) { - visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); + hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]); } } } @@ -949,11 +949,11 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, timers->current = category; } -void visc_PrintTimerSet(struct visc_TimerSet *timers) { - visc_Timestamp wall_end = get_time(); +void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) { + hpvm_Timestamp wall_end = get_time(); - struct visc_Timer *t = timers->timers; - struct visc_SubTimer *sub = NULL; + struct hpvm_Timer *t = timers->timers; + struct hpvm_SubTimer *sub = NULL; int maxSubLength; @@ -970,13 +970,13 @@ void visc_PrintTimerSet(struct visc_TimerSet *timers) { const int maxCategoryLength = 20; int i; - for (i = 1; i < visc_TimerID_LAST; + for (i = 1; i < hpvm_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if (visc_GetElapsedTime(&t[i]) != 0 || true) { + if (hpvm_GetElapsedTime(&t[i]) != 0 || true) { // Print Category Timer printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1], - visc_GetElapsedTime(&t[i])); + hpvm_GetElapsedTime(&t[i])); if (timers->sub_timer_list[i] != NULL) { sub = timers->sub_timer_list[i]->subtimer_list; @@ -999,24 +999,24 @@ void visc_PrintTimerSet(struct visc_TimerSet *timers) { // Print SubTimers while (sub != NULL) { printf(" -%-*s: %.9f\n", maxSubLength, sub->label, - visc_GetElapsedTime(&sub->timer)); + hpvm_GetElapsedTime(&sub->timer)); sub = sub->next; } } } } - if (visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0) + if (hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP]) != 0) printf("CPU/Kernel Overlap: %.9f\n", - visc_GetElapsedTime(&t[visc_TimerID_OVERLAP])); + hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP])); float walltime = (wall_end - timers->wall_begin) / 1e9; printf("Timer Wall Time: %.9f\n", walltime); } -void visc_DestroyTimerSet(struct visc_TimerSet *timers) { +void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) { /* clean up all of the async event markers */ - struct visc_async_time_marker_list *event = timers->async_markers; + struct hpvm_async_time_marker_list *event = timers->async_markers; while (event != NULL) { cl_int ciErrNum = CL_SUCCESS; @@ -1031,7 +1031,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) { } free((event)->marker); - struct visc_async_time_marker_list *next = ((event)->next); + struct hpvm_async_time_marker_list *next = ((event)->next); free(event); @@ -1040,10 +1040,10 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) { } int i = 0; - for (i = 0; i < visc_TimerID_LAST; ++i) { + for (i = 0; i < hpvm_TimerID_LAST; ++i) { if (timers->sub_timer_list[i] != NULL) { - struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; - struct visc_SubTimer *prev = NULL; + struct hpvm_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct hpvm_SubTimer *prev = NULL; while (subtimer != NULL) { free(subtimer->label); prev = subtimer; @@ -1059,7 +1059,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) { #define BUFFER_SIZE 1 // Launch API for a streaming dataflow graph -void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { +void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { DFNodeContext_X86 *Context = (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86)); @@ -1081,7 +1081,7 @@ void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { } // Push API for a streaming dataflow graph -void llvm_visc_streamPush(void *graphID, void *args) { +void llvm_hpvm_streamPush(void *graphID, void *args) { DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; @@ -1094,17 +1094,17 @@ void llvm_visc_streamPush(void *graphID, void *args) { if (Ctx->BindInSourcePort->at(j) == i) { // Push to all bind buffers connected to parent node at this port // DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); - llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element); + llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(j), element); } } } // Push 0 in isLastInput buffers of all child nodes for (CircularBuffer<uint64_t> *buffer : *(Ctx->isLastInputBuffers)) - llvm_visc_bufferPush(buffer, 0); + llvm_hpvm_bufferPush(buffer, 0); } // Pop API for a streaming dataflow graph -void *llvm_visc_streamPop(void *graphID) { +void *llvm_hpvm_streamPop(void *graphID) { DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; unsigned totalBytes = 0; @@ -1113,7 +1113,7 @@ void *llvm_visc_streamPop(void *graphID) { void *output = malloc(totalBytes); unsigned offset = 0; for (unsigned i = 0; i < Ctx->BindOutputBuffers->size(); i++) { - uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i)); + uint64_t element = llvm_hpvm_bufferPop(Ctx->BindOutputBuffers->at(i)); // DEBUG(cout << "\tPopped Value " << element << " from buffer\n"); memcpy((char *)output + offset, &element, Ctx->BindOutSizes->at(i)); offset += Ctx->BindOutSizes->at(i); @@ -1122,24 +1122,24 @@ void *llvm_visc_streamPop(void *graphID) { } // Wait API for a streaming dataflow graph -void llvm_visc_streamWait(void *graphID) { +void llvm_hpvm_streamWait(void *graphID) { DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; // Push garbage to all other input buffers for (unsigned i = 0; i < Ctx->BindInputBuffers->size(); i++) { uint64_t element = 0; // DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); - llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element); + llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(i), element); } // Push 1 in isLastInput buffers of all child nodes for (unsigned i = 0; i < Ctx->isLastInputBuffers->size(); i++) - llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1); + llvm_hpvm_bufferPush(Ctx->isLastInputBuffers->at(i), 1); - llvm_visc_freeThreads(graphID); + llvm_hpvm_freeThreads(graphID); } // Create a buffer and return the bufferID -void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size, +void *llvm_hpvm_createBindInBuffer(void *graphID, uint64_t size, unsigned inArgPort) { DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); @@ -1154,7 +1154,7 @@ void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size, return bufferID; } -void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) { +void *llvm_hpvm_createBindOutBuffer(void *graphID, uint64_t size) { DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; @@ -1166,7 +1166,7 @@ void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) { Context->BindOutSizes->push_back(size); return bufferID; } -void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) { +void *llvm_hpvm_createEdgeBuffer(void *graphID, uint64_t size) { DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; @@ -1179,7 +1179,7 @@ void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) { return bufferID; } -void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) { +void *llvm_hpvm_createLastInputBuffer(void *graphID, uint64_t size) { DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; @@ -1192,7 +1192,7 @@ void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) { } // Free buffers -void llvm_visc_freeBuffers(void *graphID) { +void llvm_hpvm_freeBuffers(void *graphID) { DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; for (CircularBuffer<uint64_t> *bufferID : *(Context->BindInputBuffers)) @@ -1206,19 +1206,19 @@ void llvm_visc_freeBuffers(void *graphID) { } // Pop an element from the buffer -uint64_t llvm_visc_bufferPop(void *bufferID) { +uint64_t llvm_hpvm_bufferPop(void *bufferID) { CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID; return buffer->pop(); } // Push an element into the buffer -void llvm_visc_bufferPush(void *bufferID, uint64_t element) { +void llvm_hpvm_bufferPush(void *bufferID, uint64_t element) { CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID; buffer->push(element); } // Create a thread -void llvm_visc_createThread(void *graphID, void *(*Func)(void *), +void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), void *arguments) { DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n"); @@ -1232,7 +1232,7 @@ void llvm_visc_createThread(void *graphID, void *(*Func)(void *), } // Wait for thread to finish -void llvm_visc_freeThreads(void *graphID) { +void llvm_hpvm_freeThreads(void *graphID) { DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; for (pthread_t thread : *(Ctx->threads)) @@ -1241,7 +1241,7 @@ void llvm_visc_freeThreads(void *graphID) { /************************ OPENCL & PTHREAD API ********************************/ -void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) { +void *llvm_hpvm_x86_launch(void *(*rootFunc)(void *), void *arguments) { DFNodeContext_X86 *Context = (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86)); // int err; @@ -1252,7 +1252,7 @@ void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) { return Context; } -void llvm_visc_x86_wait(void *graphID) { +void llvm_hpvm_x86_wait(void *graphID) { DEBUG(cout << "Waiting for pthread to finish ...\n"); // DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; // pthread_join(Context->threadID, NULL); @@ -1260,9 +1260,9 @@ void llvm_visc_x86_wait(void *graphID) { DEBUG(cout << "\t... pthread Done!\n"); } -void *llvm_visc_ocl_initContext(enum visc::Target T) { +void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) { pthread_mutex_lock(&ocl_mtx); - DEBUG(std::string Target = T == visc::GPU_TARGET ? "GPU" : "SPIR"); + DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR"); DEBUG(cout << "Initializing Context for " << Target << " device\n"); cl_uint numPlatforms; cl_int errcode; @@ -1299,10 +1299,10 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) { // assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); // Choose second one which is X86 AVX cl_context_properties properties[] = { - CL_CONTEXT_PLATFORM, (long)platforms[T == visc::GPU_TARGET ? 0 : 1], 0}; + CL_CONTEXT_PLATFORM, (long)platforms[T == hpvm::GPU_TARGET ? 0 : 1], 0}; globalOCLContext = clCreateContextFromType( properties, - T == visc::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, + T == hpvm::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, NULL, &errcode); // get the list of OCL devices associated with context size_t dataBytes; @@ -1314,7 +1314,7 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) { errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); checkErr(errcode, CL_SUCCESS, "Failure to get context info"); - if (false && T == visc::SPIR_TARGET) { + if (false && T == hpvm::SPIR_TARGET) { cl_device_partition_property props[4]; props[0] = CL_DEVICE_PARTITION_BY_COUNTS; props[1] = NUM_CORES; @@ -1340,13 +1340,13 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) { checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); DEBUG(cout << "Initialize Kernel Timer\n"); - visc_InitializeTimerSet(&kernel_timer); + hpvm_InitializeTimerSet(&kernel_timer); pthread_mutex_unlock(&ocl_mtx); return globalOCLContext; } -void llvm_visc_ocl_clearContext(void *graphID) { +void llvm_hpvm_ocl_clearContext(void *graphID) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Clear Context\n"); DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; @@ -1359,12 +1359,12 @@ void llvm_visc_ocl_clearContext(void *graphID) { // DEBUG(cout << "Released context at: " << globalOCLContext); free(Context); DEBUG(cout << "Done with OCL kernel\n"); - cout << "Printing VISC Timer: KernelTimer\n"; - visc_PrintTimerSet(&kernel_timer); + cout << "Printing HPVM Timer: KernelTimer\n"; + hpvm_PrintTimerSet(&kernel_timer); pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) { +void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Shared Memory Input:"); DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size @@ -1379,7 +1379,7 @@ void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) { pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index, +void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Scalar Input:"); @@ -1395,7 +1395,7 @@ void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index, pthread_mutex_unlock(&ocl_mtx); } -void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index, +void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index, size_t size, bool isInput, bool isOutput) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Pointer Input:"); @@ -1409,7 +1409,7 @@ void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index, pthread_mutex_unlock(&ocl_mtx); // Check with runtime the location of this memory - cl_mem d_input = (cl_mem)llvm_visc_ocl_request_mem(input, size, Context, + cl_mem d_input = (cl_mem)llvm_hpvm_ocl_request_mem(input, size, Context, isInput, isOutput); pthread_mutex_lock(&ocl_mtx); @@ -1424,7 +1424,7 @@ void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index, return d_input; } -void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) { +void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set device memory for Output Struct:"); DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size @@ -1446,13 +1446,13 @@ void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) { return d_output; } -void llvm_visc_ocl_free(void *ptr) { +void llvm_hpvm_ocl_free(void *ptr) { // DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n"); // cl_mem d_ptr = (cl_mem) ptr; // clReleaseMemObject(d_ptr); } -void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output, +void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Get Output:\n"); @@ -1471,7 +1471,7 @@ void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output, return h_output; } -void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim, +void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim, const size_t *localWorkSize, const size_t *globalWorkSize) { pthread_mutex_lock(&ocl_mtx); @@ -1517,7 +1517,7 @@ void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim, // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); // pthread_mutex_unlock(&ocl_mtx); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION); + hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COMPUTATION); // for(int i=0 ;i < NUM_TESTS; i++) { // cout << "Iteration = " << i << flush << "\n"; // pthread_mutex_lock(&ocl_mtx); @@ -1530,7 +1530,7 @@ void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim, // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); // pthread_mutex_unlock(&ocl_mtx); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); + hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); pthread_mutex_unlock(&ocl_mtx); return event; @@ -1579,7 +1579,7 @@ static char *LoadProgSource(const char *Filename, size_t *szFinalLength) { return cSourceString; } -void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) { +void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Launch OCL Kernel\n"); // Initialize OpenCL @@ -1649,7 +1649,7 @@ void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) { return Context; } -void llvm_visc_ocl_wait(void *graphID) { +void llvm_hpvm_ocl_wait(void *graphID) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Wait\n"); DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; @@ -1659,27 +1659,27 @@ void llvm_visc_ocl_wait(void *graphID) { pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID timer) { +void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) { // cout << "Switching to timer " << timer << flush << "\n"; pthread_mutex_lock(&ocl_mtx); - // visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer); + // hpvm_SwitchToTimer((hpvm_TimerSet*)(*timerSet), timer); pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_printTimerSet(void **timerSet, char *timerName) { +void llvm_hpvm_printTimerSet(void **timerSet, char *timerName) { pthread_mutex_lock(&ocl_mtx); - cout << "Printing VISC Timer: "; + cout << "Printing HPVM Timer: "; if (timerName != NULL) cout << timerName << flush << "\n"; else cout << "Anonymous\n"; - visc_PrintTimerSet((visc_TimerSet *)(*timerSet)); + hpvm_PrintTimerSet((hpvm_TimerSet *)(*timerSet)); pthread_mutex_unlock(&ocl_mtx); } -void *llvm_visc_initializeTimerSet() { +void *llvm_hpvm_initializeTimerSet() { pthread_mutex_lock(&ocl_mtx); - visc_TimerSet *TS = (visc_TimerSet *)malloc(sizeof(visc_TimerSet)); - visc_InitializeTimerSet(TS); + hpvm_TimerSet *TS = (hpvm_TimerSet *)malloc(sizeof(hpvm_TimerSet)); + hpvm_InitializeTimerSet(TS); pthread_mutex_unlock(&ocl_mtx); return TS; } diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/hpvm-rt/hpvm-rt.h similarity index 72% rename from hpvm/projects/visc-rt/visc-rt.h rename to hpvm/projects/hpvm-rt/hpvm-rt.h index 3ad315768bf90584a68c1d620ac68936e62a17f0..2b6dafba96d27e4a05c040c77565fbb62ea0e68f 100644 --- a/hpvm/projects/visc-rt/visc-rt.h +++ b/hpvm/projects/hpvm-rt/hpvm-rt.h @@ -2,8 +2,8 @@ * * (c) 2010 The Board of Trustees of the University of Illinois. */ -#ifndef VISC_RT_HEADER -#define VISC_RT_HEADER +#ifndef HPVM_RT_HEADER +#define HPVM_RT_HEADER #include <ctime> #include <iostream> @@ -13,8 +13,8 @@ #include <vector> //#include <condition_variable> -#include "../../include/SupportVISC/VISCHint.h" -#include "../../include/SupportVISC/VISCTimer.h" +#include "../../include/SupportHPVM/HPVMHint.h" +#include "../../include/SupportHPVM/HPVMTimer.h" #include "device_abstraction.h" #include "policy.h" @@ -31,14 +31,14 @@ extern "C" { /************************* Policies *************************************/ -void llvm_visc_policy_init(); -void llvm_visc_policy_clear(); -int llvm_visc_policy_getVersion(const char *, int64_t); +void llvm_hpvm_policy_init(); +void llvm_hpvm_policy_clear(); +int llvm_hpvm_policy_getVersion(const char *, int64_t); /******************** Device Abstraction ********************************/ -void llvm_visc_deviceAbstraction_start(); -void llvm_visc_deviceAbstraction_end(); -void llvm_visc_deviceAbstraction_waitOnDeviceStatus(); +void llvm_hpvm_deviceAbstraction_start(); +void llvm_hpvm_deviceAbstraction_end(); +void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus(); /********************* DFG Depth Stack **********************************/ class DFGDepth { @@ -77,12 +77,12 @@ public: unsigned getNumDim() const { return numDim; } }; -void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, +void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0); -void llvm_visc_x86_dstack_pop(); -uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim); -uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim); +void llvm_hpvm_x86_dstack_pop(); +uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim); +uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim); /********************* Memory Tracker **********************************/ class MemTrackerEntry { @@ -156,32 +156,32 @@ public: } }; -void llvm_visc_track_mem(void *, size_t); -void llvm_visc_untrack_mem(void *); -void *llvm_visc_request_mem(void *, size_t); +void llvm_hpvm_track_mem(void *, size_t); +void llvm_hpvm_untrack_mem(void *); +void *llvm_hpvm_request_mem(void *, size_t); /*********************** OPENCL & PTHREAD API **************************/ -void *llvm_visc_x86_launch(void *(void *), void *); -void llvm_visc_x86_wait(void *); -void *llvm_visc_ocl_initContext(enum visc::Target); - -void *llvm_visc_x86_argument_ptr(void *, size_t); - -void llvm_visc_ocl_clearContext(void *); -void llvm_visc_ocl_argument_shared(void *, int, size_t); -void llvm_visc_ocl_argument_scalar(void *, void *, int, size_t); -void *llvm_visc_ocl_argument_ptr(void *, void *, int, size_t, bool, bool); -void *llvm_visc_ocl_output_ptr(void *, int, size_t); -void llvm_visc_ocl_free(void *); -void *llvm_visc_ocl_getOutput(void *, void *, void *, size_t); -void *llvm_visc_ocl_executeNode(void *, unsigned, const size_t *, +void *llvm_hpvm_x86_launch(void *(void *), void *); +void llvm_hpvm_x86_wait(void *); +void *llvm_hpvm_ocl_initContext(enum hpvm::Target); + +void *llvm_hpvm_x86_argument_ptr(void *, size_t); + +void llvm_hpvm_ocl_clearContext(void *); +void llvm_hpvm_ocl_argument_shared(void *, int, size_t); +void llvm_hpvm_ocl_argument_scalar(void *, void *, int, size_t); +void *llvm_hpvm_ocl_argument_ptr(void *, void *, int, size_t, bool, bool); +void *llvm_hpvm_ocl_output_ptr(void *, int, size_t); +void llvm_hpvm_ocl_free(void *); +void *llvm_hpvm_ocl_getOutput(void *, void *, void *, size_t); +void *llvm_hpvm_ocl_executeNode(void *, unsigned, const size_t *, const size_t *); -void *llvm_visc_ocl_launch(const char *, const char *); -void llvm_visc_ocl_wait(void *); +void *llvm_hpvm_ocl_launch(const char *, const char *); +void llvm_hpvm_ocl_wait(void *); -void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID); -void llvm_visc_printTimerSet(void **timerSet, char *timerName = NULL); -void *llvm_visc_initializeTimerSet(); +void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID); +void llvm_hpvm_printTimerSet(void **timerSet, char *timerName = NULL); +void *llvm_hpvm_initializeTimerSet(); } /*************************** Pipeline API ******************************/ @@ -262,30 +262,30 @@ template <class ElementType> ElementType CircularBuffer<ElementType>::pop() { extern "C" { // Functions to push and pop values from pipeline buffers -uint64_t llvm_visc_bufferPop(void *); -void llvm_visc_bufferPush(void *, uint64_t); +uint64_t llvm_hpvm_bufferPop(void *); +void llvm_hpvm_bufferPush(void *, uint64_t); // Functions to create and destroy buffers -void *llvm_visc_createBindInBuffer(void *, uint64_t, unsigned); -void *llvm_visc_createBindOutBuffer(void *, uint64_t); -void *llvm_visc_createEdgeBuffer(void *, uint64_t); -void *llvm_visc_createLastInputBuffer(void *, uint64_t); +void *llvm_hpvm_createBindInBuffer(void *, uint64_t, unsigned); +void *llvm_hpvm_createBindOutBuffer(void *, uint64_t); +void *llvm_hpvm_createEdgeBuffer(void *, uint64_t); +void *llvm_hpvm_createLastInputBuffer(void *, uint64_t); -void llvm_visc_freeBuffers(void *); +void llvm_hpvm_freeBuffers(void *); // Functions to create and destroy threads -void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *); -void llvm_visc_freeThreads(void *); +void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), void *); +void llvm_hpvm_freeThreads(void *); // Launch API for a streaming graph. // Arguments: // (1) Launch Function: void* (void*, void*) // (2) Push Function: void (void*, std::vector<uint64_t>**, unsgined) // (3) Pop Function: void* (std::vector<uint64_t>**, unsigned) -void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *); -void llvm_visc_streamPush(void *graphID, void *args); -void *llvm_visc_streamPop(void *graphID); -void llvm_visc_streamWait(void *graphID); +void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *); +void llvm_hpvm_streamPush(void *graphID, void *args); +void *llvm_hpvm_streamPop(void *graphID); +void llvm_hpvm_streamWait(void *graphID); } -#endif // VISC_RT_HEADER +#endif // HPVM_RT_HEADER diff --git a/hpvm/projects/visc-rt/makefile b/hpvm/projects/hpvm-rt/makefile similarity index 97% rename from hpvm/projects/visc-rt/makefile rename to hpvm/projects/hpvm-rt/makefile index adcc6323356d2537eca6ed653cad6d17a1d1ef0e..927e26e254a2b2f980fed8efd8858935e9f3cbdf 100644 --- a/hpvm/projects/visc-rt/makefile +++ b/hpvm/projects/hpvm-rt/makefile @@ -9,7 +9,7 @@ ifeq ($(NUM_CORES),) endif CPP_FLAGS = -I$(LLVM_SRC_ROOT)/include -I$(LLVM_BUILD_ROOT)/include -I$(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS -TARGET:=visc-rt +TARGET:=hpvm-rt LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++ diff --git a/hpvm/projects/visc-rt/policy.h b/hpvm/projects/hpvm-rt/policy.h similarity index 100% rename from hpvm/projects/visc-rt/policy.h rename to hpvm/projects/hpvm-rt/policy.h diff --git a/hpvm/projects/visc-rt/CMakeLists.txt b/hpvm/projects/visc-rt/CMakeLists.txt deleted file mode 100644 index 5b9449bf2d00ac7a03c085cc1418a95e032d01b7..0000000000000000000000000000000000000000 --- a/hpvm/projects/visc-rt/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -add_definitions(-DNUM_CORES=8) - -SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang) -SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++) - -add_llvm_library(visc-rt.ll visc-rt.cpp - - DEPENDS - clang - llvm-dis - ) - - -target_compile_options(visc-rt.ll PUBLIC -flto ) -target_compile_options(visc-rt.ll PUBLIC -std=c++11) - -add_custom_target(visc-rt.cpp.o ALL - COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libvisc-rt.ll.a - COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc - COMMAND ${CMAKE_BINARY_DIR}/bin/llvm-dis ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc) - -add_dependencies(visc-rt.cpp.o visc-rt.ll) diff --git a/hpvm/test/CTestSuite/Makefile b/hpvm/test/CTestSuite/Makefile index 226a83287d743360d9cd64a7c57e864871829b0b..1169e4e896a861975ac0562ebff8b208828bbf89 100644 --- a/hpvm/test/CTestSuite/Makefile +++ b/hpvm/test/CTestSuite/Makefile @@ -9,7 +9,7 @@ LLVM_CC:=$(LLVM_INSTALL)/bin/clang LLVM_OPT:=$(LLVM_INSTALL)/bin/opt BUILD_DIR:=build -all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll) +all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -17,10 +17,10 @@ $(BUILD_DIR): $(HOST:%=$(BUILD_DIR)/%.ll):$(BUILD_DIR)/%.ll:%.c $(LLVM_CC) -S -emit-llvm $< -O3 -o $@ -$(HOST:%=$(BUILD_DIR)/%.visc.ll):$(BUILD_DIR)/%.visc.ll:$(BUILD_DIR)/%.ll - $(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenVISC.so -genvisc -globaldce $< -S -o $@ +$(HOST:%=$(BUILD_DIR)/%.hpvm.ll):$(BUILD_DIR)/%.hpvm.ll:$(BUILD_DIR)/%.ll + $(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenHPVM.so -genhpvm -globaldce $< -S -o $@ @cat RUN.script $@ > $@.tmp @mv $@.tmp $@ clean : - rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.nvptx.s) $(BUILD_DIR)/DataflowGraph.dot* + rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.nvptx.s) $(BUILD_DIR)/DataflowGraph.dot* diff --git a/hpvm/test/CTestSuite/RUN.script b/hpvm/test/CTestSuite/RUN.script index 10bf667818824719af2e041fc6b2dc3e449d9158..23fa1694ebf4b7448c731327b96b949c0509b62e 100644 --- a/hpvm/test/CTestSuite/RUN.script +++ b/hpvm/test/CTestSuite/RUN.script @@ -1,6 +1,6 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin ; RUN: %t.bin diff --git a/hpvm/test/CTestSuite/gemm.c b/hpvm/test/CTestSuite/gemm.c index d0a69ba25c27fb65ea549023deed2dfb0197b882..eb0a3c5e9204d9621c4a15ae7f07ef5158ac1d07 100644 --- a/hpvm/test/CTestSuite/gemm.c +++ b/hpvm/test/CTestSuite/gemm.c @@ -54,14 +54,14 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) { return 1; // Success } -// Dummy visc node execution call -// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), +// Dummy hpvm node execution call +// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned), // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* // outputs); void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) { - __visc__attributes(2, A, B, 1, C); + __hpvm__attributes(2, A, B, 1, C); // printf("Entered function\n"); int tx = get_local_id(0); // 2D Global Thread ID x int ty = get_local_id(1); // 2D Global Thread ID y @@ -130,10 +130,10 @@ int main(int argc, char **argv) { // Compute using OpenCL // matrixMul(h_A, h_B, h_C, WA, WB); - //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); - unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, + //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); + unsigned graphMM = __hpvm__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0); - __visc__wait(graphMM); + __hpvm__wait(graphMM); if (checkResults(h_A, h_B, h_C)) printf("\nPass!\n"); else diff --git a/hpvm/test/CTestSuite/gemm_2.c b/hpvm/test/CTestSuite/gemm_2.c index bd7ab27fc0160275442d23faf507851b7c2369f7..df4555936316703cfccd4048f2ade4e28592e53a 100644 --- a/hpvm/test/CTestSuite/gemm_2.c +++ b/hpvm/test/CTestSuite/gemm_2.c @@ -54,13 +54,13 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) { return 1; // Success } -// Dummy visc node execution call -// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), +// Dummy hpvm node execution call +// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned), // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* // outputs); void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) { - __visc__attributes(2, A, B, 1, C); + __hpvm__attributes(2, A, B, 1, C); // printf("Entered function\n"); int tx = get_global_id(0); // 2D Global Thread ID x @@ -130,11 +130,11 @@ int main(int argc, char **argv) { // Compute using OpenCL // matrixMul(h_A, h_B, h_C, WA, WB); - //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); + //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); unsigned graphMM = - __visc__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A, + __hpvm__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0); - __visc__wait(graphMM); + __hpvm__wait(graphMM); if (checkResults(h_A, h_B, h_C)) printf("\nPass!\n"); else diff --git a/hpvm/test/hpvm-cava/.gitignore b/hpvm/test/hpvm-cava/.gitignore index 2fc1b235647962ac761edda7dfbda4499cbcd4f0..f08b880bf9b4b8171e9fb878bea3a6d266a1f9c0 100644 --- a/hpvm/test/hpvm-cava/.gitignore +++ b/hpvm/test/hpvm-cava/.gitignore @@ -1,5 +1,5 @@ build/ -cava-visc +cava-hpvm Makefile.config example-face/*.bin diff --git a/hpvm/test/hpvm-cava/Makefile b/hpvm/test/hpvm-cava/Makefile index 62219a1cb0a92d1ca0d5bc661645b4c8251a24b8..7530477f3d73ef7b641f3d4b39fda4f50b201d0f 100644 --- a/hpvm/test/hpvm-cava/Makefile +++ b/hpvm/test/hpvm-cava/Makefile @@ -26,21 +26,21 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR) -INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include +INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include ifneq ($(CONFUSE_ROOT),) INCLUDES += -I$(CONFUSE_ROOT)/include LFLAGS += -L$(CONFUSE_ROOT)/lib endif -EXE = cava-visc-$(VERSION)-$(TARGET) +EXE = cava-hpvm-$(VERSION)-$(TARGET) LFLAGS += -pthread ## BEGIN HPVM MAKEFILE -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP = $(EXE) APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3 @@ -52,23 +52,23 @@ OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt +HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt -VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll +HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll -TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce +TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce ifeq ($(TARGET),seq) DEVICE = CPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG - VISC_OPTFLAGS += -visc-timers-x86 + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS += -hpvm-timers-x86 else DEVICE = GPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG - VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx endif - TESTGEN_OPTFLAGS += -visc-timers-gen + TESTGEN_OPTFLAGS += -hpvm-timers-gen CFLAGS += -DDEVICE=$(DEVICE) CXXFLAGS += -DDEVICE=$(DEVICE) @@ -79,7 +79,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1)) .PRECIOUS: $(BUILD_DIR)/%.ll OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) -TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS)) KERNEL = $(TEST_OBJS).kernels.ll ifeq ($(TARGET),seq) @@ -107,14 +107,14 @@ $(KERNEL_OCL) : $(KERNEL) $(EXE) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ -$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) +$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp +$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp make -C $(LLVM_LIB_PATH) -$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) - $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) +$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS) + $(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -125,7 +125,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c $(CC) $(CFLAGS) -emit-llvm -S -o $@ $< -$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll - $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ +$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll + $(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@ ## END HPVM MAKEFILE diff --git a/hpvm/test/hpvm-cava/Makefile.config.example b/hpvm/test/hpvm-cava/Makefile.config.example index 269f0b7df273c958f0cd20a0f935716a329e00ae..8cbe04af784fa1e030ed0bce07176b081980649d 100644 --- a/hpvm/test/hpvm-cava/Makefile.config.example +++ b/hpvm/test/hpvm-cava/Makefile.config.example @@ -4,20 +4,20 @@ OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib # NOTE: You may need to configure this based on your root path. -VISC_SRC_ROOT=$(LLVM_SRC_ROOT) +HPVM_SRC_ROOT=$(LLVM_SRC_ROOT) -VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build -CC = $(VISC_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include +HPVM_BUILD_DIR =$(HPVM_SRC_ROOT)/build +CC = $(HPVM_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include +CXX = $(HPVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(HPVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/hpvm-cava/README.md b/hpvm/test/hpvm-cava/README.md index 890b629d172a2f53bf77d6d52bda27637c71afeb..1106c4781b285c47d59548d47e5cd03f09063b28 100644 --- a/hpvm/test/hpvm-cava/README.md +++ b/hpvm/test/hpvm-cava/README.md @@ -12,7 +12,7 @@ See the original camera/vision pipeline repo (repo: `yaoyuannnn/cava`) for detai After building HPVM, the following steps are required to build and run the camera pipeline: 1. Build with `make TARGET=seq` for CPU and `make TARGET=gpu` for gpu. -2. Run with `./cava-visc-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. +2. Run with `./cava-hpvm-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. * `<Target>` can be either `seq` or `gpu` depending on what target is used to build. * This processes the raw image `example-tulip-small/raw_tulip-small.bin`. Note that raw images are different from bitmaps, so you might need to obtain them using special software. * This generates: `tulip-small.bin` and `tulip-small-<stage>.bin` where `<stage>` represents the stage of the pipeline. diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c index e43bbb4f25c4c97c9907ebae37251c854860c3b5..4188c9e86045de9d3d6d2688b0ebc48dc3152004 100644 --- a/hpvm/test/hpvm-cava/src/main.c +++ b/hpvm/test/hpvm-cava/src/main.c @@ -1,136 +1,154 @@ +#include "utility.h" #include <argp.h> +#include <assert.h> +#include <math.h> #include <stdio.h> #include <stdlib.h> -#include <assert.h> #include <string.h> -#include <math.h> -#include "utility.h" #include "cam_pipe_utility.h" -#include "pipe_stages.h" #include "load_cam_model.h" +#include "pipe_stages.h" -#include "visc.h" +#include "hpvm.h" int NUM_TEST_CASES; int NUM_CLASSES; int INPUT_DIM; int NUM_WORKER_THREADS; +// Type of struct holding the return value from the last node. +struct RetStruct { + size_t bytesRet; +}; + // Type of struct that is used to pass arguments to the HPVM dataflow graph // using the hpvm launch operation typedef struct __attribute__((__packed__)) { - uint8_t *input; size_t bytes_input; - uint8_t *result; size_t bytes_result; - float *input_scaled; size_t bytes_input_scaled; - float *result_scaled; size_t bytes_result_scaled; - float *demosaic_out; size_t bytes_demosaic_out; - float *denoise_out; size_t bytes_denoise_out; - float *transform_out; size_t bytes_transform_out; - float *gamut_out;size_t bytes_gamut_out; - float *TsTw; size_t bytes_TsTw; - float *ctrl_pts; size_t bytes_ctrl_pts; - float *weights; size_t bytes_weights; - float*coefs; size_t bytes_coefs; - float *l2_dist; size_t bytes_l2_dist; - float *tone_map; size_t bytes_tone_map; - size_t row_size; size_t col_size; -} -RootIn; + uint8_t *input; + size_t bytes_input; + uint8_t *result; + size_t bytes_result; + float *input_scaled; + size_t bytes_input_scaled; + float *result_scaled; + size_t bytes_result_scaled; + float *demosaic_out; + size_t bytes_demosaic_out; + float *denoise_out; + size_t bytes_denoise_out; + float *transform_out; + size_t bytes_transform_out; + float *gamut_out; + size_t bytes_gamut_out; + float *TsTw; + size_t bytes_TsTw; + float *ctrl_pts; + size_t bytes_ctrl_pts; + float *weights; + size_t bytes_weights; + float *coefs; + size_t bytes_coefs; + float *l2_dist; + size_t bytes_l2_dist; + float *tone_map; + size_t bytes_tone_map; + int row_size; + int col_size; + struct RetStruct ret; // Instance of RetStruct holding the return value. +} RootIn; typedef enum _argnum { - RAW_IMAGE_BIN, - OUTPUT_IMAGE_BIN, - NUM_REQUIRED_ARGS, - DATA_FILE = NUM_REQUIRED_ARGS, - NUM_ARGS, + RAW_IMAGE_BIN, + OUTPUT_IMAGE_BIN, + NUM_REQUIRED_ARGS, + DATA_FILE = NUM_REQUIRED_ARGS, + NUM_ARGS, } argnum; typedef struct _arguments { - char* args[NUM_ARGS]; - int num_inputs; - int num_threads; + char *args[NUM_ARGS]; + int num_inputs; + int num_threads; } arguments; static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n"; static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary"; static struct argp_option options[] = { - { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 }, - { "data-file", 'f', "F", 0, - "File to read data and weights from (if data-init-mode == READ_FILE or " - "save-params is true). *.txt files are decoded as text files, while " - "*.bin files are decoded as binary files." }, + {"num-inputs", 'n', "N", 0, "Number of input images"}, + {0}, + {"data-file", 'f', "F", 0, + "File to read data and weights from (if data-init-mode == READ_FILE or " + "save-params is true). *.txt files are decoded as text files, while " + "*.bin files are decoded as binary files."}, }; -static error_t parse_opt(int key, char* arg, struct argp_state* state) { - arguments* args = (arguments*)(state->input); - switch (key) { - case 'n': { - args->num_inputs = strtol(arg, NULL, 10); - break; - } - case 'f': { - args->args[DATA_FILE] = arg; - break; - } - case 't': { - args->num_threads = strtol(arg, NULL, 10); - break; - } - case ARGP_KEY_ARG: { - if (state->arg_num >= NUM_REQUIRED_ARGS) - argp_usage(state); - args->args[state->arg_num] = arg; - break; - } - case ARGP_KEY_END: { - if (state->arg_num < NUM_REQUIRED_ARGS) { - fprintf(stderr, - "Not enough arguments! Got %d, require %d.\n", - state->arg_num, - NUM_REQUIRED_ARGS); - argp_usage(state); - } - break; - } - default: - return ARGP_ERR_UNKNOWN; +static error_t parse_opt(int key, char *arg, struct argp_state *state) { + arguments *args = (arguments *)(state->input); + switch (key) { + case 'n': { + args->num_inputs = strtol(arg, NULL, 10); + break; + } + case 'f': { + args->args[DATA_FILE] = arg; + break; + } + case 't': { + args->num_threads = strtol(arg, NULL, 10); + break; + } + case ARGP_KEY_ARG: { + if (state->arg_num >= NUM_REQUIRED_ARGS) + argp_usage(state); + args->args[state->arg_num] = arg; + break; + } + case ARGP_KEY_END: { + if (state->arg_num < NUM_REQUIRED_ARGS) { + fprintf(stderr, "Not enough arguments! Got %d, require %d.\n", + state->arg_num, NUM_REQUIRED_ARGS); + argp_usage(state); } - return 0; + break; + } + default: + return ARGP_ERR_UNKNOWN; + } + return 0; } -void set_default_args(arguments* args) { - args->num_inputs = 1; - args->num_threads = 0; - for (int i = 0; i < NUM_ARGS; i++) { - args->args[i] = NULL; - } +void set_default_args(arguments *args) { + args->num_inputs = 1; + args->num_threads = 0; + for (int i = 0; i < NUM_ARGS; i++) { + args->args[i] = NULL; + } } -static struct argp parser = { options, parse_opt, args_doc, prog_doc }; +static struct argp parser = {options, parse_opt, args_doc, prog_doc}; // Helper function for printing intermediate results -void descale_cpu(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - size_t row_size, size_t col_size) { - +void descale_cpu(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, size_t row_size, size_t col_size) { + for (int chan = 0; chan < CHAN_SIZE; chan++) for (int row = 0; row < row_size; row++) for (int col = 0; col < col_size; col++) { - int index = (chan*row_size + row) * col_size + col; + int index = (chan * row_size + row) * col_size + col; output[index] = min(max(input[index] * 255, 0), 255); } } static void sort(float arr[], int n) { - int i, j; - for (i = 0; i < n - 1; i++) - for (j = 0; j < n - i - 1; j++) - if (arr[j] > arr[j + 1]) { - float temp = arr[j]; - arr[j] = arr[j + 1]; - arr[j + 1] = temp; - } + int i, j; + for (i = 0; i < n - 1; i++) + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } } /**************************************************************/ @@ -140,256 +158,259 @@ static void sort(float arr[], int n) { // In this benchmark, no use of HPVM query intrinsics in the leaf node functions // Leaf HPVM node function for scale -void scale_fxp(uint8_t *input, size_t bytes_input, - float *output, size_t bytes_output, - size_t row_size, size_t col_size) { +void scale_fxp(uint8_t *input, size_t bytes_input, float *output, + size_t bytes_output, size_t row_size, size_t col_size) { - //Specifies compilation target for current node - __visc__hint(CPU_TARGET); + // Specifies compilation target for current node + __hpvm__hint(CPU_TARGET); // Specifies pointer arguments that will be used as "in" and "out" arguments // - count of "in" arguments // - list of "in" argument , and similar for "out" - __visc__attributes(2, input, output, 1, output); - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + __hpvm__attributes(2, input, output, 1, output); + void *thisNode = __hpvm__getNode(); + int row = __hpvm__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++){ - int index = (chan*row_size + row) * col_size + col; - output[index] = input[index] * 1.0 / 255; - } - __visc__return(1, bytes_output); + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + output[index] = input[index] * 1.0 / 255; + } + __hpvm__return(1, bytes_output); } // Leaf HPVM node function for descale -void descale_fxp(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, input, output, 1, output); - +void descale_fxp(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, size_t row_size, size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, input, output, 1, output); + for (int chan = 0; chan < CHAN_SIZE; chan++) for (int row = 0; row < row_size; row++) for (int col = 0; col < col_size; col++) { - int index = (chan*row_size + row) * col_size + col; + int index = (chan * row_size + row) * col_size + col; output[index] = min(max(input[index] * 255, 0), 255); } - __visc__return(1, bytes_result); + __hpvm__return(1, bytes_result); } // Leaf HPVM node function for demosaicing -void demosaic_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(DEVICE); - __visc__attributes(2, input, result, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); -// for (int row = 1; row < row_size - 1; row++) - for (int col = 1; col < col_size - 1; col++) { - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - if (row % 2 == 0 && col % 2 == 0) { - // Green pixel - // Getting the R values - float R1 = input[index_0 - 1]; - float R2 = input[index_0 + 1]; - // Getting the B values - float B1 = input[index_2 - col_size]; - float B2 = input[index_2 + col_size]; - // R - result[index_0] = (R1 + R2) / 2; - // G - result[index_1] = input[index_1] * 2; - // B - result[index_2] = (B1 + B2) / 2; - } else if (row % 2 == 0 && col % 2 == 1) { - // Red pixel - // Getting the G values - float G1 = input[index_1 - col_size]; - float G2 = input[index_1 + col_size]; - float G3 = input[index_1 - 1]; - float G4 = input[index_1 + 1]; - // Getting the B values - float B1 = input[index_2 - col_size - 1]; - float B2 = input[index_2 - col_size + 1]; - float B3 = input[index_2 + col_size - 1]; - float B4 = input[index_2 + col_size + 1]; - // R - result[index_0] = input[index_0]; - // G - result[index_1] = (G1 + G2 + G3 + G4) / 2; - // B (center pixel) - result[index_2] = (B1 + B2 + B3 + B4) / 4; - } else if (row % 2 == 1 && col % 2 == 0) { - // Blue pixel - // Getting the R values - float R1 = input[index_0 - col_size - 1]; - float R2 = input[index_0 + col_size - 1]; - float R3 = input[index_0 - col_size + 1]; - float R4 = input[index_0 + col_size + 1]; - // Getting the G values - float G1 = input[index_1 - col_size]; - float G2 = input[index_1 + col_size]; - float G3 = input[index_1 - 1]; - float G4 = input[index_1 + 1]; - // R - result[index_0] = (R1 + R2 + R3 + R4) / 4; - // G - result[index_1] = (G1 + G2 + G3 + G4) / 2; - // B - result[index_2] = input[index_2]; - } else { - // Bottom Green pixel - // Getting the R values - float R1 = input[index_0 - col_size]; - float R2 = input[index_0 + col_size]; - // Getting the B values - float B1 = input[index_2 - 1]; - float B2 = input[index_2 + 1]; - // R - result[index_0] = (R1 + R2) / 2; - // G - result[index_1] = input[index_1] * 2; - // B - result[index_2] = (B1 + B2) / 2; - } - } - __visc__return(1, bytes_result); +void demosaic_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size) { + __hpvm__hint(DEVICE); + __hpvm__attributes(2, input, result, 1, result); + + void *thisNode = __hpvm__getNode(); + int row = __hpvm__getNodeInstanceID_x(thisNode); + // for (int row = 1; row < row_size - 1; row++) + for (int col = 1; col < col_size - 1; col++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = input[index_0 - 1]; + float R2 = input[index_0 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size]; + float B2 = input[index_2 + col_size]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size - 1]; + float B2 = input[index_2 - col_size + 1]; + float B3 = input[index_2 + col_size - 1]; + float B4 = input[index_2 + col_size + 1]; + // R + result[index_0] = input[index_0]; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + result[index_2] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = input[index_0 - col_size - 1]; + float R2 = input[index_0 + col_size - 1]; + float R3 = input[index_0 - col_size + 1]; + float R4 = input[index_0 + col_size + 1]; + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // R + result[index_0] = (R1 + R2 + R3 + R4) / 4; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B + result[index_2] = input[index_2]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = input[index_0 - col_size]; + float R2 = input[index_0 + col_size]; + // Getting the B values + float B1 = input[index_2 - 1]; + float B2 = input[index_2 + 1]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } + } + __hpvm__return(1, bytes_result); } // Leaf HPVM node function for denoise -void denoise_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, input, result, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); +void denoise_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, input, result, 1, result); + + void *thisNode = __hpvm__getNode(); + int row = __hpvm__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) - if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { - float filter[9]; - for (int i = -1; i < 2; i++) - for (int j = -1; j < 2; j++) { - int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1; - filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)]; - } - sort(filter, 9); - result[(chan * row_size + row) * col_size + col] = filter[4]; - } else { - result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col]; - } - __visc__return(1, bytes_result); + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) + if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { + float filter[9]; + for (int i = -1; i < 2; i++) + for (int j = -1; j < 2; j++) { + int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1; + filter[index] = + input[(chan * row_size + (i + row)) * col_size + (j + col)]; + } + sort(filter, 9); + result[(chan * row_size + row) * col_size + col] = filter[4]; + } else { + result[(chan * row_size + row) * col_size + col] = + input[(chan * row_size + row) * col_size + col]; + } + __hpvm__return(1, bytes_result); } // Leaf HPVM node function, for color map and white balance transform -void transform_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, size_t row_size, size_t col_size) { - __visc__hint(DEVICE); - __visc__attributes(3, input, result, TsTw_tran, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + __hpvm__hint(DEVICE); + __hpvm__attributes(3, input, result, TsTw_tran, 1, result); + + void *thisNode = __hpvm__getNode(); + int row = __hpvm__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - int index_2d_0 = 0 * CHAN_SIZE + chan; - int index_2d_1 = 1 * CHAN_SIZE + chan; - int index_2d_2 = 2 * CHAN_SIZE + chan; - result[index] = - max(input[index_0] * TsTw_tran[index_2d_0] + - input[index_1] * TsTw_tran[index_2d_1] + - input[index_2] * TsTw_tran[index_2d_2], - 0); - } - __visc__return(1, bytes_result); + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + int index_2d_0 = 0 * CHAN_SIZE + chan; + int index_2d_1 = 1 * CHAN_SIZE + chan; + int index_2d_2 = 2 * CHAN_SIZE + chan; + result[index] = max(input[index_0] * TsTw_tran[index_2d_0] + + input[index_1] * TsTw_tran[index_2d_1] + + input[index_2] * TsTw_tran[index_2d_2], + 0); + } + __hpvm__return(1, bytes_result); } // Leaf HPVM node function, for gamut mapping -void gamut_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, +void gamut_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, float *coefs, + size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist); - - // First, get the L2 norm from every pixel to the control points, - // Then, sum it and weight it. Finally, add the bias. - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - float chan_val_0 = 0.0; - float chan_val_1 = 0.0; - float chan_val_2 = 0.0; - for (int cp = 0; cp < 3702; cp++) { - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); - float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); - float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); - float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); - float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); - float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); - float val = val1 * val2 + val3 * val4 + val5 * val6; - float sqrt_val = sqrt(val); - chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0]; - chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1]; - chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2]; - } - chan_val_0 += coefs[0 * CHAN_SIZE + 0] + - coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col]; - chan_val_1 += coefs[0 * CHAN_SIZE + 1] + - coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col]; - chan_val_2 += coefs[0 * CHAN_SIZE + 2] + - coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col]; - result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0); - result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0); - result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0); + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, + result, l2_dist); + + // First, get the L2 norm from every pixel to the control points, + // Then, sum it and weight it. Finally, add the bias. + void *thisNode = __hpvm__getNode(); + int row = __hpvm__getNodeInstanceID_x(thisNode); + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + float chan_val_0 = 0.0; + float chan_val_1 = 0.0; + float chan_val_2 = 0.0; + for (int cp = 0; cp < 3702; cp++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val = val1 * val2 + val3 * val4 + val5 * val6; + float sqrt_val = sqrt(val); + chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0]; + chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1]; + chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2]; } - __visc__return(1, bytes_result); + chan_val_0 += + coefs[0 * CHAN_SIZE + 0] + + coefs[1 * CHAN_SIZE + 0] * + input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 0] * + input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col]; + chan_val_1 += + coefs[0 * CHAN_SIZE + 1] + + coefs[1 * CHAN_SIZE + 1] * + input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 1] * + input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col]; + chan_val_2 += + coefs[0 * CHAN_SIZE + 2] + + coefs[1 * CHAN_SIZE + 2] * + input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 2] * + input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col]; + result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0); + result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0); + result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0); + } + __hpvm__return(1, bytes_result); } // HPVM leaf node function, for tone mapping -void tone_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, size_t bytes_tone_map, size_t row_size, size_t col_size) { - __visc__hint(DEVICE); - __visc__attributes(3, input, result, tone_map, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + __hpvm__hint(DEVICE); + __hpvm__attributes(3, input, result, tone_map, 1, result); + + void *thisNode = __hpvm__getNode(); + int row = __hpvm__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - uint8_t x = input[index] * 255; - result[index] = tone_map[x * CHAN_SIZE + chan]; - } - __visc__return(1, bytes_result); + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + uint8_t x = input[index] * 255; + result[index] = tone_map[x * CHAN_SIZE + chan]; + } + __hpvm__return(1, bytes_result); } /********************************************************************/ @@ -400,185 +421,184 @@ void tone_map_fxp(float *input, size_t bytes_input, // requirement for the FPGA backend . The CPU backend also supports this, // so it does not cause a portability issue. -void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, input, result, 1, result); +void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, input, result, 1, result); // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic // instance (last argument) associated with node function scale_fxp - void *ScaleNode = __visc__createNodeND(1, scale_fxp, row_size); + void *ScaleNode = __hpvm__createNodeND(1, scale_fxp, row_size); // Binds inputs of current node with specified node // - destination node // - argument position in argument list of function of source node // - argument position in argument list of function of destination node // - streaming (1) or non-streaming (0) - __visc__bindIn(ScaleNode, 0, 0, 0); // bind input - __visc__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(ScaleNode, 2, 2, 0); // bind result - __visc__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(ScaleNode, 4, 4, 0); // bind row_size - __visc__bindIn(ScaleNode, 5, 5, 0); // bind col_size + __hpvm__bindIn(ScaleNode, 0, 0, 0); // bind input + __hpvm__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(ScaleNode, 2, 2, 0); // bind result + __hpvm__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(ScaleNode, 4, 4, 0); // bind row_size + __hpvm__bindIn(ScaleNode, 5, 5, 0); // bind col_size // Similar to bindIn, but for the output. Output of a node is a struct, and // we consider the fields in increasing ordering. - __visc__bindOut(ScaleNode, 0, 0, 0); + __hpvm__bindOut(ScaleNode, 0, 0, 0); } -void descale_fxp_wrapper(float *input, size_t bytes_input, - uint8_t *result, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, input, result, 1, result); - void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size); - __visc__bindIn(DescaleNode, 0, 0, 0); // bind input - __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(DescaleNode, 2, 2, 0); // bind result - __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size - __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size - - __visc__bindOut(DescaleNode, 0, 0, 0); +void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result, + size_t bytes_result, size_t row_size, + size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, input, result, 1, result); + void *DescaleNode = __hpvm__createNodeND(1, descale_fxp, row_size); + __hpvm__bindIn(DescaleNode, 0, 0, 0); // bind input + __hpvm__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(DescaleNode, 2, 2, 0); // bind result + __hpvm__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(DescaleNode, 4, 4, 0); // bind row_size + __hpvm__bindIn(DescaleNode, 5, 5, 0); // bind col_size + + __hpvm__bindOut(DescaleNode, 0, 0, 0); } -void demosaic_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, input, result, 1, result); - void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size); - __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input - __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result - __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size - __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size - - __visc__bindOut(DemosaicNode, 0, 0, 0); +void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, + size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, input, result, 1, result); + void *DemosaicNode = __hpvm__createNodeND(1, demosaic_fxp, row_size); + __hpvm__bindIn(DemosaicNode, 0, 0, 0); // bind input + __hpvm__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(DemosaicNode, 2, 2, 0); // bind result + __hpvm__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(DemosaicNode, 4, 4, 0); // bind row_size + __hpvm__bindIn(DemosaicNode, 5, 5, 0); // bind col_size + + __hpvm__bindOut(DemosaicNode, 0, 0, 0); } -void denoise_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, input, result, 1, result); - void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size); - __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input - __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result - __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size - __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size - - __visc__bindOut(DenoiseNode, 0, 0, 0); +void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, + size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, input, result, 1, result); + void *DenoiseNode = __hpvm__createNodeND(1, denoise_fxp, row_size); + __hpvm__bindIn(DenoiseNode, 0, 0, 0); // bind input + __hpvm__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(DenoiseNode, 2, 2, 0); // bind result + __hpvm__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(DenoiseNode, 4, 4, 0); // bind row_size + __hpvm__bindIn(DenoiseNode, 5, 5, 0); // bind col_size + + __hpvm__bindOut(DenoiseNode, 0, 0, 0); } -void transform_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(3, input, result, TsTw_tran, 1, result); - void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size); - __visc__bindIn(TransformNode, 0, 0, 0); // bind input - __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(TransformNode, 2, 2, 0); // bind result - __visc__bindIn(TransformNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(TransformNode, 4, 4, 0); // bind tstw - __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw - __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size - __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size - - __visc__bindOut(TransformNode, 0, 0, 0); +void transform_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, + size_t bytes_TsTw, size_t row_size, + size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(3, input, result, TsTw_tran, 1, result); + void *TransformNode = __hpvm__createNodeND(1, transform_fxp, row_size); + __hpvm__bindIn(TransformNode, 0, 0, 0); // bind input + __hpvm__bindIn(TransformNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(TransformNode, 2, 2, 0); // bind result + __hpvm__bindIn(TransformNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(TransformNode, 4, 4, 0); // bind tstw + __hpvm__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw + __hpvm__bindIn(TransformNode, 6, 6, 0); // bind row_size + __hpvm__bindIn(TransformNode, 7, 7, 0); // bind col_size + + __hpvm__bindOut(TransformNode, 0, 0, 0); } -void gamut_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, - size_t row_size, size_t col_size) { - __visc__hint(CPU_TARGET); - __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); - void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size); - __visc__bindIn(GamutNode, 0, 0, 0); // bind input - __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(GamutNode, 2, 2, 0); // bind result - __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts - __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts - __visc__bindIn(GamutNode, 6, 6, 0); // bind weights - __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights - __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs - __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs - __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist - __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist - __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size - __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size - - __visc__bindOut(GamutNode, 0, 0, 0); +void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, + size_t bytes_ctrl_pts, float *weights, + size_t bytes_weights, float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, size_t row_size, + size_t col_size) { + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, + result); + void *GamutNode = __hpvm__createNodeND(1, gamut_map_fxp, row_size); + __hpvm__bindIn(GamutNode, 0, 0, 0); // bind input + __hpvm__bindIn(GamutNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(GamutNode, 2, 2, 0); // bind result + __hpvm__bindIn(GamutNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts + __hpvm__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts + __hpvm__bindIn(GamutNode, 6, 6, 0); // bind weights + __hpvm__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights + __hpvm__bindIn(GamutNode, 8, 8, 0); // bind coefs + __hpvm__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs + __hpvm__bindIn(GamutNode, 10, 10, 0); // bind l2_dist + __hpvm__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist + __hpvm__bindIn(GamutNode, 12, 12, 0); // bind row_size + __hpvm__bindIn(GamutNode, 13, 13, 0); // bind col_size + + __hpvm__bindOut(GamutNode, 0, 0, 0); } -void tone_map_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, - size_t row_size, size_t col_size) { - - __visc__hint(CPU_TARGET); - __visc__attributes(3, input, result, tone_map, 1, result); - void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, row_size); - __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input - __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result - __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map - __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map - __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size - __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size - - __visc__bindOut(ToneMapNode, 0, 0, 0); +void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, + size_t bytes_tone_map, size_t row_size, + size_t col_size) { + + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(3, input, result, tone_map, 1, result); + void *ToneMapNode = __hpvm__createNodeND(1, tone_map_fxp, row_size); + __hpvm__bindIn(ToneMapNode, 0, 0, 0); // bind input + __hpvm__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input + __hpvm__bindIn(ToneMapNode, 2, 2, 0); // bind result + __hpvm__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result + __hpvm__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map + __hpvm__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map + __hpvm__bindIn(ToneMapNode, 6, 6, 0); // bind row_size + __hpvm__bindIn(ToneMapNode, 7, 7, 0); // bind col_size + + __hpvm__bindOut(ToneMapNode, 0, 0, 0); } - /*** ROOT Node - Top Level of the Graph Hierarchy ***/ -void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, - /*2*/ uint8_t *result, /*3*/ size_t bytes_result, - /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, - /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, - /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, - /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, - /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, - /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, - /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, - /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, - /*20*/ float *weights, /*21*/ size_t bytes_weights, - /*22*/ float*coefs, /*23*/ size_t bytes_coefs, - /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, - /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, - /*28*/ size_t row_size, /*29*/ size_t col_size) { - - //Specifies compilation target for current node - __visc__hint(CPU_TARGET); +void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, + /*2*/ uint8_t *result, /*3*/ size_t bytes_result, + /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, + /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, + /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, + /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, + /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, + /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, + /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, + /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, + /*20*/ float *weights, /*21*/ size_t bytes_weights, + /*22*/ float *coefs, /*23*/ size_t bytes_coefs, + /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, + /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, + /*28*/ size_t row_size, /*29*/ size_t col_size) { + + // Specifies compilation target for current node + __hpvm__hint(CPU_TARGET); // Specifies pointer arguments that will be used as "in" and "out" arguments // - count of "in" arguments // - list of "in" argument , and similar for "out" - __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, - transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, - 5, result, demosaic_out, denoise_out, transform_out, gamut_out); + __hpvm__attributes(14, input, result, input_scaled, result_scaled, + demosaic_out, denoise_out, transform_out, gamut_out, TsTw, + ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result, + demosaic_out, denoise_out, transform_out, gamut_out); // Create an 0D (specified by 1st argument) HPVM node - so a single node // associated with node function ---_fxp_wrapper - void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper); - void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper); - void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper); - void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper); - void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper); - void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper); - void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper); - + void *ScNode = __hpvm__createNodeND(0, scale_fxp_wrapper); + void *DmNode = __hpvm__createNodeND(0, demosaic_fxp_wrapper); + void *DnNode = __hpvm__createNodeND(0, denoise_fxp_wrapper); + void *TrNode = __hpvm__createNodeND(0, transform_fxp_wrapper); + void *GmNode = __hpvm__createNodeND(0, gamut_fxp_wrapper); + void *TnNode = __hpvm__createNodeND(0, tone_map_fxp_wrapper); + void *DsNode = __hpvm__createNodeND(0, descale_fxp_wrapper); + // BindIn binds inputs of current node with specified node // - destination node // - argument position in argument list of function of source node @@ -592,268 +612,283 @@ void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, // - destination position (in argument list of destination node) // - streaming (1) or non-streaming (0) - // scale_fxp inputs - __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input - __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input - __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result - __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result - __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size - __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size - - // demosaic_fxp inputs - __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input - __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input - __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result - __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result - __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size - __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size - - // denoise_fxp inputs - __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input - __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input - __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result - __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result - __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size - __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size - - // transform_fxp inputs - __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input - __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input - __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result - __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result - __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann - __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw - __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size - __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size - - // gamut_fxp inputs - __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input - __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input - __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result - __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result - __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts - __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts - __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights - __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights - __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs - __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs - __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist - __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist - __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size - __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size - - // tone_map_fxp inputs - __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input - __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input - __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result - __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result - __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map - __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map - __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size - __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size - - // descale_fxp inputs - __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input - __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input - __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result - __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result - __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size - __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size + // scale_fxp inputs + __hpvm__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input + __hpvm__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input + __hpvm__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result + __hpvm__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result + __hpvm__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size + __hpvm__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size + + // demosaic_fxp inputs + __hpvm__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input + __hpvm__edge(ScNode, DmNode, 1, 0, 1, + 0); // SCNode:bytes_result -> DmNode:bytes_input + __hpvm__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result + __hpvm__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result + __hpvm__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size + __hpvm__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size + + // denoise_fxp inputs + __hpvm__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input + __hpvm__edge(DmNode, DnNode, 1, 0, 1, + 0); // DMNode:bytes_result -> DnNode:bytes_input + __hpvm__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result + __hpvm__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result + __hpvm__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size + __hpvm__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size + + // transform_fxp inputs + __hpvm__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input + __hpvm__edge(DnNode, TrNode, 1, 0, 1, + 0); // DnNode:bytes_result -> TrNode:bytes_input + __hpvm__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result + __hpvm__bindIn(TrNode, 13, 3, + 0); // bytes_result_scaled -> TrNode:bytes_result + __hpvm__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann + __hpvm__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw + __hpvm__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size + __hpvm__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size + + // gamut_fxp inputs + __hpvm__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input + __hpvm__edge(TrNode, GmNode, 1, 0, 1, + 0); // TrNode:bytes_result -> GmNode:bytes_input + __hpvm__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result + __hpvm__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result + __hpvm__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts + __hpvm__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts + __hpvm__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights + __hpvm__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights + __hpvm__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs + __hpvm__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs + __hpvm__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist + __hpvm__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist + __hpvm__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size + __hpvm__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size + + // tone_map_fxp inputs + __hpvm__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input + __hpvm__edge(GmNode, TnNode, 1, 0, 1, + 0); // GmNode:bytes_result -> TnNode:bytes_input + __hpvm__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result + __hpvm__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result + __hpvm__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map + __hpvm__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map + __hpvm__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size + __hpvm__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size + + // descale_fxp inputs + __hpvm__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input + __hpvm__edge(TnNode, DsNode, 1, 0, 1, + 0); // TnNode:bytes_result -> DsNode:bytes_input + __hpvm__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result + __hpvm__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result + __hpvm__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size + __hpvm__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size // Similar to bindIn, but for the output. Output of a node is a struct, and // we consider the fields in increasing ordering. - __visc__bindOut(DsNode, 0, 0, 0); - + __hpvm__bindOut(DsNode, 0, 0, 0); } -int main(int argc, char* argv[]) { - // Parse the arguments. - arguments args; - set_default_args(&args); - argp_parse(&parser, argc, argv, 0, 0, &args); - - // Read a raw image. - // NOTE: We deliberately perform this file I/O outside of the kernel. - printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); - size_t row_size, col_size; - uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); - - printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); - - // Allocate a buffer for storing the output image data. - // (This is currently the same size as the input image data.) - size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; - size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; - uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image); - - __visc__init(); - - /////////////////////////////////////////////////////////////// - // Camera Model Parameters - /////////////////////////////////////////////////////////////// - // Path to the camera model to be used -// char cam_model_path[100]; -// char cam_model_path = "cam_models/NikonD7000/"; - // White balance index (select white balance from transform file) - // The first white balance in the file has a wb_index of 1 - // For more information on model format see the readme - int wb_index = 6; - - // Number of control points - int num_ctrl_pts = 3702; - uint8_t *input, *result; - float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out; - float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; - - TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); - float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); - free(TsTw); - TsTw = trans; - ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); - weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); - coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); - tone_map = get_tone_map("cam_models/NikonD7000/"); - - input_scaled = (float*) malloc_aligned(bytes_fimage); - result_scaled = (float*) malloc_aligned(bytes_fimage); - demosaic_out = (float*) malloc_aligned(bytes_fimage); - denoise_out = (float*) malloc_aligned(bytes_fimage); - transform_out = (float*) malloc_aligned(bytes_fimage); - gamut_out = (float*) malloc_aligned(bytes_fimage); - l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts); - - // This is host_input in cam_pipe() - input = (uint8_t*) malloc_aligned(bytes_image); - convert_hwc_to_chw(image_in, row_size, col_size, &input); - - // This is host_result in cam_pipe() - result = (uint8_t*) malloc_aligned(bytes_image); - - // Allocate struct to pass DFG inputs - RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn)); - - // Set up HPVM DFG inputs in the rootArgs struct. - rootArgs->input = input; - rootArgs->bytes_input = bytes_image; - - rootArgs->result = result; - rootArgs->bytes_result = bytes_image; - - rootArgs->input_scaled = input_scaled; - rootArgs->bytes_input_scaled = bytes_fimage; - - rootArgs->result_scaled = result_scaled; - rootArgs->bytes_result_scaled = bytes_fimage; - - rootArgs->demosaic_out = demosaic_out; - rootArgs->bytes_demosaic_out = bytes_fimage; - - rootArgs->denoise_out = denoise_out; - rootArgs->bytes_denoise_out = bytes_fimage; - - rootArgs->transform_out = transform_out; - rootArgs->bytes_transform_out = bytes_fimage; - - rootArgs->gamut_out = gamut_out; - rootArgs->bytes_gamut_out = bytes_fimage; - - rootArgs->TsTw = TsTw; - rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); - - rootArgs->ctrl_pts = ctrl_pts; - rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); - - rootArgs->weights = weights; - rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); - - rootArgs->coefs = coefs; - rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); - - rootArgs->tone_map = tone_map; - rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); - - rootArgs->l2_dist = l2_dist; - rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); - - rootArgs->row_size = row_size; - rootArgs->col_size = col_size; - - // Memory tracking is required for pointer arguments. - // Nodes can be scheduled on different targets, and - // dataflow edge implementation needs to request data. - // The pair (pointer, size) is inserted in memory tracker using this call - llvm_visc_track_mem(input, bytes_image); - llvm_visc_track_mem(result, bytes_image); - llvm_visc_track_mem(input_scaled, bytes_fimage); - llvm_visc_track_mem(result_scaled, bytes_fimage); - llvm_visc_track_mem(demosaic_out, bytes_fimage); - llvm_visc_track_mem(denoise_out, bytes_fimage); - llvm_visc_track_mem(transform_out, bytes_fimage); - llvm_visc_track_mem(gamut_out, bytes_fimage); - llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float)); - llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); - - printf("\n\nLaunching CAVA pipeline!\n"); - - void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs); - __visc__wait(camPipeDFG); - - printf("\n\nPipeline execution completed!\n"); - printf("\n\nRequesting memory!\n"); - - // Request data from graph. - llvm_visc_request_mem(result, bytes_image); - llvm_visc_request_mem(demosaic_out, bytes_fimage); - llvm_visc_request_mem(denoise_out, bytes_fimage); - llvm_visc_request_mem(transform_out, bytes_fimage); - llvm_visc_request_mem(gamut_out, bytes_fimage); - printf("\n\nDone requesting memory!\n"); - - - uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - - descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size); - descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size); - descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size); - descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size); - - convert_chw_to_hwc(result, row_size, col_size, &image_out); - convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); - convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic); - convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise); - convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform); - - - // Remove tracked pointers. - llvm_visc_untrack_mem(input); - llvm_visc_untrack_mem(result); - llvm_visc_untrack_mem(input_scaled); - llvm_visc_untrack_mem(result_scaled); - llvm_visc_untrack_mem(demosaic_out); - llvm_visc_untrack_mem(denoise_out); - llvm_visc_untrack_mem(transform_out); - llvm_visc_untrack_mem(gamut_out); - - llvm_visc_untrack_mem(TsTw); - llvm_visc_untrack_mem(ctrl_pts); - llvm_visc_untrack_mem(weights); - llvm_visc_untrack_mem(coefs); - llvm_visc_untrack_mem(tone_map); - llvm_visc_untrack_mem(l2_dist); - - // Output the image. - // NOTE: We deliberately perform this file I/O outside of the kernel. +int main(int argc, char *argv[]) { + // Parse the arguments. + arguments args; + set_default_args(&args); + argp_parse(&parser, argc, argv, 0, 0, &args); + + // Read a raw image. + // NOTE: We deliberately perform this file I/O outside of the kernel. + printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); + size_t row_size, col_size; + uint8_t *image_in = + read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); + + printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); + + // Allocate a buffer for storing the output image data. + // (This is currently the same size as the input image data.) + size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; + size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; + uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image); + + __hpvm__init(); + + /////////////////////////////////////////////////////////////// + // Camera Model Parameters + /////////////////////////////////////////////////////////////// + // Path to the camera model to be used + // char cam_model_path[100]; + // char cam_model_path = "cam_models/NikonD7000/"; + // White balance index (select white balance from transform file) + // The first white balance in the file has a wb_index of 1 + // For more information on model format see the readme + int wb_index = 6; + + // Number of control points + int num_ctrl_pts = 3702; + uint8_t *input, *result; + float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, + *transform_out, *gamut_out; + float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; + + TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); + float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); + free(TsTw); + TsTw = trans; + ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); + weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); + coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); + tone_map = get_tone_map("cam_models/NikonD7000/"); + + input_scaled = (float *)malloc_aligned(bytes_fimage); + result_scaled = (float *)malloc_aligned(bytes_fimage); + demosaic_out = (float *)malloc_aligned(bytes_fimage); + denoise_out = (float *)malloc_aligned(bytes_fimage); + transform_out = (float *)malloc_aligned(bytes_fimage); + gamut_out = (float *)malloc_aligned(bytes_fimage); + l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts); + + // This is host_input in cam_pipe() + input = (uint8_t *)malloc_aligned(bytes_image); + convert_hwc_to_chw(image_in, row_size, col_size, &input); + + // This is host_result in cam_pipe() + result = (uint8_t *)malloc_aligned(bytes_image); + + // Allocate struct to pass DFG inputs + RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn)); + + // Set up HPVM DFG inputs in the rootArgs struct. + rootArgs->input = input; + rootArgs->bytes_input = bytes_image; + + rootArgs->result = result; + rootArgs->bytes_result = bytes_image; + + rootArgs->input_scaled = input_scaled; + rootArgs->bytes_input_scaled = bytes_fimage; + + rootArgs->result_scaled = result_scaled; + rootArgs->bytes_result_scaled = bytes_fimage; + + rootArgs->demosaic_out = demosaic_out; + rootArgs->bytes_demosaic_out = bytes_fimage; + + rootArgs->denoise_out = denoise_out; + rootArgs->bytes_denoise_out = bytes_fimage; + + rootArgs->transform_out = transform_out; + rootArgs->bytes_transform_out = bytes_fimage; + + rootArgs->gamut_out = gamut_out; + rootArgs->bytes_gamut_out = bytes_fimage; + + rootArgs->TsTw = TsTw; + rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); + + rootArgs->ctrl_pts = ctrl_pts; + rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->weights = weights; + rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->coefs = coefs; + rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); + + rootArgs->tone_map = tone_map; + rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); + + rootArgs->l2_dist = l2_dist; + rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); + + rootArgs->row_size = row_size; + rootArgs->col_size = col_size; + + // Memory tracking is required for pointer arguments. + // Nodes can be scheduled on different targets, and + // dataflow edge implementation needs to request data. + // The pair (pointer, size) is inserted in memory tracker using this call + llvm_hpvm_track_mem(input, bytes_image); + llvm_hpvm_track_mem(result, bytes_image); + llvm_hpvm_track_mem(input_scaled, bytes_fimage); + llvm_hpvm_track_mem(result_scaled, bytes_fimage); + llvm_hpvm_track_mem(demosaic_out, bytes_fimage); + llvm_hpvm_track_mem(denoise_out, bytes_fimage); + llvm_hpvm_track_mem(transform_out, bytes_fimage); + llvm_hpvm_track_mem(gamut_out, bytes_fimage); + llvm_hpvm_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); + llvm_hpvm_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_hpvm_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_hpvm_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float)); + llvm_hpvm_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); + llvm_hpvm_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); + + printf("\n\nLaunching CAVA pipeline!\n"); + + void *camPipeDFG = __hpvm__launch(0, CamPipeRoot, (void *)rootArgs); + __hpvm__wait(camPipeDFG); + + printf("\n\nPipeline execution completed!\n"); + printf("Pipeline final stage returned %lu; should be %lu\n", + rootArgs->ret.bytesRet, bytes_image); + printf("\n\nRequesting memory!\n"); + + // Request data from graph. + llvm_hpvm_request_mem(result, bytes_image); + llvm_hpvm_request_mem(demosaic_out, bytes_fimage); + llvm_hpvm_request_mem(denoise_out, bytes_fimage); + llvm_hpvm_request_mem(transform_out, bytes_fimage); + llvm_hpvm_request_mem(gamut_out, bytes_fimage); + printf("\n\nDone requesting memory!\n"); + + uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + + descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, + row_size, col_size); + descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, + row_size, col_size); + descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, + row_size, col_size); + descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, + row_size, col_size); + + convert_chw_to_hwc(result, row_size, col_size, &image_out); + convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); + convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, + &image_out_demosaic); + convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, + &image_out_denoise); + convert_chw_to_hwc(transform_out_descaled, row_size, col_size, + &image_out_transform); + + // Remove tracked pointers. + llvm_hpvm_untrack_mem(input); + llvm_hpvm_untrack_mem(result); + llvm_hpvm_untrack_mem(input_scaled); + llvm_hpvm_untrack_mem(result_scaled); + llvm_hpvm_untrack_mem(demosaic_out); + llvm_hpvm_untrack_mem(denoise_out); + llvm_hpvm_untrack_mem(transform_out); + llvm_hpvm_untrack_mem(gamut_out); + + llvm_hpvm_untrack_mem(TsTw); + llvm_hpvm_untrack_mem(ctrl_pts); + llvm_hpvm_untrack_mem(weights); + llvm_hpvm_untrack_mem(coefs); + llvm_hpvm_untrack_mem(tone_map); + llvm_hpvm_untrack_mem(l2_dist); + + // Output the image. + // NOTE: We deliberately perform this file I/O outside of the kernel. char str[50], base_str[50]; strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]); strcpy(str, base_str); @@ -877,8 +912,7 @@ int main(int argc, char* argv[]) { printf("Writing output image to %s\n", str); write_image_to_binary(str, image_out_transform, row_size, col_size); - __visc__cleanup(); + __hpvm__cleanup(); - return 0; + return 0; } - diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c index 2ebedec936915b5e7f11881c5001c84b6db26474..05bb06697fa8df130aa0d0d324f9bc39bc575fb2 100644 --- a/hpvm/test/hpvm-cava/src/pipe_stages.c +++ b/hpvm/test/hpvm-cava/src/pipe_stages.c @@ -1,172 +1,169 @@ -#include <stdio.h> -#include <math.h> #include "pipe_stages.h" #include "cam_pipe_utility.h" +#include <math.h> +#include <stdio.h> + +// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { +void scale_fxp(uint8_t *input, size_t bytes_input, float *output, + size_t bytes_output, int row_size, int col_size) { + __hpvm__hint(DEVICE); + __hpvm__attributes(2, input, output, 1, output); -//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { -void scale_fxp(uint8_t *input, size_t bytes_input, - float *output, size_t bytes_output, - int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(2, input, output, 1, output); - ARRAY_3D(uint8_t, _input, input, row_size, col_size); ARRAY_3D(float, _output, output, row_size, col_size); - sl_chan: +sl_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - sl_row: + sl_row: for (int row = 0; row < row_size; row++) - sl_col: + sl_col: for (int col = 0; col < col_size; col++) _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255; - __visc__return(1, bytes_output); + __hpvm__return(1, bytes_output); } -//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { -void descale_fxp(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(2, input, output, 1, output); - +// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { +void descale_fxp(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, int row_size, int col_size) { + __hpvm__hint(DEVICE); + __hpvm__attributes(2, input, output, 1, output); + ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(uint8_t, _output, output, row_size, col_size); - dsl_chan: +dsl_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - dsl_row: + dsl_row: for (int row = 0; row < row_size; row++) - dsl_col: + dsl_col: for (int col = 0; col < col_size; col++) - _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255); + _output[chan][row][col] = + min(max(_input[chan][row][col] * 255, 0), 255); - __visc__return(1, bytes_output); + __hpvm__return(1, bytes_output); } // Demosaicing stage // G R // B G -//void demosaic_fxp(float *input, int row_size, int col_size, float *result) { -void demosaic_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(2, input, result, 1, result); - +// void demosaic_fxp(float *input, int row_size, int col_size, float *result) { +void demosaic_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, int row_size, int col_size) { + __hpvm__hint(DEVICE); + __hpvm__attributes(2, input, result, 1, result); + printf("Demosaicing.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); - dm_row: +dm_row: for (int row = 1; row < row_size - 1; row++) - dm_col: + dm_col: for (int col = 1; col < col_size - 1; col++) - if (row % 2 == 0 && col % 2 == 0) { - // Green pixel - // Getting the R values - float R1 = _input[0][row][col - 1]; - float R2 = _input[0][row][col + 1]; - // Getting the B values - float B1 = _input[2][row - 1][col]; - float B2 = _input[2][row + 1][col]; - // R - _result[0][row][col] = (R1 + R2) / 2; - // G - _result[1][row][col] = _input[1][row][col] * 2; - // B - _result[2][row][col] = (B1 + B2) / 2; - } else if (row % 2 == 0 && col % 2 == 1) { - // Red pixel - // Getting the G values - float G1 = _input[1][row - 1][col]; - float G2 = _input[1][row + 1][col]; - float G3 = _input[1][row][col - 1]; - float G4 = _input[1][row][col + 1]; - // Getting the B values - float B1 = _input[2][row - 1][col - 1]; - float B2 = _input[2][row - 1][col + 1]; - float B3 = _input[2][row + 1][col - 1]; - float B4 = _input[2][row + 1][col + 1]; - // R - _result[0][row][col] = _input[0][row][col]; - // G - _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; - // B (center pixel) - _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; - } else if (row % 2 == 1 && col % 2 == 0) { - // Blue pixel - // Getting the R values - float R1 = _input[0][row - 1][col - 1]; - float R2 = _input[0][row + 1][col - 1]; - float R3 = _input[0][row - 1][col + 1]; - float R4 = _input[0][row + 1][col + 1]; - // Getting the G values - float G1 = _input[1][row - 1][col]; - float G2 = _input[1][row + 1][col]; - float G3 = _input[1][row][col - 1]; - float G4 = _input[1][row][col + 1]; - // R - _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; - // G - _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; - // B - _result[2][row][col] = _input[2][row][col]; - } else { - // Bottom Green pixel - // Getting the R values - float R1 = _input[0][row - 1][col]; - float R2 = _input[0][row + 1][col]; - // Getting the B values - float B1 = _input[2][row][col - 1]; - float B2 = _input[2][row][col + 1]; - // R - _result[0][row][col] = (R1 + R2) / 2; - // G - _result[1][row][col] = _input[1][row][col] * 2; - // B - _result[2][row][col] = (B1 + B2) / 2; - } + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = _input[0][row][col - 1]; + float R2 = _input[0][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col]; + float B2 = _input[2][row + 1][col]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col - 1]; + float B2 = _input[2][row - 1][col + 1]; + float B3 = _input[2][row + 1][col - 1]; + float B4 = _input[2][row + 1][col + 1]; + // R + _result[0][row][col] = _input[0][row][col]; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = _input[0][row - 1][col - 1]; + float R2 = _input[0][row + 1][col - 1]; + float R3 = _input[0][row - 1][col + 1]; + float R4 = _input[0][row + 1][col + 1]; + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B + _result[2][row][col] = _input[2][row][col]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = _input[0][row - 1][col]; + float R2 = _input[0][row + 1][col]; + // Getting the B values + float B1 = _input[2][row][col - 1]; + float B2 = _input[2][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } - __visc__return(1, bytes_result); + __hpvm__return(1, bytes_result); } static void sort(float arr[], int n) { - int i, j; - dn_sort_i: - for (i = 0; i < n - 1; i++) - dn_sort_j: - for (j = 0; j < n - i - 1; j++) - if (arr[j] > arr[j + 1]) { - float temp = arr[j]; - arr[j] = arr[j + 1]; - arr[j + 1] = temp; - } + int i, j; +dn_sort_i: + for (i = 0; i < n - 1; i++) + dn_sort_j: + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } } // Simple denoise -//void denoise_fxp(float *input, int row_size, int col_size, float *result) { -void denoise_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(2, input, result, 1, result); - +// void denoise_fxp(float *input, int row_size, int col_size, float *result) { +void denoise_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, int row_size, int col_size) { + __hpvm__hint(DEVICE); + __hpvm__attributes(2, input, result, 1, result); + printf("Denoising.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); - dn_chan: +dn_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - dn_row: + dn_row: for (int row = 0; row < row_size; row++) - dn_col: + dn_col: for (int col = 0; col < col_size; col++) if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { float filter[9]; - dn_slide_row: - for (int i = row-1; i < row+2; i++) - dn_slide_col: - for (int j = col-1; j < col+2; j++) { + dn_slide_row: + for (int i = row - 1; i < row + 2; i++) + dn_slide_col: + for (int j = col - 1; j < col + 2; j++) { int index = (i - row + 1) * 3 + j - col + 1; filter[index] = _input[chan][i][j]; } @@ -175,53 +172,52 @@ void denoise_fxp(float *input, size_t bytes_input, } else { _result[chan][row][col] = _input[chan][row][col]; } - __visc__return(1, bytes_result); + __hpvm__return(1, bytes_result); } // Color map and white balance transform -//void transform_fxp(float *input, int row_size, int col_size, float *result, +// void transform_fxp(float *input, int row_size, int col_size, float *result, // float *TsTw_tran) { -void transform_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(3, input, result, TsTw_tran, 1, result); - + __hpvm__hint(DEVICE); + __hpvm__attributes(3, input, result, TsTw_tran, 1, result); + printf("Color mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3); - tr_chan: +tr_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - tr_row: + tr_row: for (int row = 0; row < row_size; row++) - tr_col: + tr_col: for (int col = 0; col < col_size; col++) _result[chan][row][col] = max(_input[0][row][col] * _TsTw_tran[0][chan] + _input[1][row][col] * _TsTw_tran[1][chan] + _input[2][row][col] * _TsTw_tran[2][chan], 0); - __visc__return(1, bytes_result); + __hpvm__return(1, bytes_result); } // // Weighted radial basis function for gamut mapping // -//void gamut_map_fxp(float *input, int row_size, int col_size, float *result, -// float *ctrl_pts, float *weights, float *coefs, float *l2_dist) { -void gamut_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, +// void gamut_map_fxp(float *input, int row_size, int col_size, float *result, +// float *ctrl_pts, float *weights, float *coefs, float +// *l2_dist) { +void gamut_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, float *coefs, + size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); - + __hpvm__hint(DEVICE); + __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, + result); + printf("Gamut mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); @@ -229,26 +225,25 @@ void gamut_map_fxp(float *input, size_t bytes_input, ARRAY_2D(float, _weights, weights, 3); ARRAY_2D(float, _coefs, coefs, 3); - // First, get the L2 norm from every pixel to the control points, - // Then, sum it and weight it. Finally, add the bias. - gm_rbf_row: +// First, get the L2 norm from every pixel to the control points, +// Then, sum it and weight it. Finally, add the bias. +gm_rbf_row: for (int row = 0; row < row_size; row++) - gm_rbf_col: + gm_rbf_col: for (int col = 0; col < col_size; col++) { - gm_rbf_cp0: + gm_rbf_cp0: for (int cp = 0; cp < num_ctrl_pts; cp++) { - l2_dist[cp] = - sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * - (_input[0][row][col] - _ctrl_pts[cp][0]) + - (_input[1][row][col] - _ctrl_pts[cp][1]) * - (_input[1][row][col] - _ctrl_pts[cp][1]) + - (_input[2][row][col] - _ctrl_pts[cp][2]) * - (_input[2][row][col] - _ctrl_pts[cp][2])); + l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * + (_input[0][row][col] - _ctrl_pts[cp][0]) + + (_input[1][row][col] - _ctrl_pts[cp][1]) * + (_input[1][row][col] - _ctrl_pts[cp][1]) + + (_input[2][row][col] - _ctrl_pts[cp][2]) * + (_input[2][row][col] - _ctrl_pts[cp][2])); } - gm_rbf_chan: + gm_rbf_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) { float chan_val = 0.0; - gm_rbf_cp1: + gm_rbf_cp1: for (int cp = 0; cp < num_ctrl_pts; cp++) { chan_val += l2_dist[cp] * _weights[cp][chan]; } @@ -259,32 +254,31 @@ void gamut_map_fxp(float *input, size_t bytes_input, _result[chan][row][col] = max(chan_val, 0); } } - __visc__return(1, bytes_result); + __hpvm__return(1, bytes_result); } // Tone mapping -//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, +// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, // float *result) { -void tone_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, size_t bytes_tone_map, int row_size, int col_size) { - __visc__hint(DEVICE); - __visc__attributes(3, input, result, tone_map, 1, result); - + __hpvm__hint(DEVICE); + __hpvm__attributes(3, input, result, tone_map, 1, result); + printf("Tone mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); ARRAY_2D(float, _tone_map, tone_map, 3); - tm_chan: +tm_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - tm_row: + tm_row: for (int row = 0; row < row_size; row++) - tm_col: + tm_col: for (int col = 0; col < col_size; col++) { uint8_t x = _input[chan][row][col] * 255; _result[chan][row][col] = _tone_map[x][chan]; } - __visc__return(1, bytes_result); + __hpvm__return(1, bytes_result); } diff --git a/hpvm/test/include/hpvm.h b/hpvm/test/include/hpvm.h new file mode 100644 index 0000000000000000000000000000000000000000..1e31c98946f00e32d84933fe4bfd443e65cb92a9 --- /dev/null +++ b/hpvm/test/include/hpvm.h @@ -0,0 +1,73 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef DEVICE +#define DEVICE GPU_TARGET +#endif + +#include "../../include/SupportHPVM/HPVMHint.h" + +#ifndef __cplusplus +#define noexcept +#endif + +#ifdef __cplusplus +extern "C" { +void __hpvm__hint(hpvm::Target) noexcept; +#else +void __hpvm__hint(enum Target) noexcept; +#endif + +void *__hpvm__createNodeND(unsigned, ...) noexcept; +void __hpvm__return(unsigned, ...) noexcept; + +void __hpvm__attributes(unsigned, ...) noexcept; +void __hpvm__init() noexcept; +void __hpvm__cleanup() noexcept; + +void __hpvm__bindIn(void *, unsigned, unsigned, unsigned) noexcept; +void __hpvm__bindOut(void *, unsigned, unsigned, unsigned) noexcept; +void *__hpvm__edge(void *, void *, unsigned, unsigned, unsigned, + unsigned) noexcept; + +void __hpvm__push(void *, void *) noexcept; +void *__hpvm__pop(void *) noexcept; +void *__hpvm__launch(unsigned, ...) noexcept; +void __hpvm__wait(void *) noexcept; + +void *__hpvm__getNode() noexcept; +void *__hpvm__getParentNode(void *) noexcept; +void __hpvm__barrier() noexcept; +void *__hpvm__malloc(long) noexcept; +long __hpvm__getNodeInstanceID_x(void *) noexcept; +long __hpvm__getNodeInstanceID_y(void *) noexcept; +long __hpvm__getNodeInstanceID_z(void *) noexcept; +long __hpvm__getNumNodeInstances_x(void *) noexcept; +long __hpvm__getNumNodeInstances_y(void *) noexcept; +long __hpvm__getNumNodeInstances_z(void *) noexcept; + +// Atomic +// signed int +int __hpvm__atomic_add(int *, int) noexcept; +int __hpvm__atomic_sub(int *, int) noexcept; +int __hpvm__atomic_xchg(int *, int) noexcept; +int __hpvm__atomic_inc(int *) noexcept; +int __hpvm__atomic_dec(int *) noexcept; +int __hpvm__atomic_min(int *, int) noexcept; +int __hpvm__atomic_max(int *, int) noexcept; +int __hpvm__atomic_and(int *, int) noexcept; +int __hpvm__atomic_or(int *, int) noexcept; +int __hpvm__atomic_xor(int *, int) noexcept; + +void llvm_hpvm_track_mem(void *, size_t) noexcept; +void llvm_hpvm_untrack_mem(void *) noexcept; +void llvm_hpvm_request_mem(void *, size_t) noexcept; + +#ifdef __cplusplus +} +#endif diff --git a/hpvm/test/include/visc.h b/hpvm/test/include/visc.h deleted file mode 100644 index 18b29500261362be66ea23feecf9a5f85ac68005..0000000000000000000000000000000000000000 --- a/hpvm/test/include/visc.h +++ /dev/null @@ -1,73 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef DEVICE -#define DEVICE GPU_TARGET -#endif - -#include "../../include/SupportVISC/VISCHint.h" - -#ifndef __cplusplus -#define noexcept -#endif - -#ifdef __cplusplus -extern "C" { -void __visc__hint(visc::Target) noexcept; -#else -void __visc__hint(enum Target) noexcept; -#endif - -void *__visc__createNodeND(unsigned, ...) noexcept; -void __visc__return(unsigned, ...) noexcept; - -void __visc__attributes(unsigned, ...) noexcept; -void __visc__init() noexcept; -void __visc__cleanup() noexcept; - -void __visc__bindIn(void *, unsigned, unsigned, unsigned) noexcept; -void __visc__bindOut(void *, unsigned, unsigned, unsigned) noexcept; -void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, - unsigned) noexcept; - -void __visc__push(void *, void *) noexcept; -void *__visc__pop(void *) noexcept; -void *__visc__launch(unsigned, ...) noexcept; -void __visc__wait(void *) noexcept; - -void *__visc__getNode() noexcept; -void *__visc__getParentNode(void *) noexcept; -void __visc__barrier() noexcept; -void *__visc__malloc(long) noexcept; -long __visc__getNodeInstanceID_x(void *) noexcept; -long __visc__getNodeInstanceID_y(void *) noexcept; -long __visc__getNodeInstanceID_z(void *) noexcept; -long __visc__getNumNodeInstances_x(void *) noexcept; -long __visc__getNumNodeInstances_y(void *) noexcept; -long __visc__getNumNodeInstances_z(void *) noexcept; - -// Atomic -// signed int -int __visc__atomic_add(int *, int) noexcept; -int __visc__atomic_sub(int *, int) noexcept; -int __visc__atomic_xchg(int *, int) noexcept; -int __visc__atomic_inc(int *) noexcept; -int __visc__atomic_dec(int *) noexcept; -int __visc__atomic_min(int *, int) noexcept; -int __visc__atomic_max(int *, int) noexcept; -int __visc__atomic_and(int *, int) noexcept; -int __visc__atomic_or(int *, int) noexcept; -int __visc__atomic_xor(int *, int) noexcept; - -void llvm_visc_track_mem(void *, size_t) noexcept; -void llvm_visc_untrack_mem(void *) noexcept; -void llvm_visc_request_mem(void *, size_t) noexcept; - -#ifdef __cplusplus -} -#endif diff --git a/hpvm/test/parboil/RUN.parboil.script b/hpvm/test/parboil/RUN.parboil.script index 7f8c01ede7bacdccc546f2a68935eb91db64afd6..5cedcf480dbcd357599710acdf27c274ec7c4ccf 100644 --- a/hpvm/test/parboil/RUN.parboil.script +++ b/hpvm/test/parboil/RUN.parboil.script @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll parboil.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll parboil.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin diff --git a/hpvm/test/parboil/benchmarks/bfs/Makefile b/hpvm/test/parboil/benchmarks/bfs/Makefile index cc6db678298c4c66312248cc4f7a2df0bd134d3f..e40a8484a3c7b40919b07fd7c30ab512c01741d8 100644 --- a/hpvm/test/parboil/benchmarks/bfs/Makefile +++ b/hpvm/test/parboil/benchmarks/bfs/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = bfs -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc + VERSION = hpvm endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile similarity index 81% rename from hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile index a459707110b6f281e8b1c8fc1cf21f888dffe95e..27cde148f75502914d12a77448d358cbea2f17ab 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS= -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/config.h similarity index 100% rename from hpvm/test/parboil/benchmarks/bfs/src/visc/config.h rename to hpvm/test/parboil/benchmarks/bfs/src/hpvm/config.h diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp similarity index 70% rename from hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp rename to hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp index 9491218e5e93d39fc1bda4fac3c14770ee48645b..0fa9a60df8e631f4684c58c26fdafc498a06295b 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp @@ -26,11 +26,11 @@ */ #include "config.h" #include "parboil.h" +#include <hpvm.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <visc.h> /********** Define colors for BFS @@ -113,11 +113,11 @@ void packData(RootIn *args, int *q1, size_t bytesq1, int *q2, size_t bytesq2, void Allocation(long block) { // Memory shared between threadblocks - void *local_q_tail = __visc__malloc(sizeof(int)); - void *local_q = __visc__malloc(LOCAL_MEM_SIZE * sizeof(int)); - void *shift = __visc__malloc(sizeof(int)); + void *local_q_tail = __hpvm__malloc(sizeof(int)); + void *local_q = __hpvm__malloc(LOCAL_MEM_SIZE * sizeof(int)); + void *shift = __hpvm__malloc(sizeof(int)); - __visc__return(6, local_q_tail, sizeof(int), local_q, + __hpvm__return(6, local_q_tail, sizeof(int), local_q, LOCAL_MEM_SIZE * sizeof(int), shift, sizeof(int)); } @@ -133,21 +133,21 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, int *local_q_tail, size_t byteslocal_q_tail, int *local_q, size_t byteslocal_q, int *shift, size_t bytesshift) { - __visc__hint(visc::DEVICE); - __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int dimx = __visc__getNumNodeInstances_x(thisNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int dimx = __hpvm__getNumNodeInstances_x(thisNode); if (lx == 0) { *local_q_tail = 0; // initialize the tail of w-queue } - __visc__barrier(); + __hpvm__barrier(); // first, propagate and add the new frontier elements into w-queues // int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0); @@ -170,16 +170,16 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, int cost = cur_edge.y; cost += cur_cost; - int orig_cost = __visc__atomic_min(&g_cost[id], cost); + int orig_cost = __hpvm__atomic_min(&g_cost[id], cost); if (orig_cost > cost) { // the node should be visited if (g_color[id] > UP_LIMIT) { - int old_color = __visc__atomic_xchg(&g_color[id], gray_shade); + int old_color = __hpvm__atomic_xchg(&g_color[id], gray_shade); // this guarantees that only one thread will push this node // into a queue if (old_color != gray_shade) { // atomic operation guarantees the correctness // even if multiple warps are executing simultaneously - int index = __visc__atomic_add(local_q_tail, 1); + int index = __hpvm__atomic_add(local_q_tail, 1); local_q[index] = id; } } @@ -187,16 +187,16 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, } } - __visc__barrier(); + __hpvm__barrier(); if (lx == 0) { int tot_sum = *local_q_tail; // the offset or "shift" of the block-level queue within the grid-level // queue is determined by atomic operation - *shift = __visc__atomic_add(tail, tot_sum); + *shift = __hpvm__atomic_add(tail, tot_sum); } - __visc__barrier(); + __hpvm__barrier(); // shift within a w-queue int local_shift = lx; @@ -220,41 +220,41 @@ void BlockingBFS(int *q1, size_t bytesq1, int *q2, size_t bytesq2, // ideally be placed in local memory int *local_q_tail, size_t byteslocal_q_tail, int *local_q, size_t byteslocal_q, int *shift, size_t bytesshift) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *AllocationNode = __visc__createNodeND(0, Allocation); - void *BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block); + void *AllocationNode = __hpvm__createNodeND(0, Allocation); + void *BFSLeafNode = __hpvm__createNodeND(1, BFSLeaf, block); // Bind edges - __visc__bindIn(AllocationNode, 17, 0, 0); // Bind block - __visc__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1 - __visc__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1 - __visc__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2 - __visc__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 - __visc__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes - __visc__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes - __visc__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges - __visc__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges - __visc__bindIn(BFSLeafNode, 8, 8, 0); // Bind color - __visc__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color - __visc__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost - __visc__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost - __visc__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail - __visc__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail - __visc__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes - __visc__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade - __visc__bindIn(BFSLeafNode, 16, 16, 0); // Bind k + __hpvm__bindIn(AllocationNode, 17, 0, 0); // Bind block + __hpvm__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1 + __hpvm__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1 + __hpvm__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2 + __hpvm__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 + __hpvm__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes + __hpvm__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes + __hpvm__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges + __hpvm__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges + __hpvm__bindIn(BFSLeafNode, 8, 8, 0); // Bind color + __hpvm__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color + __hpvm__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost + __hpvm__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost + __hpvm__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail + __hpvm__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail + __hpvm__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes + __hpvm__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade + __hpvm__bindIn(BFSLeafNode, 16, 16, 0); // Bind k // Create Edges between AllocationNode and BFSLeafNodeNode - __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail - __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18, + __hpvm__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail + __hpvm__edge(AllocationNode, BFSLeafNode, 1, 1, 18, 0); // Edge bytes_local_q_tail - __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q - __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q - __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift - __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift + __hpvm__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q + __hpvm__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q + __hpvm__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift + __hpvm__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift } // VoidRetTy @@ -264,30 +264,30 @@ void BFS_Root(int *q1, size_t bytesq1, int *q2, size_t bytesq2, int *g_color, size_t bytesg_color, int *g_cost, size_t bytesg_cost, int *tail, size_t bytestail, int no_of_nodes, int gray_shade, int k, long block, long grid) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid); + void *BlockingBFSNode = __hpvm__createNodeND(1, BlockingBFS, grid); // Bind edges - __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 - __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 - __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 - __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 - __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes - __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes - __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges - __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges - __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color - __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color - __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost - __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost - __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail - __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail - __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes - __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade - __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k - __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block + __hpvm__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __hpvm__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __hpvm__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __hpvm__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __hpvm__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __hpvm__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __hpvm__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __hpvm__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __hpvm__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __hpvm__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __hpvm__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost + __hpvm__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost + __hpvm__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail + __hpvm__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail + __hpvm__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes + __hpvm__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade + __hpvm__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k + __hpvm__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block } void BFS_Wrapper(int *q1, size_t bytesq1, // 0, 1 @@ -300,31 +300,31 @@ void BFS_Wrapper(int *q1, size_t bytesq1, // 0, 1 int no_of_nodes, int gray_shade, // 14, 15 int k, long block, long grid // 16 - 18 ) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *BlockingBFSNode = __visc__createNodeND(0, BFS_Root); + void *BlockingBFSNode = __hpvm__createNodeND(0, BFS_Root); // Bind edges - __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 - __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 - __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 - __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 - __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes - __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes - __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges - __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges - __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color - __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color - __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost - __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost - __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail - __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail - __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes - __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade - __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k - __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block - __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid + __hpvm__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __hpvm__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __hpvm__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __hpvm__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __hpvm__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __hpvm__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __hpvm__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __hpvm__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __hpvm__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __hpvm__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __hpvm__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost + __hpvm__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost + __hpvm__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail + __hpvm__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail + __hpvm__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes + __hpvm__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade + __hpvm__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k + __hpvm__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block + __hpvm__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid } FILE *fp; @@ -415,7 +415,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate mem for the result on host side @@ -433,15 +433,15 @@ int main(int argc, char **argv) { int *q2 = (int *)malloc(sizeof(int) * num_of_nodes); int *tail = (int *)malloc(sizeof(int)); - llvm_visc_track_mem(graph_nodes, bytes_graph_nodes); - llvm_visc_track_mem(graph_edges, bytes_graph_edges); - llvm_visc_track_mem(cost, bytes_cost); - llvm_visc_track_mem(color, bytes_cost); + llvm_hpvm_track_mem(graph_nodes, bytes_graph_nodes); + llvm_hpvm_track_mem(graph_edges, bytes_graph_edges); + llvm_hpvm_track_mem(cost, bytes_cost); + llvm_hpvm_track_mem(color, bytes_cost); // Allocating stuff on host side, but these can also be allocated in the graph - llvm_visc_track_mem(q1, bytes_cost); - llvm_visc_track_mem(q2, bytes_cost); + llvm_hpvm_track_mem(q1, bytes_cost); + llvm_hpvm_track_mem(q2, bytes_cost); // Scalar variable read/written by both graph and host. - llvm_visc_track_mem(tail, sizeof(int)); + llvm_hpvm_track_mem(tail, sizeof(int)); int num_of_blocks; int num_of_threads_per_block; @@ -466,9 +466,9 @@ int main(int argc, char **argv) { graph_edges, bytes_graph_edges, color, bytes_cost, cost, bytes_cost, tail, sizeof(int), num_of_nodes, gray, k, block, grid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); do { - llvm_visc_request_mem(tail, sizeof(int)); + llvm_hpvm_request_mem(tail, sizeof(int)); num_t = *tail; // printf("tail for iteration %d = %d\n",k, num_t); *tail = 0; @@ -493,7 +493,7 @@ int main(int argc, char **argv) { } else { args->gray_shade = GRAY1; } - // void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17, + // void* bfsDFG = __hpvm__node(BFS_kernel, 2, 1, block, grid, 17, // q1, bytes_cost, // q2, bytes_cost, // graph_nodes, bytes_graph_nodes, @@ -505,8 +505,8 @@ int main(int argc, char **argv) { // gray, // k, // 0); - void *bfsDFG = __visc__launch(0, BFS_Wrapper, (void *)args); - __visc__wait(bfsDFG); + void *bfsDFG = __hpvm__launch(0, BFS_Wrapper, (void *)args); + __hpvm__wait(bfsDFG); // Swap q1 and q2 // Swap q1 and q2 int *temp = args->q1; @@ -518,22 +518,22 @@ int main(int argc, char **argv) { // copy result from device to host pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(cost, bytes_cost); - llvm_visc_request_mem(color, bytes_cost); + llvm_hpvm_request_mem(cost, bytes_cost); + llvm_hpvm_request_mem(color, bytes_cost); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - llvm_visc_untrack_mem(graph_nodes); - llvm_visc_untrack_mem(graph_edges); - llvm_visc_untrack_mem(cost); - llvm_visc_untrack_mem(color); - llvm_visc_untrack_mem(q1); - llvm_visc_untrack_mem(q2); - llvm_visc_untrack_mem(tail); + llvm_hpvm_untrack_mem(graph_nodes); + llvm_hpvm_untrack_mem(graph_edges); + llvm_hpvm_untrack_mem(cost); + llvm_hpvm_untrack_mem(color); + llvm_hpvm_untrack_mem(q1); + llvm_hpvm_untrack_mem(q2); + llvm_hpvm_untrack_mem(tail); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); // Store the result into a file // FIXME: color is not even printed. Why are we reading it back?? diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll index 9abdb29a3c9cb7f4dc641d278fd8e1e001433c44..aca5667b70e9f612d833f06a8482be0a312173cc 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/bfs/src/opencl_base/kernel.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/bfs/src/opencl_base/kernel.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp index 9b8b502688abb01934b337bc7fb178b32fda4633..8e0d34c4b8e070958d47e517bec3dedbfd9c6403 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp @@ -237,7 +237,7 @@ int main(int argc, char **argv) { NULL, NULL)); printf("Starting GPU kernel\n"); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); int num_of_blocks; int num_of_threads_per_block; @@ -272,7 +272,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL)); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); if (num_t == 0) { // frontier is empty break; } diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp index 3f9bc775574f597bdcf69c6999553c3c37bd352d..cfd0bf870a91988f5b0f67ffb3be2143e3b6e964 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp @@ -428,7 +428,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clSetKernelArg( BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); @@ -458,7 +458,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1, 0, grid, block, 0, 0, 0)); @@ -490,7 +490,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL( clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); @@ -542,7 +542,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); @@ -572,7 +572,7 @@ int main(int argc, char **argv) { clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); diff --git a/hpvm/test/parboil/benchmarks/cutcp/Makefile b/hpvm/test/parboil/benchmarks/cutcp/Makefile index 5e56793360aa479f604883f63b41a3ab8bb0cc58..e8edc6e7314b4b41d4712d6e4433ffc321f3f082 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/Makefile +++ b/hpvm/test/parboil/benchmarks/cutcp/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = cutcp -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc + VERSION = hpvm endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile similarity index 85% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile index d4c650a17e4261cd14a564f38bea3e9009b92dd3..43a175b947140200bc9415ccd421c198349ba32a 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=excl.ll cutcpu.ll cutoff6overlap.ll output.ll readatom.ll ocl.ll -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/atom.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/atom.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutcpu.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutcpu.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff6overlap.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff6overlap.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/excl.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/excl.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/kernel.cl rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/macros.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/macros.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp similarity index 82% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp index caf99a5b37daaa28af83cd058c138af1270feff9..0a36196619a5013108c9bf3656ab2ce90fcfc710 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp @@ -16,7 +16,7 @@ #include "cutoff.h" #include "macros.h" #include "output.h" -#include <visc.h> +#include <hpvm.h> #define ERRTOL 1e-4f @@ -54,11 +54,11 @@ extern float rsqrt(float x); void Allocation(long block) { // Memory shared between threadblocks size_t bytes_AtomBinCache = sizeof(float) * BIN_CACHE_MAXLEN * BIN_DEPTH * 4; - void *AtomBinCache = __visc__malloc(bytes_AtomBinCache); + void *AtomBinCache = __hpvm__malloc(bytes_AtomBinCache); size_t bytes_myBinIndex = sizeof(xyz); - void *myBinIndex = __visc__malloc(bytes_myBinIndex); - __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, + void *myBinIndex = __hpvm__malloc(bytes_myBinIndex); + __hpvm__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, bytes_myBinIndex); } @@ -76,21 +76,21 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, // local memory args float *AtomBinCache, size_t bytes_AtomBinCache, int *myBinIndex, size_t bytes_myBinIndex) { - __visc__hint(visc::DEVICE); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int ly = __visc__getNodeInstanceID_y(thisNode); - int lz = __visc__getNodeInstanceID_z(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - int dimx = __visc__getNumNodeInstances_x(thisNode); - int dimy = __visc__getNumNodeInstances_y(thisNode); - int gdimx = __visc__getNumNodeInstances_x(parentNode); - int gdimy = __visc__getNumNodeInstances_y(parentNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int ly = __hpvm__getNodeInstanceID_y(thisNode); + int lz = __hpvm__getNodeInstanceID_z(thisNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int gy = __hpvm__getNodeInstanceID_y(parentNode); + int dimx = __hpvm__getNumNodeInstances_x(thisNode); + int dimy = __hpvm__getNumNodeInstances_y(thisNode); + int gdimx = __hpvm__getNumNodeInstances_x(parentNode); + int gdimy = __hpvm__getNumNodeInstances_y(parentNode); float *binZeroAddr = binBaseAddr + 4 * offset; @@ -168,7 +168,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, AtomBinCache[binIndex + tidmask + 16] = p_global[tidmask + 16]; } - __visc__barrier(); + __hpvm__barrier(); /* no warp divergence */ if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) { numbins = *NbrListLen - totalbins; @@ -196,7 +196,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); energy0 += aq * rsqrt(r2) * s * s; - // energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s; + // energy0 += aq * (1.0/__hpvm__sqrt(r2)) * s * s; } #else energy0 += (r2 < cutoff2); @@ -208,7 +208,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); energy1 += aq * rsqrt(r2) * s * s; - // energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s; + // energy1 += aq * (1.0/__hpvm__sqrt(r2)) * s * s; } #else energy1 += (r2 < cutoff2); @@ -219,7 +219,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); energy2 += aq * rsqrt(r2) * s * s; - // energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s; + // energy2 += aq * (1.0/__hpvm__sqrt(r2)) * s * s; } #else energy2 += (r2 < cutoff2); @@ -237,7 +237,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, #endif } /* end loop over atoms in bin */ } /* end loop over cached atom bins */ - __visc__barrier(); + __hpvm__barrier(); } /* end loop over neighbor list */ /* store into global memory */ @@ -260,38 +260,38 @@ void BlockingCUTCP(int binDim_x, int binDim_y, float4 *binBaseAddr, size_t bytes_NbrList, long blockx, long blocky, long blockz) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - void *AllocationNode = __visc__createNodeND(0, Allocation); + void *AllocationNode = __hpvm__createNodeND(0, Allocation); void *CUTCPLeafNode = - __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz); + __hpvm__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz); // Bind Inputs - __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx - __visc__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x - __visc__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y - __visc__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr - __visc__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr - __visc__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset - __visc__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h - __visc__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2 - __visc__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2 - __visc__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr - __visc__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex - __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen - __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen - __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList - __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList + __hpvm__bindIn(AllocationNode, 15, 0, 0); // Bind blockx + __hpvm__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x + __hpvm__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y + __hpvm__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr + __hpvm__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr + __hpvm__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset + __hpvm__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h + __hpvm__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2 + __hpvm__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2 + __hpvm__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr + __hpvm__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __hpvm__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex + __hpvm__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen + __hpvm__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen + __hpvm__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList + __hpvm__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList // Create Edges - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, + __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache + __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, 0); // Edge bytes_AtomBinCache - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, + __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex + __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, 0); // Edge bytes_myBinIndex } @@ -370,32 +370,32 @@ void CUTCPRoot(int binDim_x, int binDim_y, float4 *binBaseAddr, int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, size_t bytes_NbrList, long blockx, long blocky, long blockz, long gridx, long gridy, long gridz) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); void *BlockingCUTCPNode = - __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz); + __hpvm__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz); // Bind Inputs - __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x - __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y - __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset - __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h - __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 - __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 - __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex - __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen - __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen - __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList - __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList - __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx - __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky - __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz + __hpvm__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __hpvm__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __hpvm__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __hpvm__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __hpvm__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __hpvm__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __hpvm__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __hpvm__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __hpvm__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __hpvm__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __hpvm__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __hpvm__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __hpvm__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __hpvm__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __hpvm__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __hpvm__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __hpvm__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __hpvm__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz } void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr, @@ -410,34 +410,34 @@ void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr, int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, size_t bytes_NbrList, long blockx, long blocky, long blockz, long gridx, long gridy, long gridz) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - void *BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot); + void *BlockingCUTCPNode = __hpvm__createNodeND(0, CUTCPRoot); // Bind Inputs - __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x - __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y - __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset - __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h - __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 - __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 - __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex - __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen - __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen - __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList - __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList - __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx - __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky - __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz - __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx - __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy - __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz + __hpvm__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __hpvm__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __hpvm__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __hpvm__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __hpvm__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __hpvm__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __hpvm__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __hpvm__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __hpvm__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __hpvm__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __hpvm__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __hpvm__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __hpvm__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __hpvm__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __hpvm__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __hpvm__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __hpvm__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __hpvm__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz + __hpvm__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx + __hpvm__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy + __hpvm__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz } // ==================== Host Code ============================== @@ -546,7 +546,7 @@ int main(int argc, char *argv[]) { } pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -586,7 +586,7 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); /* Print output */ // pb_SwitchToTimer(&timers, pb_TimerID_IO); @@ -873,11 +873,11 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("\n"); } - // Track visc data - llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr); - llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr); - llvm_visc_track_mem(nbrlistlen, sizeof(int)); - llvm_visc_track_mem(nbrlist, bytes_nbrlist); + // Track hpvm data + llvm_hpvm_track_mem(regionZeroAddr, bytes_regionZeroAddr); + llvm_hpvm_track_mem(binBaseAddr, bytes_binBaseAddr); + llvm_hpvm_track_mem(nbrlistlen, sizeof(int)); + llvm_hpvm_track_mem(nbrlist, bytes_nbrlist); /* setup OpenCL kernel parameters */ blockDim[0] = 8; @@ -914,7 +914,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( gridDim[1], gridDim[2]); /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); void *CUTCP_DFG; if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); @@ -926,9 +926,9 @@ int gpu_compute_cutoff_potential_lattice6overlap( args->zRegionIndex = zRegionIndex; - CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void *)args); - __visc__wait(CUTCP_DFG); - // llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); + CUTCP_DFG = __hpvm__launch(0, CUTCPWrapper, (void *)args); + __hpvm__wait(CUTCP_DFG); + // llvm_hpvm_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); } /* @@ -941,14 +941,14 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("computing extra atoms on CPU\n"); } - pb_SwitchToTimer(timers, visc_TimerID_MISC); + pb_SwitchToTimer(timers, hpvm_TimerID_MISC); if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " "for extra atoms\n"); return -1; } - pb_SwitchToTimer(timers, visc_TimerID_MISC); + pb_SwitchToTimer(timers, hpvm_TimerID_MISC); printf("\n"); } if (verbose) @@ -957,7 +957,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* copy result regions from OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); - llvm_visc_request_mem(regionZeroAddr, lnall * sizeof(ener_t)); + llvm_hpvm_request_mem(regionZeroAddr, lnall * sizeof(ener_t)); /* * transpose on CPU, updating, producing the final lattice diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/readatom.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/readatom.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c index 06f856c1a0fa43dc95cb896450baa42f74c047fd..dfd7f1ff388be0c0a51dadbeee80345355c8bf4c 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c @@ -427,7 +427,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( /*CHECK_ERROR("clCreateCommandQueue")*/ /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc.cl b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc.cl rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm.cl diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll similarity index 99% rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll index 7f614e66ff564c661e2388c7e9aef6d70eb4add8..85a73b291f407feae2d407385679fc0bb05b589f 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel_visc.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel_hpvm.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.spir b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.spir similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.spir rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.spir diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll index 370e3c0f8ffec89a85e9a884a4ebcea7664a5723..5a3c1fcd5d853dcda7ba55a9a9ab84a376b1a2f0 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c index 96ebeafbdf377a2d2e6e8e7f2cf5e1e58a3e7a6a..076532b709b6fa49a552f777975f596fc72e2ed3 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c @@ -423,7 +423,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( /*CHECK_ERROR("clCreateCommandQueue")*/ /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { diff --git a/hpvm/test/parboil/benchmarks/lbm/Makefile b/hpvm/test/parboil/benchmarks/lbm/Makefile index 4ebf6fc0af2f05cd10f6d556e0b52bee186540d8..af7215ff7039795e2d09ce98af675a851b32b0cb 100644 --- a/hpvm/test/parboil/benchmarks/lbm/Makefile +++ b/hpvm/test/parboil/benchmarks/lbm/Makefile @@ -5,9 +5,9 @@ ifeq ($(NUM_CORES),) NUM_CORES=8 endif -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc + VERSION = hpvm endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile similarity index 85% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile index d1664ee9880312ccfa2677e6a284851ecadf1f24..5aa206f758e87a94cdaa1cbaadfa3bf9b661d120 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=lbm.ll -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES) APP_CXXFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES) diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp similarity index 86% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp index b51864366b500fc796d9073fe1893be2f402797f..32db8e9b2c4d153a28ee1da2dd91877ba2b2a680 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp @@ -8,11 +8,11 @@ /*############################################################################*/ +#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <sys/stat.h> -#include <visc.h> #include "layout_config.h" #include "lbm.h" @@ -92,18 +92,18 @@ typedef struct __attribute__((__packed__)) { void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, srcG, dstG, 1, dstG); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, srcG, dstG, 1, dstG); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); srcG += MARGIN; dstG += MARGIN; - int lx = __visc__getNodeInstanceID_x(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int gy = __hpvm__getNodeInstanceID_y(parentNode); // Using some predefined macros here. Consider this the declaration // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z @@ -274,40 +274,40 @@ void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG, void lbmLvl1(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, size_t dim_X1) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, srcG, dstG, 1, dstG); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, srcG, dstG, 1, dstG); void *lbm_node = - __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1); - __visc__bindIn(lbm_node, 0, 0, 0); - __visc__bindIn(lbm_node, 1, 1, 0); - __visc__bindIn(lbm_node, 2, 2, 0); - __visc__bindIn(lbm_node, 3, 3, 0); + __hpvm__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1); + __hpvm__bindIn(lbm_node, 0, 0, 0); + __hpvm__bindIn(lbm_node, 1, 1, 0); + __hpvm__bindIn(lbm_node, 2, 2, 0); + __hpvm__bindIn(lbm_node, 3, 3, 0); } void lbmLvl2(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, srcG, dstG, 1, dstG); - void *lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2); - __visc__bindIn(lbm_node, 0, 0, 0); - __visc__bindIn(lbm_node, 1, 1, 0); - __visc__bindIn(lbm_node, 2, 2, 0); - __visc__bindIn(lbm_node, 3, 3, 0); - __visc__bindIn(lbm_node, 4, 4, 0); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = __hpvm__createNodeND(2, lbmLvl1, dim_X2, dim_Y2); + __hpvm__bindIn(lbm_node, 0, 0, 0); + __hpvm__bindIn(lbm_node, 1, 1, 0); + __hpvm__bindIn(lbm_node, 2, 2, 0); + __hpvm__bindIn(lbm_node, 3, 3, 0); + __hpvm__bindIn(lbm_node, 4, 4, 0); } void lbmLvl3(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, srcG, dstG, 1, dstG); - void *lbm_node = __visc__createNodeND(0, lbmLvl2); - __visc__bindIn(lbm_node, 0, 0, 0); - __visc__bindIn(lbm_node, 1, 1, 0); - __visc__bindIn(lbm_node, 2, 2, 0); - __visc__bindIn(lbm_node, 3, 3, 0); - __visc__bindIn(lbm_node, 4, 4, 0); - __visc__bindIn(lbm_node, 5, 5, 0); - __visc__bindIn(lbm_node, 6, 6, 0); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = __hpvm__createNodeND(0, lbmLvl2); + __hpvm__bindIn(lbm_node, 0, 0, 0); + __hpvm__bindIn(lbm_node, 1, 1, 0); + __hpvm__bindIn(lbm_node, 2, 2, 0); + __hpvm__bindIn(lbm_node, 3, 3, 0); + __hpvm__bindIn(lbm_node, 4, 4, 0); + __hpvm__bindIn(lbm_node, 5, 5, 0); + __hpvm__bindIn(lbm_node, 6, 6, 0); } __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src, @@ -321,9 +321,9 @@ __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src, RootIn root_in_local = {src - MARGIN, size, dst - MARGIN, size, SIZE_X, SIZE_Y, SIZE_Z}; *(RootIn *)root_in = root_in_local; - void *lbmDFG = __visc__launch(0, lbmLvl3, root_in); + void *lbmDFG = __hpvm__launch(0, lbmLvl3, root_in); - __visc__wait(lbmDFG); + __hpvm__wait(lbmDFG); } void MAIN_initialize(const MAIN_Param *param) { @@ -379,12 +379,12 @@ int main(int nArgs, char *arg[]) { MAIN_initialize(¶m); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(srcGrid - MARGIN, size); - llvm_visc_track_mem(dstGrid - MARGIN, size); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); + llvm_hpvm_track_mem(srcGrid - MARGIN, size); + llvm_hpvm_track_mem(dstGrid - MARGIN, size); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); for (t = 1; t <= param.nTimeSteps; t++) { @@ -404,15 +404,15 @@ int main(int nArgs, char *arg[]) { } pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(srcGrid - MARGIN, size); + llvm_hpvm_request_mem(srcGrid - MARGIN, size); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(srcGrid - MARGIN); - llvm_visc_untrack_mem(dstGrid - MARGIN); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(srcGrid - MARGIN); + llvm_hpvm_untrack_mem(dstGrid - MARGIN); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ MAIN_finalize(¶m); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/visc/main.h rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c index 59aa8daf9a018348274e20653c9c92f6995a96e4..a55f0ce785e635e1c840de8000a68b85b7295807 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c @@ -79,7 +79,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); clFinish(prm.clCommandQueue); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -101,7 +101,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c index d93a919df300c520c7105612cc54f9684f052678..64fe482b81503c4ef4ac5a88f9b0eb0a16f9a806 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c index d93a919df300c520c7105612cc54f9684f052678..64fe482b81503c4ef4ac5a88f9b0eb0a16f9a806 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c index 18320b7394e5d499339ee820a992b00acd9b368e..54399ee119a6c905baffae6c116ba890cafe44a8 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c @@ -79,7 +79,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); for (t = 1; t <= param.nTimeSteps; t++) { - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c index 5e43b754279910d3ca3b45d40184df666138f9e5..6d682e98e6c4df3b05bb197ef36a21623b545f96 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c index e66cb2c47cc5bd1f62d774952a7e2397005f1e47..9dc95e7d856a5425f84d4063d7a7ba7bfddcebf6 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 4; i++) { for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/sgemm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/Makefile index ace9ded22b6ef365c9cd0f6262245dd2e086643d..4757432d224ea5a1aaa762bfc89c1c89e869bd32 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = sgemm -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc_sh + VERSION = hpvm_sh endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile index d1f6c96d0c279bc2f2e3e70313369d49881b62b8..6e63f8384190ff75c281592df1ab3843b017d07f 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O1 APP_CXXFLAGS=-ffast-math -O1 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc similarity index 69% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc index 627f5a82412374cff4a9061620ce1f27ea3c14a6..de36705707d7062b4cef2042197902c2c415e312 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -57,17 +57,17 @@ typedef struct __attribute__((__packed__)) { void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta) { - __visc__hint(visc::DEVICE); - __visc__attributes(3, A, B, C, 1, C); - - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int ly = __visc__getNodeInstanceID_y(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - int gridx = __visc__getNumNodeInstances_x(thisNode); - int gridy = __visc__getNumNodeInstances_y(thisNode); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(3, A, B, C, 1, C); + + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int ly = __hpvm__getNodeInstanceID_y(thisNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int gy = __hpvm__getNodeInstanceID_y(parentNode); + int gridx = __hpvm__getNumNodeInstances_x(thisNode); + int gridy = __hpvm__getNumNodeInstances_y(thisNode); int m = gx * gridx + lx; int n = gy * gridy + ly; @@ -83,46 +83,46 @@ void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, void basicSgemmLvl1(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1) { - __visc__hint(visc::DEVICE); - __visc__attributes(3, A, B, C, 1, C); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(3, A, B, C, 1, C); void *sgemm_node = - __visc__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1); - __visc__bindIn(sgemm_node, 0, 0, 0); - __visc__bindIn(sgemm_node, 1, 1, 0); - __visc__bindIn(sgemm_node, 2, 2, 0); - __visc__bindIn(sgemm_node, 3, 3, 0); - __visc__bindIn(sgemm_node, 4, 4, 0); - __visc__bindIn(sgemm_node, 5, 5, 0); - __visc__bindIn(sgemm_node, 6, 6, 0); - __visc__bindIn(sgemm_node, 7, 7, 0); - __visc__bindIn(sgemm_node, 8, 8, 0); - __visc__bindIn(sgemm_node, 9, 9, 0); - __visc__bindIn(sgemm_node, 10, 10, 0); - __visc__bindIn(sgemm_node, 11, 11, 0); + __hpvm__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1); + __hpvm__bindIn(sgemm_node, 0, 0, 0); + __hpvm__bindIn(sgemm_node, 1, 1, 0); + __hpvm__bindIn(sgemm_node, 2, 2, 0); + __hpvm__bindIn(sgemm_node, 3, 3, 0); + __hpvm__bindIn(sgemm_node, 4, 4, 0); + __hpvm__bindIn(sgemm_node, 5, 5, 0); + __hpvm__bindIn(sgemm_node, 6, 6, 0); + __hpvm__bindIn(sgemm_node, 7, 7, 0); + __hpvm__bindIn(sgemm_node, 8, 8, 0); + __hpvm__bindIn(sgemm_node, 9, 9, 0); + __hpvm__bindIn(sgemm_node, 10, 10, 0); + __hpvm__bindIn(sgemm_node, 11, 11, 0); } void basicSgemmLvl2(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); void *sgemm_node = - __visc__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2); - __visc__bindIn(sgemm_node, 0, 0, 0); - __visc__bindIn(sgemm_node, 1, 1, 0); - __visc__bindIn(sgemm_node, 2, 2, 0); - __visc__bindIn(sgemm_node, 3, 3, 0); - __visc__bindIn(sgemm_node, 4, 4, 0); - __visc__bindIn(sgemm_node, 5, 5, 0); - __visc__bindIn(sgemm_node, 6, 6, 0); - __visc__bindIn(sgemm_node, 7, 7, 0); - __visc__bindIn(sgemm_node, 8, 8, 0); - __visc__bindIn(sgemm_node, 9, 9, 0); - __visc__bindIn(sgemm_node, 10, 10, 0); - __visc__bindIn(sgemm_node, 11, 11, 0); - __visc__bindIn(sgemm_node, 12, 12, 0); - __visc__bindIn(sgemm_node, 13, 13, 0); + __hpvm__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2); + __hpvm__bindIn(sgemm_node, 0, 0, 0); + __hpvm__bindIn(sgemm_node, 1, 1, 0); + __hpvm__bindIn(sgemm_node, 2, 2, 0); + __hpvm__bindIn(sgemm_node, 3, 3, 0); + __hpvm__bindIn(sgemm_node, 4, 4, 0); + __hpvm__bindIn(sgemm_node, 5, 5, 0); + __hpvm__bindIn(sgemm_node, 6, 6, 0); + __hpvm__bindIn(sgemm_node, 7, 7, 0); + __hpvm__bindIn(sgemm_node, 8, 8, 0); + __hpvm__bindIn(sgemm_node, 9, 9, 0); + __hpvm__bindIn(sgemm_node, 10, 10, 0); + __hpvm__bindIn(sgemm_node, 11, 11, 0); + __hpvm__bindIn(sgemm_node, 12, 12, 0); + __hpvm__bindIn(sgemm_node, 13, 13, 0); } // A wrapper level used in codegen for some backends @@ -130,25 +130,25 @@ void basicSgemmLvl3(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void *sgemm_node = __visc__createNodeND(0, basicSgemmLvl2); - __visc__bindIn(sgemm_node, 0, 0, 0); - __visc__bindIn(sgemm_node, 1, 1, 0); - __visc__bindIn(sgemm_node, 2, 2, 0); - __visc__bindIn(sgemm_node, 3, 3, 0); - __visc__bindIn(sgemm_node, 4, 4, 0); - __visc__bindIn(sgemm_node, 5, 5, 0); - __visc__bindIn(sgemm_node, 6, 6, 0); - __visc__bindIn(sgemm_node, 7, 7, 0); - __visc__bindIn(sgemm_node, 8, 8, 0); - __visc__bindIn(sgemm_node, 9, 9, 0); - __visc__bindIn(sgemm_node, 10, 10, 0); - __visc__bindIn(sgemm_node, 11, 11, 0); - __visc__bindIn(sgemm_node, 12, 12, 0); - __visc__bindIn(sgemm_node, 13, 13, 0); - __visc__bindIn(sgemm_node, 14, 14, 0); - __visc__bindIn(sgemm_node, 15, 15, 0); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); + void *sgemm_node = __hpvm__createNodeND(0, basicSgemmLvl2); + __hpvm__bindIn(sgemm_node, 0, 0, 0); + __hpvm__bindIn(sgemm_node, 1, 1, 0); + __hpvm__bindIn(sgemm_node, 2, 2, 0); + __hpvm__bindIn(sgemm_node, 3, 3, 0); + __hpvm__bindIn(sgemm_node, 4, 4, 0); + __hpvm__bindIn(sgemm_node, 5, 5, 0); + __hpvm__bindIn(sgemm_node, 6, 6, 0); + __hpvm__bindIn(sgemm_node, 7, 7, 0); + __hpvm__bindIn(sgemm_node, 8, 8, 0); + __hpvm__bindIn(sgemm_node, 9, 9, 0); + __hpvm__bindIn(sgemm_node, 10, 10, 0); + __hpvm__bindIn(sgemm_node, 11, 11, 0); + __hpvm__bindIn(sgemm_node, 12, 12, 0); + __hpvm__bindIn(sgemm_node, 13, 13, 0); + __hpvm__bindIn(sgemm_node, 14, 14, 0); + __hpvm__bindIn(sgemm_node, 15, 15, 0); } __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, @@ -194,8 +194,8 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, dg[0] / db[0], dg[1] / db[1]}; *(RootIn *)root_in = root_in_local; - void *sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in); - __visc__wait(sgemmDFG); + void *sgemmDFG = __hpvm__launch(0, basicSgemmLvl3, root_in); + __hpvm__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -233,7 +233,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -246,9 +246,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -263,16 +263,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc index 62f9285e8a8054e5597fe45adc5257470b147622..a1db2e56a5c5639319d7be5f6a890d44c3a28421 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -42,8 +42,8 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __visc__hint(visc::GPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); + __hpvm__hint(hpvm::GPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); float c[TILE_N]; for (int i = 0; i < TILE_N; i++) @@ -96,10 +96,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, // unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N}; - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + __hpvm__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -142,9 +142,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -159,16 +159,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile similarity index 86% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile index a0fd0e95753970ad1c0db1038cf243635d259899..f81bac47072bc017dcdcdccf373cdfbd0f21ceac 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc similarity index 65% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc index 05d143b5884164926213ca060da341a254399bf3..de0d473ed6fe6724ef81f99b13e02d0de29b103b 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -82,29 +82,29 @@ void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B, } void Allocation(long block_x, long block_y) { - void *shB = __visc__malloc(block_x * block_y * sizeof(float)); - __visc__return(2, shB, block_x * block_y * sizeof(float)); + void *shB = __hpvm__malloc(block_x * block_y * sizeof(float)); + __hpvm__return(2, shB, block_x * block_y * sizeof(float)); } void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k, float alpha, float beta, float *shB, size_t bytesshB) { - __visc__hint(visc::DEVICE); - //__visc__hint(visc::SPIR_TARGET); - //__visc__hint(visc::GPU_TARGET); + __hpvm__hint(hpvm::DEVICE); + //__hpvm__hint(hpvm::SPIR_TARGET); + //__hpvm__hint(hpvm::GPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); + __hpvm__attributes(3, A, B, C, 1, C); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); - long lx = __visc__getNodeInstanceID_x(thisNode); - long ly = __visc__getNodeInstanceID_y(thisNode); + long lx = __hpvm__getNodeInstanceID_x(thisNode); + long ly = __hpvm__getNodeInstanceID_y(thisNode); - long gx = __visc__getNodeInstanceID_x(parentNode); - long gy = __visc__getNodeInstanceID_y(parentNode); + long gx = __hpvm__getNodeInstanceID_x(parentNode); + long gy = __hpvm__getNodeInstanceID_y(parentNode); - long dimx = __visc__getNumNodeInstances_x(thisNode); + long dimx = __hpvm__getNumNodeInstances_x(thisNode); float c[TILE_N]; for (int i = 0; i < TILE_N; i++) @@ -119,7 +119,7 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, // shB[ly][lx] = B[n+(i+ly)*ldb]; shB[ly * dimx + lx] = B[n + (i + ly) * ldb]; - __visc__barrier(); + __hpvm__barrier(); for (int j = 0; j < TILE_TB_HEIGHT; j++) { a = A[m + (i + j) * lda]; for (int kk = 0; kk < TILE_N; kk++) { @@ -127,7 +127,7 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, c[kk] += a * shB[j * dimx + kk]; } } - __visc__barrier(); + __hpvm__barrier(); } int t = ldc * gy * TILE_N + m; @@ -140,31 +140,31 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k, float alpha, float beta, long block_x, long block_y) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void *AllocationNode = __visc__createNodeND(0, Allocation); - void *SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); + void *AllocationNode = __hpvm__createNodeND(0, Allocation); + void *SgemmLeafNode = __hpvm__createNodeND(2, SgemmLeaf, block_x, block_y); // Bind edges - __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta - - __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x - __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y + __hpvm__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A + __hpvm__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA + __hpvm__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda + __hpvm__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B + __hpvm__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB + __hpvm__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb + __hpvm__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C + __hpvm__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC + __hpvm__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc + __hpvm__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k + __hpvm__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha + __hpvm__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta + + __hpvm__bindIn(AllocationNode, 12, 0, 0); // Bind block_x + __hpvm__bindIn(AllocationNode, 13, 1, 0); // Bind block_y // Create Edges between AllocationNode and BFSLeafNodeNode - __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B - __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, + __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B + __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, 0); // Edge bytes_local_B } @@ -175,25 +175,25 @@ void SgemmRoot(float *A, size_t bytesA, int lda, // 0-2 int k, float alpha, float beta, // 9-11 long block_x, long block_y, long grid_x, long grid_y // 12-15 ) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void *SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); + void *SgemmTBNode = __hpvm__createNodeND(2, SgemmTB, grid_x, grid_y); // Bind edges - __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta - __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x - __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y + __hpvm__bindIn(SgemmTBNode, 0, 0, 0); // Bind A + __hpvm__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA + __hpvm__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda + __hpvm__bindIn(SgemmTBNode, 3, 3, 0); // Bind B + __hpvm__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB + __hpvm__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb + __hpvm__bindIn(SgemmTBNode, 6, 6, 0); // Bind C + __hpvm__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC + __hpvm__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc + __hpvm__bindIn(SgemmTBNode, 9, 9, 0); // Bind k + __hpvm__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha + __hpvm__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta + __hpvm__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x + __hpvm__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y } void SgemmWrapper(float *A, size_t bytesA, int lda, // 0-2 @@ -202,27 +202,27 @@ void SgemmWrapper(float *A, size_t bytesA, int lda, // 0-2 int k, float alpha, float beta, // 9-11 long block_x, long block_y, long grid_x, long grid_y // 12-15 ) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void *SgemmRootNode = __visc__createNodeND(0, SgemmRoot); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); + void *SgemmRootNode = __hpvm__createNodeND(0, SgemmRoot); // Bind edges - __visc__bindIn(SgemmRootNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmRootNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmRootNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmRootNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta - __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x - __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y - __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x - __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y + __hpvm__bindIn(SgemmRootNode, 0, 0, 0); // Bind A + __hpvm__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA + __hpvm__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda + __hpvm__bindIn(SgemmRootNode, 3, 3, 0); // Bind B + __hpvm__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB + __hpvm__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb + __hpvm__bindIn(SgemmRootNode, 6, 6, 0); // Bind C + __hpvm__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC + __hpvm__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc + __hpvm__bindIn(SgemmRootNode, 9, 9, 0); // Bind k + __hpvm__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha + __hpvm__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta + __hpvm__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x + __hpvm__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y + __hpvm__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x + __hpvm__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y } // Creates root node for sgemm @@ -262,10 +262,10 @@ __attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers, packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, block_x, block_y, grid_x, grid_y); - pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); - void *sgemmDFG = __visc__launch(0, SgemmWrapper, (void *)args); + pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); + void *sgemmDFG = __hpvm__launch(0, SgemmWrapper, (void *)args); - __visc__wait(sgemmDFG); + __hpvm__wait(sgemmDFG); pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); } @@ -296,7 +296,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -309,9 +309,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -324,16 +324,16 @@ int main(int argc, char *argv[]) { C_sz, matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (params->outFile) { /* Write C to file */ diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc index 0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8..be39d713d55d1cb518083679fb1ea1ce717a4ca9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -40,7 +40,7 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __visc__attributes(3, A, B, C, 1, C); + __hpvm__attributes(3, A, B, C, 1, C); float c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0.0f; int m = 4 * get_global_id(0); @@ -90,10 +90,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_SZ / 4, TILE_SZ}; unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + __hpvm__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -107,7 +107,7 @@ int main(int argc, char *argv[]) { std::vector<float> matA, matBT; pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ @@ -138,9 +138,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -158,22 +158,22 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); /* Write C to file */ - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); pb_SwitchToTimer(&timers, pb_TimerID_IO); writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 << std::endl; pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); pb_FreeParameters(params); return 0; diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc index 0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8..be39d713d55d1cb518083679fb1ea1ce717a4ca9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -40,7 +40,7 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __visc__attributes(3, A, B, C, 1, C); + __hpvm__attributes(3, A, B, C, 1, C); float c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0.0f; int m = 4 * get_global_id(0); @@ -90,10 +90,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_SZ / 4, TILE_SZ}; unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + __hpvm__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -107,7 +107,7 @@ int main(int argc, char *argv[]) { std::vector<float> matA, matBT; pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ @@ -138,9 +138,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -158,22 +158,22 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); /* Write C to file */ - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); pb_SwitchToTimer(&timers, pb_TimerID_IO); writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 << std::endl; pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); pb_FreeParameters(params); return 0; diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc index 76d0cefc817ea28f2ffb15cd48d8dd5c7a97d0e0..286297d6fefe0b6f72bdc9e8a9079a131a7b16bf 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -41,8 +41,8 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __visc__hint(visc::GPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); + __hpvm__hint(hpvm::GPU_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); float c = 0.0f; int m = get_global_id(0); @@ -99,10 +99,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ}; unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + __hpvm__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -145,9 +145,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -162,16 +162,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e --- /dev/null +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile @@ -0,0 +1,8 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=hpvm +SRCDIR_OBJS=io.ll #compute_gold.o +HPVM_OBJS=main.hpvm.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS=-ffast-math -O3 +APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc similarity index 91% rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc index a4c252d8f183e76f91349d97872dbca0b3766acf..8fbc45e08a9e2fd1e3af6cc03360086b354665d7 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc @@ -10,6 +10,7 @@ * Main entry of dense matrix-matrix multiplication kernel */ +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -19,7 +20,6 @@ #include <string.h> #include <sys/time.h> #include <vector> -#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -42,8 +42,8 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __visc__hint(visc::SPIR_TARGET); - __visc__attributes(3, A, B, C, 1, C); + __hpvm__hint(hpvm::SPIR_TARGET); + __hpvm__attributes(3, A, B, C, 1, C); float c[TILE_N]; for (int i = 0; i < TILE_N; i++) @@ -135,10 +135,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_N, TILE_TB_HEIGHT}; unsigned dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; - void *sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + void *sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + __hpvm__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -181,10 +181,10 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); + llvm_hpvm_track_mem(&matA.front(), A_sz); + llvm_hpvm_track_mem(&matBT.front(), B_sz); + llvm_hpvm_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -200,16 +200,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(&matC.front(), C_sz); + llvm_hpvm_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + llvm_hpvm_untrack_mem(&matA.front()); + llvm_hpvm_untrack_mem(&matBT.front()); + llvm_hpvm_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc index 5489f6a55ce6e8ba3676b0c98ad4b37ac7f4a7fd..e8d1c69ec9a63c3328f573195a66ceaa02b73aab 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc @@ -109,7 +109,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -212,7 +212,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc index 105baf590da13dd2ffc3cb803d63291daef0854d..4285a52a01adec3b17084c058ada68b6dbe23836 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -208,7 +208,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc index f72c18c293c52e322a35814b13c000f9b64548b0..7edbf05a4bc423d2f30b01ebde457a02263d1fa0 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc index 744ee4096664e2f11620fae388a0a848a8cd49ac..cccec04beba6122632347b1339ec6caaeac16f29 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc @@ -110,7 +110,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -254,7 +254,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc index 45ed8e942a1a69475b75a63a24b70655f1ffa2aa..36e7b93571c24aad59c206d18f69293689bf395f 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc @@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc index d8275be777079f1a57e585b3057685f737f38ed3..2cc311d1eff010bb3c4820bb517083ac33ad8c58 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc @@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - // pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + // pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); // for(int i=0; i<15; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -212,7 +212,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc index b4e561ded6b82bf2b84aa4dbab2f5f4b5bceab7b..678b4d8131515b68b52f8c12d5384b849c1b54ae 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc @@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); // for(int i=0; i<15; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll index ca9fcca0608a891f800e5c5a68f10d36aff268d9..9b4cf7702d777fea811ad800bacf09db63fe7e1d 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc index 8de437a4f8935d5746dbcfbbe5345e0e66ae484a..79fecfb84b536388136932789f00b9e40491df0b 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc @@ -195,7 +195,7 @@ int main(int argc, char *argv[]) { &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); // Use standard sgemm interface regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll index 908c7104bb776bcade055ae430762e3eeab45b9d..2f72a6cebad6829711f2c8a4c33dd649497a9a30 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc index 06f5da5c319811ebfc5aa8937559219b2feed625..22f66ca0a8cfe3bd7789b93e4f96f3adbf323a31 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc @@ -190,7 +190,7 @@ int main(int argc, char *argv[]) { &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); // Use standard sgemm interface regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc index b22ebd8804bdb1204c42e2859aab69209dc77e4c..10e044545385162e2d682e77c98f801bba36dbed 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -214,7 +214,7 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc index a7cb9793e8c1ec991d5a3f3cd1676f7a88ff8e26..59da9562a1169c27a20b699eaf49383090e7c977 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc @@ -79,7 +79,7 @@ void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 4; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, 0, NULL, NULL); @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -211,7 +211,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc index 713fd9e88966f885919bfba7df3bb0386c815f9a..5069484492c50e921276378615df3972987559a3 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc @@ -79,7 +79,7 @@ void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 200; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, 0, NULL, NULL); @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -211,7 +211,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc index 7d5d75c53341060d5d61e21ffdd4d8123aa019a9..bad82538709cc06a07f11853c1dbd01458f034e4 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -208,7 +208,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll deleted file mode 100644 index ea1e7b3b7cc4092f69dd0de9b33ad9b693bcac1c..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll +++ /dev/null @@ -1,894 +0,0 @@ -; ModuleID = 'build/visc_tc_vec_default/main.ll' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%"class.std::ios_base::Init" = type { i8 } -%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } -%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } -%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } -%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } -%"struct.std::ios_base::_Words" = type { i8*, i64 } -%"class.std::locale" = type { %"class.std::locale::_Impl"* } -%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } -%"class.std::locale::facet" = type { i32 (...)**, i32 } -%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } -%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } -%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } -%struct.__locale_data = type opaque -%"class.std::num_put" = type { %"class.std::locale::facet" } -%"class.std::num_get" = type { %"class.std::locale::facet" } -%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } -%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -%rtype = type {} -%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> -%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } -%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } -%struct.pb_Timer = type { i32, i64, i64 } -%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } -%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } -%"class.std::vector" = type { %"struct.std::_Vector_base" } -%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } -%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } -%struct.pb_Parameters = type { i8*, i8** } - -@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 -@__dso_handle = external global i8 -@_ZSt4cerr = external global %"class.std::basic_ostream" -@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 -@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 -@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 -@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 -@stderr = external global %struct._IO_FILE* -@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 -@_ZSt4cout = external global %"class.std::basic_ostream" -@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 -@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] -@viscTimerSet_GenVISC = common global i8* null -@0 = internal constant [14 x i8] c"GenVISC_Timer\00" - -declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 - -declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 - -; Function Attrs: nounwind -declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() - %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %2 = mul i32 %0, %1 - %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %4 = add i32 %2, %3 - %mul = shl nsw i32 %4, 2 - %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - %6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %7 = mul i32 %5, %6 - %8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %9 = add i32 %7, %8 - %cmp147 = icmp sgt i32 %k, 0 - %add3144 = or i32 %mul, 1 - %add8145 = or i32 %mul, 2 - %add13146 = or i32 %mul, 3 - - %mul.tmp1 = insertelement <4 x i32> < i32 0, i32 0, i32 0, i32 0 >, i32 %mul, i32 0 - %mul.tmp2 = insertelement <4 x i32> %mul.tmp1, i32 %add3144, i32 1 - %mul.tmp3 = insertelement <4 x i32> %mul.tmp2, i32 %add8145, i32 2 - %mul.vector = insertelement <4 x i32> %mul.tmp2, i32 %add13146, i32 3 - - %lda.tmp = insertelement <1 x i32> < i32 0 >, i32 %lda, i32 0 - %lda.vector = shufflevector <1 x i32> %lda.tmp, <1 x i32> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 > - - br i1 %cmp147, label %for.body, label %for.end - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - -; %c0.0152 = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ] -; %c1.0151 = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ] -; %c2.0150 = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ] -; %c3.0149 = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ] - %c.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ] - - %10 = trunc i64 %indvars.iv to i32 - %mul2 = mul nsw i32 %10, %lda - -; %add = add nsw i32 %mul2, %mul -; %idxprom = sext i32 %add to i64 -; %arrayidx = getelementptr inbounds float* %A, i64 %idxprom -; %11 = load float* %arrayidx, align 4, !tbaa !0 -; %add5 = add nsw i32 %mul2, %add3144 -; %idxprom6 = sext i32 %add5 to i64 -; %arrayidx7 = getelementptr inbounds float* %A, i64 %idxprom6 -; %12 = load float* %arrayidx7, align 4, !tbaa !0 -; %add10 = add nsw i32 %mul2, %add8145 -; %idxprom11 = sext i32 %add10 to i64 -; %arrayidx12 = getelementptr inbounds float* %A, i64 %idxprom11 -; %13 = load float* %arrayidx12, align 4, !tbaa !0 -; %add15 = add nsw i32 %mul2, %add13146 -; %idxprom16 = sext i32 %add15 to i64 -; %arrayidx17 = getelementptr inbounds float* %A, i64 %idxprom16 -; %14 = load float* %arrayidx17, align 4, !tbaa !0 - %add = add nsw i32 %mul2, %mul - %idxprom = sext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %arrayidx.cast = bitcast float* %arrayidx to <4 x float>* - %11 = load <4 x float>* %arrayidx.cast, align 4 - - %mul18 = mul nsw i32 %10, %ldb - %add19 = add nsw i32 %mul18, %9 - %idxprom20 = sext i32 %add19 to i64 - %arrayidx21 = getelementptr inbounds float* %B, i64 %idxprom20 -; %15 = load float* %arrayidx21, align 4, !tbaa !0 - %12 = load float* %arrayidx21, align 4, !tbaa !0 - - %b.tmp = insertelement <1 x float> < float 0.000000e+00 >, float %12, i32 0 - %b.vector = shufflevector <1 x float> %b.tmp, <1 x float> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 > - -; %mul22 = fmul fast float %11, %15 -; %add23 = fadd fast float %c0.0152, %mul22 -; %mul24 = fmul fast float %12, %15 -; %add25 = fadd fast float %c1.0151, %mul24 -; %mul26 = fmul fast float %13, %15 -; %add27 = fadd fast float %c2.0150, %mul26 -; %mul28 = fmul fast float %14, %15 -; %add29 = fadd fast float %c3.0149, %mul28 - %mul22 = fmul fast <4 x float> %11, %b.vector - %add23 = fadd fast <4 x float> %c.vector, %mul22 - - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry -; %c0.0.lcssa = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ] -; %c1.0.lcssa = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ] -; %c2.0.lcssa = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ] -; %c3.0.lcssa = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ] - %c.end.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ] - - %c0.0.lcssa = extractelement <4 x float> %c.end.vector, i32 0 - %c1.0.lcssa = extractelement <4 x float> %c.end.vector, i32 1 - %c2.0.lcssa = extractelement <4 x float> %c.end.vector, i32 2 - %c3.0.lcssa = extractelement <4 x float> %c.end.vector, i32 3 - - %mul30 = mul nsw i32 %9, %ldc - %add31 = add nsw i32 %mul30, %mul - %idxprom32 = sext i32 %add31 to i64 - %arrayidx33 = getelementptr inbounds float* %C, i64 %idxprom32 - -; %16 = load float* %arrayidx33, align 4, !tbaa !0 -; %mul34 = fmul fast float %16, %beta - %13 = load float* %arrayidx33, align 4, !tbaa !0 - %mul34 = fmul fast float %13, %beta - - %mul35 = fmul fast float %c0.0.lcssa, %alpha - %add36 = fadd fast float %mul35, %mul34 - store float %add36, float* %arrayidx33, align 4, !tbaa !0 - %add43 = add nsw i32 %add3144, %mul30 - %idxprom44 = sext i32 %add43 to i64 - %arrayidx45 = getelementptr inbounds float* %C, i64 %idxprom44 - -; %17 = load float* %arrayidx45, align 4, !tbaa !0 -; %mul46 = fmul fast float %17, %beta - %14 = load float* %arrayidx45, align 4, !tbaa !0 - %mul46 = fmul fast float %14, %beta - - %mul47 = fmul fast float %c1.0.lcssa, %alpha - %add48 = fadd fast float %mul47, %mul46 - store float %add48, float* %arrayidx45, align 4, !tbaa !0 - %add56 = add nsw i32 %add8145, %mul30 - %idxprom57 = sext i32 %add56 to i64 - %arrayidx58 = getelementptr inbounds float* %C, i64 %idxprom57 - -; %18 = load float* %arrayidx58, align 4, !tbaa !0 -; %mul59 = fmul fast float %18, %beta - %15 = load float* %arrayidx58, align 4, !tbaa !0 - %mul59 = fmul fast float %15, %beta - - %mul60 = fmul fast float %c2.0.lcssa, %alpha - %add61 = fadd fast float %mul60, %mul59 - store float %add61, float* %arrayidx58, align 4, !tbaa !0 - %add69 = add nsw i32 %add13146, %mul30 - %idxprom70 = sext i32 %add69 to i64 - %arrayidx71 = getelementptr inbounds float* %C, i64 %idxprom70 - -; %19 = load float* %arrayidx71, align 4, !tbaa !0 -; %mul72 = fmul fast float %19, %beta - %16 = load float* %arrayidx71, align 4, !tbaa !0 - %mul72 = fmul fast float %16, %beta - - %mul73 = fmul fast float %c3.0.lcssa, %alpha - %add74 = fadd fast float %mul73, %mul72 - store float %add74, float* %arrayidx71, align 4, !tbaa !0 - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { -entry: - switch i8 %transa, label %if.then [ - i8 78, label %if.end - i8 110, label %if.end - ] - -if.then: ; preds = %entry - %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 - %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3 - %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 - %0 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64* %0, align 8 - %add.ptr.i.sum = add i64 %vbase.offset.i, 240 - %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum - %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** - %2 = load %"class.std::ctype"** %1, align 8, !tbaa !4 - %tobool.i97 = icmp eq %"class.std::ctype"* %2, null - br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - -if.then.i98: ; preds = %if.then - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then - %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 - %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !1 - %tobool.i = icmp eq i8 %3, 0 - br i1 %tobool.i, label %if.end.i, label %if.then.i - -if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 - %4 = load i8* %arrayidx.i, align 1, !tbaa !1 - br label %_ZNKSt5ctypeIcE5widenEc.exit - -if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 - %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !3 - %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6 - %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 - %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit - -_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i - %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ] - %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 - %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1 - br label %return - -if.end: ; preds = %entry, %entry - switch i8 %transb, label %if.then9 [ - i8 84, label %if.end12 - i8 116, label %if.end12 - ] - -if.then9: ; preds = %if.end - %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 - %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3 - %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24 - %7 = bitcast i8* %vbase.offset.ptr.i52 to i64* - %vbase.offset.i53 = load i64* %7, align 8 - %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240 - %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum - %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"** - %9 = load %"class.std::ctype"** %8, align 8, !tbaa !4 - %tobool.i100 = icmp eq %"class.std::ctype"* %9, null - br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - -if.then.i101: ; preds = %if.then9 - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9 - %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 - %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !1 - %tobool.i76 = icmp eq i8 %10, 0 - br i1 %tobool.i76, label %if.end.i82, label %if.then.i78 - -if.then.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 - %11 = load i8* %arrayidx.i77, align 1, !tbaa !1 - br label %_ZNKSt5ctypeIcE5widenEc.exit84 - -if.end.i82: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 - %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !3 - %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6 - %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8 - %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit84 - -_ZNKSt5ctypeIcE5widenEc.exit84: ; preds = %if.end.i82, %if.then.i78 - %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ] - %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1 - %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1 - br label %return - -if.end12: ; preds = %if.end, %if.end - %rem44 = and i32 %m, 15 - %tobool = icmp eq i32 %rem44, 0 - br i1 %tobool, label %lor.lhs.false, label %if.then15 - -lor.lhs.false: ; preds = %if.end12 - %rem1345 = and i32 %n, 15 - %tobool14 = icmp eq i32 %rem1345, 0 - br i1 %tobool14, label %if.end21, label %if.then15 - -if.then15: ; preds = %lor.lhs.false, %if.end12 - %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 - %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1 - %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 - %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1 - %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** - %vtable.i63 = load i8** %14, align 8, !tbaa !3 - %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24 - %15 = bitcast i8* %vbase.offset.ptr.i64 to i64* - %vbase.offset.i65 = load i64* %15, align 8 - %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* - %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240 - %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum - %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"** - %18 = load %"class.std::ctype"** %17, align 8, !tbaa !4 - %tobool.i104 = icmp eq %"class.std::ctype"* %18, null - br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 - -if.then.i105: ; preds = %if.then15 - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15 - %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 - %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !1 - %tobool.i88 = icmp eq i8 %19, 0 - br i1 %tobool.i88, label %if.end.i94, label %if.then.i90 - -if.then.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 - %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 - %20 = load i8* %arrayidx.i89, align 1, !tbaa !1 - br label %_ZNKSt5ctypeIcE5widenEc.exit96 - -if.end.i94: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 - %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !3 - %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6 - %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8 - %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit96 - -_ZNKSt5ctypeIcE5widenEc.exit96: ; preds = %if.end.i94, %if.then.i90 - %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ] - %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1 - %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1 - br label %if.end21 - -if.end21: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false - %div = sdiv i32 %m, 16 - %mul = and i32 %div, 1073741823 - %div22 = sdiv i32 %n, 16 - %mul24 = and i32 %div22, 268435455 - %conv33 = fpext float %alpha to double - %conv34 = fpext float %beta to double - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) - %in.addr = alloca %struct.arg - %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - store float* %A, float** %in.addr.A - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - store i64 %bytesA, i64* %in.addr.bytes_A - %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - store i32 %lda, i32* %in.addr.lda - %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - store float* %B, float** %in.addr.B - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - store i64 %bytesB, i64* %in.addr.bytes_B - %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - store i32 %ldb, i32* %in.addr.ldb - %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - store float* %C, float** %in.addr.C - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - store i64 %bytesC, i64* %in.addr.bytes_C - %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - store i32 %ldc, i32* %in.addr.ldc - %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - store i32 %k, i32* %in.addr.k - %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 - %in.addr.alpha.cast = fptrunc double %conv33 to float - store float %in.addr.alpha.cast, float* %in.addr.alpha - %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 - %in.addr.beta.cast = fptrunc double %conv34 to float - store float %in.addr.beta.cast, float* %in.addr.beta - %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 - store i32 4, i32* %in.addr.dimX0 - %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 - store i32 16, i32* %in.addr.dimY0 - %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 - store i32 %mul, i32* %in.addr.dimX1 - %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 - store i32 %mul24, i32* %in.addr.dimY1 - %args = bitcast %struct.arg* %in.addr to i8* - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) - call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) - br label %return - -return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit - ret void -} - -declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** %argv) #2 { -entry: - %argc.addr = alloca i32, align 4 - %timers = alloca %struct.pb_TimerSet, align 8 - %matArow = alloca i32, align 4 - %matAcol = alloca i32, align 4 - %matBrow = alloca i32, align 4 - %matBcol = alloca i32, align 4 - %matA = alloca %"class.std::vector", align 8 - %matBT = alloca %"class.std::vector", align 8 - %matC = alloca %"class.std::vector", align 8 - store i32 %argc, i32* %argc.addr, align 4, !tbaa !5 - %0 = bitcast %struct.pb_TimerSet* %timers to i8* - call void @llvm.lifetime.start(i64 800, i8* %0) #1 - %1 = bitcast %"class.std::vector"* %matA to i8* - call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 - %2 = bitcast %"class.std::vector"* %matBT to i8* - call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 - call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - %3 = call i8* @llvm_visc_initializeTimerSet() - store i8* %3, i8** @viscTimerSet_GenVISC - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - call void @llvm.visc.init() - %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 - %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 - %4 = load i8*** %inpFiles, align 8, !tbaa !4 - %5 = load i8** %4, align 8, !tbaa !4 - %cmp = icmp eq i8* %5, null - br i1 %cmp, label %if.then, label %lor.lhs.false - -lor.lhs.false: ; preds = %entry - %arrayidx2 = getelementptr inbounds i8** %4, i64 1 - %6 = load i8** %arrayidx2, align 8, !tbaa !4 - %cmp3 = icmp eq i8* %6, null - br i1 %cmp3, label %if.then, label %lor.lhs.false4 - -lor.lhs.false4: ; preds = %lor.lhs.false - %arrayidx6 = getelementptr inbounds i8** %4, i64 2 - %7 = load i8** %arrayidx6, align 8, !tbaa !4 - %cmp7 = icmp eq i8* %7, null - br i1 %cmp7, label %if.then, label %lor.lhs.false8 - -lor.lhs.false8: ; preds = %lor.lhs.false4 - %arrayidx10 = getelementptr inbounds i8** %4, i64 3 - %8 = load i8** %arrayidx10, align 8, !tbaa !4 - %cmp11 = icmp eq i8* %8, null - br i1 %cmp11, label %if.end, label %if.then - -if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry - %9 = load %struct._IO_FILE** @stderr, align 8, !tbaa !4 - %10 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %9) - call void @exit(i32 -1) #7 - unreachable - -if.end: ; preds = %lor.lhs.false8 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1 - %11 = load i8*** %inpFiles, align 8, !tbaa !4 - %12 = load i8** %11, align 8, !tbaa !4 - %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %12, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 - %13 = load i8*** %inpFiles, align 8, !tbaa !4 - %arrayidx17 = getelementptr inbounds i8** %13, i64 2 - %14 = load i8** %arrayidx17, align 8, !tbaa !4 - %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %14, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %15 = load i32* %matArow, align 4, !tbaa !5 - %16 = load i32* %matAcol, align 4, !tbaa !5 - %mul = mul nsw i32 %16, %15 - %conv = sext i32 %mul to i64 - %mul19 = shl nsw i64 %conv, 2 - %17 = load i32* %matBrow, align 4, !tbaa !5 - %18 = load i32* %matBcol, align 4, !tbaa !5 - %mul20 = mul nsw i32 %18, %17 - %conv21 = sext i32 %mul20 to i64 - %mul22 = shl nsw i64 %conv21, 2 - %mul23 = mul nsw i32 %18, %15 - %conv24 = sext i32 %mul23 to i64 - %mul25 = shl nsw i64 %conv24, 2 - %19 = bitcast %"class.std::vector"* %matC to i8* - call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 24, i32 8, i1 false) #1 - %cmp.i.i.i.i = icmp eq i32 %mul23, 0 - br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i - -cond.true.i.i.i.i: ; preds = %if.end - %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 - br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !6 - -if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i - call void @_ZSt17__throw_bad_allocv() #7 - unreachable - -_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i - %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 - %20 = bitcast i8* %call2.i.i.i.i.i to float* - br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - -_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end - %cond.i.i.i.i = phi float* [ %20, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] - %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 - store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !4 - %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 - store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4 - %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 - %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 - store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !4 - br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i - -for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 - %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i - %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 - %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i - br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i - -vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i - %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] - %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i - %21 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* - store <4 x float> zeroinitializer, <4 x float>* %21, align 4 - %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 - %22 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i - %23 = bitcast float* %22 to <4 x float>* - store <4 x float> zeroinitializer, <4 x float>* %23, align 4 - %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 - %24 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i - br i1 %24, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i - -middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i - %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 - br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader - -for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i - %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* - %25 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 - call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %25, i32 4, i1 false) - br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - -_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4 - %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 - %26 = load float** %_M_start.i.i, align 8, !tbaa !4 - %27 = bitcast float* %26 to i8* - call void @llvm_visc_track_mem(i8* %27, i64 %mul19) #1 - %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 - %28 = load float** %_M_start.i.i82, align 8, !tbaa !4 - %29 = bitcast float* %28 to i8* - call void @llvm_visc_track_mem(i8* %29, i64 %mul22) #1 - %30 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 - %31 = bitcast float* %30 to i8* - call void @llvm_visc_track_mem(i8* %31, i64 %mul25) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %32 = load float** %_M_finish.i.i.i, align 8, !tbaa !4 - %33 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 - %cmp3399 = icmp eq float* %32, %33 - br i1 %cmp3399, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - %sub.ptr.lhs.cast.i = ptrtoint float* %32 to i64 - %sub.ptr.rhs.cast.i = ptrtoint float* %33 to i64 - %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i - %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %add.ptr.i = getelementptr inbounds float* %33, i64 %i.0100 - store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !0 - %inc = add i64 %i.0100, 1 - %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i - br i1 %cmp33, label %for.body, label %for.end - -for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - %34 = load i32* %matArow, align 4, !tbaa !5 - %35 = load i32* %matBcol, align 4, !tbaa !5 - %36 = load i32* %matAcol, align 4, !tbaa !5 - %37 = load float** %_M_start.i.i, align 8, !tbaa !4 - %38 = load float** %_M_start.i.i82, align 8, !tbaa !4 - %39 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 - call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %34, i32 %35, i32 %36, float 1.000000e+00, float* %37, i64 %mul19, i32 %34, float* %38, i64 %mul22, i32 %35, float 0.000000e+00, float* %39, i64 %mul25, i32 %34) - %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 - %40 = load i8** %outFile, align 8, !tbaa !4 - %tobool = icmp eq i8* %40, null - br i1 %tobool, label %if.end42, label %if.then38 - -if.then38: ; preds = %for.end - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 - %41 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 - %42 = bitcast float* %41 to i8* - call void @llvm_visc_request_mem(i8* %42, i64 %mul25) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1 - %43 = load i8** %outFile, align 8, !tbaa !4 - %44 = load i32* %matArow, align 4, !tbaa !5 - %45 = load i32* %matBcol, align 4, !tbaa !5 - %call41 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %43, i32 %44, i32 %45, %"class.std::vector"* %matC) #1 - br label %if.end42 - -if.end42: ; preds = %if.then38, %for.end - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 - %46 = load float** %_M_start.i.i, align 8, !tbaa !4 - %47 = bitcast float* %46 to i8* - call void @llvm_visc_untrack_mem(i8* %47) #1 - %48 = load float** %_M_start.i.i82, align 8, !tbaa !4 - %49 = bitcast float* %48 to i8* - call void @llvm_visc_untrack_mem(i8* %49) #1 - %50 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 - %51 = bitcast float* %50 to i8* - call void @llvm_visc_untrack_mem(i8* %51) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 - %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 - %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 - %52 = load i32* %matArow, align 4, !tbaa !5 - %conv50 = sitofp i32 %52 to double - %mul51 = fmul fast double %conv50, 2.000000e+00 - %53 = load i32* %matBcol, align 4, !tbaa !5 - %conv52 = sitofp i32 %53 to double - %mul53 = fmul fast double %mul51, %conv52 - %54 = load i32* %matAcol, align 4, !tbaa !5 - %conv54 = sitofp i32 %54 to double - %mul55 = fmul fast double %mul53, %conv54 - %div = fdiv fast double %mul55, %call48 - %div56 = fmul double %div, 1.000000e-09 - %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 - %55 = bitcast %"class.std::basic_ostream"* %call.i to i8** - %vtable.i = load i8** %55, align 8, !tbaa !3 - %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 - %56 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64* %56, align 8 - %57 = bitcast %"class.std::basic_ostream"* %call.i to i8* - %add.ptr.sum.i = add i64 %vbase.offset.i, 240 - %_M_ctype.i.i = getelementptr inbounds i8* %57, i64 %add.ptr.sum.i - %58 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** - %59 = load %"class.std::ctype"** %58, align 8, !tbaa !4 - %tobool.i.i.i = icmp eq %"class.std::ctype"* %59, null - br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - -if.then.i.i.i: ; preds = %if.end42 - call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end42 - %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 6 - %60 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !1 - %tobool.i3.i.i = icmp eq i8 %60, 0 - br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i - -if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 7, i64 10 - %61 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1 - br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - -if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %59) #1 - %62 = bitcast %"class.std::ctype"* %59 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %62, align 8, !tbaa !3 - %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 - %63 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 - %call.i.i.i = call signext i8 %63(%"class.std::ctype"* %59, i8 signext 10) #1 - br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - -_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i - %retval.0.i.i.i = phi i8 [ %61, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] - %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 - %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 - call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 - call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) - call void @llvm.visc.cleanup() - call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 - %64 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 - %tobool.i.i.i.i78 = icmp eq float* %64, null - br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 - -if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - %65 = bitcast float* %64 to i8* - call void @_ZdlPv(i8* %65) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 - -_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - %66 = load float** %_M_start.i.i82, align 8, !tbaa !4 - %tobool.i.i.i.i74 = icmp eq float* %66, null - br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 - -if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 - %67 = bitcast float* %66 to i8* - call void @_ZdlPv(i8* %67) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 - -_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 - %68 = load float** %_M_start.i.i, align 8, !tbaa !4 - %tobool.i.i.i.i = icmp eq float* %68, null - br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i - -if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 - %69 = bitcast float* %68 to i8* - call void @_ZdlPv(i8* %69) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit - -_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 - call void @llvm.lifetime.end(i64 800, i8* %0) #1 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 - -declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 - -; Function Attrs: noreturn nounwind -declare void @exit(i32) #4 - -declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 - -declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 - -declare void @llvm_visc_track_mem(i8*, i64) #0 - -declare void @llvm_visc_request_mem(i8*, i64) #0 - -declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 - -declare void @llvm_visc_untrack_mem(i8*) #0 - -declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 - -declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 - -declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 - -; Function Attrs: noreturn -declare void @_ZSt17__throw_bad_allocv() #5 - -declare noalias i8* @_Znwm(i64) #0 - -; Function Attrs: nounwind -declare void @_ZdlPv(i8*) #6 - -declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 - -declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 - -; Function Attrs: noreturn -declare void @_ZSt16__throw_bad_castv() #5 - -declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 - -declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 - -; Function Attrs: nounwind -define internal void @_GLOBAL__I_a() #1 section ".text.startup" { -entry: - tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 - %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 - ret void -} - -; Function Attrs: nounwind -declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 - -; Function Attrs: nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 - -declare i8* @llvm_visc_initializeTimerSet() - -declare void @llvm_visc_switchToTimer(i8**, i32) - -declare void @llvm_visc_printTimerSet(i8**, i8*) - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #7 = { noreturn nounwind } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} -!3 = metadata !{metadata !"vtable pointer", metadata !2} -!4 = metadata !{metadata !"any pointer", metadata !1} -!5 = metadata !{metadata !"int", metadata !1} -!6 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll deleted file mode 100644 index b6e9e3818e997156517574f16e6fd12a1bbebc52..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll +++ /dev/null @@ -1,869 +0,0 @@ -; ModuleID = 'build/visc_vec_default/main.ll' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%"class.std::ios_base::Init" = type { i8 } -%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } -%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } -%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } -%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } -%"struct.std::ios_base::_Words" = type { i8*, i64 } -%"class.std::locale" = type { %"class.std::locale::_Impl"* } -%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } -%"class.std::locale::facet" = type { i32 (...)**, i32 } -%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } -%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } -%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } -%struct.__locale_data = type opaque -%"class.std::num_put" = type { %"class.std::locale::facet" } -%"class.std::num_get" = type { %"class.std::locale::facet" } -%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } -%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -%rtype = type {} -%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> -%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } -%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } -%struct.pb_Timer = type { i32, i64, i64 } -%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } -%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } -%"class.std::vector" = type { %"struct.std::_Vector_base" } -%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } -%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } -%struct.pb_Parameters = type { i8*, i8** } - -@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 -@__dso_handle = external global i8 -@_ZSt4cerr = external global %"class.std::basic_ostream" -@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 -@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 -@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 -@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 -@stderr = external global %struct._IO_FILE* -@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 -@_ZSt4cout = external global %"class.std::basic_ostream" -@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 -@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] -@viscTimerSet_GenVISC = common global i8* null -@0 = internal constant [14 x i8] c"GenVISC_Timer\00" - -declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 - -declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 - -; Function Attrs: nounwind -declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 - -; Function Attrs: nounwind readnone -declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { - %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() - %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %a0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %2 = mul i32 %a0, %1 - %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %4 = add i32 %2, %3 - %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - %a6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %a7 = mul i32 %5, %a6 - %a8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - %a9 = add i32 %a7, %a8 - - %a10 = shl i32 %4, 3 - - - ;a10 = %3, a9 = %5 - ;%1 = tail call i64 @_Z13get_global_idj(i32 0) #1 - ;%2 = shl i64 %1, 3 - ;%3 = trunc i64 %2 to i32 - ;%4 = tail call i64 @_Z13get_global_idj(i32 1) #1 - ;%5 = trunc i64 %4 to i32 - - - - - %6 = icmp sgt i32 %k, 0 - br i1 %6, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %.lr.ph, %0 - %cp.021 = phi <8 x float> [ %20, %.lr.ph ], [ zeroinitializer, %0 ] - %i.020 = phi i32 [ %21, %.lr.ph ], [ 0, %0 ] - %7 = mul nsw i32 %i.020, %lda - %8 = add nsw i32 %7, %a10 - %9 = sext i32 %8 to i64 - %10 = getelementptr inbounds float* %A, i64 %9 - %v10 = bitcast float* %10 to <8 x float>* - %11 = load <8 x float>* %v10 - %12 = mul nsw i32 %i.020, %ldb - %13 = add nsw i32 %12, %a9 - %14 = sext i32 %13 to i64 - %15 = getelementptr inbounds float* %B, i64 %14 - %16 = load float* %15, align 4, !tbaa !9 - %17 = insertelement <8 x float> undef, float %16, i32 0 - %18 = shufflevector <8 x float> %17, <8 x float> undef, <8 x i32> zeroinitializer - %19 = fmul <8 x float> %11, %18 - %20 = fadd <8 x float> %cp.021, %19 - %21 = add nsw i32 %i.020, 1 - %22 = icmp slt i32 %21, %k - br i1 %22, label %.lr.ph, label %._crit_edge - -._crit_edge: ; preds = %.lr.ph, %0 - %cp.0.lcssa = phi <8 x float> [ zeroinitializer, %0 ], [ %20, %.lr.ph ] - %23 = mul nsw i32 %a9, %ldc - %24 = add nsw i32 %23, %a10 - %25 = sext i32 %24 to i64 - %26 = getelementptr inbounds float* %C, i64 %25 - %v26 = bitcast float* %26 to <8 x float>* - %27 = load <8 x float>* %v26 - %28 = insertelement <8 x float> undef, float %beta, i32 0 - %29 = shufflevector <8 x float> %28, <8 x float> undef, <8 x i32> zeroinitializer - %30 = insertelement <8 x float> undef, float %alpha, i32 0 - %31 = shufflevector <8 x float> %30, <8 x float> undef, <8 x i32> zeroinitializer - %32 = fmul <8 x float> %31, %cp.0.lcssa - - ;%33 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %27, <8 x float> %29, <8 x float> %32) - - %mul = fmul <8 x float> %27, %29 - %33 = fadd <8 x float> %mul, %32 - store <8 x float> %33, <8 x float>* %v26 - -;entry: - ;%_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() - ;%_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - ;%0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - ;%1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - ;%2 = mul i32 %0, %1 - ;%3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - ;%4 = add i32 %2, %3 - ;%5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - ;%6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - ;%7 = mul i32 %5, %6 - ;%8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - ;%9 = add i32 %7, %8 - ;%cmp32 = icmp sgt i32 %k, 0 - ;br i1 %cmp32, label %for.body, label %for.end - -;for.body: ; preds = %for.body, %entry - ;%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - ;%c.034 = phi float [ %add7, %for.body ], [ 0.000000e+00, %entry ] - ;%10 = trunc i64 %indvars.iv to i32 - ;%mul = mul nsw i32 %10, %lda - ;%add = add nsw i32 %mul, %4 - ;%idxprom = sext i32 %add to i64 - ;%arrayidx = getelementptr inbounds float* %A, i64 %idxprom - ;%11 = load float* %arrayidx, align 4, !tbaa !3 - ;%mul2 = mul nsw i32 %10, %ldb - ;%add3 = add nsw i32 %mul2, %9 - ;%idxprom4 = sext i32 %add3 to i64 - ;%arrayidx5 = getelementptr inbounds float* %B, i64 %idxprom4 - ;%12 = load float* %arrayidx5, align 4, !tbaa !3 - ;%mul6 = fmul fast float %11, %12 - ;%add7 = fadd fast float %c.034, %mul6 - ;%indvars.iv.next = add i64 %indvars.iv, 1 - ;%lftr.wideiv = trunc i64 %indvars.iv.next to i32 - ;%exitcond = icmp eq i32 %lftr.wideiv, %k - ;br i1 %exitcond, label %for.end, label %for.body - -;for.end: ; preds = %for.body, %entry - ;%c.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add7, %for.body ] - ;%mul8 = mul nsw i32 %9, %ldc - ;%add9 = add nsw i32 %mul8, %4 - ;%idxprom10 = sext i32 %add9 to i64 - ;%arrayidx11 = getelementptr inbounds float* %C, i64 %idxprom10 - ;%13 = load float* %arrayidx11, align 4, !tbaa !3 - ;%mul12 = fmul fast float %13, %beta - ;%mul13 = fmul fast float %c.0.lcssa, %alpha - ;%add14 = fadd fast float %mul13, %mul12 - ;store float %add14, float* %arrayidx11, align 4, !tbaa !3 - ret %rtype undef - - -} - -; Function Attrs: noinline nounwind uwtable -define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { -entry: - switch i8 %transa, label %if.then [ - i8 78, label %if.end - i8 110, label %if.end - ] - -if.then: ; preds = %entry - %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 - %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 - %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 - %0 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64* %0, align 8 - %add.ptr.i.sum = add i64 %vbase.offset.i, 240 - %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum - %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** - %2 = load %"class.std::ctype"** %1, align 8, !tbaa !7 - %tobool.i97 = icmp eq %"class.std::ctype"* %2, null - br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - -if.then.i98: ; preds = %if.then - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then - %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 - %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !4 - %tobool.i = icmp eq i8 %3, 0 - br i1 %tobool.i, label %if.end.i, label %if.then.i - -if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 - %4 = load i8* %arrayidx.i, align 1, !tbaa !4 - br label %_ZNKSt5ctypeIcE5widenEc.exit - -if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 - %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !6 - %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6 - %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 - %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit - -_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i - %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ] - %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 - %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1 - br label %return - -if.end: ; preds = %entry, %entry - switch i8 %transb, label %if.then9 [ - i8 84, label %if.end12 - i8 116, label %if.end12 - ] - -if.then9: ; preds = %if.end - %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 - %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 - %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24 - %7 = bitcast i8* %vbase.offset.ptr.i52 to i64* - %vbase.offset.i53 = load i64* %7, align 8 - %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240 - %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum - %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"** - %9 = load %"class.std::ctype"** %8, align 8, !tbaa !7 - %tobool.i100 = icmp eq %"class.std::ctype"* %9, null - br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - -if.then.i101: ; preds = %if.then9 - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9 - %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 - %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !4 - %tobool.i76 = icmp eq i8 %10, 0 - br i1 %tobool.i76, label %if.end.i82, label %if.then.i78 - -if.then.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 - %11 = load i8* %arrayidx.i77, align 1, !tbaa !4 - br label %_ZNKSt5ctypeIcE5widenEc.exit84 - -if.end.i82: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 - %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !6 - %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6 - %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8 - %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit84 - -_ZNKSt5ctypeIcE5widenEc.exit84: ; preds = %if.end.i82, %if.then.i78 - %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ] - %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1 - %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1 - br label %return - -if.end12: ; preds = %if.end, %if.end - %rem44 = and i32 %m, 15 - %tobool = icmp eq i32 %rem44, 0 - br i1 %tobool, label %lor.lhs.false, label %if.then15 - -lor.lhs.false: ; preds = %if.end12 - %rem1345 = and i32 %n, 15 - %tobool14 = icmp eq i32 %rem1345, 0 - br i1 %tobool14, label %if.end21, label %if.then15 - -if.then15: ; preds = %lor.lhs.false, %if.end12 - %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 - %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1 - %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 - %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1 - %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** - %vtable.i63 = load i8** %14, align 8, !tbaa !6 - %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24 - %15 = bitcast i8* %vbase.offset.ptr.i64 to i64* - %vbase.offset.i65 = load i64* %15, align 8 - %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* - %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240 - %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum - %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"** - %18 = load %"class.std::ctype"** %17, align 8, !tbaa !7 - %tobool.i104 = icmp eq %"class.std::ctype"* %18, null - br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 - -if.then.i105: ; preds = %if.then15 - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15 - %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 - %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !4 - %tobool.i88 = icmp eq i8 %19, 0 - br i1 %tobool.i88, label %if.end.i94, label %if.then.i90 - -if.then.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 - %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 - %20 = load i8* %arrayidx.i89, align 1, !tbaa !4 - br label %_ZNKSt5ctypeIcE5widenEc.exit96 - -if.end.i94: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 - %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !6 - %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6 - %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8 - %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit96 - -_ZNKSt5ctypeIcE5widenEc.exit96: ; preds = %if.end.i94, %if.then.i90 - %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ] - %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1 - %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1 - br label %if.end21 - -if.end21: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false - %div = sdiv i32 %m, 16 - %mul = and i32 %div, 2147483647 - %div22 = sdiv i32 %n, 16 - %mul24 = and i32 %div22, 268435455 - %conv33 = fpext float %alpha to double - %conv34 = fpext float %beta to double - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) - %in.addr = alloca %struct.arg - %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - store float* %A, float** %in.addr.A - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - store i64 %bytesA, i64* %in.addr.bytes_A - %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - store i32 %lda, i32* %in.addr.lda - %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - store float* %B, float** %in.addr.B - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - store i64 %bytesB, i64* %in.addr.bytes_B - %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - store i32 %ldb, i32* %in.addr.ldb - %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - store float* %C, float** %in.addr.C - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - store i64 %bytesC, i64* %in.addr.bytes_C - %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - store i32 %ldc, i32* %in.addr.ldc - %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - store i32 %k, i32* %in.addr.k - %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 - %in.addr.alpha.cast = fptrunc double %conv33 to float - store float %in.addr.alpha.cast, float* %in.addr.alpha - %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 - %in.addr.beta.cast = fptrunc double %conv34 to float - store float %in.addr.beta.cast, float* %in.addr.beta - %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 - store i32 2, i32* %in.addr.dimX0 - %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 - store i32 16, i32* %in.addr.dimY0 - %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 - store i32 %mul, i32* %in.addr.dimX1 - %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 - store i32 %mul24, i32* %in.addr.dimY1 - %args = bitcast %struct.arg* %in.addr to i8* - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) - call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) - br label %return - -return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit - ret void -} - -declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** %argv) #2 { -entry: - %argc.addr = alloca i32, align 4 - %timers = alloca %struct.pb_TimerSet, align 8 - %matArow = alloca i32, align 4 - %matAcol = alloca i32, align 4 - %matBrow = alloca i32, align 4 - %matBcol = alloca i32, align 4 - %matA = alloca %"class.std::vector", align 8 - %matBT = alloca %"class.std::vector", align 8 - %matC = alloca %"class.std::vector", align 8 - store i32 %argc, i32* %argc.addr, align 4, !tbaa !8 - %0 = bitcast %struct.pb_TimerSet* %timers to i8* - call void @llvm.lifetime.start(i64 800, i8* %0) #1 - %1 = bitcast %"class.std::vector"* %matA to i8* - call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 - %2 = bitcast %"class.std::vector"* %matBT to i8* - call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 - %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 - %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 - %3 = load i8*** %inpFiles, align 8, !tbaa !7 - %4 = load i8** %3, align 8, !tbaa !7 - %cmp = icmp eq i8* %4, null - br i1 %cmp, label %if.then, label %lor.lhs.false - -lor.lhs.false: ; preds = %entry - %arrayidx2 = getelementptr inbounds i8** %3, i64 1 - %5 = load i8** %arrayidx2, align 8, !tbaa !7 - %cmp3 = icmp eq i8* %5, null - br i1 %cmp3, label %if.then, label %lor.lhs.false4 - -lor.lhs.false4: ; preds = %lor.lhs.false - %arrayidx6 = getelementptr inbounds i8** %3, i64 2 - %6 = load i8** %arrayidx6, align 8, !tbaa !7 - %cmp7 = icmp eq i8* %6, null - br i1 %cmp7, label %if.then, label %lor.lhs.false8 - -lor.lhs.false8: ; preds = %lor.lhs.false4 - %arrayidx10 = getelementptr inbounds i8** %3, i64 3 - %7 = load i8** %arrayidx10, align 8, !tbaa !7 - %cmp11 = icmp eq i8* %7, null - br i1 %cmp11, label %if.end, label %if.then - -if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry - %8 = load %struct._IO_FILE** @stderr, align 8, !tbaa !7 - %9 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %8) - call void @exit(i32 -1) #7 - unreachable - -if.end: ; preds = %lor.lhs.false8 - %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 - %10 = load i8*** %inpFiles, align 8, !tbaa !7 - %arrayidx17 = getelementptr inbounds i8** %10, i64 2 - %11 = load i8** %arrayidx17, align 8, !tbaa !7 - %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %11, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 - call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - %12 = call i8* @llvm_visc_initializeTimerSet() - store i8* %12, i8** @viscTimerSet_GenVISC - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - call void @llvm.visc.init() - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %13 = load i32* %matArow, align 4, !tbaa !8 - %14 = load i32* %matAcol, align 4, !tbaa !8 - %mul = mul nsw i32 %14, %13 - %conv = sext i32 %mul to i64 - %mul19 = shl nsw i64 %conv, 2 - %15 = load i32* %matBrow, align 4, !tbaa !8 - %16 = load i32* %matBcol, align 4, !tbaa !8 - %mul20 = mul nsw i32 %16, %15 - %conv21 = sext i32 %mul20 to i64 - %mul22 = shl nsw i64 %conv21, 2 - %mul23 = mul nsw i32 %16, %13 - %conv24 = sext i32 %mul23 to i64 - %mul25 = shl nsw i64 %conv24, 2 - %17 = bitcast %"class.std::vector"* %matC to i8* - call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 24, i32 8, i1 false) #1 - %cmp.i.i.i.i = icmp eq i32 %mul23, 0 - br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i - -cond.true.i.i.i.i: ; preds = %if.end - %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 - br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !9 - -if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i - call void @_ZSt17__throw_bad_allocv() #7 - unreachable - -_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i - %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 - %18 = bitcast i8* %call2.i.i.i.i.i to float* - br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - -_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end - %cond.i.i.i.i = phi float* [ %18, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] - %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 - store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !7 - %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 - store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 - %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 - %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 - store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !7 - br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i - -for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 - %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i - %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 - %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i - br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i - -vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i - %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] - %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i - %19 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* - store <4 x float> zeroinitializer, <4 x float>* %19, align 4 - %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 - %20 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i - %21 = bitcast float* %20 to <4 x float>* - store <4 x float> zeroinitializer, <4 x float>* %21, align 4 - %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 - %22 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i - br i1 %22, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i - -middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i - %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 - br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader - -for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i - %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* - %23 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 - call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %23, i32 4, i1 false) - br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - -_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 - %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 - %24 = load float** %_M_start.i.i, align 8, !tbaa !7 - %25 = bitcast float* %24 to i8* - call void @llvm_visc_track_mem(i8* %25, i64 %mul19) #1 - %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 - %26 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %27 = bitcast float* %26 to i8* - call void @llvm_visc_track_mem(i8* %27, i64 %mul22) #1 - %28 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %29 = bitcast float* %28 to i8* - call void @llvm_visc_track_mem(i8* %29, i64 %mul25) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %30 = load float** %_M_finish.i.i.i, align 8, !tbaa !7 - %31 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %cmp3399 = icmp eq float* %30, %31 - br i1 %cmp3399, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - %sub.ptr.lhs.cast.i = ptrtoint float* %30 to i64 - %sub.ptr.rhs.cast.i = ptrtoint float* %31 to i64 - %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i - %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %add.ptr.i = getelementptr inbounds float* %31, i64 %i.0100 - store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !3 - %inc = add i64 %i.0100, 1 - %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i - br i1 %cmp33, label %for.body, label %for.end - -for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - %32 = load i32* %matArow, align 4, !tbaa !8 - %33 = load i32* %matBcol, align 4, !tbaa !8 - %34 = load i32* %matAcol, align 4, !tbaa !8 - %35 = load float** %_M_start.i.i, align 8, !tbaa !7 - %36 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %37 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %32, i32 %33, i32 %34, float 1.000000e+00, float* %35, i64 %mul19, i32 %32, float* %36, i64 %mul22, i32 %33, float 0.000000e+00, float* %37, i64 %mul25, i32 %32) - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 - %38 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %39 = bitcast float* %38 to i8* - call void @llvm_visc_request_mem(i8* %39, i64 %mul25) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 - %40 = load float** %_M_start.i.i, align 8, !tbaa !7 - %41 = bitcast float* %40 to i8* - call void @llvm_visc_untrack_mem(i8* %41) #1 - %42 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %43 = bitcast float* %42 to i8* - call void @llvm_visc_untrack_mem(i8* %43) #1 - %44 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %45 = bitcast float* %44 to i8* - call void @llvm_visc_untrack_mem(i8* %45) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 - call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) - call void @llvm.visc.cleanup() - %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 - %46 = load i8** %outFile, align 8, !tbaa !7 - %tobool = icmp eq i8* %46, null - br i1 %tobool, label %if.end45, label %if.then42 - -if.then42: ; preds = %for.end - %47 = load i32* %matArow, align 4, !tbaa !8 - %48 = load i32* %matBcol, align 4, !tbaa !8 - %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %46, i32 %47, i32 %48, %"class.std::vector"* %matC) #1 - br label %if.end45 - -if.end45: ; preds = %if.then42, %for.end - %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 - %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 - %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 - %49 = load i32* %matArow, align 4, !tbaa !8 - %conv50 = sitofp i32 %49 to double - %mul51 = fmul fast double %conv50, 2.000000e+00 - %50 = load i32* %matBcol, align 4, !tbaa !8 - %conv52 = sitofp i32 %50 to double - %mul53 = fmul fast double %mul51, %conv52 - %51 = load i32* %matAcol, align 4, !tbaa !8 - %conv54 = sitofp i32 %51 to double - %mul55 = fmul fast double %mul53, %conv54 - %div = fdiv fast double %mul55, %call48 - %div56 = fmul double %div, 1.000000e-09 - %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 - %52 = bitcast %"class.std::basic_ostream"* %call.i to i8** - %vtable.i = load i8** %52, align 8, !tbaa !6 - %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 - %53 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64* %53, align 8 - %54 = bitcast %"class.std::basic_ostream"* %call.i to i8* - %add.ptr.sum.i = add i64 %vbase.offset.i, 240 - %_M_ctype.i.i = getelementptr inbounds i8* %54, i64 %add.ptr.sum.i - %55 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** - %56 = load %"class.std::ctype"** %55, align 8, !tbaa !7 - %tobool.i.i.i = icmp eq %"class.std::ctype"* %56, null - br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - -if.then.i.i.i: ; preds = %if.end45 - call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end45 - %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 6 - %57 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !4 - %tobool.i3.i.i = icmp eq i8 %57, 0 - br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i - -if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 7, i64 10 - %58 = load i8* %arrayidx.i.i.i, align 1, !tbaa !4 - br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - -if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %56) #1 - %59 = bitcast %"class.std::ctype"* %56 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %59, align 8, !tbaa !6 - %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 - %60 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 - %call.i.i.i = call signext i8 %60(%"class.std::ctype"* %56, i8 signext 10) #1 - br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - -_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i - %retval.0.i.i.i = phi i8 [ %58, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] - %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 - %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 - call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 - %61 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %tobool.i.i.i.i78 = icmp eq float* %61, null - br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 - -if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - %62 = bitcast float* %61 to i8* - call void @_ZdlPv(i8* %62) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 - -_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - %63 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %tobool.i.i.i.i74 = icmp eq float* %63, null - br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 - -if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 - %64 = bitcast float* %63 to i8* - call void @_ZdlPv(i8* %64) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 - -_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 - %65 = load float** %_M_start.i.i, align 8, !tbaa !7 - %tobool.i.i.i.i = icmp eq float* %65, null - br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i - -if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 - %66 = bitcast float* %65 to i8* - call void @_ZdlPv(i8* %66) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit - -_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 - call void @llvm.lifetime.end(i64 800, i8* %0) #1 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 - -; Function Attrs: noreturn nounwind -declare void @exit(i32) #4 - -declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 - -declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 - -declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 - -declare void @llvm_visc_track_mem(i8*, i64) #0 - -declare void @llvm_visc_request_mem(i8*, i64) #0 - -declare void @llvm_visc_untrack_mem(i8*) #0 - -declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 - -declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 - -declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 - -declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 - -; Function Attrs: noreturn -declare void @_ZSt17__throw_bad_allocv() #5 - -declare noalias i8* @_Znwm(i64) #0 - -; Function Attrs: nounwind -declare void @_ZdlPv(i8*) #6 - -declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 - -declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 - -; Function Attrs: noreturn -declare void @_ZSt16__throw_bad_castv() #5 - -declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 - -declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 - -; Function Attrs: nounwind -define internal void @_GLOBAL__I_a() #1 section ".text.startup" { -entry: - tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 - %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 - ret void -} - -; Function Attrs: nounwind -declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 - -; Function Attrs: nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 - -declare i8* @llvm_visc_initializeTimerSet() - -declare void @llvm_visc_switchToTimer(i8**, i32) - -declare void @llvm_visc_printTimerSet(i8**, i8*) - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #7 = { noreturn nounwind } - -!visc_hint_gpu = !{!0, !1} -!visc_hint_cpu = !{!2} - -!0 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff} -!1 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1} -!2 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2} -!3 = metadata !{metadata !"float", metadata !4} -!4 = metadata !{metadata !"omnipotent char", metadata !5} -!5 = metadata !{metadata !"Simple C/C++ TBAA"} -!6 = metadata !{metadata !"vtable pointer", metadata !5} -!7 = metadata !{metadata !"any pointer", metadata !4} -!8 = metadata !{metadata !"int", metadata !4} -!9 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile deleted file mode 100644 index f74ee8921a534b6963ba06d089398114571d070b..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# (c) 2010 The Board of Trustees of the University of Illinois. - -LANGUAGE=visc -SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll -APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll deleted file mode 100644 index b997cf7ebcabcb339e90258dd78f0b141483bbf9..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll +++ /dev/null @@ -1,889 +0,0 @@ -; ModuleID = 'build/visc_vec_opt_default/main.ll' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%"class.std::ios_base::Init" = type { i8 } -%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } -%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } -%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } -%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } -%"struct.std::ios_base::_Words" = type { i8*, i64 } -%"class.std::locale" = type { %"class.std::locale::_Impl"* } -%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } -%"class.std::locale::facet" = type { i32 (...)**, i32 } -%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } -%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } -%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } -%struct.__locale_data = type opaque -%"class.std::num_put" = type { %"class.std::locale::facet" } -%"class.std::num_get" = type { %"class.std::locale::facet" } -%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } -%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -%rtype = type {} -%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> -%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } -%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } -%struct.pb_Timer = type { i32, i64, i64 } -%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } -%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } -%"class.std::vector" = type { %"struct.std::_Vector_base" } -%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } -%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } -%struct.pb_Parameters = type { i8*, i8** } - -@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 -@__dso_handle = external global i8 -@_ZSt4cerr = external global %"class.std::basic_ostream" -@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 -@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 -@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 -@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 -@stderr = external global %struct._IO_FILE* -@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 -@_ZSt4cout = external global %"class.std::basic_ostream" -@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 -@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] -@viscTimerSet_GenVISC = common global i8* null -@0 = internal constant [14 x i8] c"GenVISC_Timer\00" - -declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 - -declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 - -; Function Attrs: nounwind -declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() - %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - -; %call = call i32 @get_local_id(i32 1) #2 - %call = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - -; %call1 = call i32 @get_local_size(i32 0) #2 - %call1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - - %mul = mul i32 %call1, %call - -; %call2 = call i32 @get_local_id(i32 0) #2 - %call2 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) - - %add = add i32 %mul, %call2 - -; %call3 = call i32 @get_group_id(i32 0) #2 - %call3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - - %mul4 = shl i32 %call3, 6 - %add5 = add i32 %add, %mul4 - %cmp89 = icmp sgt i32 %k, 0 - - %call6 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - %mul7 = shl i32 %call6, 3 - - br i1 %cmp89, label %for.body, label %for.end23 - -for.body: ; preds = %entry, %for.inc21 - %cp.091 = phi <8 x float> [ %add20, %for.inc21 ], [ zeroinitializer, %entry ] - %i.090 = phi i32 [ %add22, %for.inc21 ], [ 0, %entry ] -; %call6 = call i32 @get_group_id(i32 1) #2 -; %call6 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) -; %mul7 = shl i32 %call6, 3 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body - %cp.188 = phi <8 x float> [ %cp.091, %for.body ], [ %add20, %for.body12 ] - %j.087 = phi i32 [ 0, %for.body ], [ %inc, %for.body12 ] - %add13 = add i32 %j.087, %i.090 - %mul14 = mul nsw i32 %add13, %lda - %add15 = add nsw i32 %mul14, %add5 - %arrayidx = getelementptr inbounds float* %A, i32 %add15 - %0 = load float* %arrayidx, align 4, !tbaa !3 - %splat.splatinsert = insertelement <8 x float> undef, float %0, i32 0 - %splat.splat = shufflevector <8 x float> %splat.splatinsert, <8 x float> undef, <8 x i32> zeroinitializer - %tmp83 = mul i32 %add13, %ldb - %add.ptr.sum = add i32 %tmp83, %mul7 - %add.ptr17 = getelementptr inbounds float* %B, i32 %add.ptr.sum - -; %call18 = call <8 x float> @_Z6vload8jPKU3AS1f(i32 0, float* %add.ptr17) #2 - %add.ptr17.cast = bitcast float* %add.ptr17 to <8 x float>* - %call18 = load <8 x float>* %add.ptr17.cast, align 8 - - %mul19 = fmul fast <8 x float> %call18, %splat.splat - %add20 = fadd fast <8 x float> %cp.188, %mul19 - %inc = add nsw i32 %j.087, 1 - %exitcond92 = icmp eq i32 %inc, 8 - br i1 %exitcond92, label %for.inc21, label %for.body12 - -for.inc21: ; preds = %for.body12 - %add22 = add nsw i32 %i.090, 8 - %cmp = icmp slt i32 %add22, %k - br i1 %cmp, label %for.body, label %for.end23 - -for.end23: ; preds = %for.inc21, %entry - %cp.0.lcssa = phi <8 x float> [ zeroinitializer, %entry ], [ %add20, %for.inc21 ] - %splat.splatinsert24 = insertelement <8 x float> undef, float %alpha, i32 0 - %splat.splat25 = shufflevector <8 x float> %splat.splatinsert24, <8 x float> undef, <8 x i32> zeroinitializer - %mul26 = fmul fast <8 x float> %splat.splat25, %cp.0.lcssa - %1 = extractelement <8 x float> %mul26, i32 0 - %2 = extractelement <8 x float> %mul26, i32 1 - %3 = extractelement <8 x float> %mul26, i32 2 - %4 = extractelement <8 x float> %mul26, i32 3 - %5 = extractelement <8 x float> %mul26, i32 4 - %6 = extractelement <8 x float> %mul26, i32 5 - %7 = extractelement <8 x float> %mul26, i32 6 - %8 = extractelement <8 x float> %mul26, i32 7 -; %call35 = call i32 @get_group_id(i32 1) #2 -; %call35 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) - %mul37 = shl i32 %call6, 3 - %tmp85 = mul i32 %mul37, %ldc - %add44 = add i32 %tmp85, %add5 - %arrayidx45 = getelementptr inbounds float* %C, i32 %add44 - %9 = load float* %arrayidx45, align 4, !tbaa !3 - %mul46 = fmul fast float %9, %beta - %add48 = fadd fast float %mul46, %1 - store float %add48, float* %arrayidx45, align 4, !tbaa !3 - %tmp84.193 = or i32 %mul37, 1 - %tmp85.1 = mul i32 %tmp84.193, %ldc - %add44.1 = add i32 %tmp85.1, %add5 - %arrayidx45.1 = getelementptr inbounds float* %C, i32 %add44.1 - %10 = load float* %arrayidx45.1, align 4, !tbaa !3 - %mul46.1 = fmul fast float %10, %beta - %add48.1 = fadd fast float %mul46.1, %2 - store float %add48.1, float* %arrayidx45.1, align 4, !tbaa !3 - %tmp84.294 = or i32 %mul37, 2 - %tmp85.2 = mul i32 %tmp84.294, %ldc - %add44.2 = add i32 %tmp85.2, %add5 - %arrayidx45.2 = getelementptr inbounds float* %C, i32 %add44.2 - %11 = load float* %arrayidx45.2, align 4, !tbaa !3 - %mul46.2 = fmul fast float %11, %beta - %add48.2 = fadd fast float %mul46.2, %3 - store float %add48.2, float* %arrayidx45.2, align 4, !tbaa !3 - %tmp84.395 = or i32 %mul37, 3 - %tmp85.3 = mul i32 %tmp84.395, %ldc - %add44.3 = add i32 %tmp85.3, %add5 - %arrayidx45.3 = getelementptr inbounds float* %C, i32 %add44.3 - %12 = load float* %arrayidx45.3, align 4, !tbaa !3 - %mul46.3 = fmul fast float %12, %beta - %add48.3 = fadd fast float %mul46.3, %4 - store float %add48.3, float* %arrayidx45.3, align 4, !tbaa !3 - %tmp84.496 = or i32 %mul37, 4 - %tmp85.4 = mul i32 %tmp84.496, %ldc - %add44.4 = add i32 %tmp85.4, %add5 - %arrayidx45.4 = getelementptr inbounds float* %C, i32 %add44.4 - %13 = load float* %arrayidx45.4, align 4, !tbaa !3 - %mul46.4 = fmul fast float %13, %beta - %add48.4 = fadd fast float %mul46.4, %5 - store float %add48.4, float* %arrayidx45.4, align 4, !tbaa !3 - %tmp84.597 = or i32 %mul37, 5 - %tmp85.5 = mul i32 %tmp84.597, %ldc - %add44.5 = add i32 %tmp85.5, %add5 - %arrayidx45.5 = getelementptr inbounds float* %C, i32 %add44.5 - %14 = load float* %arrayidx45.5, align 4, !tbaa !3 - %mul46.5 = fmul fast float %14, %beta - %add48.5 = fadd fast float %mul46.5, %6 - store float %add48.5, float* %arrayidx45.5, align 4, !tbaa !3 - %tmp84.698 = or i32 %mul37, 6 - %tmp85.6 = mul i32 %tmp84.698, %ldc - %add44.6 = add i32 %tmp85.6, %add5 - %arrayidx45.6 = getelementptr inbounds float* %C, i32 %add44.6 - %15 = load float* %arrayidx45.6, align 4, !tbaa !3 - %mul46.6 = fmul fast float %15, %beta - %add48.6 = fadd fast float %mul46.6, %7 - store float %add48.6, float* %arrayidx45.6, align 4, !tbaa !3 - %tmp84.799 = or i32 %mul37, 7 - %tmp85.7 = mul i32 %tmp84.799, %ldc - %add44.7 = add i32 %tmp85.7, %add5 - %arrayidx45.7 = getelementptr inbounds float* %C, i32 %add44.7 - %16 = load float* %arrayidx45.7, align 4, !tbaa !3 - %mul46.7 = fmul fast float %16, %beta - %add48.7 = fadd fast float %mul46.7, %8 - store float %add48.7, float* %arrayidx45.7, align 4, !tbaa !3 - - - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { -entry: - switch i8 %transa, label %if.then [ - i8 78, label %if.end - i8 110, label %if.end - ] - -if.then: ; preds = %entry - %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 - %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 - %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 - %0 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64* %0, align 8 - %add.ptr.i.sum = add i64 %vbase.offset.i, 240 - %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum - %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** - %2 = load %"class.std::ctype"** %1, align 8, !tbaa !7 - %tobool.i93 = icmp eq %"class.std::ctype"* %2, null - br i1 %tobool.i93, label %if.then.i94, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - -if.then.i94: ; preds = %if.then - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then - %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 - %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !4 - %tobool.i = icmp eq i8 %3, 0 - br i1 %tobool.i, label %if.end.i, label %if.then.i - -if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 - %4 = load i8* %arrayidx.i, align 1, !tbaa !4 - br label %_ZNKSt5ctypeIcE5widenEc.exit - -if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 - %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i67 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !6 - %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i67, i64 6 - %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 - %call.i68 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit - -_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i - %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i68, %if.end.i ] - %call1.i43 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 - %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i43) #1 - br label %return - -if.end: ; preds = %entry, %entry - switch i8 %transb, label %if.then9 [ - i8 84, label %if.end12 - i8 116, label %if.end12 - ] - -if.then9: ; preds = %if.end - %call1.i45 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 - %vtable.i47 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 - %vbase.offset.ptr.i48 = getelementptr i8* %vtable.i47, i64 -24 - %7 = bitcast i8* %vbase.offset.ptr.i48 to i64* - %vbase.offset.i49 = load i64* %7, align 8 - %add.ptr.i50.sum = add i64 %vbase.offset.i49, 240 - %_M_ctype.i69 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i50.sum - %8 = bitcast i8* %_M_ctype.i69 to %"class.std::ctype"** - %9 = load %"class.std::ctype"** %8, align 8, !tbaa !7 - %tobool.i96 = icmp eq %"class.std::ctype"* %9, null - br i1 %tobool.i96, label %if.then.i97, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99 - -if.then.i97: ; preds = %if.then9 - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99: ; preds = %if.then9 - %_M_widen_ok.i71 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 - %10 = load i8* %_M_widen_ok.i71, align 1, !tbaa !4 - %tobool.i72 = icmp eq i8 %10, 0 - br i1 %tobool.i72, label %if.end.i78, label %if.then.i74 - -if.then.i74: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99 - %arrayidx.i73 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 - %11 = load i8* %arrayidx.i73, align 1, !tbaa !4 - br label %_ZNKSt5ctypeIcE5widenEc.exit80 - -if.end.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99 - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 - %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i75 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !6 - %vfn.i76 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i75, i64 6 - %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i76, align 8 - %call.i77 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit80 - -_ZNKSt5ctypeIcE5widenEc.exit80: ; preds = %if.end.i78, %if.then.i74 - %retval.0.i79 = phi i8 [ %11, %if.then.i74 ], [ %call.i77, %if.end.i78 ] - %call1.i52 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i79) #1 - %call.i53 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i52) #1 - br label %return - -if.end12: ; preds = %if.end, %if.end - %rem40 = and i32 %m, 63 - %tobool = icmp eq i32 %rem40, 0 - br i1 %tobool, label %lor.lhs.false, label %if.then15 - -lor.lhs.false: ; preds = %if.end12 - %rem1341 = and i32 %n, 7 - %tobool14 = icmp eq i32 %rem1341, 0 - br i1 %tobool14, label %if.end21, label %if.then15 - -if.then15: ; preds = %lor.lhs.false, %if.end12 - %call1.i55 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 - %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 64) #1 - %call1.i57 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 - %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 8) #1 - %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** - %vtable.i59 = load i8** %14, align 8, !tbaa !6 - %vbase.offset.ptr.i60 = getelementptr i8* %vtable.i59, i64 -24 - %15 = bitcast i8* %vbase.offset.ptr.i60 to i64* - %vbase.offset.i61 = load i64* %15, align 8 - %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* - %add.ptr.i62.sum = add i64 %vbase.offset.i61, 240 - %_M_ctype.i81 = getelementptr inbounds i8* %16, i64 %add.ptr.i62.sum - %17 = bitcast i8* %_M_ctype.i81 to %"class.std::ctype"** - %18 = load %"class.std::ctype"** %17, align 8, !tbaa !7 - %tobool.i100 = icmp eq %"class.std::ctype"* %18, null - br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - -if.then.i101: ; preds = %if.then15 - tail call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then15 - %_M_widen_ok.i83 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 - %19 = load i8* %_M_widen_ok.i83, align 1, !tbaa !4 - %tobool.i84 = icmp eq i8 %19, 0 - br i1 %tobool.i84, label %if.end.i90, label %if.then.i86 - -if.then.i86: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - %arrayidx.i85 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 - %20 = load i8* %arrayidx.i85, align 1, !tbaa !4 - br label %_ZNKSt5ctypeIcE5widenEc.exit92 - -if.end.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 - tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 - %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i87 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !6 - %vfn.i88 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i87, i64 6 - %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i88, align 8 - %call.i89 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 - br label %_ZNKSt5ctypeIcE5widenEc.exit92 - -_ZNKSt5ctypeIcE5widenEc.exit92: ; preds = %if.end.i90, %if.then.i86 - %retval.0.i91 = phi i8 [ %20, %if.then.i86 ], [ %call.i89, %if.end.i90 ] - %call1.i64 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i91) #1 - %call.i65 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i64) #1 - br label %return - -if.end21: ; preds = %lor.lhs.false - %mul = shl nsw i32 %m, 3 - %div = sdiv i32 %mul, 64 - %div27 = lshr i32 %div, 3 - %div30 = lshr i32 %n, 3 - %conv31 = fpext float %alpha to double - %conv32 = fpext float %beta to double - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) - %in.addr = alloca %struct.arg - %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - store float* %A, float** %in.addr.A - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - store i64 %bytesA, i64* %in.addr.bytes_A - %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - store i32 %lda, i32* %in.addr.lda - %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - store float* %B, float** %in.addr.B - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - store i64 %bytesB, i64* %in.addr.bytes_B - %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - store i32 %ldb, i32* %in.addr.ldb - %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - store float* %C, float** %in.addr.C - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - store i64 %bytesC, i64* %in.addr.bytes_C - %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - store i32 %ldc, i32* %in.addr.ldc - %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - store i32 %k, i32* %in.addr.k - %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 - %in.addr.alpha.cast = fptrunc double %conv31 to float - store float %in.addr.alpha.cast, float* %in.addr.alpha - %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 - %in.addr.beta.cast = fptrunc double %conv32 to float - store float %in.addr.beta.cast, float* %in.addr.beta - %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 - store i32 8, i32* %in.addr.dimX0 - %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 - store i32 8, i32* %in.addr.dimY0 - %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 - store i32 %div27, i32* %in.addr.dimX1 - %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 - store i32 %div30, i32* %in.addr.dimY1 - %args = bitcast %struct.arg* %in.addr to i8* - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) - call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) - br label %return - -return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit92, %_ZNKSt5ctypeIcE5widenEc.exit80, %_ZNKSt5ctypeIcE5widenEc.exit - ret void -} - -declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** %argv) #2 { -entry: - %argc.addr = alloca i32, align 4 - %timers = alloca %struct.pb_TimerSet, align 8 - %matArow = alloca i32, align 4 - %matAcol = alloca i32, align 4 - %matBrow = alloca i32, align 4 - %matBcol = alloca i32, align 4 - %matA = alloca %"class.std::vector", align 8 - %matBT = alloca %"class.std::vector", align 8 - %matC = alloca %"class.std::vector", align 8 - store i32 %argc, i32* %argc.addr, align 4, !tbaa !8 - %0 = bitcast %struct.pb_TimerSet* %timers to i8* - call void @llvm.lifetime.start(i64 800, i8* %0) #1 - %1 = bitcast %"class.std::vector"* %matA to i8* - call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 - %2 = bitcast %"class.std::vector"* %matBT to i8* - call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 - %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 - %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 - %3 = load i8*** %inpFiles, align 8, !tbaa !7 - %4 = load i8** %3, align 8, !tbaa !7 - %cmp = icmp eq i8* %4, null - br i1 %cmp, label %if.then, label %lor.lhs.false - -lor.lhs.false: ; preds = %entry - %arrayidx2 = getelementptr inbounds i8** %3, i64 1 - %5 = load i8** %arrayidx2, align 8, !tbaa !7 - %cmp3 = icmp eq i8* %5, null - br i1 %cmp3, label %if.then, label %lor.lhs.false4 - -lor.lhs.false4: ; preds = %lor.lhs.false - %arrayidx6 = getelementptr inbounds i8** %3, i64 2 - %6 = load i8** %arrayidx6, align 8, !tbaa !7 - %cmp7 = icmp eq i8* %6, null - br i1 %cmp7, label %if.then, label %lor.lhs.false8 - -lor.lhs.false8: ; preds = %lor.lhs.false4 - %arrayidx10 = getelementptr inbounds i8** %3, i64 3 - %7 = load i8** %arrayidx10, align 8, !tbaa !7 - %cmp11 = icmp eq i8* %7, null - br i1 %cmp11, label %if.end, label %if.then - -if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry - %8 = load %struct._IO_FILE** @stderr, align 8, !tbaa !7 - %9 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %8) - call void @exit(i32 -1) #7 - unreachable - -if.end: ; preds = %lor.lhs.false8 - %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 - %10 = load i8*** %inpFiles, align 8, !tbaa !7 - %arrayidx17 = getelementptr inbounds i8** %10, i64 2 - %11 = load i8** %arrayidx17, align 8, !tbaa !7 - %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %11, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 - call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - %12 = call i8* @llvm_visc_initializeTimerSet() - store i8* %12, i8** @viscTimerSet_GenVISC - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - call void @llvm.visc.init() - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %13 = load i32* %matArow, align 4, !tbaa !8 - %14 = load i32* %matAcol, align 4, !tbaa !8 - %mul = mul nsw i32 %14, %13 - %conv = sext i32 %mul to i64 - %mul19 = shl nsw i64 %conv, 2 - %15 = load i32* %matBrow, align 4, !tbaa !8 - %16 = load i32* %matBcol, align 4, !tbaa !8 - %mul20 = mul nsw i32 %16, %15 - %conv21 = sext i32 %mul20 to i64 - %mul22 = shl nsw i64 %conv21, 2 - %mul23 = mul nsw i32 %16, %13 - %conv24 = sext i32 %mul23 to i64 - %mul25 = shl nsw i64 %conv24, 2 - %17 = bitcast %"class.std::vector"* %matC to i8* - call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 24, i32 8, i1 false) #1 - %cmp.i.i.i.i = icmp eq i32 %mul23, 0 - br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i - -cond.true.i.i.i.i: ; preds = %if.end - %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 - br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !9 - -if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i - call void @_ZSt17__throw_bad_allocv() #7 - unreachable - -_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i - %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 - %18 = bitcast i8* %call2.i.i.i.i.i to float* - br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - -_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end - %cond.i.i.i.i = phi float* [ %18, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] - %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 - store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !7 - %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 - store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 - %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 - %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 - store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !7 - br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i - -for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 - %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i - %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 - %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i - br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i - -vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i - %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] - %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i - %19 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* - store <4 x float> zeroinitializer, <4 x float>* %19, align 4 - %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 - %20 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i - %21 = bitcast float* %20 to <4 x float>* - store <4 x float> zeroinitializer, <4 x float>* %21, align 4 - %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 - %22 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i - br i1 %22, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i - -middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i - %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] - %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 - br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader - -for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i - %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* - %23 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 - call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %23, i32 4, i1 false) - br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - -_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i - store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 - %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 - %24 = load float** %_M_start.i.i, align 8, !tbaa !7 - %25 = bitcast float* %24 to i8* - call void @llvm_visc_track_mem(i8* %25, i64 %mul19) #1 - %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 - %26 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %27 = bitcast float* %26 to i8* - call void @llvm_visc_track_mem(i8* %27, i64 %mul22) #1 - %28 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %29 = bitcast float* %28 to i8* - call void @llvm_visc_track_mem(i8* %29, i64 %mul25) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %30 = load float** %_M_finish.i.i.i, align 8, !tbaa !7 - %31 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %cmp3399 = icmp eq float* %30, %31 - br i1 %cmp3399, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - %sub.ptr.lhs.cast.i = ptrtoint float* %30 to i64 - %sub.ptr.rhs.cast.i = ptrtoint float* %31 to i64 - %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i - %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %add.ptr.i = getelementptr inbounds float* %31, i64 %i.0100 - store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !3 - %inc = add i64 %i.0100, 1 - %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i - br i1 %cmp33, label %for.body, label %for.end - -for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - %32 = load i32* %matArow, align 4, !tbaa !8 - %33 = load i32* %matBcol, align 4, !tbaa !8 - %34 = load i32* %matAcol, align 4, !tbaa !8 - %35 = load float** %_M_start.i.i, align 8, !tbaa !7 - %36 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %37 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %32, i32 %33, i32 %34, float 1.000000e+00, float* %35, i64 %mul19, i32 %32, float* %36, i64 %mul22, i32 %33, float 0.000000e+00, float* %37, i64 %mul25, i32 %32) - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 - %38 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %39 = bitcast float* %38 to i8* - call void @llvm_visc_request_mem(i8* %39, i64 %mul25) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 - %40 = load float** %_M_start.i.i, align 8, !tbaa !7 - %41 = bitcast float* %40 to i8* - call void @llvm_visc_untrack_mem(i8* %41) #1 - %42 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %43 = bitcast float* %42 to i8* - call void @llvm_visc_untrack_mem(i8* %43) #1 - %44 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %45 = bitcast float* %44 to i8* - call void @llvm_visc_untrack_mem(i8* %45) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 - call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) - call void @llvm.visc.cleanup() - %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 - %46 = load i8** %outFile, align 8, !tbaa !7 - %tobool = icmp eq i8* %46, null - br i1 %tobool, label %if.end45, label %if.then42 - -if.then42: ; preds = %for.end - %47 = load i32* %matArow, align 4, !tbaa !8 - %48 = load i32* %matBcol, align 4, !tbaa !8 - %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %46, i32 %47, i32 %48, %"class.std::vector"* %matC) #1 - br label %if.end45 - -if.end45: ; preds = %if.then42, %for.end - %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 - %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 - %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 - %49 = load i32* %matArow, align 4, !tbaa !8 - %conv50 = sitofp i32 %49 to double - %mul51 = fmul fast double %conv50, 2.000000e+00 - %50 = load i32* %matBcol, align 4, !tbaa !8 - %conv52 = sitofp i32 %50 to double - %mul53 = fmul fast double %mul51, %conv52 - %51 = load i32* %matAcol, align 4, !tbaa !8 - %conv54 = sitofp i32 %51 to double - %mul55 = fmul fast double %mul53, %conv54 - %div = fdiv fast double %mul55, %call48 - %div56 = fmul double %div, 1.000000e-09 - %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 - %52 = bitcast %"class.std::basic_ostream"* %call.i to i8** - %vtable.i = load i8** %52, align 8, !tbaa !6 - %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 - %53 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64* %53, align 8 - %54 = bitcast %"class.std::basic_ostream"* %call.i to i8* - %add.ptr.sum.i = add i64 %vbase.offset.i, 240 - %_M_ctype.i.i = getelementptr inbounds i8* %54, i64 %add.ptr.sum.i - %55 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** - %56 = load %"class.std::ctype"** %55, align 8, !tbaa !7 - %tobool.i.i.i = icmp eq %"class.std::ctype"* %56, null - br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - -if.then.i.i.i: ; preds = %if.end45 - call void @_ZSt16__throw_bad_castv() #7 - unreachable - -_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end45 - %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 6 - %57 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !4 - %tobool.i3.i.i = icmp eq i8 %57, 0 - br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i - -if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 7, i64 10 - %58 = load i8* %arrayidx.i.i.i, align 1, !tbaa !4 - br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - -if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i - call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %56) #1 - %59 = bitcast %"class.std::ctype"* %56 to i8 (%"class.std::ctype"*, i8)*** - %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %59, align 8, !tbaa !6 - %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 - %60 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 - %call.i.i.i = call signext i8 %60(%"class.std::ctype"* %56, i8 signext 10) #1 - br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - -_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i - %retval.0.i.i.i = phi i8 [ %58, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] - %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 - %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 - call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 - %61 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 - %tobool.i.i.i.i78 = icmp eq float* %61, null - br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 - -if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - %62 = bitcast float* %61 to i8* - call void @_ZdlPv(i8* %62) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 - -_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit - %63 = load float** %_M_start.i.i82, align 8, !tbaa !7 - %tobool.i.i.i.i74 = icmp eq float* %63, null - br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 - -if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 - %64 = bitcast float* %63 to i8* - call void @_ZdlPv(i8* %64) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 - -_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 - %65 = load float** %_M_start.i.i, align 8, !tbaa !7 - %tobool.i.i.i.i = icmp eq float* %65, null - br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i - -if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 - %66 = bitcast float* %65 to i8* - call void @_ZdlPv(i8* %66) #1 - br label %_ZNSt6vectorIfSaIfEED1Ev.exit - -_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 - call void @llvm.lifetime.end(i64 800, i8* %0) #1 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 - -; Function Attrs: noreturn nounwind -declare void @exit(i32) #4 - -declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 - -declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 - -declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 - -declare void @llvm_visc_track_mem(i8*, i64) #0 - -declare void @llvm_visc_request_mem(i8*, i64) #0 - -declare void @llvm_visc_untrack_mem(i8*) #0 - -declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 - -declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 - -declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 - -declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 - -; Function Attrs: noreturn -declare void @_ZSt17__throw_bad_allocv() #5 - -declare noalias i8* @_Znwm(i64) #0 - -; Function Attrs: nounwind -declare void @_ZdlPv(i8*) #6 - -declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 - -declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 - -; Function Attrs: noreturn -declare void @_ZSt16__throw_bad_castv() #5 - -declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 - -declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 - -; Function Attrs: nounwind -define internal void @_GLOBAL__I_a() #1 section ".text.startup" { -entry: - tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 - %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 - ret void -} - -; Function Attrs: nounwind -declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 - -; Function Attrs: nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 - -declare i8* @llvm_visc_initializeTimerSet() - -declare void @llvm_visc_switchToTimer(i8**, i32) - -declare void @llvm_visc_printTimerSet(i8**, i8*) - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) #1 - -; Function Attrs: nounwind uwtable -define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { -entry: - %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) - call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #7 = { noreturn nounwind } - -!visc_hint_gpu = !{!0, !1} -!visc_hint_cpu = !{!2} - -!0 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff} -!1 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1} -!2 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2} -!3 = metadata !{metadata !"float", metadata !4} -!4 = metadata !{metadata !"omnipotent char", metadata !5} -!5 = metadata !{metadata !"Simple C/C++ TBAA"} -!6 = metadata !{metadata !"vtable pointer", metadata !5} -!7 = metadata !{metadata !"any pointer", metadata !4} -!8 = metadata !{metadata !"int", metadata !4} -!9 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/hpvm/test/parboil/benchmarks/spmv/Makefile b/hpvm/test/parboil/benchmarks/spmv/Makefile index 23e1d4990031404b8e365d9430499b5fddb2af01..aff3e54712256348ebd9d0054d87fd62616fa15b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/Makefile +++ b/hpvm/test/parboil/benchmarks/spmv/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = spmv -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc + VERSION = hpvm endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile similarity index 88% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile index a289d68f342ba488f8ce4d90faf26816d4d00829..06af6bebea2aa6a94f56196e0399a25ebfdda030 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile @@ -1,9 +1,9 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm TOOLS_SRC=common_src/convert-dataset SRCDIR_OBJS=gpu_info.ll file.ll -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O1 -I$(TOOLS_SRC) APP_CXXFLAGS=-ffast-math -O1 -I$(TOOLS_SRC) diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/file.h rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp similarity index 68% rename from hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp index f6ce5ccfb2412036f4eadcdab419ceca0a6c8f30..4414744b4995a9ae09bb88fdda297150dfbe1031 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp @@ -8,11 +8,11 @@ //#include <CL/cl.h> //#include <CL/cl_ext.h> +#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <visc.h> #include "convert_dataset.h" #include "file.h" @@ -54,15 +54,15 @@ void spmv_jds(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int) { - __visc__hint(visc::DEVICE); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gridx = __visc__getNumNodeInstances_x(thisNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int gridx = __hpvm__getNumNodeInstances_x(thisNode); int ix = gx * gridx + lx; int warp_id = ix >> WARP_BITS; @@ -126,25 +126,25 @@ void spmvLvl1(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int, size_t dim_X1) { - __visc__hint(visc::DEVICE); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1); - __visc__bindIn(spmv_node, 0, 0, 0); - __visc__bindIn(spmv_node, 1, 1, 0); - __visc__bindIn(spmv_node, 2, 2, 0); - __visc__bindIn(spmv_node, 3, 3, 0); - __visc__bindIn(spmv_node, 4, 4, 0); - __visc__bindIn(spmv_node, 5, 5, 0); - __visc__bindIn(spmv_node, 6, 6, 0); - __visc__bindIn(spmv_node, 7, 7, 0); - __visc__bindIn(spmv_node, 8, 8, 0); - __visc__bindIn(spmv_node, 9, 9, 0); - __visc__bindIn(spmv_node, 10, 10, 0); - __visc__bindIn(spmv_node, 11, 11, 0); - __visc__bindIn(spmv_node, 12, 12, 0); - __visc__bindIn(spmv_node, 13, 13, 0); - __visc__bindIn(spmv_node, 14, 14, 0); + void *spmv_node = __hpvm__createNodeND(1, spmv_jds, dim_X1); + __hpvm__bindIn(spmv_node, 0, 0, 0); + __hpvm__bindIn(spmv_node, 1, 1, 0); + __hpvm__bindIn(spmv_node, 2, 2, 0); + __hpvm__bindIn(spmv_node, 3, 3, 0); + __hpvm__bindIn(spmv_node, 4, 4, 0); + __hpvm__bindIn(spmv_node, 5, 5, 0); + __hpvm__bindIn(spmv_node, 6, 6, 0); + __hpvm__bindIn(spmv_node, 7, 7, 0); + __hpvm__bindIn(spmv_node, 8, 8, 0); + __hpvm__bindIn(spmv_node, 9, 9, 0); + __hpvm__bindIn(spmv_node, 10, 10, 0); + __hpvm__bindIn(spmv_node, 11, 11, 0); + __hpvm__bindIn(spmv_node, 12, 12, 0); + __hpvm__bindIn(spmv_node, 13, 13, 0); + __hpvm__bindIn(spmv_node, 14, 14, 0); } void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data, @@ -153,26 +153,26 @@ void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2); - __visc__bindIn(spmv_node, 0, 0, 0); - __visc__bindIn(spmv_node, 1, 1, 0); - __visc__bindIn(spmv_node, 2, 2, 0); - __visc__bindIn(spmv_node, 3, 3, 0); - __visc__bindIn(spmv_node, 4, 4, 0); - __visc__bindIn(spmv_node, 5, 5, 0); - __visc__bindIn(spmv_node, 6, 6, 0); - __visc__bindIn(spmv_node, 7, 7, 0); - __visc__bindIn(spmv_node, 8, 8, 0); - __visc__bindIn(spmv_node, 9, 9, 0); - __visc__bindIn(spmv_node, 10, 10, 0); - __visc__bindIn(spmv_node, 11, 11, 0); - __visc__bindIn(spmv_node, 12, 12, 0); - __visc__bindIn(spmv_node, 13, 13, 0); - __visc__bindIn(spmv_node, 14, 14, 0); - __visc__bindIn(spmv_node, 15, 15, 0); + void *spmv_node = __hpvm__createNodeND(1, spmvLvl1, dim_X2); + __hpvm__bindIn(spmv_node, 0, 0, 0); + __hpvm__bindIn(spmv_node, 1, 1, 0); + __hpvm__bindIn(spmv_node, 2, 2, 0); + __hpvm__bindIn(spmv_node, 3, 3, 0); + __hpvm__bindIn(spmv_node, 4, 4, 0); + __hpvm__bindIn(spmv_node, 5, 5, 0); + __hpvm__bindIn(spmv_node, 6, 6, 0); + __hpvm__bindIn(spmv_node, 7, 7, 0); + __hpvm__bindIn(spmv_node, 8, 8, 0); + __hpvm__bindIn(spmv_node, 9, 9, 0); + __hpvm__bindIn(spmv_node, 10, 10, 0); + __hpvm__bindIn(spmv_node, 11, 11, 0); + __hpvm__bindIn(spmv_node, 12, 12, 0); + __hpvm__bindIn(spmv_node, 13, 13, 0); + __hpvm__bindIn(spmv_node, 14, 14, 0); + __hpvm__bindIn(spmv_node, 15, 15, 0); } void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data, @@ -181,27 +181,27 @@ void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2); - __visc__bindIn(spmv_node, 0, 0, 0); - __visc__bindIn(spmv_node, 1, 1, 0); - __visc__bindIn(spmv_node, 2, 2, 0); - __visc__bindIn(spmv_node, 3, 3, 0); - __visc__bindIn(spmv_node, 4, 4, 0); - __visc__bindIn(spmv_node, 5, 5, 0); - __visc__bindIn(spmv_node, 6, 6, 0); - __visc__bindIn(spmv_node, 7, 7, 0); - __visc__bindIn(spmv_node, 8, 8, 0); - __visc__bindIn(spmv_node, 9, 9, 0); - __visc__bindIn(spmv_node, 10, 10, 0); - __visc__bindIn(spmv_node, 11, 11, 0); - __visc__bindIn(spmv_node, 12, 12, 0); - __visc__bindIn(spmv_node, 13, 13, 0); - __visc__bindIn(spmv_node, 14, 14, 0); - __visc__bindIn(spmv_node, 15, 15, 0); - __visc__bindIn(spmv_node, 16, 16, 0); + void *spmv_node = __hpvm__createNodeND(1, spmvLvl2, dim_X2); + __hpvm__bindIn(spmv_node, 0, 0, 0); + __hpvm__bindIn(spmv_node, 1, 1, 0); + __hpvm__bindIn(spmv_node, 2, 2, 0); + __hpvm__bindIn(spmv_node, 3, 3, 0); + __hpvm__bindIn(spmv_node, 4, 4, 0); + __hpvm__bindIn(spmv_node, 5, 5, 0); + __hpvm__bindIn(spmv_node, 6, 6, 0); + __hpvm__bindIn(spmv_node, 7, 7, 0); + __hpvm__bindIn(spmv_node, 8, 8, 0); + __hpvm__bindIn(spmv_node, 9, 9, 0); + __hpvm__bindIn(spmv_node, 10, 10, 0); + __hpvm__bindIn(spmv_node, 11, 11, 0); + __hpvm__bindIn(spmv_node, 12, 12, 0); + __hpvm__bindIn(spmv_node, 13, 13, 0); + __hpvm__bindIn(spmv_node, 14, 14, 0); + __hpvm__bindIn(spmv_node, 15, 15, 0); + __hpvm__bindIn(spmv_node, 16, 16, 0); } int main(int argc, char **argv) { @@ -261,7 +261,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memset(h_Ax_vector, 0, dim * sizeof(float)); @@ -271,14 +271,14 @@ int main(int argc, char **argv) { compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(h_Ax_vector, dim * sizeof(float)); - llvm_visc_track_mem(h_data, len * sizeof(float)); - llvm_visc_track_mem(h_indices, len * sizeof(int)); - llvm_visc_track_mem(h_perm, dim * sizeof(int)); - llvm_visc_track_mem(h_x_vector, dim * sizeof(float)); - llvm_visc_track_mem(h_ptr, depth * sizeof(int)); - llvm_visc_track_mem(h_nzcnt, nzcnt_len * sizeof(int)); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); + llvm_hpvm_track_mem(h_Ax_vector, dim * sizeof(float)); + llvm_hpvm_track_mem(h_data, len * sizeof(float)); + llvm_hpvm_track_mem(h_indices, len * sizeof(int)); + llvm_hpvm_track_mem(h_perm, dim * sizeof(int)); + llvm_hpvm_track_mem(h_x_vector, dim * sizeof(float)); + llvm_hpvm_track_mem(h_ptr, depth * sizeof(int)); + llvm_hpvm_track_mem(h_nzcnt, nzcnt_len * sizeof(int)); // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -306,9 +306,9 @@ int main(int argc, char **argv) { block, (grid / block)}; *(RootIn *)root_in = root_in_local; - void *spmvDFG = __visc__launch(0, spmvLvl3, root_in); + void *spmvDFG = __hpvm__launch(0, spmvLvl3, root_in); - __visc__wait(spmvDFG); + __hpvm__wait(spmvDFG); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); /******************************* Issues ******************************* @@ -326,21 +326,21 @@ int main(int argc, char **argv) { // HtoD memory copy pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(h_Ax_vector, dim * sizeof(float)); + llvm_hpvm_request_mem(h_Ax_vector, dim * sizeof(float)); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(h_Ax_vector); - llvm_visc_untrack_mem(h_data); - llvm_visc_untrack_mem(h_indices); - llvm_visc_untrack_mem(h_perm); - llvm_visc_untrack_mem(h_x_vector); - llvm_visc_untrack_mem(h_ptr); - llvm_visc_untrack_mem(h_nzcnt); + llvm_hpvm_untrack_mem(h_Ax_vector); + llvm_hpvm_untrack_mem(h_data); + llvm_hpvm_untrack_mem(h_indices); + llvm_hpvm_untrack_mem(h_perm); + llvm_hpvm_untrack_mem(h_x_vector); + llvm_hpvm_untrack_mem(h_ptr); + llvm_hpvm_untrack_mem(h_nzcnt); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c index a19184a9659eaa91223da57e1b926ac6bff54b4e..8bff8a1d0af3c22348daad7bde0fed51f4c6f58d 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // parameters declaration cl_int clStatus; @@ -127,7 +127,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ /*cl_program clProgram = * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is // %d=\n",grid,block); printf("!!! dim is %d\n",dim); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -240,7 +240,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int j = 0; j < 20; j++) { for (i = 0; i < 50; i++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c index d4fc026b73894e47c94dd7f2c9ef8f31e366eec6..f704f96ed291269457d99563b2779dae93da78c7 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c @@ -236,7 +236,7 @@ int main(int argc, char **argv) { // main execution int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); /*for(int j=0; j<20; j++) {*/ for (i = 0; i < 50; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c index 42ffab597d028eacba7f9975473908bdf812524e..a6fe5012f96ee73f54af85c20e665517b22c1b1e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // parameters declaration cl_int clStatus; @@ -127,7 +127,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ /*cl_program clProgram = * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is // %d=\n",grid,block); printf("!!! dim is %d\n",dim); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -240,7 +240,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int j = 0; j < 1; j++) { for (i = 0; i < 50; i++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c index fbd272b32f7f60fbd0c651b0f329550b47e4db27..bc3655c4abfec2463cef9082e1f3d3e0b25b7d3b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // parameters declaration cl_int clStatus; @@ -127,7 +127,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ /*cl_program clProgram = * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is // %d=\n",grid,block); printf("!!! dim is %d\n",dim); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -240,7 +240,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int j = 0; j < 20; j++) { for (i = 0; i < 50; i++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c index 343814149aa74139930380c2178e2f447c64e806..88fd0c878bb8e128c3790716b82b5aec8acbe41a 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c @@ -224,7 +224,7 @@ int main(int argc, char **argv) { // main execution - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); int i; for (i = 0; i < 50; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c index 4600a3e6b8d580ad6fc3986d24a712ad592e25eb..ca538e3a95f56498c8ba8deb90b7820035dcbe11 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c @@ -83,7 +83,7 @@ int main(int argc, char **argv) { printf("Col count = %d, dim = %d\n", col_count, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; @@ -137,7 +137,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCLDeviceProp clDeviceProp; clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, @@ -215,7 +215,7 @@ int main(int argc, char **argv) { compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, clDeviceProp.minor, clDeviceProp.multiProcessorCount); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -237,7 +237,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); int i; for (int j = 0; j < 5; j++) { for (i = 0; i < 50; i++) { @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c index d2375af91dd8d4812fcb82b78b856e85feda376f..21973c2fa75fc95f4496d26b3d2c8870d9a1e577 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c @@ -83,7 +83,7 @@ int main(int argc, char **argv) { printf("Col count = %d, dim = %d\n", col_count, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; @@ -137,7 +137,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); OpenCLDeviceProp clDeviceProp; clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, @@ -215,7 +215,7 @@ int main(int argc, char **argv) { compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, clDeviceProp.minor, clDeviceProp.multiProcessorCount); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -237,7 +237,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); int i; for (int j = 0; j < 100; j++) { for (i = 0; i < 50; i++) { @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc deleted file mode 100644 index b804d14d16cff805c0c1850d1f5079ab6e973ecf..0000000000000000000000000000000000000000 Binary files a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc and /dev/null differ diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll deleted file mode 100644 index 5604d70e8a005ee7e21c5ae9bf6dbf0dbac77d15..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll +++ /dev/null @@ -1,138 +0,0 @@ -; ModuleID = 'build/visc_default/main.visc.ll.kernels.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -target triple = "spir64-unknown-unknown" - -%rtype = type {} - -; Function Attrs: optsize zeroext -define void @spmv_jds(float* %dst_vector, i64 %bytes_dst_vector, float* %d_data, i64 %bytes_d_data, i32* %d_index, i64 %bytes_d_index, i32* %d_perm, i64 %bytes_d_perm, float* %x_vec, i64 %bytes_x_vec, i32 %dim, i32* %jds_ptr_int, i64 %bytes_jds_ptr_int, i32* %sh_zcnt_int, i64 %bytes_sh_zcnt_int) #0 { -entry: - ;%0 = call i64 @_Z12get_group_idj(i32 0) - ;%1 = trunc i64 %0 to i32 - ;%2 = call i64 @_Z14get_local_sizej(i32 0) - ;%3 = trunc i64 %2 to i32 - ;%4 = mul i32 %1, %3 - ;%5 = call i64 @_Z12get_local_idj(i32 0) - ;%6 = trunc i64 %5 to i32 - ;%7 = add i32 %4, %6 - %0 = add i32 0, 0 - %1 = add i32 0, 0 - %2 = add i32 0, 0 - %3 = add i32 0, 0 - %4 = add i32 0, 0 - %5 = add i32 0, 0 - %6 = call i64 @_Z13get_global_idj(i32 0) - %7 = trunc i64 %6 to i32 - %cmp = icmp slt i32 %7, %dim - br i1 %cmp, label %if.then, label %if.end38 - -if.then: ; preds = %entry - %shr = ashr i32 %7, 5 - %idxprom = sext i32 %shr to i64 - %arrayidx = getelementptr inbounds i32* %sh_zcnt_int, i64 %idxprom - %8 = load i32* %arrayidx, align 4, !tbaa !4 - %9 = load i32* %jds_ptr_int, align 4, !tbaa !4 - %add = add nsw i32 %9, %7 - %idxprom3 = sext i32 %add to i64 - %arrayidx4 = getelementptr inbounds float* %d_data, i64 %idxprom3 - %10 = load float* %arrayidx4, align 4, !tbaa !8 - %arrayidx6 = getelementptr inbounds i32* %d_index, i64 %idxprom3 - %11 = load i32* %arrayidx6, align 4, !tbaa !4 - %idxprom7 = sext i32 %11 to i64 - %arrayidx8 = getelementptr inbounds float* %x_vec, i64 %idxprom7 - %12 = load float* %arrayidx8, align 4, !tbaa !8 - %cmp9 = icmp sgt i32 %8, 1 - br i1 %cmp9, label %if.then10, label %if.end - -if.then10: ; preds = %if.then - %arrayidx11 = getelementptr inbounds i32* %jds_ptr_int, i64 1 - %.pn77 = load i32* %arrayidx11, align 4 - %idxprom13.pn.in78 = add nsw i32 %.pn77, %7 - %idxprom13.pn79 = sext i32 %idxprom13.pn.in78 to i64 - %i.0.in80 = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn79 - %i.081 = load i32* %i.0.in80, align 4 - %cmp1582 = icmp sgt i32 %8, 2 - %arrayidx1783 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn79 - %13 = load float* %arrayidx1783, align 4, !tbaa !8 - br i1 %cmp1582, label %for.body, label %for.end - -for.body: ; preds = %for.body, %if.then10 - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 2, %if.then10 ] - %14 = phi float [ %16, %for.body ], [ %13, %if.then10 ] - %i.088 = phi i32 [ %i.0, %for.body ], [ %i.081, %if.then10 ] - %sum.086 = phi float [ %add25, %for.body ], [ 0.000000e+00, %if.then10 ] - %t.085 = phi float [ %15, %for.body ], [ %12, %if.then10 ] - %d.084 = phi float [ %14, %for.body ], [ %10, %if.then10 ] - %arrayidx19 = getelementptr inbounds i32* %jds_ptr_int, i64 %indvars.iv - %idxprom23 = sext i32 %i.088 to i64 - %arrayidx24 = getelementptr inbounds float* %x_vec, i64 %idxprom23 - %15 = load float* %arrayidx24, align 4, !tbaa !8 - %mul = fmul fast float %d.084, %t.085 - %add25 = fadd fast float %sum.086, %mul - %indvars.iv.next = add i64 %indvars.iv, 1 - %.pn = load i32* %arrayidx19, align 4 - %idxprom13.pn.in = add nsw i32 %.pn, %7 - %idxprom13.pn = sext i32 %idxprom13.pn.in to i64 - %i.0.in = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn - %i.0 = load i32* %i.0.in, align 4 - %arrayidx17 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn - %16 = load float* %arrayidx17, align 4, !tbaa !8 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %8 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %if.then10 - %.lcssa = phi float [ %13, %if.then10 ], [ %16, %for.body ] - %i.0.lcssa = phi i32 [ %i.081, %if.then10 ], [ %i.0, %for.body ] - %sum.0.lcssa = phi float [ 0.000000e+00, %if.then10 ], [ %add25, %for.body ] - %t.0.lcssa = phi float [ %12, %if.then10 ], [ %15, %for.body ] - %d.0.lcssa = phi float [ %10, %if.then10 ], [ %14, %for.body ] - %idxprom28 = sext i32 %i.0.lcssa to i64 - %arrayidx29 = getelementptr inbounds float* %x_vec, i64 %idxprom28 - %17 = load float* %arrayidx29, align 4, !tbaa !8 - %mul30 = fmul fast float %d.0.lcssa, %t.0.lcssa - %add31 = fadd fast float %sum.0.lcssa, %mul30 - br label %if.end - -if.end: ; preds = %for.end, %if.then - %d.1 = phi float [ %.lcssa, %for.end ], [ %10, %if.then ] - %t.1 = phi float [ %17, %for.end ], [ %12, %if.then ] - %sum.1 = phi float [ %add31, %for.end ], [ 0.000000e+00, %if.then ] - %mul32 = fmul fast float %d.1, %t.1 - %add33 = fadd fast float %sum.1, %mul32 - %idxprom34 = sext i32 %7 to i64 - %arrayidx35 = getelementptr inbounds i32* %d_perm, i64 %idxprom34 - %18 = load i32* %arrayidx35, align 4, !tbaa !4 - %idxprom36 = sext i32 %18 to i64 - %arrayidx37 = getelementptr inbounds float* %dst_vector, i64 %idxprom36 - store float %add33, float* %arrayidx37, align 4, !tbaa !8 - br label %if.end38 - -if.end38: ; preds = %if.end, %entry - ret void -} - -declare i64 @_Z13get_global_idj(i32) - -declare i64 @_Z12get_group_idj(i32) - -declare i64 @_Z14get_local_sizej(i32) - -declare i64 @_Z12get_local_idj(i32) - -attributes #0 = { optsize zeroext "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } - -!visc_hint_gpu = !{} -!visc_hint_cpu = !{!0, !1} -!opencl.kernels = !{!2} - -!0 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32)* undef} -!1 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32, i32)* undef} -!2 = metadata !{void (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64)* @spmv_jds, metadata !3} -!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64"} -!4 = metadata !{metadata !5, metadata !5, i64 0} -!5 = metadata !{metadata !"int", metadata !6} -!6 = metadata !{metadata !"omnipotent char", metadata !7} -!7 = metadata !{metadata !"Simple C/C++ TBAA"} -!8 = metadata !{metadata !9, metadata !9, i64 0} -!9 = metadata !{metadata !"float", metadata !6} diff --git a/hpvm/test/parboil/benchmarks/stencil/Makefile b/hpvm/test/parboil/benchmarks/stencil/Makefile index a44dd0dbf0d678c7e8417345854254a1c2676653..e761d7b4f5f020fc19c5f59040ca5eb82b117381 100644 --- a/hpvm/test/parboil/benchmarks/stencil/Makefile +++ b/hpvm/test/parboil/benchmarks/stencil/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = stencil -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc + VERSION = hpvm endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile similarity index 80% rename from hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile index cf61fb3a6c77e07bf8ccc67902bd1a1997902763..35b36dcf3c053da03017c72d442204590675ecb4 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=file.ll -VISC_OBJS=stencil.visc.ll +HPVM_OBJS=stencil.hpvm.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/visc/common.h rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/visc/file.h rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp similarity index 66% rename from hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp index 5672a3ee490917d1374783eae5ab0ba1956ef441..e5810fc8101bef72dd4636b0b6c11826a8b18318 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp +++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp @@ -9,11 +9,11 @@ #include "common.h" #include "file.h" +#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <visc.h> static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { int s = 0; @@ -42,23 +42,23 @@ typedef struct __attribute__((__packed__)) { void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, A0, Anext, 1, Anext); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, A0, Anext, 1, Anext); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int ly = __visc__getNodeInstanceID_y(thisNode); - int lz = __visc__getNodeInstanceID_z(thisNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int ly = __hpvm__getNodeInstanceID_y(thisNode); + int lz = __hpvm__getNodeInstanceID_z(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - int gz = __visc__getNodeInstanceID_z(parentNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int gy = __hpvm__getNodeInstanceID_y(parentNode); + int gz = __hpvm__getNodeInstanceID_z(parentNode); - int gridx = __visc__getNumNodeInstances_x(thisNode); - int gridy = __visc__getNumNodeInstances_y(thisNode); - int gridz = __visc__getNumNodeInstances_z(thisNode); + int gridx = __hpvm__getNumNodeInstances_x(thisNode); + int gridy = __hpvm__getNumNodeInstances_y(thisNode); + int gridz = __hpvm__getNumNodeInstances_z(thisNode); int i = gx * gridx + lx + 1; int j = gy * gridy + ly + 1; @@ -78,65 +78,65 @@ void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, void stencilLvl1(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, size_t dim_Y1, size_t dim_Z1) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, A0, Anext, 1, Anext); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, A0, Anext, 1, Anext); void *stencil_node = - __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1); - __visc__bindIn(stencil_node, 0, 0, 0); - __visc__bindIn(stencil_node, 1, 1, 0); - __visc__bindIn(stencil_node, 2, 2, 0); - __visc__bindIn(stencil_node, 3, 3, 0); - __visc__bindIn(stencil_node, 4, 4, 0); - __visc__bindIn(stencil_node, 5, 5, 0); - __visc__bindIn(stencil_node, 6, 6, 0); - __visc__bindIn(stencil_node, 7, 7, 0); - __visc__bindIn(stencil_node, 8, 8, 0); + __hpvm__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1); + __hpvm__bindIn(stencil_node, 0, 0, 0); + __hpvm__bindIn(stencil_node, 1, 1, 0); + __hpvm__bindIn(stencil_node, 2, 2, 0); + __hpvm__bindIn(stencil_node, 3, 3, 0); + __hpvm__bindIn(stencil_node, 4, 4, 0); + __hpvm__bindIn(stencil_node, 5, 5, 0); + __hpvm__bindIn(stencil_node, 6, 6, 0); + __hpvm__bindIn(stencil_node, 7, 7, 0); + __hpvm__bindIn(stencil_node, 8, 8, 0); } void stencilLvl2(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2, size_t dim_Z2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, A0, Anext, 1, Anext); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, A0, Anext, 1, Anext); void *stencil_node = - __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2); - __visc__bindIn(stencil_node, 0, 0, 0); - __visc__bindIn(stencil_node, 1, 1, 0); - __visc__bindIn(stencil_node, 2, 2, 0); - __visc__bindIn(stencil_node, 3, 3, 0); - __visc__bindIn(stencil_node, 4, 4, 0); - __visc__bindIn(stencil_node, 5, 5, 0); - __visc__bindIn(stencil_node, 6, 6, 0); - __visc__bindIn(stencil_node, 7, 7, 0); - __visc__bindIn(stencil_node, 8, 8, 0); - __visc__bindIn(stencil_node, 9, 9, 0); - __visc__bindIn(stencil_node, 10, 10, 0); - __visc__bindIn(stencil_node, 11, 11, 0); + __hpvm__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2); + __hpvm__bindIn(stencil_node, 0, 0, 0); + __hpvm__bindIn(stencil_node, 1, 1, 0); + __hpvm__bindIn(stencil_node, 2, 2, 0); + __hpvm__bindIn(stencil_node, 3, 3, 0); + __hpvm__bindIn(stencil_node, 4, 4, 0); + __hpvm__bindIn(stencil_node, 5, 5, 0); + __hpvm__bindIn(stencil_node, 6, 6, 0); + __hpvm__bindIn(stencil_node, 7, 7, 0); + __hpvm__bindIn(stencil_node, 8, 8, 0); + __hpvm__bindIn(stencil_node, 9, 9, 0); + __hpvm__bindIn(stencil_node, 10, 10, 0); + __hpvm__bindIn(stencil_node, 11, 11, 0); } void stencilLvl3(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2, size_t dim_Z2) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, A0, Anext, 1, Anext); - void *stencil_node = __visc__createNodeND(0, stencilLvl2); - __visc__bindIn(stencil_node, 0, 0, 0); - __visc__bindIn(stencil_node, 1, 1, 0); - __visc__bindIn(stencil_node, 2, 2, 0); - __visc__bindIn(stencil_node, 3, 3, 0); - __visc__bindIn(stencil_node, 4, 4, 0); - __visc__bindIn(stencil_node, 5, 5, 0); - __visc__bindIn(stencil_node, 6, 6, 0); - __visc__bindIn(stencil_node, 7, 7, 0); - __visc__bindIn(stencil_node, 8, 8, 0); - __visc__bindIn(stencil_node, 9, 9, 0); - __visc__bindIn(stencil_node, 10, 10, 0); - __visc__bindIn(stencil_node, 11, 11, 0); - __visc__bindIn(stencil_node, 12, 12, 0); - __visc__bindIn(stencil_node, 13, 13, 0); - __visc__bindIn(stencil_node, 14, 14, 0); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, A0, Anext, 1, Anext); + void *stencil_node = __hpvm__createNodeND(0, stencilLvl2); + __hpvm__bindIn(stencil_node, 0, 0, 0); + __hpvm__bindIn(stencil_node, 1, 1, 0); + __hpvm__bindIn(stencil_node, 2, 2, 0); + __hpvm__bindIn(stencil_node, 3, 3, 0); + __hpvm__bindIn(stencil_node, 4, 4, 0); + __hpvm__bindIn(stencil_node, 5, 5, 0); + __hpvm__bindIn(stencil_node, 6, 6, 0); + __hpvm__bindIn(stencil_node, 7, 7, 0); + __hpvm__bindIn(stencil_node, 8, 8, 0); + __hpvm__bindIn(stencil_node, 9, 9, 0); + __hpvm__bindIn(stencil_node, 10, 10, 0); + __hpvm__bindIn(stencil_node, 11, 11, 0); + __hpvm__bindIn(stencil_node, 12, 12, 0); + __hpvm__bindIn(stencil_node, 13, 13, 0); + __hpvm__bindIn(stencil_node, 14, 14, 0); } int main(int argc, char **argv) { @@ -195,11 +195,11 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(h_A0, sizeof(float) * size); - llvm_visc_track_mem(h_Anext, sizeof(float) * size); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); + llvm_hpvm_track_mem(h_A0, sizeof(float) * size); + llvm_hpvm_track_mem(h_Anext, sizeof(float) * size); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -241,9 +241,9 @@ int main(int argc, char **argv) { grid[1] / block[1], grid[2] / block[2]}; *(RootIn *)root_in = root_in_local; - void *stencilDFG = __visc__launch(0, stencilLvl3, root_in); + void *stencilDFG = __hpvm__launch(0, stencilLvl3, root_in); - __visc__wait(stencilDFG); + __hpvm__wait(stencilDFG); // printf("iteration %d\n",t); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); float *h_temp = h_A0; @@ -255,19 +255,19 @@ int main(int argc, char **argv) { h_A0 = h_Anext; h_Anext = h_temp; pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(h_Anext, bytes); + llvm_hpvm_request_mem(h_Anext, bytes); printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]); printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(h_A0); - llvm_visc_untrack_mem(h_Anext); + llvm_hpvm_untrack_mem(h_A0); + llvm_hpvm_untrack_mem(h_Anext); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c similarity index 90% rename from hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c index bb6e45c932a68d951f5559bd856017ecf71aade6..35c5ed960c2031b0b84124bbdd1aeb95042625ee 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c @@ -9,11 +9,11 @@ #include "common.h" #include "file.h" +#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <visc.h> static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { int s = 0; @@ -31,7 +31,7 @@ static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { void naive_kernel(float c0, float c1, float *A0, float *Anext, int nx, int ny, int nz) { - __visc__attributes(2, A0, Anext, 1, Anext); + __hpvm__attributes(2, A0, Anext, 1, Anext); int i = get_global_id(0) + 1; int j = get_global_id(1) + 1; int k = get_global_id(2) + 1; @@ -106,11 +106,11 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(h_A0, sizeof(float) * size); - llvm_visc_track_mem(h_Anext, sizeof(float) * size); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); + llvm_hpvm_track_mem(h_A0, sizeof(float) * size); + llvm_hpvm_track_mem(h_Anext, sizeof(float) * size); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -133,11 +133,11 @@ int main(int argc, char **argv) { printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]); for (t = 0; t < iteration; t++) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); - unsigned stencilDFG = __visc__node( + unsigned stencilDFG = __hpvm__node( naive_kernel, 2, 3, block[0], block[1], block[2], grid[0] / block[0], grid[1] / block[1], grid[2] / block[2], 9, (float)c0, (float)c1, h_A0, bytes, h_Anext, bytes, nx, ny, nz, 0); - __visc__wait(stencilDFG); + __hpvm__wait(stencilDFG); // printf("iteration %d\n",t); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); float *h_temp = h_A0; @@ -149,19 +149,19 @@ int main(int argc, char **argv) { h_A0 = h_Anext; h_Anext = h_temp; pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(h_Anext, bytes); + llvm_hpvm_request_mem(h_Anext, bytes); printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]); printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(h_A0); - llvm_visc_untrack_mem(h_Anext); + llvm_hpvm_untrack_mem(h_A0); + llvm_hpvm_untrack_mem(h_Anext); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c index ec47c22227648df094cbf03ea1b667943207207e..1157b6198888a547a7d9c29b6f17970410ddb865 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) { CHECK_ERROR("clSetKernelArg") // main execution - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); int t; for (t = 0; t < iteration; t++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c index 61382182d1c8b406a2e2ba9dee250327914dbac4..70a86245b75e98e93607d135949af5637c8ab32f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -182,7 +182,7 @@ int main(int argc, char **argv) { // main execution /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 10; i++) { int t; for (t = 0; t < iteration; t++) { @@ -219,7 +219,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c index 217352e036b0d03bcc578286fd62c4339dedfe94..3a5dfa3b3a5d00395e01e71a54e71154a34f02c3 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -182,7 +182,7 @@ int main(int argc, char **argv) { // main execution /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { int t; for (t = 0; t < iteration; t++) { @@ -219,7 +219,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c index 28c0e5fd7bf24ac79857b3488dc28f12b3c354df..264cec20a92a1ce6a6b5f821773a65ca727ecba9 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -195,7 +195,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -213,7 +213,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c index f767f6a9d29094623296e012a6b2671954b0546a..7b5db72237cadd39a3b560f26dc5c65e58f8f6f9 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -195,7 +195,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -213,7 +213,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c index 10626bed59111d3ded3429626463966914218a5c..51c263f0efaa2ef561d471af396530f7f6113d94 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -145,7 +145,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -175,7 +175,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -200,7 +200,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -218,7 +218,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c index 1d03111f209173dfc2462cb274e1bb0ac56e9c8c..a2a98e923364de634a4ba3e3cc6db2ce23203d7b 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); cl_int clStatus; cl_uint numPlatforms; @@ -184,7 +184,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int t; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); /*for(int i=0; i<1; i++) {*/ for (t = 0; t < iteration; t++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, @@ -216,7 +216,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll index 9ea545c1841fcf2afa6dab59a6fd695aa25d0188..a288b7649ac6bb5c9a1cc90abea8e40bfe069c17 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c index cf86734a8639ce38eb2b1ac8280582e7bde4531c..9fc78af4b9a911fd0ef857209e04f13d3c931171 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -145,7 +145,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -177,7 +177,7 @@ int main(int argc, char **argv) { printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -190,7 +190,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int t; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 2; i++) { for (t = 0; t < iteration; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -204,7 +204,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ + /*pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);*/ clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ @@ -226,7 +226,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll index 9ea545c1841fcf2afa6dab59a6fd695aa25d0188..a288b7649ac6bb5c9a1cc90abea8e40bfe069c17 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' +; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c index 3b009e370e284a5b5b705bcc3a8122547a83c177..a1e1c4e74ebd305236e9b2c5e27eda6eca3457c7 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -145,7 +145,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -177,7 +177,7 @@ int main(int argc, char **argv) { printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -190,7 +190,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int t; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { for (t = 0; t < iteration; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -204,7 +204,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ + /*pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);*/ clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ @@ -226,7 +226,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll deleted file mode 100644 index 7dc32f37603e16c20a72d6a4b4b808c7b38afb79..0000000000000000000000000000000000000000 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll +++ /dev/null @@ -1,673 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc -; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll parboil.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin -; ModuleID = 'build/visc_vec_default/stencil.ll' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%rtype = type {} -%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } -%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } -%struct.pb_Timer = type { i32, i64, i64 } -%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } -%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } -%struct.pb_Parameters = type { i8*, i8** } -%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } -%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -%struct.arg = type <{ float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, %rtype }> - -@.str3 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 -@.str4 = private unnamed_addr constant [37 x i8] c"grid(%d, %d, %d), block(%d, %d, %d)\0A\00", align 1 -@.str5 = private unnamed_addr constant [17 x i8] c"A[126,1,1] = %f\0A\00", align 1 -@.str6 = private unnamed_addr constant [17 x i8] c"A[125,1,1] = %f\0A\00", align 1 -@str = private unnamed_addr constant [46 x i8] c"OpenCL accelerated 7 points stencil codes****\00" -@str7 = private unnamed_addr constant [45 x i8] c"Author: Li-Wen Chang <lchang20@illinois.edu>\00" -@str8 = private unnamed_addr constant [106 x i8] c"Usage: probe nx ny nz t\0Anx: the grid size x\0Any: the grid size y\0Anz: the grid size z\0At: the iteration time\00" -@viscTimerSet_GenVISC = common global i8* null -@0 = internal constant [14 x i8] c"GenVISC_Timer\00" - -; Function Attrs: nounwind uwtable -define %rtype @naive_kernel(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz) #0 { -entry: - %naive_kernel.node = call i8* @llvm.visc.getNode() - %naive_kernel.parentNode = call i8* @llvm.visc.getParentNode(i8* %naive_kernel.node) - %a0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %naive_kernel.parentNode) - %a1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %naive_kernel.node) - %a2 = mul i32 %a0, %a1 - %a3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %naive_kernel.node) - %a4 = add i32 %a2, %a3 - ;%add = add nsw i32 %4, 1 - %a5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %naive_kernel.parentNode) - %a6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %naive_kernel.node) - %a7 = mul i32 %a5, %a6 - %a8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %naive_kernel.node) - %a9 = add i32 %a7, %a8 - ;%add3 = add nsw i32 %9, 1 - %a10 = call i32 @llvm.visc.getNodeInstanceID.z(i8* %naive_kernel.parentNode) - %a11 = call i32 @llvm.visc.getNumNodeInstances.z(i8* %naive_kernel.node) - %a12 = mul i32 %a10, %a11 - %a13 = call i32 @llvm.visc.getNodeInstanceID.z(i8* %naive_kernel.node) - %a14 = add i32 %a12, %a13 - ;%sub = add nsw i32 %nx, -1 - ;%cmp = icmp slt i32 %add, %sub - ;br i1 %cmp, label %if.then, label %if.end - - - ;%call = tail call i32 @get_global_id(i32 0) #2 - ;%mul = shl i32 %call, 2 - %mul = shl i32 %a4, 2 - %add258 = or i32 %mul, 1 - ;%call1 = tail call i32 @get_global_id(i32 1) #2 - ;%add2 = add i32 %call1, 1 - %add2 = add i32 %a9, 1 - ;%call3 = tail call i32 @get_global_id(i32 2) #2 - ;%add4 = add i32 %call3, 1 - %add4 = add i32 %a14, 1 - %sub = add i32 %add258, 3 - %sub6 = add i32 %nx, -1 - %cmp = icmp slt i32 %sub, %sub6 - br i1 %cmp, label %if.then, label %if.else - -if.then: ; preds = %entry - %mul7 = mul nsw i32 %add4, %ny - %add8 = add nsw i32 %mul7, %add2 - %mul9 = mul nsw i32 %add8, %nx - %add11 = add i32 %sub, %mul9 - %add.ptr = getelementptr inbounds float* %A0, i32 %add11 - ;%call12 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr) #2 - %vadd.ptr = bitcast float* %add.ptr to <4 x float>* - %call12 = load <4 x float>* %vadd.ptr - - %add13 = add i32 %a14, 2 - %mul14 = mul nsw i32 %add13, %ny - %add15 = add nsw i32 %mul14, %add2 - %mul16 = mul nsw i32 %add15, %nx - %add18 = add i32 %sub, %mul16 - %add.ptr19 = getelementptr inbounds float* %A0, i32 %add18 - ;%call20 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr19) #2 - %vadd.ptr19 = bitcast float* %add.ptr19 to <4 x float>* - %call20 = load <4 x float>* %vadd.ptr19 - - %mul22 = mul nsw i32 %a14, %ny - %add23 = add nsw i32 %mul22, %add2 - %mul24 = mul nsw i32 %add23, %nx - %add26 = add i32 %sub, %mul24 - %add.ptr27 = getelementptr inbounds float* %A0, i32 %add26 - ;%call28 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr27) #2 - %vadd.ptr27 = bitcast float* %add.ptr27 to <4 x float>* - %call28 = load <4 x float>* %vadd.ptr27 - - %add29 = add i32 %a9, 2 - %add31 = add nsw i32 %add29, %mul7 - %mul32 = mul nsw i32 %add31, %nx - %add34 = add i32 %sub, %mul32 - %add.ptr35 = getelementptr inbounds float* %A0, i32 %add34 - ;%call36 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr35) #2 - %vadd.ptr35 = bitcast float* %add.ptr35 to <4 x float>* - %call36 = load <4 x float>* %vadd.ptr35 - - %add39 = add nsw i32 %mul7, %a9 - %mul40 = mul nsw i32 %add39, %nx - %add42 = add i32 %sub, %mul40 - %add.ptr43 = getelementptr inbounds float* %A0, i32 %add42 - ;%call44 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr43) #2 - %vadd.ptr43 = bitcast float* %add.ptr43 to <4 x float>* - %call44 = load <4 x float>* %vadd.ptr43 - - %add49 = add i32 %add258, 4 - %add50 = add i32 %add49, %mul9 - %arrayidx = getelementptr inbounds float* %A0, i32 %add50 - %0 = load float* %arrayidx, align 4, !tbaa !2 - %add55261 = or i32 %mul, 3 - %add56 = add i32 %add55261, %mul9 - %arrayidx57 = getelementptr inbounds float* %A0, i32 %add56 - %1 = load float* %arrayidx57, align 4, !tbaa !2 - %2 = shufflevector <4 x float> %call12, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3> - %vext = shufflevector <3 x float> %2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> - %vecinit58 = insertelement <4 x float> %vext, float %0, i32 3 - %vecinit60 = insertelement <4 x float> undef, float %1, i32 0 - %vecinit62 = shufflevector <4 x float> %vecinit60, <4 x float> %call12, <4 x i32> <i32 0, i32 4, i32 5, i32 6> - %splat.splatinsert = insertelement <4 x float> undef, float %c1, i32 0 - %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer - %add63 = fadd fast <4 x float> %call20, %call28 - %add64 = fadd fast <4 x float> %add63, %call36 - %add65 = fadd fast <4 x float> %add64, %call44 - %add66 = fadd fast <4 x float> %add65, %vecinit58 - %add67 = fadd fast <4 x float> %add66, %vecinit62 - %mul68 = fmul fast <4 x float> %splat.splat, %add67 - %splat.splatinsert69 = insertelement <4 x float> undef, float %c0, i32 0 - %splat.splat70 = shufflevector <4 x float> %splat.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer - %mul71 = fmul fast <4 x float> %splat.splat70, %call12 - %sub72 = fsub fast <4 x float> %mul68, %mul71 - %add.ptr78 = getelementptr inbounds float* %Anext, i32 %add11 - ;tail call void @_Z7vstore4Dv4_fjPU3AS1f(<4 x float> %sub72, i32 0, float* %add.ptr78) #2 - %vadd.ptr78 = bitcast float* %add.ptr78 to <4 x float>* - store <4 x float> %sub72, <4 x float>* %vadd.ptr78 - - br label %if.end146 - -if.else: ; preds = %entry - %cmp80 = icmp slt i32 %add258, %sub6 - br i1 %cmp80, label %for.body.lr.ph, label %if.end146 - -for.body.lr.ph: ; preds = %if.else - %add84 = add i32 %a14, 2 - %mul85 = mul nsw i32 %add84, %ny - %add86 = add nsw i32 %mul85, %add2 - %mul87 = mul nsw i32 %add86, %nx - %add88 = add i32 %mul87, 3 - %mul92 = mul nsw i32 %a14, %ny - %add93 = add nsw i32 %mul92, %add2 - %mul94 = mul nsw i32 %add93, %nx - %add95 = add i32 %mul94, 3 - %add99 = add i32 %a9, 2 - %mul100 = mul nsw i32 %add4, %ny - %add101 = add nsw i32 %add99, %mul100 - %mul102 = mul nsw i32 %add101, %nx - %add103 = add i32 %mul102, 3 - %add109 = add nsw i32 %mul100, %a9 - %mul110 = mul nsw i32 %add109, %nx - %add111 = add i32 %mul110, 3 - %add117 = add nsw i32 %mul100, %add2 - %mul118 = mul nsw i32 %add117, %nx - %add119 = add i32 %mul118, 3 - %add127 = add i32 %mul118, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %vid.0260 = phi i32 [ %add258, %for.body.lr.ph ], [ %add115, %for.body ] - %add89 = add i32 %add88, %vid.0260 - %arrayidx90 = getelementptr inbounds float* %A0, i32 %add89 - %3 = load float* %arrayidx90, align 4, !tbaa !2 - %add96 = add i32 %add95, %vid.0260 - %arrayidx97 = getelementptr inbounds float* %A0, i32 %add96 - %4 = load float* %arrayidx97, align 4, !tbaa !2 - %add98 = fadd fast float %3, %4 - %add104 = add i32 %add103, %vid.0260 - %arrayidx105 = getelementptr inbounds float* %A0, i32 %add104 - %5 = load float* %arrayidx105, align 4, !tbaa !2 - %add106 = fadd fast float %add98, %5 - %add112 = add i32 %add111, %vid.0260 - %arrayidx113 = getelementptr inbounds float* %A0, i32 %add112 - %6 = load float* %arrayidx113, align 4, !tbaa !2 - %add114 = fadd fast float %add106, %6 - %add115 = add nsw i32 %vid.0260, 1 - %add120 = add i32 %add119, %add115 - %arrayidx121 = getelementptr inbounds float* %A0, i32 %add120 - %7 = load float* %arrayidx121, align 4, !tbaa !2 - %add122 = fadd fast float %add114, %7 - %add128 = add i32 %add127, %vid.0260 - %arrayidx129 = getelementptr inbounds float* %A0, i32 %add128 - %8 = load float* %arrayidx129, align 4, !tbaa !2 - %add130 = fadd fast float %add122, %8 - %mul131 = fmul fast float %add130, %c1 - %add136 = add i32 %add119, %vid.0260 - %arrayidx137 = getelementptr inbounds float* %A0, i32 %add136 - %9 = load float* %arrayidx137, align 4, !tbaa !2 - %mul138 = fmul fast float %9, %c0 - %sub139 = fsub fast float %mul131, %mul138 - %arrayidx145 = getelementptr inbounds float* %Anext, i32 %add136 - store float %sub139, float* %arrayidx145, align 4, !tbaa !2 - %exitcond = icmp eq i32 %add115, %sub6 - br i1 %exitcond, label %if.end146, label %for.body - -if.end146: ; preds = %for.body, %if.else, %if.then - ;ret void - - - - - -;if.then: ; preds = %entry - ;%add5 = add nsw i32 %14, 1 - ;%add6 = add nsw i32 %14, 2 - ;%mul = mul nsw i32 %add6, %ny - ;%add7 = add nsw i32 %mul, %add3 - ;%mul8 = mul nsw i32 %add7, %nx - ;%add9 = add i32 %4, 4 - ;%add10 = add i32 %add9, %mul8 - ;%idxprom = sext i32 %add10 to i64 - ;%arrayidx = getelementptr inbounds float* %A0, i64 %idxprom - ;%15 = load float* %arrayidx, align 4, !tbaa !2 - ;%mul12 = mul nsw i32 %14, %ny - ;%add13 = add nsw i32 %mul12, %add3 - ;%mul14 = mul nsw i32 %add13, %nx - ;%add16 = add i32 %add9, %mul14 - ;%idxprom17 = sext i32 %add16 to i64 - ;%arrayidx18 = getelementptr inbounds float* %A0, i64 %idxprom17 - ;%16 = load float* %arrayidx18, align 4, !tbaa !2 - ;%add19 = fadd fast float %15, %16 - ;%add20 = add nsw i32 %9, 2 - ;%mul21 = mul nsw i32 %add5, %ny - ;%add22 = add nsw i32 %add20, %mul21 - ;%mul23 = mul nsw i32 %add22, %nx - ;%add25 = add i32 %add9, %mul23 - ;%idxprom26 = sext i32 %add25 to i64 - ;%arrayidx27 = getelementptr inbounds float* %A0, i64 %idxprom26 - ;%17 = load float* %arrayidx27, align 4, !tbaa !2 - ;%add28 = fadd fast float %add19, %17 - ;%add31 = add nsw i32 %mul21, %9 - ;%mul32 = mul nsw i32 %add31, %nx - ;%add34 = add i32 %add9, %mul32 - ;%idxprom35 = sext i32 %add34 to i64 - ;%arrayidx36 = getelementptr inbounds float* %A0, i64 %idxprom35 - ;%18 = load float* %arrayidx36, align 4, !tbaa !2 - ;%add37 = fadd fast float %add28, %18 - ;%add40 = add nsw i32 %mul21, %add3 - ;%mul41 = mul nsw i32 %add40, %nx - ;%add42 = add i32 %4, 5 - ;%add43 = add i32 %add42, %mul41 - ;%idxprom44 = sext i32 %add43 to i64 - ;%arrayidx45 = getelementptr inbounds float* %A0, i64 %idxprom44 - ;%19 = load float* %arrayidx45, align 4, !tbaa !2 - ;%add46 = fadd fast float %add37, %19 - ;%add51 = add i32 %4, 3 - ;%add52 = add i32 %add51, %mul41 - ;%idxprom53 = sext i32 %add52 to i64 - ;%arrayidx54 = getelementptr inbounds float* %A0, i64 %idxprom53 - ;%20 = load float* %arrayidx54, align 4, !tbaa !2 - ;%add55 = fadd fast float %add46, %20 - ;%mul56 = fmul fast float %add55, %c1 - ;%add61 = add i32 %add9, %mul41 - ;%idxprom62 = sext i32 %add61 to i64 - ;%arrayidx63 = getelementptr inbounds float* %A0, i64 %idxprom62 - ;%21 = load float* %arrayidx63, align 4, !tbaa !2 - ;%mul64 = fmul fast float %21, %c0 - ;%sub65 = fsub fast float %mul56, %mul64 - ;%arrayidx72 = getelementptr inbounds float* %Anext, i64 %idxprom62 - ;store float %sub65, float* %arrayidx72, align 4, !tbaa !2 - ;br label %if.end - -;if.end: ; preds = %if.then, %entry - ret %rtype undef -} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** %argv) #0 { -entry: - %argc.addr = alloca i32, align 4 - %timers = alloca %struct.pb_TimerSet, align 8 - store i32 %argc, i32* %argc.addr, align 4, !tbaa !5 - %0 = bitcast %struct.pb_TimerSet* %timers to i8* - call void @llvm.lifetime.start(i64 800, i8* %0) #1 - %puts = call i32 @puts(i8* getelementptr inbounds ([46 x i8]* @str, i64 0, i64 0)) - %puts186 = call i32 @puts(i8* getelementptr inbounds ([45 x i8]* @str7, i64 0, i64 0)) - %call2 = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 - %1 = load i32* %argc.addr, align 4, !tbaa !5 - %cmp = icmp slt i32 %1, 5 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %puts187 = call i32 @puts(i8* getelementptr inbounds ([106 x i8]* @str8, i64 0, i64 0)) - br label %cleanup - -if.end: ; preds = %entry - %arrayidx = getelementptr inbounds i8** %argv, i64 1 - %2 = load i8** %arrayidx, align 8, !tbaa !6 - %call.i = call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #1 - %conv.i = trunc i64 %call.i to i32 - %cmp5 = icmp slt i32 %conv.i, 1 - br i1 %cmp5, label %cleanup, label %if.end7 - -if.end7: ; preds = %if.end - %arrayidx8 = getelementptr inbounds i8** %argv, i64 2 - %3 = load i8** %arrayidx8, align 8, !tbaa !6 - %call.i188 = call i64 @strtol(i8* nocapture %3, i8** null, i32 10) #1 - %conv.i189 = trunc i64 %call.i188 to i32 - %cmp10 = icmp slt i32 %conv.i189, 1 - br i1 %cmp10, label %cleanup, label %if.end12 - -if.end12: ; preds = %if.end7 - %arrayidx13 = getelementptr inbounds i8** %argv, i64 3 - %4 = load i8** %arrayidx13, align 8, !tbaa !6 - %call.i190 = call i64 @strtol(i8* nocapture %4, i8** null, i32 10) #1 - %conv.i191 = trunc i64 %call.i190 to i32 - %cmp15 = icmp slt i32 %conv.i191, 1 - br i1 %cmp15, label %cleanup, label %if.end17 - -if.end17: ; preds = %if.end12 - %arrayidx18 = getelementptr inbounds i8** %argv, i64 4 - %5 = load i8** %arrayidx18, align 8, !tbaa !6 - %call.i192 = call i64 @strtol(i8* nocapture %5, i8** null, i32 10) #1 - %conv.i193 = trunc i64 %call.i192 to i32 - %cmp20 = icmp slt i32 %conv.i193, 1 - br i1 %cmp20, label %cleanup, label %for.cond1.preheader.lr.ph.i - -for.cond1.preheader.lr.ph.i: ; preds = %if.end17 - %mul = shl i64 %call.i, 32 - %mul23 = mul i64 %mul, %call.i188 - %sext = mul i64 %mul23, %call.i190 - %add = ashr exact i64 %sext, 30 - %mul24 = add i64 %add, 12 - %call25 = call noalias i8* @malloc(i64 %mul24) #1 - %6 = bitcast i8* %call25 to float* - %call27 = call noalias i8* @malloc(i64 %mul24) #1 - %7 = bitcast i8* %call27 to float* - %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call2, i64 0, i32 1 - %8 = load i8*** %inpFiles, align 8, !tbaa !6 - %9 = load i8** %8, align 8, !tbaa !6 - %call29 = call %struct._IO_FILE* @fopen(i8* %9, i8* getelementptr inbounds ([3 x i8]* @.str3, i64 0, i64 0)) #1 - %add.ptr = getelementptr inbounds i8* %call25, i64 12 - %10 = bitcast i8* %add.ptr to float* - %cmp24.i = icmp sgt i32 %conv.i189, 0 - %cmp51.i = icmp sgt i32 %conv.i, 0 - %or.cond = and i1 %cmp24.i, %cmp51.i - br i1 %or.cond, label %for.cond4.preheader.lr.ph.us.i.preheader.split.us, label %read_data.exit - -for.cond4.preheader.lr.ph.us.i.preheader.split.us: ; preds = %for.cond1.preheader.lr.ph.i - %11 = mul i32 %conv.i, %conv.i189 - br label %for.body6.lr.ph.us.us.i.preheader.us - -for.body6.lr.ph.us.us.i.us: ; preds = %for.body6.lr.ph.us.us.i.preheader.us, %for.inc8.us.us.i.us - %j.06.us.us.i.us = phi i32 [ %inc9.us.us.i.us, %for.inc8.us.us.i.us ], [ 0, %for.body6.lr.ph.us.us.i.preheader.us ] - %s.15.us.us.i.us = phi i32 [ %14, %for.inc8.us.us.i.us ], [ %s.09.us.i.us, %for.body6.lr.ph.us.us.i.preheader.us ] - %12 = sext i32 %s.15.us.us.i.us to i64 - br label %for.body6.us.us.i.us - -for.body6.us.us.i.us: ; preds = %for.body6.us.us.i.us, %for.body6.lr.ph.us.us.i.us - %indvars.iv.i.us = phi i64 [ %indvars.iv.next.i.us, %for.body6.us.us.i.us ], [ %12, %for.body6.lr.ph.us.us.i.us ] - %k.03.us.us.i.us = phi i32 [ %inc7.us.us.i.us, %for.body6.us.us.i.us ], [ 0, %for.body6.lr.ph.us.us.i.us ] - %add.ptr.us.us.i.us = getelementptr inbounds float* %10, i64 %indvars.iv.i.us - %13 = bitcast float* %add.ptr.us.us.i.us to i8* - %call.us.us.i.us = call i64 @fread(i8* %13, i64 4, i64 1, %struct._IO_FILE* %call29) #1 - %indvars.iv.next.i.us = add i64 %indvars.iv.i.us, 1 - %inc7.us.us.i.us = add nsw i32 %k.03.us.us.i.us, 1 - %exitcond.i.us = icmp eq i32 %inc7.us.us.i.us, %conv.i - br i1 %exitcond.i.us, label %for.inc8.us.us.i.us, label %for.body6.us.us.i.us - -for.inc8.us.us.i.us: ; preds = %for.body6.us.us.i.us - %14 = add i32 %s.15.us.us.i.us, %conv.i - %inc9.us.us.i.us = add nsw i32 %j.06.us.us.i.us, 1 - %exitcond33.i.us = icmp eq i32 %inc9.us.us.i.us, %conv.i189 - br i1 %exitcond33.i.us, label %for.inc11.us.i.us, label %for.body6.lr.ph.us.us.i.us - -for.inc11.us.i.us: ; preds = %for.inc8.us.us.i.us - %15 = add i32 %11, %s.09.us.i.us - %inc12.us.i.us = add nsw i32 %i.010.us.i.us, 1 - %exitcond34.i.us = icmp eq i32 %inc12.us.i.us, %conv.i191 - br i1 %exitcond34.i.us, label %read_data.exit, label %for.body6.lr.ph.us.us.i.preheader.us - -for.body6.lr.ph.us.us.i.preheader.us: ; preds = %for.inc11.us.i.us, %for.cond4.preheader.lr.ph.us.i.preheader.split.us - %i.010.us.i.us = phi i32 [ %inc12.us.i.us, %for.inc11.us.i.us ], [ 0, %for.cond4.preheader.lr.ph.us.i.preheader.split.us ] - %s.09.us.i.us = phi i32 [ %15, %for.inc11.us.i.us ], [ 0, %for.cond4.preheader.lr.ph.us.i.preheader.split.us ] - br label %for.body6.lr.ph.us.us.i.us - -read_data.exit: ; preds = %for.inc11.us.i.us, %for.cond1.preheader.lr.ph.i - %call31 = call i32 @fclose(%struct._IO_FILE* %call29) #1 - call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - %16 = call i8* @llvm_visc_initializeTimerSet() - store i8* %16, i8** @viscTimerSet_GenVISC - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - call void @llvm.visc.init() - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 15) #1 - call void @llvm_visc_track_mem(i8* %call25, i64 %mul24) #1 - call void @llvm_visc_track_mem(i8* %call27, i64 %mul24) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call27, i8* %call25, i64 %mul24, i32 4, i1 false) - %sub40 = add nsw i32 %conv.i, 253 - %div = sdiv i32 %sub40, 256 - %mul42 = shl nsw i32 %div, 6 - %sub44 = add nsw i32 %conv.i189, -2 - %sub46 = add nsw i32 %conv.i191, -2 - %call53 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([37 x i8]* @.str4, i64 0, i64 0), i32 %mul42, i32 %sub44, i32 %sub46, i32 64, i32 1, i32 1) #1 - %add56 = add nsw i32 %conv.i189, 1 - %mul57 = mul nsw i32 %add56, %conv.i - %add59 = add nsw i32 %mul57, 129 - %idxprom = sext i32 %add59 to i64 - %arrayidx60 = getelementptr inbounds float* %6, i64 %idxprom - %17 = load float* %arrayidx60, align 4, !tbaa !2 - %conv61 = fpext float %17 to double - %call62 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str5, i64 0, i64 0), double %conv61) #1 - %add67 = add nsw i32 %mul57, 128 - %idxprom68 = sext i32 %add67 to i64 - %arrayidx69 = getelementptr inbounds float* %6, i64 %idxprom68 - %18 = load float* %arrayidx69, align 4, !tbaa !2 - %conv70 = fpext float %18 to double - %call71 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str6, i64 0, i64 0), double %conv70) #1 - %cmp72194 = icmp sgt i32 %conv.i193, 0 - br i1 %cmp72194, label %for.body, label %for.end - -for.body: ; preds = %for.body, %read_data.exit - %h_A0.0197 = phi float* [ %h_Anext.0196, %for.body ], [ %6, %read_data.exit ] - %h_Anext.0196 = phi float* [ %h_A0.0197, %for.body ], [ %7, %read_data.exit ] - %t.0195 = phi i32 [ %inc, %for.body ], [ 0, %read_data.exit ] - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) - %in.addr = alloca %struct.arg - %in.addr.c0 = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.c0.cast = fptrunc double 0x3FC5555560000000 to float - store float %in.addr.c0.cast, float* %in.addr.c0 - %in.addr.c1 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.c1.cast = fptrunc double 0x3F9C71C720000000 to float - store float %in.addr.c1.cast, float* %in.addr.c1 - %in.addr.A0 = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - store float* %h_A0.0197, float** %in.addr.A0 - %in.addr.bytes_A0 = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - store i64 %mul24, i64* %in.addr.bytes_A0 - %in.addr.Anext = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - store float* %h_Anext.0196, float** %in.addr.Anext - %in.addr.bytes_Anext = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - store i64 %mul24, i64* %in.addr.bytes_Anext - %in.addr.nx = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - store i32 %conv.i, i32* %in.addr.nx - %in.addr.ny = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - store i32 %conv.i189, i32* %in.addr.ny - %in.addr.nz = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - store i32 %conv.i191, i32* %in.addr.nz - %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - store i32 64, i32* %in.addr.dimX0 - %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 10 - store i32 1, i32* %in.addr.dimY0 - %in.addr.dimZ1 = getelementptr %struct.arg* %in.addr, i32 0, i32 11 - store i32 1, i32* %in.addr.dimZ1 - %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 - store i32 %div, i32* %in.addr.dimX1 - %in.addr.dimY2 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 - store i32 %sub44, i32* %in.addr.dimY2 - %in.addr.dimZ2 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 - store i32 %sub46, i32* %in.addr.dimZ2 - %args = bitcast %struct.arg* %in.addr to i8* - call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) - %graphnaive_kernelInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level2 to i8*), i8* %args) - call void @llvm.visc.wait(i8* %graphnaive_kernelInternal_level2) - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 - %inc = add nsw i32 %t.0195, 1 - %exitcond = icmp eq i32 %inc, %conv.i193 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %read_data.exit - %h_A0.0.lcssa = phi float* [ %6, %read_data.exit ], [ %h_Anext.0196, %for.body ] - %h_Anext.0.lcssa = phi float* [ %7, %read_data.exit ], [ %h_A0.0197, %for.body ] - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 - %19 = bitcast float* %h_A0.0.lcssa to i8* - call void @llvm_visc_request_mem(i8* %19, i64 %mul24) #1 - %arrayidx97 = getelementptr inbounds float* %h_A0.0.lcssa, i64 %idxprom - %20 = load float* %arrayidx97, align 4, !tbaa !2 - %conv98 = fpext float %20 to double - %call99 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str5, i64 0, i64 0), double %conv98) #1 - %arrayidx106 = getelementptr inbounds float* %h_A0.0.lcssa, i64 %idxprom68 - %21 = load float* %arrayidx106, align 4, !tbaa !2 - %conv107 = fpext float %21 to double - %call108 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str6, i64 0, i64 0), double %conv107) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 - %22 = bitcast float* %h_Anext.0.lcssa to i8* - call void @llvm_visc_untrack_mem(i8* %22) #1 - call void @llvm_visc_untrack_mem(i8* %19) #1 - call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 - call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 - call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) - call void @llvm.visc.cleanup() - %outFile = getelementptr inbounds %struct.pb_Parameters* %call2, i64 0, i32 0 - %23 = load i8** %outFile, align 8, !tbaa !6 - %tobool = icmp eq i8* %23, null - br i1 %tobool, label %if.end113, label %if.then110 - -if.then110: ; preds = %for.end - %add.ptr112 = getelementptr inbounds float* %h_A0.0.lcssa, i64 3 - call void @outputData(i8* %23, float* %add.ptr112, i32 %conv.i, i32 %conv.i189, i32 %conv.i191) #1 - br label %if.end113 - -if.end113: ; preds = %if.then110, %for.end - call void @free(i8* %22) #1 - call void @free(i8* %19) #1 - call void @pb_FreeParameters(%struct.pb_Parameters* %call2) #1 - br label %cleanup - -cleanup: ; preds = %if.end113, %if.end17, %if.end12, %if.end7, %if.end, %if.then - %retval.0 = phi i32 [ -1, %if.then ], [ 0, %if.end113 ], [ -1, %if.end ], [ -1, %if.end7 ], [ -1, %if.end12 ], [ -1, %if.end17 ] - call void @llvm.lifetime.end(i64 800, i8* %0) #1 - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #2 - -declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #3 - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #2 - -; Function Attrs: nounwind -declare noalias %struct._IO_FILE* @fopen(i8* nocapture, i8* nocapture) #2 - -; Function Attrs: nounwind -declare i32 @fclose(%struct._IO_FILE* nocapture) #2 - -declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #3 - -declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #3 - -declare void @llvm_visc_track_mem(i8*, i64) #3 - -; Function Attrs: nounwind -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #1 - -declare void @llvm_visc_request_mem(i8*, i64) #3 - -declare void @llvm_visc_untrack_mem(i8*) #3 - -declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #3 - -declare void @outputData(i8*, float*, i32, i32, i32) #3 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #2 - -declare void @pb_FreeParameters(%struct.pb_Parameters*) #3 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2 - -; Function Attrs: nounwind -declare i64 @strtol(i8*, i8** nocapture, i32) #2 - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #1 - -declare i8* @llvm_visc_initializeTimerSet() - -declare void @llvm_visc_switchToTimer(i8**, i32) - -declare void @llvm_visc_printTimerSet(i8**, i8*) - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.z(i8*) #1 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.z(i8*) #1 - -; Function Attrs: nounwind uwtable -define %rtype @naive_kernelInternal_level1(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz, i32 %dimX, i32 %dimY, i32 %dimZ) #0 { -entry: - %naive_kernel.node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32)* @naive_kernel to i8*), i32 %dimX, i32 %dimY, i32 %dimZ) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 8, i32 8) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) #1 - -; Function Attrs: nounwind uwtable -define %rtype @naive_kernelInternal_level2(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz, i32 %dimX, i32 %dimY, i32 %dimZ, i32 %dimX1, i32 %dimY2, i32 %dimZ3) #0 { -entry: - %naive_kernelInternal_level1.node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level1 to i8*), i32 %dimX1, i32 %dimY2, i32 %dimZ3) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 5, i32 5) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 6, i32 6) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 7, i32 7) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 8, i32 8) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 9, i32 9) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 10, i32 10) - call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 11, i32 11) - ret %rtype undef -} - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } - -!visc_hint_gpu = !{} -!visc_hint_cpu = !{!0, !1} - -!0 = metadata !{%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level1} -!1 = metadata !{%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level2} -!2 = metadata !{metadata !"float", metadata !3} -!3 = metadata !{metadata !"omnipotent char", metadata !4} -!4 = metadata !{metadata !"Simple C/C++ TBAA"} -!5 = metadata !{metadata !"int", metadata !3} -!6 = metadata !{metadata !"any pointer", metadata !3} diff --git a/hpvm/test/parboil/benchmarks/tpacf/Makefile b/hpvm/test/parboil/benchmarks/tpacf/Makefile index 6140acd5ac3a196c8750b997c2e5904ba9585839..e76139ba384fed18f7487e723d0859e4e44075f6 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/Makefile +++ b/hpvm/test/parboil/benchmarks/tpacf/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = tpacf -# Default compile visc +# Default compile hpvm ifeq ($(VERSION),) - VERSION = visc + VERSION = hpvm endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile similarity index 82% rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile index ba6459d78a16e381f4f3b75ee026b380583f87c5..040e2c7994ff0c0ace28099f6f193a7cb7b3d272 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile +++ b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=args.ll model.ll -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.cc diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.h similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.h diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc similarity index 76% rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc index 3239be6c92f641422f2ba6910894ae68cc8b220e..49208f579c87545dcbfccc01fd054c20e1123d40 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc @@ -14,7 +14,7 @@ #include "args.h" #include "model.h" -#include <visc.h> +#include <hpvm.h> extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; @@ -62,13 +62,13 @@ void packData(RootIn *args, hist_t *histograms, size_t bytes_histograms, void Allocation(long block) { // Memory shared between threadblocks - // void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE); + // void* data_s = __hpvm__malloc(sizeof(struct cartesian)*BLOCK_SIZE); void *warp_hists = - __visc__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); + __hpvm__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); - //__visc__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE, + //__hpvm__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE, // warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS); - __visc__return(2, warp_hists, + __hpvm__return(2, warp_hists, sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); } @@ -80,14 +80,14 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, // struct cartesian* data_s, size_t bytes_data_s, unsigned int *warp_hists, size_t bytes_warp_hists) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, all_x_data, binb, 1, histograms); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, all_x_data, binb, 1, histograms); - void *thisNode = __visc__getNode(); - void *parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int dimx = __visc__getNumNodeInstances_x(thisNode); + void *thisNode = __hpvm__getNode(); + void *parentNode = __hpvm__getParentNode(thisNode); + int lx = __hpvm__getNodeInstanceID_x(thisNode); + int gx = __hpvm__getNodeInstanceID_x(parentNode); + int dimx = __hpvm__getNumNodeInstances_x(thisNode); float *all_y_data = all_x_data + NUM_ELEMENTS * (NUM_SETS + 1); float *all_z_data = all_y_data + NUM_ELEMENTS * (NUM_SETS + 1); @@ -170,7 +170,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, unsigned int warpnum = tid / (WARP_SIZE / HISTS_PER_WARP); if ((distance < binb[min]) && (distance >= binb[max]) && (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS)) { - __visc__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1); + __hpvm__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1); } } } @@ -181,7 +181,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, for (unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; offset >>= 1) { for (unsigned int bin_base = 0; bin_base < NUM_BINS; bin_base += BLOCK_SIZE / (NUM_HISTOGRAMS >> 1)) { - __visc__barrier(); + __hpvm__barrier(); if (warp_index < offset && bin_base + bin_index < NUM_BINS) { unsigned long sum = warp_hists(bin_base + bin_index, warp_index) + @@ -191,7 +191,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, } } - __visc__barrier(); + __hpvm__barrier(); // Put the results back in the real histogram // warp_hists(x, 0) holds sum of all locations of bin x @@ -207,26 +207,26 @@ void BlockingTPACF(hist_t *histograms, size_t bytes_histograms, float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, all_x_data, binb, 1, histograms); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, all_x_data, binb, 1, histograms); - void *AllocationNode = __visc__createNodeND(0, Allocation); - void *TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block); + void *AllocationNode = __hpvm__createNodeND(0, Allocation); + void *TPACFLeafNode = __hpvm__createNodeND(1, TPACFLeaf, block); // Bind Inputs - __visc__bindIn(AllocationNode, 8, 0, 0); // Bind block - __visc__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms - __visc__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms - __visc__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data - __visc__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data - __visc__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb - __visc__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb - __visc__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS - __visc__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS + __hpvm__bindIn(AllocationNode, 8, 0, 0); // Bind block + __hpvm__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms + __hpvm__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms + __hpvm__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data + __hpvm__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data + __hpvm__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb + __hpvm__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb + __hpvm__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS + __hpvm__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS // Create Edges - __visc__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists - __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, + __hpvm__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists + __hpvm__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, 0); // Edge bytes_warp_hists } @@ -236,21 +236,21 @@ void TPACFRoot(hist_t *histograms, size_t bytes_histograms, float *all_x_data, float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block, long grid) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, all_x_data, binb, 1, histograms); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, all_x_data, binb, 1, histograms); - void *BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid); + void *BlockingTPACFNode = __hpvm__createNodeND(1, BlockingTPACF, grid); // Bind Inputs - __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms - __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms - __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data - __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data - __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb - __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb - __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS - __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS - __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block + __hpvm__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms + __hpvm__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms + __hpvm__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data + __hpvm__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data + __hpvm__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb + __hpvm__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb + __hpvm__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS + __hpvm__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS + __hpvm__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block } void TPACFWrapper(hist_t *histograms, size_t bytes_histograms, @@ -258,22 +258,22 @@ void TPACFWrapper(hist_t *histograms, size_t bytes_histograms, // next arg is read-only constant float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block, long grid) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, all_x_data, binb, 1, histograms); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, all_x_data, binb, 1, histograms); - void *BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot); + void *BlockingTPACFNode = __hpvm__createNodeND(0, TPACFRoot); // Bind Inputs - __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms - __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms - __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data - __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data - __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb - __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb - __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS - __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS - __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block - __visc__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid + __hpvm__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms + __hpvm__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms + __hpvm__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data + __hpvm__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data + __hpvm__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb + __hpvm__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb + __hpvm__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS + __hpvm__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS + __hpvm__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block + __hpvm__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid } // **===-----------------------------------------------------------===** @@ -324,14 +324,14 @@ int main(int argc, char **argv) { } pb_InitializeTimerSet(&timers); - __visc__init(); + __hpvm__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays // AOS to SOA transformation size_t bytes_h_x_data = 3 * f_mem_size; float *h_x_data = (float *)malloc(bytes_h_x_data); - llvm_visc_track_mem(h_x_data, bytes_h_x_data); + llvm_hpvm_track_mem(h_x_data, bytes_h_x_data); float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); @@ -349,12 +349,12 @@ int main(int argc, char **argv) { // allocate system memory for final histograms size_t bytes_hists = NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t); hist_t *hists = (hist_t *)malloc(bytes_hists); - llvm_visc_track_mem(hists, bytes_hists); + llvm_hpvm_track_mem(hists, bytes_hists); // Initialize the boundary constants for bin search size_t bytes_binb = (NUM_BINS + 1) * sizeof(float); float *binb = (float *)malloc(bytes_binb); - llvm_visc_track_mem(binb, bytes_binb); + llvm_hpvm_track_mem(binb, bytes_binb); for (int k = 0; k < NUM_BINS + 1; k++) { binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / @@ -369,17 +369,17 @@ int main(int argc, char **argv) { RootIn *graph_args = (RootIn *)malloc(sizeof(RootIn)); packData(graph_args, hists, bytes_hists, h_x_data, bytes_h_x_data, binb, bytes_binb, NUM_SETS, NUM_ELEMENTS, block, grid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); - void *TPACF_DFG = __visc__launch(0, TPACFRoot, (void *)graph_args); - __visc__wait(TPACF_DFG); + void *TPACF_DFG = __hpvm__launch(0, TPACFRoot, (void *)graph_args); + __hpvm__wait(TPACF_DFG); pb_SwitchToTimer(&timers, pb_TimerID_COPY); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** - llvm_visc_request_mem(hists, bytes_hists); + llvm_hpvm_request_mem(hists, bytes_hists); // references into output histograms hist_t *dd_hist = hists; hist_t *rr_hist = dd_hist + NUM_BINS; @@ -407,7 +407,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __visc__cleanup(); + __hpvm__cleanup(); FILE *outfile; if ((outfile = fopen(params->outFile, "w")) == NULL) { diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.cc diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.h similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.h diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc index d945bccf4eae7f296394d74ac0617f3e20426dcd..d89d556a100157164445ec46f649828791edfd29 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc @@ -199,7 +199,7 @@ int main(int argc, char **argv) { 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc index 791b5fbdd6aa70359d37ca5a85139c7f8374c56d..ef2a21daed14a9ada398130c4cf4ac650621056e 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc @@ -203,7 +203,7 @@ int main(int argc, char **argv) { 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); diff --git a/hpvm/test/parboil/common/include/parboil.h b/hpvm/test/parboil/common/include/parboil.h index 30ad6721c3190610dd08ec131603b6fe622f897e..ba25726c027a5c67283c68a703216ad7ee785ef5 100644 --- a/hpvm/test/parboil/common/include/parboil.h +++ b/hpvm/test/parboil/common/include/parboil.h @@ -102,23 +102,23 @@ enum pb_TimerID { * host activity: automatically filled in, * not intended for direct usage */ // GPU FUNCTION - visc_TimerID_INIT_CTX, - visc_TimerID_CLEAR_CTX, - visc_TimerID_COPY_SCALAR, - visc_TimerID_COPY_PTR, - visc_TimerID_MEM_FREE, - visc_TimerID_READ_OUTPUT, - visc_TimerID_SETUP, - visc_TimerID_MEM_TRACK, - visc_TimerID_MEM_UNTRACK, - visc_TimerID_MISC, + hpvm_TimerID_INIT_CTX, + hpvm_TimerID_CLEAR_CTX, + hpvm_TimerID_COPY_SCALAR, + hpvm_TimerID_COPY_PTR, + hpvm_TimerID_MEM_FREE, + hpvm_TimerID_READ_OUTPUT, + hpvm_TimerID_SETUP, + hpvm_TimerID_MEM_TRACK, + hpvm_TimerID_MEM_UNTRACK, + hpvm_TimerID_MISC, // LAUNCH FUNCTION - visc_TimerID_PTHREAD_CREATE, - visc_TimerID_ARG_PACK, - visc_TimerID_ARG_UNPACK, - visc_TimerID_COMPUTATION, - visc_TimerID_OUTPUT_PACK, - visc_TimerID_OUTPUT_UNPACK, + hpvm_TimerID_PTHREAD_CREATE, + hpvm_TimerID_ARG_PACK, + hpvm_TimerID_ARG_UNPACK, + hpvm_TimerID_COMPUTATION, + hpvm_TimerID_OUTPUT_PACK, + hpvm_TimerID_OUTPUT_UNPACK, pb_TimerID_LAST /* Number of timer IDs */ }; diff --git a/hpvm/test/parboil/common/mk/visc.mk b/hpvm/test/parboil/common/mk/hpvm.mk similarity index 81% rename from hpvm/test/parboil/common/mk/visc.mk rename to hpvm/test/parboil/common/mk/hpvm.mk index eb11371ccdb931d5160e5143af907a308215eb54..1c59d4d8fd7802698df9fcc78cfd16adc64ad641 100755 --- a/hpvm/test/parboil/common/mk/visc.mk +++ b/hpvm/test/parboil/common/mk/hpvm.mk @@ -9,38 +9,38 @@ CFLAGS=$(LANG_CFLAGS) $(PLATFORM_CFLAGS) $(APP_CFLAGS) CXXFLAGS=$(LANG_CXXFLAGS) $(PLATFORM_CXXFLAGS) $(APP_CXXFLAGS) LDFLAGS=$(LANG_LDFLAGS) $(PLATFORM_LDFLAGS) $(APP_LDFLAGS) -# VISC +# HPVM LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs -#VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt -VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt +#HPVM_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/hpvm-rt +HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt -VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll +HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll #LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc #LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc LLVM_34_AS = /opt/llvm/bin/llvm-as -TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce +TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl ifeq ($(TARGET),x86) DEVICE = SPIR_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG CFLAGS += -DOPENCL_CPU else ifeq ($(TARGET),seq) DEVICE = CPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG else ifeq ($(TARGET),seqx86) DEVICE = CPU_OR_SPIR_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG CFLAGS += -DOPENCL_CPU else ifeq ($(TARGET),seqgpu) DEVICE = CPU_OR_GPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG else DEVICE = GPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG endif CFLAGS += -DDEVICE=$(DEVICE) @@ -49,31 +49,31 @@ CXXFLAGS += -DDEVICE=$(DEVICE) HOST_LINKFLAGS = ifeq ($(TIMER),x86) - VISC_OPTFLAGS += -visc-timers-x86 + HPVM_OPTFLAGS += -hpvm-timers-x86 else ifeq ($(TIMER),ptx) - VISC_OPTFLAGS += -visc-timers-ptx + HPVM_OPTFLAGS += -hpvm-timers-ptx else ifeq ($(TIMER),gen) - TESTGEN_OPTFLAGS += -visc-timers-gen + TESTGEN_OPTFLAGS += -hpvm-timers-gen else ifeq ($(TIMER),spir) - TESTGEN_OPTFLAGS += -visc-timers-spir + TESTGEN_OPTFLAGS += -hpvm-timers-spir else ifeq ($(TIMER),no) else ifeq ($(TARGET),x86) - VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir + HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir else ifeq ($(TARGET),seq) - VISC_OPTFLAGS += -visc-timers-x86 + HPVM_OPTFLAGS += -hpvm-timers-x86 else ifeq ($(TARGET),seqx86) - VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir + HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir else ifeq ($(TARGET),seqgpu) - VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx + HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx else - VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx + HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx endif - TESTGEN_OPTFLAGS += -visc-timers-gen + TESTGEN_OPTFLAGS += -hpvm-timers-gen endif ifeq ($(DABSTRACTION),true) - VISC_OPTFLAGS += -visc-eda + HPVM_OPTFLAGS += -hpvm-eda endif # Rules common to all makefiles @@ -121,7 +121,7 @@ endif ######################################## OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) -TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS)) PARBOIL_OBJS = $(call INBUILDDIR,parboil.ll) KERNEL = $(TEST_OBJS).kernels.ll KERNEL_OPT = $(BUILDDIR)/$(APP).kernels.opt.ll @@ -190,14 +190,14 @@ $(KERNEL_OPT) : $(KERNEL) $(BIN) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ -$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(VISC_RT_LIB) +$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(HPVM_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp +$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp make -C $(LLVM_LIB_PATH) -$(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS) - $(OPT) --debug $(VISC_OPTFLAGS) -S $< -o $(HOST) +$(HOST) $(KERNEL): $(BUILDDIR)/$(HPVM_OBJS) + $(OPT) --debug $(HPVM_OPTFLAGS) -S $< -o $(HOST) $(RUNDIR) : mkdir -p $(RUNDIR) @@ -214,11 +214,11 @@ $(BUILDDIR)/%.ll : $(SRCDIR)/%.cc $(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp $(CXX) $(CXXFLAGS) -S -emit-llvm $< -o $@ -$(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll +$(BUILDDIR)/%.hpvm.ll: $(BUILDDIR)/%.ll $(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@ cat $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil/RUN.parboil.script $@ > $@.tmp - mv $@.tmp $(BUILDDIR)/$(APP).visc.ll - #@cp $(VISC_OBJS) $(BUILDDIR)/$(VISC_OBJS) + mv $@.tmp $(BUILDDIR)/$(APP).hpvm.ll + #@cp $(HPVM_OBJS) $(BUILDDIR)/$(HPVM_OBJS) $(BUILDDIR)/%.o : $(SRCDIR)/%.c $(CC) $(CFLAGS) -c $< -o $@ diff --git a/hpvm/test/parboil/common/platform/visc.default.mk b/hpvm/test/parboil/common/platform/hpvm.default.mk similarity index 61% rename from hpvm/test/parboil/common/platform/visc.default.mk rename to hpvm/test/parboil/common/platform/hpvm.default.mk index 03a9b0874aa2b2617afab71b27470b97f5b1f4b0..ca90d453a38d0b63d16e850b57de5622cbd1f2e1 100644 --- a/hpvm/test/parboil/common/platform/visc.default.mk +++ b/hpvm/test/parboil/common/platform/hpvm.default.mk @@ -12,20 +12,20 @@ #OPENCL_LIB_PATH=$(OPENCL_PATH)/lib/x86_64 #build -VISC_BUILD_DIR = $(LLVM_SRC_ROOT)/../build +HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build # gcc (default) -CC = $(VISC_BUILD_DIR)/bin/clang -OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include +CC = $(HPVM_BUILD_DIR)/bin/clang +OCLBE = $(HPVM_BUILD_DIR)/bin/llvm-cbe +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include -CXX = $(VISC_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include +CXX = $(HPVM_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include -LINKER = $(VISC_BUILD_DIR)/bin/clang++ +LINKER = $(HPVM_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lOpenCL -LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib -LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin +LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib +LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/parboil/driver/options.py b/hpvm/test/parboil/driver/options.py index b80fc16168b54a326d16aaa99703e0bc172385f6..e15883b753c71a0e2d4fa68294a589fa7324aeca 100644 --- a/hpvm/test/parboil/driver/options.py +++ b/hpvm/test/parboil/driver/options.py @@ -264,7 +264,7 @@ def time_options(progname, cmd, args): label_ptx = 'NVPTX_Timer' #label_ptx = 'SPIR_Timer' label_x86 = 'X86_Timer' - label_gen = 'GenVISC_Timer' + label_gen = 'GenHPVM_Timer' timings[label_f] = {} timings[label_f]['IO'] = addTime([(label_pb, 'IO')], timings) timings[label_f]['Memory Track'] = addTime([(label_pb, 'Mem_Track')], timings) @@ -297,11 +297,11 @@ def time_options(progname, cmd, args): timerName = 'Parboil' timings[timerName] = {} continue - if line.startswith('Printing VISC Timer'): - regex = re.search('Printing VISC Timer: *(?P<name>[a-zA-Z0-9 _]+)', line) + if line.startswith('Printing HPVM Timer'): + regex = re.search('Printing HPVM Timer: *(?P<name>[a-zA-Z0-9 _]+)', line) timerName = regex.group('name').strip() timings[timerName] = {} - if timerName != 'NVPTX_Timer' and timerName != 'X86_Timer' and timerName != 'GenVISC_Timer' and timerName != 'KernelTimer' and timerName != 'SPIR_Timer': + if timerName != 'NVPTX_Timer' and timerName != 'X86_Timer' and timerName != 'GenHPVM_Timer' and timerName != 'KernelTimer' and timerName != 'SPIR_Timer': print "Warning: Found unknown timer " + timerName continue m = re.search('(?P<timerID>[a-zA-Z _/]+) *: *(?P<value>[0-9]*\.[0-9]*) *$', line) @@ -352,67 +352,67 @@ def time_options(progname, cmd, args): globals.verbose = opts.verbose configs = [ - ('spmv', { 'VERSION' : ["opencl_nvidia", "visc"], + ('spmv', { 'VERSION' : ["opencl_nvidia", "hpvm"], 'TEST' : [("large", 10)] } ) - ,('sgemm', { 'VERSION' : ["opencl_nvidia", "visc_sh"], + ,('sgemm', { 'VERSION' : ["opencl_nvidia", "hpvm_sh"], 'TEST' : [("4K", 10)] } ) - ,('lbm', { 'VERSION' : ["opencl_nvidia", "visc"], + ,('lbm', { 'VERSION' : ["opencl_nvidia", "hpvm"], 'TEST' : [("long", 10)] } ) - ,('stencil', { 'VERSION' : ["opencl_base", "visc"], + ,('stencil', { 'VERSION' : ["opencl_base", "hpvm"], 'TEST' : [("large", 10)] } ) - ,('bfs', { 'VERSION' : ["opencl_nvidia", "visc"], + ,('bfs', { 'VERSION' : ["opencl_nvidia", "hpvm"], 'TEST' : [("1M", 10), ("SF", 10)] } ) - ,('tpacf', { 'VERSION' : ["opencl_base", "visc"], + ,('tpacf', { 'VERSION' : ["opencl_base", "hpvm"], 'TEST' : [("large", 10)] } ) - ,('cutcp', { 'VERSION' : ["opencl_nvidia", "visc"], + ,('cutcp', { 'VERSION' : ["opencl_nvidia", "hpvm"], 'TEST' : [("large", 10)] } ) - #('histo', { 'VERSION' : ["opencl_nvidia", "visc"], + #('histo', { 'VERSION' : ["opencl_nvidia", "hpvm"], #'TEST' : [("default", 10), ("large", 10)] #} #) - #('spmv', { 'VERSION' : ["opencl_cpu_baseline", "visc"], + #('spmv', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], #'TEST' : [("large", 10), ("huge", 10)] #} #) - #('sgemm', { 'VERSION' : ["opencl_cpu_sm", "visc_sh"], + #('sgemm', { 'VERSION' : ["opencl_cpu_sm", "hpvm_sh"], #'TEST' : [("medium", 1), ("4K", 1)] #} #) - #('lbm', { 'VERSION' : ["opencl_cpu_baseline", "visc"], + #('lbm', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], #'TEST' : [("short", 10), ("long", 10)] #} #) - #,('stencil', { 'VERSION' : ["opencl_cpu_baseline", "visc"], + #,('stencil', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], #'TEST' : [("default", 10), ("large", 10)] #} #) - #('bfs', { 'VERSION' : ["opencl_cpu_baseline", "visc_base"], + #('bfs', { 'VERSION' : ["opencl_cpu_baseline", "hpvm_base"], #'TEST' : [("1M", 5), ("SF", 5)] #} #) - #,('tpacf', { 'VERSION' : ["opencl_cpu_base", "visc"], + #,('tpacf', { 'VERSION' : ["opencl_cpu_base", "hpvm"], #'TEST' : [("medium", 1), ("large", 1)] #} #) - #,('cutcp', { 'VERSION' : ["opencl_cpu_baseline", "visc"], + #,('cutcp', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], #'TEST' : [("small", 1), ("large", 1)] #} #) - #,('histo', { 'VERSION' : ["opencl_cpu_baseline", "visc"], + #,('histo', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], #'TEST' : [("default", 1), ("large", 1)] #} #) diff --git a/hpvm/test/parboil/parboilParser.py b/hpvm/test/parboil/parboilParser.py index 0d1f10b6862c15fb8f591972dea8f13dfba45e30..5ea1346349d124c08cfe63a79a8c07e54f8f3e18 100755 --- a/hpvm/test/parboil/parboilParser.py +++ b/hpvm/test/parboil/parboilParser.py @@ -77,7 +77,7 @@ def parseCSVFile(filename): file.close() #print csvDict['a']['b']['c']['d']['e'] - #print csvDict['sgemm']['visc']['c']['d']['e'] + #print csvDict['sgemm']['hpvm']['c']['d']['e'] #print csvDict['sgemm']['opencl_base']['c']['d']['e'] #print csvDict['sgemm']['opencl_base']['small']['d']['e'] #print csvDict['sgemm']['opencl_base']['small']['Final']['e'] @@ -96,14 +96,14 @@ def parseCSVFile(filename): # returns a list of available tests for the given application -# the tests are found based on the visc version, because it exists +# the tests are found based on the hpvm version, because it exists # for all apps in the dict def getTests(app, csvDict): - return csvDict[app]["visc"].keys() + return csvDict[app]["hpvm"].keys() -def isViscVersion(version): - return version.startswith("visc") +def isHPVMVersion(version): + return version.startswith("hpvm") def getAllVersions(csvDict): @@ -142,7 +142,7 @@ def printTimerDecomposition(csvDict, version): # get apps apps = csvDict.keys() - isVisc = isViscVersion(version) + isHPVM = isHPVMVersion(version) # get tests for each app tests = dict() @@ -150,7 +150,7 @@ def printTimerDecomposition(csvDict, version): tests[app] = csvDict[app][version].keys() # list of timer-category pairs - if isVisc: + if isHPVM: timers =[('Final', 'Kernel'), ('Final', 'Load Program Binary'), ('Final', 'Argument Unpack'), @@ -170,7 +170,7 @@ def printTimerDecomposition(csvDict, version): ('Parboil', 'Clear_Ctx'), ('Final', 'Timer Wall - IO'), ('Final', 'IO'), - ('GenVISC_Timer', 'Timer Wall Time')] + ('GenHPVM_Timer', 'Timer Wall Time')] else: timers =[('Final', 'Init_Ctx'), ('Final', 'Arg_Unpack'), diff --git a/hpvm/test/pipeline/Makefile b/hpvm/test/pipeline/Makefile index 421c9a853264854a2ec943035a41244f892f93ab..3fc794393cf6342d949940ea74ed3bcb5002258f 100644 --- a/hpvm/test/pipeline/Makefile +++ b/hpvm/test/pipeline/Makefile @@ -23,12 +23,12 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) EXE = pipeline-$(TARGET) INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR) -INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include +INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include ## BEGIN HPVM MAKEFILE SRCDIR_OBJS= io.ll OBJS_SRC=src/io.cc -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP = $(EXE) APP_CFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize APP_CXXFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize @@ -39,21 +39,21 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt -VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll +HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt +HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll -TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce +TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce ifeq ($(TARGET),seq) DEVICE = CPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG - VISC_OPTFLAGS += -visc-timers-x86 + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS += -hpvm-timers-x86 else DEVICE = GPU_TARGET - VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG - VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx + HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx endif - TESTGEN_OPTFLAGS += -visc-timers-gen + TESTGEN_OPTFLAGS += -hpvm-timers-gen CFLAGS += -DDEVICE=$(DEVICE) CXXFLAGS += -DDEVICE=$(DEVICE) @@ -64,7 +64,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1)) .PRECIOUS: $(BUILD_DIR)/%.ll OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) -TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS)) KERNEL = $(TEST_OBJS).kernels.ll ifeq ($(TARGET),seq) @@ -91,14 +91,14 @@ $(KERNEL_OCL) : $(KERNEL) $(EXE) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ -$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) +$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp +$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp make -C $(LLVM_LIB_PATH) -$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) - $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) +$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS) + $(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -109,7 +109,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.cc $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc $(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $< -$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll - $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ +$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll + $(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@ ## END HPVM MAKEFILE diff --git a/hpvm/test/pipeline/copyToVersions.sh b/hpvm/test/pipeline/copyToVersions.sh index 3b9c19bad6dd86de7eb9a82edc7f17b92265155e..67551aff2f1b47fb2ad9c69be44936e8145a68da 100755 --- a/hpvm/test/pipeline/copyToVersions.sh +++ b/hpvm/test/pipeline/copyToVersions.sh @@ -1,12 +1,12 @@ -declare -a versionList=("viscGPU" "viscVector" "viscScalar" "viscGPU-Scalar-MaxG" "viscVector-Scalar-MaxG" "viscGPU-Scalar-ZC" "viscVector-Scalar-ZC") +declare -a versionList=("hpvmGPU" "hpvmVector" "hpvmScalar" "hpvmGPU-Scalar-MaxG" "hpvmVector-Scalar-MaxG" "hpvmGPU-Scalar-ZC" "hpvmVector-Scalar-ZC") declare -a fileList=("Makefile" "io.cc" "main.cc") for version in "${versionList[@]}"; do echo $version for filename in "${fileList[@]}"; do - echo cp ./src/visc_parallel/$filename ./src/$version/ - cp ./src/visc_parallel/$filename ./src/$version/ + echo cp ./src/hpvm_parallel/$filename ./src/$version/ + cp ./src/hpvm_parallel/$filename ./src/$version/ done echo done diff --git a/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll b/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll similarity index 95% rename from hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll rename to hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll index 06ec055bb746c7cc0cd58f75ed1f8090e0afa459..8056cc12eed0e4d20d45e294bf674dfc689f6bb8 100644 --- a/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll +++ b/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll @@ -1,4 +1,4 @@ -; ModuleID = 'build/Gradient_default/main.visc.ll' +; ModuleID = 'build/Gradient_default/main.hpvm.ll' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -167,9 +167,9 @@ entry: ; Function Attrs: nounwind uwtable define %emptyStruct @squareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 { entry: - %call3 = tail call i8* @llvm.visc.getNode() - %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3) - %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3) + %call3 = tail call i8* @llvm.hpvm.getNode() + %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3) + %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3) %cmp = icmp slt i32 %call14, %n %cmp3 = icmp slt i32 %call25, %m %or.cond = and i1 %cmp, %cmp3 @@ -198,51 +198,51 @@ if.end: ; preds = %if.then, %entry ; Function Attrs: nounwind uwtable define %emptyStruct.23 @WrapperSquareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 { entry: - %squareRoot.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false) - tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false) + %squareRoot.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false) ret %emptyStruct.23 undef } ; Function Attrs: nounwind uwtable define %emptyStruct.24 @Gradient(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %Sx, i64 %bytesSx, float* nocapture in %Sy, i64 %bytesSy, float* nocapture out %Gx, i64 %bytesGx, float* nocapture out %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n) #2 { entry: - %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*)) - %WrapperSquareRoot.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*)) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false) - %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false) - %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false) + %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*)) + %WrapperSquareRoot.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*)) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false) + %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false) + %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false) ret %emptyStruct.24 undef } @@ -866,7 +866,7 @@ cond.false: ; preds = %land.lhs.true58, %l cond.end: ; preds = %land.lhs.true58 call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.visc.init() + call void @llvm.hpvm.init() %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5 %104 = load i32* %103, align 4, !tbaa !9 %arrayidx.i296 = getelementptr inbounds i32* %103, i64 1 @@ -1137,15 +1137,15 @@ cond.false87: ; preds = %_Z12getNextFrameRN2 unreachable cond.end88: ; preds = %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit335 - call void @llvm_visc_track_mem(i8* %150, i64 %mul65) #1 - call void @llvm_visc_track_mem(i8* %106, i64 36) #1 - call void @llvm_visc_track_mem(i8* %113, i64 36) #1 + call void @llvm_hpvm_track_mem(i8* %150, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1 + call void @llvm_hpvm_track_mem(i8* %113, i64 36) #1 %176 = load i8** %data73, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %176, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %176, i64 %mul65) #1 %177 = load i8** %data74, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %177, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %177, i64 %mul65) #1 %178 = load i8** %data75, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %178, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %178, i64 %mul65) #1 %179 = load i8** %data, align 8, !tbaa !5 %180 = bitcast i8* %179 to float* store float* %180, float** %I1.i, align 1, !tbaa !5 @@ -1154,8 +1154,8 @@ cond.end88: ; preds = %_Z12getNextFrameRN2 for.body: ; preds = %for.body, %cond.end88 %j.0480 = phi i32 [ 0, %cond.end88 ], [ %inc, %for.body ] - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false) - call void @llvm.visc.wait(i8* %graphID) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false) + call void @llvm.hpvm.wait(i8* %graphID) %inc = add i32 %j.0480, 1 %exitcond = icmp eq i32 %inc, 2994 br i1 %exitcond, label %for.end, label %for.body @@ -1163,19 +1163,19 @@ for.body: ; preds = %for.body, %cond.end for.end: ; preds = %for.body call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 %181 = load i8** %data75, align 8, !tbaa !5 - call void @llvm_visc_request_mem(i8* %181, i64 %mul65) #1 + call void @llvm_hpvm_request_mem(i8* %181, i64 %mul65) #1 %182 = load i8** %data, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %182) #1 - call void @llvm_visc_untrack_mem(i8* %106) #1 - call void @llvm_visc_untrack_mem(i8* %113) #1 + call void @llvm_hpvm_untrack_mem(i8* %182) #1 + call void @llvm_hpvm_untrack_mem(i8* %106) #1 + call void @llvm_hpvm_untrack_mem(i8* %113) #1 %183 = load i8** %data73, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %183) #1 + call void @llvm_hpvm_untrack_mem(i8* %183) #1 %184 = load i8** %data74, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %184) #1 + call void @llvm_hpvm_untrack_mem(i8* %184) #1 %185 = load i8** %data75, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %185) #1 + call void @llvm_hpvm_untrack_mem(i8* %185) #1 call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1 %u.i.i.i342 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9 %186 = load %"struct.cv::UMatData"** %u.i.i.i342, align 8, !tbaa !5 @@ -1647,13 +1647,13 @@ declare noalias i8* @malloc(i64) #5 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0 -declare void @llvm_visc_track_mem(i8*, i64) #0 +declare void @llvm_hpvm_track_mem(i8*, i64) #0 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 -declare void @llvm_visc_request_mem(i8*, i64) #0 +declare void @llvm_hpvm_request_mem(i8*, i64) #0 -declare void @llvm_visc_untrack_mem(i8*) #0 +declare void @llvm_hpvm_untrack_mem(i8*) #0 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 @@ -1713,50 +1713,50 @@ entry: declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 ; Function Attrs: nounwind readnone -declare i8* @llvm.visc.getNode() #7 +declare i8* @llvm.hpvm.getNode() #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7 +declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7 +declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 +declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1 +declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1 +declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind readonly declare float @llvm.sqrt.f32(float) #8 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #1 +declare i8* @llvm.hpvm.createNode(i8*) #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*, i1) #1 +declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #1 +declare void @llvm.hpvm.wait(i8*) #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind define %horizontal.vertical.ty @horizontal_vertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 { entry: - %call3.i = tail call i8* @llvm.visc.getNode() #1 - %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1 - %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1 + %call3.i = tail call i8* @llvm.hpvm.getNode() #1 + %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1 + %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1 %mul.i = mul nsw i32 %call25.i, %n1_n %add.i = add nsw i32 %mul.i, %call14.i %cmp.i = icmp slt i32 %call14.i, %n1_n @@ -2139,25 +2139,25 @@ vertical.exit: ; preds = %if.end42.2.i67.us, ; Function Attrs: nounwind define %WrapperHorizontal.WrapperVertical.ty @WrapperHorizontal_WrapperVertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 { entry: - %horizontal_vertical.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n) - tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false) - tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false) + %horizontal_vertical.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n) + tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false) ret %WrapperHorizontal.WrapperVertical.ty undef } @@ -2172,9 +2172,9 @@ attributes #7 = { nounwind readnone } attributes #8 = { nounwind readonly } attributes #9 = { noreturn nounwind } -!visc_hint_gpu = !{!0, !1} -!visc_hint_cpu = !{!2, !3, !4} -!visc_hint_spir = !{} +!hpvm_hint_gpu = !{!0, !1} +!hpvm_hint_cpu = !{!2, !3, !4} +!hpvm_hint_spir = !{} !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot} !1 = metadata !{%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical} diff --git a/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll b/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll similarity index 95% rename from hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll rename to hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll index 4b0458625157e1c6535941ec5c663f8a16660c22..aa4a0d19a0ec80910b8d82b03de018ad41470a22 100644 --- a/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll +++ b/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll @@ -1,4 +1,4 @@ -; ModuleID = 'build/Laplacian_default/main.visc.ll' +; ModuleID = 'build/Laplacian_default/main.hpvm.ll' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -170,9 +170,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) #1 ; Function Attrs: nounwind uwtable define %emptyStruct @lincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 { entry: - %call3 = tail call i8* @llvm.visc.getNode() - %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3) - %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3) + %call3 = tail call i8* @llvm.hpvm.getNode() + %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3) + %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3) %cmp = icmp slt i32 %call14, %n %cmp3 = icmp slt i32 %call25, %m %or.cond = and i1 %cmp, %cmp3 @@ -202,55 +202,55 @@ if.end: ; preds = %if.then, %entry ; Function Attrs: nounwind uwtable define %emptyStruct.23 @WrapperLincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 { entry: - %lincomb.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false) - tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false) + %lincomb.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false) ret %emptyStruct.23 undef } ; Function Attrs: nounwind uwtable define %emptyStruct.24 @LaplacianEstimate(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %B, i64 %bytesB, float* nocapture out %D, i64 %bytesD, float* nocapture out %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n) #2 { entry: - %WrapperDilate_WrapperErode.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*)) - %WrapperLincomb.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*)) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false) - tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false) - %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false) - %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false) + %WrapperDilate_WrapperErode.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*)) + %WrapperLincomb.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*)) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false) + %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false) + %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false) ret %emptyStruct.24 undef } @@ -873,7 +873,7 @@ cond.false: ; preds = %land.lhs.true58, %l cond.end: ; preds = %land.lhs.true58 call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.visc.init() + call void @llvm.hpvm.init() %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5 %104 = load i32* %103, align 4, !tbaa !9 %arrayidx.i290 = getelementptr inbounds i32* %103, i64 1 @@ -1062,18 +1062,18 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328, call void @llvm.lifetime.end(i64 24, i8* %134) #1 %data = getelementptr inbounds %"class.cv::Mat"* %src, i64 0, i32 4 %139 = load i8** %data, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %139, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %139, i64 %mul65) #1 %arraydecay = getelementptr inbounds [9 x float]* %B, i64 0, i64 0 - call void @llvm_visc_track_mem(i8* %106, i64 36) #1 + call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1 %data81 = getelementptr inbounds %"class.cv::Mat"* %D, i64 0, i32 4 %140 = load i8** %data81, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %140, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %140, i64 %mul65) #1 %data82 = getelementptr inbounds %"class.cv::Mat"* %E, i64 0, i32 4 %141 = load i8** %data82, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %141, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %141, i64 %mul65) #1 %data83 = getelementptr inbounds %"class.cv::Mat"* %L, i64 0, i32 4 %142 = load i8** %data83, align 8, !tbaa !5 - call void @llvm_visc_track_mem(i8* %142, i64 %mul65) #1 + call void @llvm_hpvm_track_mem(i8* %142, i64 %mul65) #1 %143 = load i8** %data, align 8, !tbaa !5 %144 = bitcast i8* %143 to float* %145 = load i8** %data81, align 8, !tbaa !5 @@ -1126,8 +1126,8 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328, for.body: ; preds = %for.body, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332 %j.0474 = phi i32 [ 0, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332 ], [ %inc, %for.body ] - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false) - call void @llvm.visc.wait(i8* %graphID) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false) + call void @llvm.hpvm.wait(i8* %graphID) %inc = add nsw i32 %j.0474, 1 %exitcond = icmp eq i32 %inc, 2994 br i1 %exitcond, label %for.end, label %for.body @@ -1135,18 +1135,18 @@ for.body: ; preds = %for.body, %_Z12getN for.end: ; preds = %for.body call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 %165 = load i8** %data83, align 8, !tbaa !5 - call void @llvm_visc_request_mem(i8* %165, i64 %mul65) #1 + call void @llvm_hpvm_request_mem(i8* %165, i64 %mul65) #1 %166 = load i8** %data, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %166) #1 - call void @llvm_visc_untrack_mem(i8* %106) #1 + call void @llvm_hpvm_untrack_mem(i8* %166) #1 + call void @llvm_hpvm_untrack_mem(i8* %106) #1 %167 = load i8** %data81, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %167) #1 + call void @llvm_hpvm_untrack_mem(i8* %167) #1 %168 = load i8** %data82, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %168) #1 + call void @llvm_hpvm_untrack_mem(i8* %168) #1 %169 = load i8** %data83, align 8, !tbaa !5 - call void @llvm_visc_untrack_mem(i8* %169) #1 + call void @llvm_hpvm_untrack_mem(i8* %169) #1 call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1 %u.i.i.i336 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9 %170 = load %"struct.cv::UMatData"** %u.i.i.i336, align 8, !tbaa !5 @@ -1614,13 +1614,13 @@ declare noalias i8* @malloc(i64) #5 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0 -declare void @llvm_visc_track_mem(i8*, i64) #0 +declare void @llvm_hpvm_track_mem(i8*, i64) #0 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 -declare void @llvm_visc_request_mem(i8*, i64) #0 +declare void @llvm_hpvm_request_mem(i8*, i64) #0 -declare void @llvm_visc_untrack_mem(i8*) #0 +declare void @llvm_hpvm_untrack_mem(i8*) #0 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 @@ -1677,47 +1677,47 @@ declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 ; Function Attrs: nounwind readnone -declare i8* @llvm.visc.getNode() #7 +declare i8* @llvm.hpvm.getNode() #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7 +declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7 +declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 +declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1 +declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1 +declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #1 +declare i8* @llvm.hpvm.createNode(i8*) #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*, i1) #1 +declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #1 +declare void @llvm.hpvm.wait(i8*) #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind define %dilate.erode.ty @dilate_erode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 { entry: - %call3.i = tail call i8* @llvm.visc.getNode() #1 - %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1 - %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1 + %call3.i = tail call i8* @llvm.hpvm.getNode() #1 + %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1 + %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1 %cmp.i = icmp slt i32 %call14.i, %n1_n %cmp3.i = icmp slt i32 %call25.i, %n1_m %or.cond.i = and i1 %cmp.i, %cmp3.i @@ -2070,25 +2070,25 @@ erode.exit: ; preds = %dilate.exit, %cond. ; Function Attrs: nounwind define %WrapperDilate.WrapperErode.ty @WrapperDilate_WrapperErode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 { entry: - %dilate_erode.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n) - tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false) - tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false) - tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false) + %dilate_erode.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n) + tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false) + tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false) + tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false) ret %WrapperDilate.WrapperErode.ty undef } @@ -2103,9 +2103,9 @@ attributes #7 = { nounwind readnone } attributes #8 = { noreturn nounwind } attributes #9 = { nounwind readonly } -!visc_hint_gpu = !{!0, !1} -!visc_hint_cpu = !{!2, !3, !4} -!visc_hint_spir = !{} +!hpvm_hint_gpu = !{!0, !1} +!hpvm_hint_cpu = !{!2, !3, !4} +!hpvm_hint_spir = !{} !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb} !1 = metadata !{%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode} diff --git a/hpvm/test/pipeline/run.sh b/hpvm/test/pipeline/run.sh index 0c8435764bd87c92dd30ad51aa97011ddb07b339..5ac734026bf839c511dfdfb843b07382e6d8d4d6 100755 --- a/hpvm/test/pipeline/run.sh +++ b/hpvm/test/pipeline/run.sh @@ -4,7 +4,7 @@ echo Pipeline Script $1 $2 version=$1 pos=$2 -if [[ ($version == *"GPU"*) || ($version == "visc_parallel") ]] +if [[ ($version == *"GPU"*) || ($version == "hpvm_parallel") ]] then target="" elif [[ $version == *"Vector"* ]] diff --git a/hpvm/test/pipeline/runscript.sh b/hpvm/test/pipeline/runscript.sh index 5a2933e78801993ee440ead6e19f84aae66b3577..c95af8f831eeeb7f5f464e4acbc90dd49fcb67a1 100755 --- a/hpvm/test/pipeline/runscript.sh +++ b/hpvm/test/pipeline/runscript.sh @@ -2,21 +2,21 @@ echo Pipeline Script # Compile all version -make VERSION=viscGPU clean -make VERSION=viscVector TARGET=x86 clean -make VERSION=viscScalar TARGET=seq clean +make VERSION=hpvmGPU clean +make VERSION=hpvmVector TARGET=x86 clean +make VERSION=hpvmScalar TARGET=seq clean -make VERSION=viscGPU -make VERSION=viscVector TARGET=x86 -make VERSION=viscScalar TARGET=seq +make VERSION=hpvmGPU +make VERSION=hpvmVector TARGET=x86 +make VERSION=hpvmScalar TARGET=seq #Run all version -make VERSION=viscGPU run & +make VERSION=hpvmGPU run & ID_GPU=$! -make VERSION=viscVector TARGET=x86 run & +make VERSION=hpvmVector TARGET=x86 run & ID_Vector=$! -make VERSION=viscScalar TARGET=seq run +make VERSION=hpvmScalar TARGET=seq run ID_Scalar=$! #echo Wait 60 seconds diff --git a/hpvm/test/pipeline/src/Makefile b/hpvm/test/pipeline/src/Makefile index ec39b86f1cf71e2e8b6131b076c2953b566cbb56..55acb2e0982edc2a914340f2bfacbbfc1d06397f 100644 --- a/hpvm/test/pipeline/src/Makefile +++ b/hpvm/test/pipeline/src/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=visc +LANGUAGE=hpvm SRCDIR_OBJS=io.ll #compute_gold.o -VISC_OBJS=main.visc.ll +HPVM_OBJS=main.hpvm.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include diff --git a/hpvm/test/pipeline/src/main.cc b/hpvm/test/pipeline/src/main.cc index 9314833d25d0a3a25f13dfb24fb8a239b94956b1..ef9d8412c70813fcae123b0ef84de1850fa6b28c 100644 --- a/hpvm/test/pipeline/src/main.cc +++ b/hpvm/test/pipeline/src/main.cc @@ -13,6 +13,7 @@ #include "opencv2/ocl/ocl.hpp" #include "opencv2/opencv.hpp" #include <cassert> +#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,7 +21,6 @@ #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <visc.h> #define NUM_RUNS 100 #define DEPTH 3 @@ -147,12 +147,12 @@ void packData(struct InStruct *args, float *I, size_t bytesI, float *Is, void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, float *Is, size_t bytesIs, long m, long n) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, I, Gs, 1, Is); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, I, Gs, 1, Is); - void *thisNode = __visc__getNode(); - long gx = __visc__getNodeInstanceID_x(thisNode); - long gy = __visc__getNodeInstanceID_y(thisNode); + void *thisNode = __hpvm__getNode(); + long gx = __hpvm__getNodeInstanceID_x(thisNode); + long gy = __hpvm__getNodeInstanceID_y(thisNode); int gloc = gx + gy * n; @@ -187,26 +187,26 @@ void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, Is[gloc] = smoothedVal; } - __visc__return(2, bytesIs, bytesIs); + __hpvm__return(2, bytesIs, bytesIs); } void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, float *Is, size_t bytesIs, long m, long n) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, I, Gs, 1, Is); - void *GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n); - __visc__bindIn(GSNode, 0, 0, 0); // Bind I - __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI - __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs - __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs - __visc__bindIn(GSNode, 4, 4, 0); // Bind Is - __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs - __visc__bindIn(GSNode, 6, 6, 0); // Bind m - __visc__bindIn(GSNode, 7, 7, 0); // Bind n - - __visc__bindOut(GSNode, 0, 0, 0); // bind output bytesIs - __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, I, Gs, 1, Is); + void *GSNode = __hpvm__createNodeND(2, gaussianSmoothing, m, n); + __hpvm__bindIn(GSNode, 0, 0, 0); // Bind I + __hpvm__bindIn(GSNode, 1, 1, 0); // Bind bytesI + __hpvm__bindIn(GSNode, 2, 2, 0); // Bind Gs + __hpvm__bindIn(GSNode, 3, 3, 0); // Bind bytesGs + __hpvm__bindIn(GSNode, 4, 4, 0); // Bind Is + __hpvm__bindIn(GSNode, 5, 5, 0); // Bind bytesIs + __hpvm__bindIn(GSNode, 6, 6, 0); // Bind m + __hpvm__bindIn(GSNode, 7, 7, 0); // Bind n + + __hpvm__bindOut(GSNode, 0, 0, 0); // bind output bytesIs + __hpvm__bindOut(GSNode, 1, 1, 0); // bind output bytesIs } /* Compute a non-linear laplacian estimate of input image I of size m x n */ @@ -220,14 +220,14 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs, void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, float *L, size_t bytesL, long m, long n) { - __visc__hint(visc::DEVICE); - __visc__attributes(2, Is, B, 1, L); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(2, Is, B, 1, L); // 3x3 image area float imageArea[SZB * SZB]; - void *thisNode = __visc__getNode(); - long gx = __visc__getNodeInstanceID_x(thisNode); - long gy = __visc__getNodeInstanceID_y(thisNode); + void *thisNode = __hpvm__getNode(); + long gx = __hpvm__getNodeInstanceID_x(thisNode); + long gy = __hpvm__getNodeInstanceID_y(thisNode); int i, j; if ((gx < n) && (gy < m)) { @@ -300,25 +300,25 @@ void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB + 1]; L[gy * n + gx] = laplacian; } - __visc__return(1, bytesL); + __hpvm__return(1, bytesL); } void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, float *L, size_t bytesL, long m, long n) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, Is, B, 1, L); - void *LNode = __visc__createNodeND(2, laplacianEstimate, m, n); - __visc__bindIn(LNode, 0, 0, 0); // Bind Is - __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs - __visc__bindIn(LNode, 2, 2, 0); // Bind B - __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB - __visc__bindIn(LNode, 4, 4, 0); // Bind L - __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL - __visc__bindIn(LNode, 6, 6, 0); // Bind m - __visc__bindIn(LNode, 7, 7, 0); // Bind n - - __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, Is, B, 1, L); + void *LNode = __hpvm__createNodeND(2, laplacianEstimate, m, n); + __hpvm__bindIn(LNode, 0, 0, 0); // Bind Is + __hpvm__bindIn(LNode, 1, 1, 0); // Bind bytesIs + __hpvm__bindIn(LNode, 2, 2, 0); // Bind B + __hpvm__bindIn(LNode, 3, 3, 0); // Bind bytesB + __hpvm__bindIn(LNode, 4, 4, 0); // Bind L + __hpvm__bindIn(LNode, 5, 5, 0); // Bind bytesL + __hpvm__bindIn(LNode, 6, 6, 0); // Bind m + __hpvm__bindIn(LNode, 7, 7, 0); // Bind n + + __hpvm__bindOut(LNode, 0, 0, 0); // bind output bytesL } /* Compute the zero crossings of input image L of size m x n */ @@ -331,16 +331,16 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B, */ void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, float *S, size_t bytesS, long m, long n) { - __visc__hint(visc::DEVICE); - //__visc__hint(visc::CPU_TARGET); - __visc__attributes(2, L, B, 1, S); + __hpvm__hint(hpvm::DEVICE); + //__hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, L, B, 1, S); // 3x3 image area float imageArea[SZB][SZB]; - void *thisNode = __visc__getNode(); - long gx = __visc__getNodeInstanceID_x(thisNode); - long gy = __visc__getNodeInstanceID_y(thisNode); + void *thisNode = __hpvm__getNode(); + long gx = __hpvm__getNodeInstanceID_x(thisNode); + long gy = __hpvm__getNodeInstanceID_y(thisNode); int i, j; if ((gx < n) && (gy < m)) { @@ -416,25 +416,25 @@ void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, float pixelSign = dilatedPixel - erodedPixel; S[gy * n + gx] = pixelSign; } - __visc__return(1, bytesS); + __hpvm__return(1, bytesS); } void WrapperComputeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, float *S, size_t bytesS, long m, long n) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, L, B, 1, S); - void *ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n); - __visc__bindIn(ZCNode, 0, 0, 0); // Bind L - __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL - __visc__bindIn(ZCNode, 2, 2, 0); // Bind B - __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB - __visc__bindIn(ZCNode, 4, 4, 0); // Bind S - __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS - __visc__bindIn(ZCNode, 6, 6, 0); // Bind m - __visc__bindIn(ZCNode, 7, 7, 0); // Bind n - - __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, L, B, 1, S); + void *ZCNode = __hpvm__createNodeND(2, computeZeroCrossings, m, n); + __hpvm__bindIn(ZCNode, 0, 0, 0); // Bind L + __hpvm__bindIn(ZCNode, 1, 1, 0); // Bind bytesL + __hpvm__bindIn(ZCNode, 2, 2, 0); // Bind B + __hpvm__bindIn(ZCNode, 3, 3, 0); // Bind bytesB + __hpvm__bindIn(ZCNode, 4, 4, 0); // Bind S + __hpvm__bindIn(ZCNode, 5, 5, 0); // Bind bytesS + __hpvm__bindIn(ZCNode, 6, 6, 0); // Bind m + __hpvm__bindIn(ZCNode, 7, 7, 0); // Bind n + + __hpvm__bindOut(ZCNode, 0, 0, 0); // bind output bytesS } /* @@ -458,12 +458,12 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, float *Sy, size_t bytesSy, float *G, size_t bytesG, long m, long n) { - __visc__hint(visc::DEVICE); - __visc__attributes(3, Is, Sx, Sy, 1, G); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(3, Is, Sx, Sy, 1, G); - void *thisNode = __visc__getNode(); - long gx = __visc__getNodeInstanceID_x(thisNode); - long gy = __visc__getNodeInstanceID_y(thisNode); + void *thisNode = __hpvm__getNode(); + long gx = __hpvm__getNodeInstanceID_x(thisNode); + long gy = __hpvm__getNodeInstanceID_y(thisNode); int gloc = gx + gy * n; @@ -498,27 +498,27 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, G[gloc] = sqrt(Gx * Gx + Gy * Gy); } - __visc__return(1, bytesG); + __hpvm__return(1, bytesG); } void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, float *Sy, size_t bytesSy, float *G, size_t bytesG, long m, long n) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, Is, Sx, Sy, 1, G); - void *CGNode = __visc__createNodeND(2, computeGradient, m, n); - __visc__bindIn(CGNode, 0, 0, 0); // Bind Is - __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs - __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx - __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx - __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy - __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy - __visc__bindIn(CGNode, 6, 6, 0); // Bind G - __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG - __visc__bindIn(CGNode, 8, 8, 0); // Bind m - __visc__bindIn(CGNode, 9, 9, 0); // Bind n - - __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, Is, Sx, Sy, 1, G); + void *CGNode = __hpvm__createNodeND(2, computeGradient, m, n); + __hpvm__bindIn(CGNode, 0, 0, 0); // Bind Is + __hpvm__bindIn(CGNode, 1, 1, 0); // Bind bytesIs + __hpvm__bindIn(CGNode, 2, 2, 0); // Bind Sx + __hpvm__bindIn(CGNode, 3, 3, 0); // Bind bytesSx + __hpvm__bindIn(CGNode, 4, 4, 0); // Bind Sy + __hpvm__bindIn(CGNode, 5, 5, 0); // Bind bytesSy + __hpvm__bindIn(CGNode, 6, 6, 0); // Bind G + __hpvm__bindIn(CGNode, 7, 7, 0); // Bind bytesG + __hpvm__bindIn(CGNode, 8, 8, 0); // Bind m + __hpvm__bindIn(CGNode, 9, 9, 0); // Bind n + + __hpvm__bindOut(CGNode, 0, 0, 0); // bind output bytesG } /* @@ -531,13 +531,13 @@ void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx, void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(1, G, 1, maxG); + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(1, G, 1, maxG); - void *thisNode = __visc__getNode(); + void *thisNode = __hpvm__getNode(); - long lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x - long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x + long lx = __hpvm__getNodeInstanceID_x(thisNode); // threadIdx.x + long dimx = __hpvm__getNumNodeInstances_x(thisNode); // blockDim.x // Assume a single thread block // Thread block iterates over all elements @@ -556,39 +556,39 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG, *maxG = G[lx]; } - __visc__return(1, bytesMaxG); + __hpvm__return(1, bytesMaxG); } void computeMaxGradientTB(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n, long block_x) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, G, maxG, 1, maxG); - void *CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x); - __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G - __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG - __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG - __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG - __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m - __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n - - __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, G, maxG, 1, maxG); + void *CMGLeafNode = __hpvm__createNodeND(1, computeMaxGradientLeaf, block_x); + __hpvm__bindIn(CMGLeafNode, 0, 0, 0); // Bind G + __hpvm__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG + __hpvm__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG + __hpvm__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG + __hpvm__bindIn(CMGLeafNode, 4, 4, 0); // Bind m + __hpvm__bindIn(CMGLeafNode, 5, 5, 0); // Bind n + + __hpvm__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG } void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n, long block_x, long grid_x) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, G, maxG, 1, maxG); - void *CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x); - __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G - __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG - __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG - __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG - __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m - __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n - __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x - - __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(2, G, maxG, 1, maxG); + void *CMGTBNode = __hpvm__createNodeND(1, computeMaxGradientTB, grid_x); + __hpvm__bindIn(CMGTBNode, 0, 0, 0); // Bind G + __hpvm__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG + __hpvm__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG + __hpvm__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG + __hpvm__bindIn(CMGTBNode, 4, 4, 0); // Bind m + __hpvm__bindIn(CMGTBNode, 5, 5, 0); // Bind n + __hpvm__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x + + __hpvm__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG } /* Reject the zero crossings where the gradient is below a threshold */ @@ -604,39 +604,39 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG, void rejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG, float *maxG, size_t bytesMaxG, float *E, size_t bytesE, long m, long n) { - __visc__hint(visc::DEVICE); - __visc__attributes(3, S, G, maxG, 1, E); + __hpvm__hint(hpvm::DEVICE); + __hpvm__attributes(3, S, G, maxG, 1, E); - void *thisNode = __visc__getNode(); - int gx = __visc__getNodeInstanceID_x(thisNode); - int gy = __visc__getNodeInstanceID_y(thisNode); + void *thisNode = __hpvm__getNode(); + int gx = __hpvm__getNodeInstanceID_x(thisNode); + int gy = __hpvm__getNodeInstanceID_y(thisNode); float mG = *maxG; if ((gx < n) && (gy < m)) { E[gy * n + gx] = ((S[gy * n + gx] > 0.0) && (G[gy * n + gx] > THETA * mG)) ? 1.0 : 0.0; } - __visc__return(1, bytesE); + __hpvm__return(1, bytesE); } void WrapperRejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG, float *maxG, size_t bytesMaxG, float *E, size_t bytesE, long m, long n) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, S, G, maxG, 1, E); - void *RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n); - __visc__bindIn(RZCNode, 0, 0, 0); // Bind S - __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS - __visc__bindIn(RZCNode, 2, 2, 0); // Bind G - __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG - __visc__bindIn(RZCNode, 4, 4, 0); // Bind maxG - __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG - __visc__bindIn(RZCNode, 6, 6, 0); // Bind E - __visc__bindIn(RZCNode, 7, 7, 0); // Bind bytesE - __visc__bindIn(RZCNode, 8, 8, 0); // Bind m - __visc__bindIn(RZCNode, 9, 9, 0); // Bind n - - __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes(3, S, G, maxG, 1, E); + void *RZCNode = __hpvm__createNodeND(2, rejectZeroCrossings, m, n); + __hpvm__bindIn(RZCNode, 0, 0, 0); // Bind S + __hpvm__bindIn(RZCNode, 1, 1, 0); // Bind bytesS + __hpvm__bindIn(RZCNode, 2, 2, 0); // Bind G + __hpvm__bindIn(RZCNode, 3, 3, 0); // Bind bytesG + __hpvm__bindIn(RZCNode, 4, 4, 0); // Bind maxG + __hpvm__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG + __hpvm__bindIn(RZCNode, 6, 6, 0); // Bind E + __hpvm__bindIn(RZCNode, 7, 7, 0); // Bind bytesE + __hpvm__bindIn(RZCNode, 8, 8, 0); // Bind m + __hpvm__bindIn(RZCNode, 9, 9, 0); // Bind n + + __hpvm__bindOut(RZCNode, 0, 0, 0); // bind output bytesE } // Pipelined Root node @@ -656,80 +656,80 @@ void edgeDetection(float *I, size_t bytesI, // 0 long block_x, // 24 long grid_x // 25 ) { - __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); - __visc__hint(visc::CPU_TARGET); - void *GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing); - void *LNode = __visc__createNodeND(0, WrapperlaplacianEstimate); - void *CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings); - void *CGNode = __visc__createNodeND(0, WrapperComputeGradient); - void *CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient); - void *RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings); + __hpvm__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); + __hpvm__hint(hpvm::CPU_TARGET); + void *GSNode = __hpvm__createNodeND(0, WrapperGaussianSmoothing); + void *LNode = __hpvm__createNodeND(0, WrapperlaplacianEstimate); + void *CZCNode = __hpvm__createNodeND(0, WrapperComputeZeroCrossings); + void *CGNode = __hpvm__createNodeND(0, WrapperComputeGradient); + void *CMGNode = __hpvm__createNodeND(0, WrapperComputeMaxGradient); + void *RZCNode = __hpvm__createNodeND(0, WrapperRejectZeroCrossings); // Gaussian Inputs - __visc__bindIn(GSNode, 0, 0, 1); // Bind I - __visc__bindIn(GSNode, 1, 1, 1); // Bind bytesI - __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs - __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs - __visc__bindIn(GSNode, 2, 4, 1); // Bind Is - __visc__bindIn(GSNode, 3, 5, 1); // Bind bytesIs - __visc__bindIn(GSNode, 22, 6, 1); // Bind m - __visc__bindIn(GSNode, 23, 7, 1); // Bind n + __hpvm__bindIn(GSNode, 0, 0, 1); // Bind I + __hpvm__bindIn(GSNode, 1, 1, 1); // Bind bytesI + __hpvm__bindIn(GSNode, 14, 2, 1); // Bind Gs + __hpvm__bindIn(GSNode, 15, 3, 1); // Bind bytesGs + __hpvm__bindIn(GSNode, 2, 4, 1); // Bind Is + __hpvm__bindIn(GSNode, 3, 5, 1); // Bind bytesIs + __hpvm__bindIn(GSNode, 22, 6, 1); // Bind m + __hpvm__bindIn(GSNode, 23, 7, 1); // Bind n // Laplacian Inputs - __visc__bindIn(LNode, 2, 0, 1); // Bind Is - __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs - __visc__bindIn(LNode, 16, 2, 1); // Bind B - __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB - __visc__bindIn(LNode, 4, 4, 1); // Bind L - __visc__bindIn(LNode, 5, 5, 1); // Bind bytesL - __visc__bindIn(LNode, 22, 6, 1); // Bind m - __visc__bindIn(LNode, 23, 7, 1); // Bind n + __hpvm__bindIn(LNode, 2, 0, 1); // Bind Is + __hpvm__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs + __hpvm__bindIn(LNode, 16, 2, 1); // Bind B + __hpvm__bindIn(LNode, 17, 3, 1); // Bind bytesB + __hpvm__bindIn(LNode, 4, 4, 1); // Bind L + __hpvm__bindIn(LNode, 5, 5, 1); // Bind bytesL + __hpvm__bindIn(LNode, 22, 6, 1); // Bind m + __hpvm__bindIn(LNode, 23, 7, 1); // Bind n // Compute ZC Inputs - __visc__bindIn(CZCNode, 4, 0, 1); // Bind L - __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL - __visc__bindIn(CZCNode, 16, 2, 1); // Bind B - __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB - __visc__bindIn(CZCNode, 6, 4, 1); // Bind S - __visc__bindIn(CZCNode, 7, 5, 1); // Bind bytesS - __visc__bindIn(CZCNode, 22, 6, 1); // Bind m - __visc__bindIn(CZCNode, 23, 7, 1); // Bind n + __hpvm__bindIn(CZCNode, 4, 0, 1); // Bind L + __hpvm__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL + __hpvm__bindIn(CZCNode, 16, 2, 1); // Bind B + __hpvm__bindIn(CZCNode, 17, 3, 1); // Bind bytesB + __hpvm__bindIn(CZCNode, 6, 4, 1); // Bind S + __hpvm__bindIn(CZCNode, 7, 5, 1); // Bind bytesS + __hpvm__bindIn(CZCNode, 22, 6, 1); // Bind m + __hpvm__bindIn(CZCNode, 23, 7, 1); // Bind n // Gradient Inputs - __visc__bindIn(CGNode, 2, 0, 1); // Bind Is - __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs - __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx - __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx - __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy - __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy - __visc__bindIn(CGNode, 8, 6, 1); // Bind G - __visc__bindIn(CGNode, 9, 7, 1); // Bind bytesG - __visc__bindIn(CGNode, 22, 8, 1); // Bind m - __visc__bindIn(CGNode, 23, 9, 1); // Bind n + __hpvm__bindIn(CGNode, 2, 0, 1); // Bind Is + __hpvm__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs + __hpvm__bindIn(CGNode, 18, 2, 1); // Bind Sx + __hpvm__bindIn(CGNode, 19, 3, 1); // Bind bytesSx + __hpvm__bindIn(CGNode, 20, 4, 1); // Bind Sy + __hpvm__bindIn(CGNode, 21, 5, 1); // Bind bytesSy + __hpvm__bindIn(CGNode, 8, 6, 1); // Bind G + __hpvm__bindIn(CGNode, 9, 7, 1); // Bind bytesG + __hpvm__bindIn(CGNode, 22, 8, 1); // Bind m + __hpvm__bindIn(CGNode, 23, 9, 1); // Bind n // Max Gradient Inputs - __visc__bindIn(CMGNode, 8, 0, 1); // Bind G - __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG - __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG - __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG - __visc__bindIn(CMGNode, 22, 4, 1); // Bind m - __visc__bindIn(CMGNode, 23, 5, 1); // Bind n - __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x - __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x + __hpvm__bindIn(CMGNode, 8, 0, 1); // Bind G + __hpvm__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG + __hpvm__bindIn(CMGNode, 10, 2, 1); // Bind maxG + __hpvm__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG + __hpvm__bindIn(CMGNode, 22, 4, 1); // Bind m + __hpvm__bindIn(CMGNode, 23, 5, 1); // Bind n + __hpvm__bindIn(CMGNode, 24, 6, 1); // Bind block_x + __hpvm__bindIn(CMGNode, 25, 7, 1); // Bind grid_x // Reject ZC Inputs - __visc__bindIn(RZCNode, 6, 0, 1); // Bind S - __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS - __visc__bindIn(RZCNode, 8, 2, 1); // Bind G - __visc__bindIn(RZCNode, 9, 3, 1); // Bind bytesG - __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG - __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG - __visc__bindIn(RZCNode, 12, 6, 1); // Bind E - __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE - __visc__bindIn(RZCNode, 22, 8, 1); // Bind m - __visc__bindIn(RZCNode, 23, 9, 1); // Bind n - - __visc__bindOut(RZCNode, 0, 0, 1); // Bind output + __hpvm__bindIn(RZCNode, 6, 0, 1); // Bind S + __hpvm__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS + __hpvm__bindIn(RZCNode, 8, 2, 1); // Bind G + __hpvm__bindIn(RZCNode, 9, 3, 1); // Bind bytesG + __hpvm__bindIn(RZCNode, 10, 4, 1); // Bind maxG + __hpvm__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG + __hpvm__bindIn(RZCNode, 12, 6, 1); // Bind E + __hpvm__bindIn(RZCNode, 13, 7, 1); // Bind bytesE + __hpvm__bindIn(RZCNode, 22, 8, 1); // Bind m + __hpvm__bindIn(RZCNode, 23, 9, 1); // Bind n + + __hpvm__bindOut(RZCNode, 0, 0, 1); // Bind output } } @@ -796,7 +796,7 @@ int main(int argc, char *argv[]) { assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous()); - __visc__init(); + __hpvm__init(); // copy A to device memory I_sz = src.size[0] * src.size[1] * sizeof(float); @@ -843,7 +843,7 @@ int main(int argc, char *argv[]) { for (unsigned j = 0; j < NUM_RUNS; j++) { std::cout << "Run: " << j << "\n"; - void *DFG = __visc__launch(1, edgeDetection, (void *)args); + void *DFG = __hpvm__launch(1, edgeDetection, (void *)args); cap = VideoCapture(inFile); getNextFrame(cap, src); @@ -855,25 +855,25 @@ int main(int argc, char *argv[]) { *maxG = 0.0; - llvm_visc_track_mem(src.data, I_sz); - llvm_visc_track_mem(Is.data, I_sz); - llvm_visc_track_mem(L.data, I_sz); - llvm_visc_track_mem(S.data, I_sz); - llvm_visc_track_mem(G.data, I_sz); - llvm_visc_track_mem(maxG, bytesMaxG); - llvm_visc_track_mem(E.data, I_sz); - llvm_visc_track_mem(Gs, bytesGs); - llvm_visc_track_mem(B, bytesB); - llvm_visc_track_mem(Sx, bytesSx); - llvm_visc_track_mem(Sy, bytesSy); - - __visc__push(DFG, args); - void *ret = __visc__pop(DFG); + llvm_hpvm_track_mem(src.data, I_sz); + llvm_hpvm_track_mem(Is.data, I_sz); + llvm_hpvm_track_mem(L.data, I_sz); + llvm_hpvm_track_mem(S.data, I_sz); + llvm_hpvm_track_mem(G.data, I_sz); + llvm_hpvm_track_mem(maxG, bytesMaxG); + llvm_hpvm_track_mem(E.data, I_sz); + llvm_hpvm_track_mem(Gs, bytesGs); + llvm_hpvm_track_mem(B, bytesB); + llvm_hpvm_track_mem(Sx, bytesSx); + llvm_hpvm_track_mem(Sy, bytesSy); + + __hpvm__push(DFG, args); + void *ret = __hpvm__pop(DFG); std::cout << "Returned size: " << *(size_t *)ret << " expected " << I_sz << '\n'; - llvm_visc_request_mem(maxG, bytesMaxG); - llvm_visc_request_mem(E.data, I_sz); + llvm_hpvm_request_mem(maxG, bytesMaxG); + llvm_hpvm_request_mem(E.data, I_sz); Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); @@ -882,26 +882,26 @@ int main(int argc, char *argv[]) { imshow(input_window, in); waitKey(1); - llvm_visc_untrack_mem(src.data); - llvm_visc_untrack_mem(Is.data); - llvm_visc_untrack_mem(L.data); - llvm_visc_untrack_mem(S.data); - llvm_visc_untrack_mem(G.data); - llvm_visc_untrack_mem(maxG); - llvm_visc_untrack_mem(E.data); - llvm_visc_untrack_mem(Gs); - llvm_visc_untrack_mem(B); - llvm_visc_untrack_mem(Sx); - llvm_visc_untrack_mem(Sy); + llvm_hpvm_untrack_mem(src.data); + llvm_hpvm_untrack_mem(Is.data); + llvm_hpvm_untrack_mem(L.data); + llvm_hpvm_untrack_mem(S.data); + llvm_hpvm_untrack_mem(G.data); + llvm_hpvm_untrack_mem(maxG); + llvm_hpvm_untrack_mem(E.data); + llvm_hpvm_untrack_mem(Gs); + llvm_hpvm_untrack_mem(B); + llvm_hpvm_untrack_mem(Sx); + llvm_hpvm_untrack_mem(Sy); getNextFrame(cap, src); } } else { - __visc__push(DFG, args); - __visc__pop(DFG); + __hpvm__push(DFG, args); + __hpvm__pop(DFG); } - __visc__wait(DFG); + __hpvm__wait(DFG); } - __visc__cleanup(); + __hpvm__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/CreateNodeAndEdge.c b/hpvm/test/unitTests/CreateNodeAndEdge.c index 1b6b1cff211d5af5a909065af988aadbe979f2ec..c3f58c95d631b5c49a47de1cbe41ed5ea871f5f4 100644 --- a/hpvm/test/unitTests/CreateNodeAndEdge.c +++ b/hpvm/test/unitTests/CreateNodeAndEdge.c @@ -1,4 +1,4 @@ -#include "visc.h" +#include "hpvm.h" #include <stdio.h> struct Root { @@ -7,33 +7,33 @@ struct Root { }; void Func1(int *In, int *Out) { - __visc__hint(CPU_TARGET); - __visc__attributes(1, In, 1, Out); + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(1, In, 1, Out); - __visc__return(1, Out); + __hpvm__return(1, Out); } void Func2(int *BindIn, int *SrcIn, int *Out) { - __visc__hint(CPU_TARGET); - __visc__attributes(2, BindIn, SrcIn, 1, Out); + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(2, BindIn, SrcIn, 1, Out); - __visc__return(1, Out); + __hpvm__return(1, Out); } void PipeRoot(int *In, int *Out) { - __visc__hint(CPU_TARGET); + __hpvm__hint(CPU_TARGET); - __visc__attributes(1, In, 1, Out); + __hpvm__attributes(1, In, 1, Out); - void *SrcNode = __visc__createNodeND(0, Func1); - void *DestNode = __visc__createNodeND(0, Func2); + void *SrcNode = __hpvm__createNodeND(0, Func1); + void *DestNode = __hpvm__createNodeND(0, Func2); - __visc__bindIn(SrcNode, 0, 0, 0); + __hpvm__bindIn(SrcNode, 0, 0, 0); - __visc__bindIn(DestNode, 0, 0, 0); - __visc__edge(SrcNode, DestNode, 1, 0, 1, 0); + __hpvm__bindIn(DestNode, 0, 0, 0); + __hpvm__edge(SrcNode, DestNode, 1, 0, 1, 0); - __visc__bindOut(SrcNode, 0, 0, 0); + __hpvm__bindOut(SrcNode, 0, 0, 0); } int main(void) { @@ -41,10 +41,10 @@ int main(void) { int Out = 0; struct Root RootArgs = {(int *)&In, (int *)&Out}; - __visc__init(); - void *PipeDFG = __visc__launch(0, PipeRoot, (void *)&RootArgs); - __visc__wait(PipeDFG); - __visc__cleanup(); + __hpvm__init(); + void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)&RootArgs); + __hpvm__wait(PipeDFG); + __hpvm__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/Makefile b/hpvm/test/unitTests/Makefile index 539ee5e8fbf010d33663c98470b245bb2710eeea..15580e9300a119f55e4a828b645c27dd00b62ff8 100644 --- a/hpvm/test/unitTests/Makefile +++ b/hpvm/test/unitTests/Makefile @@ -2,8 +2,8 @@ PASSES := .PHONY: clean -LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install -LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc +LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc HOST:=gemm_opencl KERNELS:=matrixMul LLVM_CC:=$(LLVM_INSTALL)/bin/clang diff --git a/hpvm/test/unitTests/MallocIntrinsic.c b/hpvm/test/unitTests/MallocIntrinsic.c index cfd041a991d976c24b372a81b35842598b571d89..173f6b3b16d1090a98242d345cefa330910d862d 100644 --- a/hpvm/test/unitTests/MallocIntrinsic.c +++ b/hpvm/test/unitTests/MallocIntrinsic.c @@ -1,4 +1,4 @@ -#include "visc.h" +#include "hpvm.h" #include <stdlib.h> struct Root { @@ -7,12 +7,12 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __visc__hint(CPU_TARGET); - __visc__attributes(1, In, 1, Out); + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(1, In, 1, Out); - Out = (int *)__visc__malloc(*In); + Out = (int *)__hpvm__malloc(*In); - __visc__return(1, Out); + __hpvm__return(1, Out); } int main(void) { @@ -26,12 +26,12 @@ int main(void) { RootArgs->input = (int *)&In; RootArgs->output = (int *)&Out; - __visc__init(); + __hpvm__init(); - void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); - __visc__wait(PipeDFG); + void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs); + __hpvm__wait(PipeDFG); - __visc__cleanup(); + __hpvm__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/PipelineIntrinsics.c b/hpvm/test/unitTests/PipelineIntrinsics.c index 2a9bf83402891beddf13d96c6346e8fed924d17e..43ba0ef56cf160acb1fab6ea334732e56e0359d2 100644 --- a/hpvm/test/unitTests/PipelineIntrinsics.c +++ b/hpvm/test/unitTests/PipelineIntrinsics.c @@ -1,4 +1,4 @@ -#include "visc.h" +#include "hpvm.h" #include <stdlib.h> struct Root { @@ -7,9 +7,9 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __visc__hint(CPU_TARGET); - __visc__attributes(1, In, 1, Out); - __visc__return(1, Out); + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(1, In, 1, Out); + __hpvm__return(1, Out); } int main(void) { @@ -23,12 +23,12 @@ int main(void) { RootArgs->input = (int *)&In; RootArgs->output = (int *)&Out; - __visc__init(); + __hpvm__init(); - void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); - __visc__wait(PipeDFG); + void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs); + __hpvm__wait(PipeDFG); - __visc__cleanup(); + __hpvm__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c index 36fc02d22b066025be4a57695265779d8e55652a..c2deed98679bf794316f283acef8e3c1db9ffa88 100644 --- a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c +++ b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c @@ -1,4 +1,4 @@ -#include "visc.h" +#include "hpvm.h" #include <stdlib.h> struct Root { @@ -7,24 +7,24 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __visc__hint(CPU_TARGET); - __visc__attributes(1, In, 1, Out); - __visc__return(1, Out); + __hpvm__hint(CPU_TARGET); + __hpvm__attributes(1, In, 1, Out); + __hpvm__return(1, Out); } int main(void) { int In, Out; - __visc__init(); + __hpvm__init(); struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root)); RootArgs->input = (int *)&In; RootArgs->output = (int *)&Out; - void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); - __visc__wait(PipeDFG); + void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs); + __hpvm__wait(PipeDFG); - __visc__cleanup(); + __hpvm__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/temp/3level.ll b/hpvm/test/unitTests/temp/3level.ll index 168e7b42322c8f7fa4be83a64cbd06d44dd9e428..2e3753f1400798d0989e2a01be78ab338205a291 100644 --- a/hpvm/test/unitTests/temp/3level.ll +++ b/hpvm/test/unitTests/temp/3level.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/3level.ll' @@ -13,31 +13,31 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { @@ -47,18 +47,18 @@ entry: %0 = load i8** %arrayidx, align 8, !tbaa !0 %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0 %conv.i = trunc i64 %call.i to i32 - call void @llvm.visc.init() + call void @llvm.hpvm.init() %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output1 = extractvalue %rtype %outputstruct, 0 %output2 = extractvalue %rtype %outputstruct, 1 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output1) #0 %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output2) #0 ret i32 0 @@ -83,21 +83,21 @@ define %rtype_internal @foo(i32 %id) { } define %rtype_internal @subNode(i32 %id) { - %foo_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*)) - call void @llvm.visc.bind.input(i8* %foo_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %foo_node, i32 0, i32 0) + %foo_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*)) + call void @llvm.hpvm.bind.input(i8* %foo_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %foo_node, i32 0, i32 0) ret %rtype_internal zeroinitializer } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*)) - %sub_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %sub_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %sub_node, i32 0, i32 1) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*)) + %sub_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %sub_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %sub_node, i32 0, i32 1) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/Makefile b/hpvm/test/unitTests/temp/Makefile index 539ee5e8fbf010d33663c98470b245bb2710eeea..15580e9300a119f55e4a828b645c27dd00b62ff8 100644 --- a/hpvm/test/unitTests/temp/Makefile +++ b/hpvm/test/unitTests/temp/Makefile @@ -2,8 +2,8 @@ PASSES := .PHONY: clean -LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install -LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc +LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc HOST:=gemm_opencl KERNELS:=matrixMul LLVM_CC:=$(LLVM_INSTALL)/bin/clang diff --git a/hpvm/test/unitTests/temp/query2D.ll b/hpvm/test/unitTests/temp/query2D.ll index c994c2a3ff5b166b2f192f4b900982b3b7afc508..48358a3527553c8f4a31ff89454010289d02c072 100644 --- a/hpvm/test/unitTests/temp/query2D.ll +++ b/hpvm/test/unitTests/temp/query2D.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/query2D.ll' @@ -12,46 +12,46 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 +declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 +declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 +declare i8* @llvm.hpvm.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 +declare i32 @llvm.hpvm.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { @@ -61,25 +61,25 @@ entry: %0 = load i8** %arrayidx, align 8, !tbaa !0 %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0 %conv.i = trunc i64 %call.i to i32 - call void @llvm.visc.init() + call void @llvm.hpvm.init() %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.visc.getNode() - %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) + %this_node = call i8* @llvm.hpvm.getNode() + %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node) %sum2 = add i32 %sum, %dim %output = insertvalue %rtype undef, i32 %sum2, 0 ret %rtype %output @@ -92,11 +92,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %dimension) { - %p_node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/query3D.ll b/hpvm/test/unitTests/temp/query3D.ll index 438fe60a3bc6c2dfe718da76d55041addc47367f..d2ff16ef56628752b997577891c44fd904be4405 100644 --- a/hpvm/test/unitTests/temp/query3D.ll +++ b/hpvm/test/unitTests/temp/query3D.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/query3D.ll' @@ -12,57 +12,57 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 +declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 +declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 +declare i8* @llvm.hpvm.createNode3D(i8*, i32, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 +declare i8* @llvm.hpvm.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 +declare i32 @llvm.hpvm.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 +declare i32 @llvm.hpvm.getNumNodeInstances.y(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -71,21 +71,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.visc.getNode() - %dim = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) + %this_node = call i8* @llvm.hpvm.getNode() + %dim = call i32 @llvm.hpvm.getNumNodeInstances.y(i8* %this_node) %sum2 = add i32 %sum, %dim %output = insertvalue %rtype undef, i32 %sum2, 0 ret %rtype %output @@ -98,11 +98,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %dimension) { - %p_node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/queryNodeInst.ll b/hpvm/test/unitTests/temp/queryNodeInst.ll index 24d6a3f0d30e6661c0f1396e082f889d54dc50be..4e3dd7553045d466199c726416db220a6be2d1aa 100644 --- a/hpvm/test/unitTests/temp/queryNodeInst.ll +++ b/hpvm/test/unitTests/temp/queryNodeInst.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -12,40 +12,40 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 +declare i32 @llvm.hpvm.getNumDims(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -54,21 +54,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.visc.getNode() - %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node) + %this_node = call i8* @llvm.hpvm.getNode() + %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node) %sum2 = add i32 %sum, %numDim %output = insertvalue %rtype undef, i32 %sum, 0 ret %rtype %output @@ -81,11 +81,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/queryNumDim.ll b/hpvm/test/unitTests/temp/queryNumDim.ll index 500e2ff41bd52f29a56cfd49563927bf6323482b..caa0978dabab0bf6295853e35f23e3ed68f00840 100644 --- a/hpvm/test/unitTests/temp/queryNumDim.ll +++ b/hpvm/test/unitTests/temp/queryNumDim.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -12,42 +12,42 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 +declare i8* @llvm.hpvm.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 +declare i32 @llvm.hpvm.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -56,21 +56,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.visc.getNode() - %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node) + %this_node = call i8* @llvm.hpvm.getNode() + %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node) %sum2 = add i32 %sum, %numDim %output = insertvalue %rtype undef, i32 %sum, 0 ret %rtype %output @@ -83,11 +83,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/queryNumNodeInst.ll b/hpvm/test/unitTests/temp/queryNumNodeInst.ll index 48add92f16125bdf33c9691896a8b7259339fe78..07418ff725c277e2e8adbe6a39d8831e2b77bc59 100644 --- a/hpvm/test/unitTests/temp/queryNumNodeInst.ll +++ b/hpvm/test/unitTests/temp/queryNumNodeInst.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -12,48 +12,48 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 +declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 +declare i8* @llvm.hpvm.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 +declare i32 @llvm.hpvm.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -62,21 +62,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.visc.getNode() - %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) + %this_node = call i8* @llvm.hpvm.getNode() + %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node) %sum2 = add i32 %sum, %dim %output = insertvalue %rtype undef, i32 %sum2, 0 ret %rtype %output @@ -89,11 +89,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %dimension) { - %p_node = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/singleNode.ll b/hpvm/test/unitTests/temp/singleNode.ll index 20713e955fb457acec2e2968d1b4a2ae61396fe0..99e53181317a6b27a83916682bcf1457895c0bfc 100644 --- a/hpvm/test/unitTests/temp/singleNode.ll +++ b/hpvm/test/unitTests/temp/singleNode.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll' @@ -12,43 +12,43 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0 %conv.i = trunc i64 %call.i to i32 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) - call void @llvm.visc.cleanup() + call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.hpvm.cleanup() ret i32 0 } @@ -59,8 +59,8 @@ define %rtype @foo() { } define %rtype @Root() { - %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo to i8*)) - call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0) + %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo to i8*)) + call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/singleNodeStream.ll b/hpvm/test/unitTests/temp/singleNodeStream.ll index fce75df6714240286e9a676e40e37c3f14e537a6..aa0243603c420a21f51f9842d467f9da814f1814 100644 --- a/hpvm/test/unitTests/temp/singleNodeStream.ll +++ b/hpvm/test/unitTests/temp/singleNodeStream.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll' @@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*, i1) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0 ; Function Attrs: nounwind -declare void @llvm.visc.push(i8*, i8*) #0 +declare void @llvm.hpvm.push(i8*, i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.pop(i8*) #0 +declare i8* @llvm.hpvm.pop(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32, i1) +declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32, i1) +declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %num = alloca i32 %arrayidx = getelementptr inbounds i8** %argv, i64 1 @@ -60,27 +60,27 @@ entry: %args = bitcast %struct.arg* %in.addr to i8* ; Launch the pipeline - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1) ; Push arguments into the pipeline - call void @llvm.visc.push(i8* %graphID, i8* %args) - call void @llvm.visc.push(i8* %graphID, i8* %args) - call void @llvm.visc.push(i8* %graphID, i8* %args) - call void @llvm.visc.push(i8* %graphID, i8* %args) + call void @llvm.hpvm.push(i8* %graphID, i8* %args) + call void @llvm.hpvm.push(i8* %graphID, i8* %args) + call void @llvm.hpvm.push(i8* %graphID, i8* %args) + call void @llvm.hpvm.push(i8* %graphID, i8* %args) ; Pop out arguments and read the output - %graph_output = call i8* @llvm.visc.pop(i8* %graphID) - %graph_output1 = call i8* @llvm.visc.pop(i8* %graphID) - %graph_output2 = call i8* @llvm.visc.pop(i8* %graphID) - %graph_output3 = call i8* @llvm.visc.pop(i8* %graphID) + %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID) + %graph_output1 = call i8* @llvm.hpvm.pop(i8* %graphID) + %graph_output2 = call i8* @llvm.hpvm.pop(i8* %graphID) + %graph_output3 = call i8* @llvm.hpvm.pop(i8* %graphID) %output.addr = bitcast i8* %graph_output to %rptype* %outputstruct = load %rptype* %output.addr %output = extractvalue %rptype %outputstruct, 0 %output_val = load i32* %output %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0 - call void @llvm.visc.wait(i8* %graphID) - call void @llvm.visc.cleanup() + call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.hpvm.cleanup() ret i32 0 } @@ -95,11 +95,11 @@ define %rptype @producer(i32* %id, i64 %size) { } define %rptype @Root(i32* %id, i64 %size) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1) - call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 1) - call void @llvm.visc.bind.output(i8* %p_node, i32 0, i32 0, i1 1) - call void @llvm.visc.bind.output(i8* %p_node, i32 1, i32 1, i1 1) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 1) + call void @llvm.hpvm.bind.output(i8* %p_node, i32 0, i32 0, i1 1) + call void @llvm.hpvm.bind.output(i8* %p_node, i32 1, i32 1, i1 1) ret %rptype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoLaunch.ll b/hpvm/test/unitTests/temp/twoLaunch.ll index 48c973a7e6f1cc5422fffd8d9e4ae0a0e1a06bf9..ee602f58d82f004a7b19bf54e55e1c0759c17bef 100644 --- a/hpvm/test/unitTests/temp/twoLaunch.ll +++ b/hpvm/test/unitTests/temp/twoLaunch.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll' @@ -12,33 +12,33 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr_1 = alloca %struct.arg %in.addr_2= alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 @@ -47,12 +47,12 @@ entry: %conv.i = trunc i64 %call.i to i32 %args_1 = bitcast %struct.arg* %in.addr_1 to i8* %args_2 = bitcast %struct.arg* %in.addr_2 to i8* - %graphID_1 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1) - %graphID_2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2) + %graphID_1 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1) + %graphID_2 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID_1) - call void @llvm.visc.wait(i8* %graphID_2) - call void @llvm.visc.cleanup() + call void @llvm.hpvm.wait(i8* %graphID_1) + call void @llvm.hpvm.wait(i8* %graphID_2) + call void @llvm.hpvm.cleanup() ret i32 0 } @@ -70,14 +70,14 @@ define %rtype @foo_2() { } define %rtype @Root_1() { - %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*)) - call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0) + %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*)) + call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0) ret %rtype zeroinitializer } define %rtype @Root_2() { - %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*)) - call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0) + %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*)) + call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNode.ll b/hpvm/test/unitTests/temp/twoNode.ll index 5e2899830b835ff50c9d2d8e4157451d4bd26f7f..74e4c64d599f7204b375743687c6da2b7ed8c9f6 100644 --- a/hpvm/test/unitTests/temp/twoNode.ll +++ b/hpvm/test/unitTests/temp/twoNode.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -46,10 +46,10 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) - call void @llvm.visc.cleanup() + call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.hpvm.cleanup() ret i32 0 } @@ -66,10 +66,10 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNodeConnect.ll b/hpvm/test/unitTests/temp/twoNodeConnect.ll index 06652b94e02c2cac66ab4a07e88dec0a04da49f8..6b23ad691bacb42c39fe681967d4c584179644f1 100644 --- a/hpvm/test/unitTests/temp/twoNodeConnect.ll +++ b/hpvm/test/unitTests/temp/twoNodeConnect.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll' @@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -46,14 +46,14 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() ret i32 0 } @@ -70,11 +70,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNodeQuery.ll b/hpvm/test/unitTests/temp/twoNodeQuery.ll index 2e1ea0dba4659d92b9c1b0600732748c87571671..247d1830dadff69ac5380b939d26c5f850bc08ac 100644 --- a/hpvm/test/unitTests/temp/twoNodeQuery.ll +++ b/hpvm/test/unitTests/temp/twoNodeQuery.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeQuery.ll' @@ -11,42 +11,42 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) +declare void @llvm.hpvm.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) +declare void @llvm.hpvm.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 +declare i8* @llvm.hpvm.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 +declare i8* @llvm.hpvm.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 +declare i32 @llvm.hpvm.getNumDims(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -55,21 +55,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.visc.wait(i8* %graphID) + call void @llvm.hpvm.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.visc.cleanup() + call void @llvm.hpvm.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.visc.getNode() - %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node) + %this_node = call i8* @llvm.hpvm.getNode() + %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node) %sum2 = add i32 %sum, %numDim %output = insertvalue %rtype undef, i32 %sum, 0 ret %rtype %output @@ -82,11 +82,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNodeStream.ll b/hpvm/test/unitTests/temp/twoNodeStream.ll index 6e9925951884775e7ba60bb396a97fd9bc0ef52d..f9820abd19eb7b329b2c7184719d9699b15891e6 100644 --- a/hpvm/test/unitTests/temp/twoNodeStream.ll +++ b/hpvm/test/unitTests/temp/twoNodeStream.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll' @@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.visc.init() #1 +declare void @llvm.hpvm.init() #1 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 +declare void @llvm.hpvm.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 +declare i8* @llvm.hpvm.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0 +declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*, i1) #0 +declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0 ; Function Attrs: nounwind -declare void @llvm.visc.push(i8*, i8*) #0 +declare void @llvm.hpvm.push(i8*, i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.visc.pop(i8*) #0 +declare i8* @llvm.hpvm.pop(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 +declare void @llvm.hpvm.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32, i1) +declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32, i1) +declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.visc.init() + call void @llvm.hpvm.init() %in.addr = alloca %struct.arg %num = alloca i32 %arrayidx = getelementptr inbounds i8** %argv, i64 1 @@ -60,21 +60,21 @@ entry: %args = bitcast %struct.arg* %in.addr to i8* ; Launch the pipeline - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1) + %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1) ; Push arguments into the pipeline - call void @llvm.visc.push(i8* %graphID, i8* %args) + call void @llvm.hpvm.push(i8* %graphID, i8* %args) ; Pop out arguments and read the output - %graph_output = call i8* @llvm.visc.pop(i8* %graphID) + %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID) %output.addr = bitcast i8* %graph_output to %rctype* %outputstruct = load %rctype* %output.addr %output = extractvalue %rctype %outputstruct, 0 %output_val = load i32* %output %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0 - call void @llvm.visc.wait(i8* %graphID) - call void @llvm.visc.cleanup() + call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.hpvm.cleanup() ret i32 0 } @@ -97,14 +97,14 @@ define %rctype @consumer(i32* %id, i64 %size) { } define %rctype @Root(i32* %id, i64 %size) { - %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) - %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*)) - %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1) - %edge2 = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1) - call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1) - call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 0) - call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0, i1 1) - call void @llvm.visc.bind.output(i8* %c_node, i32 1, i32 1, i1 1) + %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) + %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*)) + %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1) + %edge2 = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1) + call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 0) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0, i1 1) + call void @llvm.hpvm.bind.output(i8* %c_node, i32 1, i32 1, i1 1) ret %rctype zeroinitializer }