diff --git a/.hooks/pre-commit b/.hooks/pre-commit new file mode 100755 index 0000000000000000000000000000000000000000..4204a26e48e1ccbde141ba50d5311a0d2de38186 --- /dev/null +++ b/.hooks/pre-commit @@ -0,0 +1,37 @@ +#!/usr/bin/sh +# +# Format *.cpp|*.c|*.cc|*.h|*.hpp files with clang-format. +# Called by "git commit" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message if +# it wants to stop the commit. + +if git rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=$(git hash-object -t tree /dev/null) +fi + +# Redirect output to stderr. +exec 1>&2 + +# Find all changed C/C++ files +diff_source_files=$(git diff --cached --name-only --diff-filter=AM $against \ + -- '*.c' '*.cc' '*.cpp' '*.h' '*.hpp') +# Only perform clang-format when changed source files exist +if [[ ! -z $diff_source_files ]]; then + echo "[clang-format] Reformatting the following files: " + echo $diff_source_files + clang-format --style=file -i $diff_source_files + echo "[clang-format] Adding reformatted files." + git add $diff_source_files + # Commit can become empty after this; reject commit in that case. + diff_after_format=$(git diff --cached $against) + if [[ -z $diff_after_format ]]; then + echo "[clang-format] Commit is empty after formatting; rejected." + exit 1 + fi +else + echo "[clang-format] No C/C++ source files modified in the commit." +fi diff --git a/hpvm/include/BuildDFG/BuildDFG.h b/hpvm/include/BuildDFG/BuildDFG.h index d48aa3aa69822bae28a031a438dcd7ecfc9d7748..28230e135beb68c07c998e607fa3d03d40a66791 100644 --- a/hpvm/include/BuildDFG/BuildDFG.h +++ b/hpvm/include/BuildDFG/BuildDFG.h @@ -10,13 +10,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/ValueMap.h" -#include "llvm/IR/Module.h" +#include "SupportVISC/DFGraph.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Value.h" -#include "SupportVISC/DFGraph.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" using namespace llvm; @@ -27,56 +27,54 @@ struct BuildDFG : public ModulePass { static char ID; // Pass identification, replacement for typeid BuildDFG() : ModulePass(ID) {} - typedef ValueMap<Value*, DFNode*> HandleToDFNode; - typedef ValueMap<Value*, DFEdge*> HandleToDFEdge; + typedef ValueMap<Value *, DFNode *> HandleToDFNode; + typedef ValueMap<Value *, DFEdge *> HandleToDFEdge; private: // Member variables DFInternalNode *Root; - std::vector<DFInternalNode*> Roots; + std::vector<DFInternalNode *> Roots; - HandleToDFNode HandleToDFNodeMap; // This map associates the i8* pointer + HandleToDFNode HandleToDFNodeMap; // This map associates the i8* pointer // with the DFNode structure that it // represents - HandleToDFEdge HandleToDFEdgeMap; // This map associates the i8* pointer + HandleToDFEdge HandleToDFEdgeMap; // This map associates the i8* pointer // with the DFEdge structure that it // represents - // Functions public: - void handleCreateNode (DFInternalNode* N, IntrinsicInst* II); + void handleCreateNode(DFInternalNode *N, IntrinsicInst *II); + private: - void handleCreateEdge (DFInternalNode* N, IntrinsicInst* II); - void handleGetParentNode (DFInternalNode* N, IntrinsicInst* II); - void handleBindInput (DFInternalNode* N, IntrinsicInst* II); - void handleBindOutput (DFInternalNode* N, IntrinsicInst* II); + void handleCreateEdge(DFInternalNode *N, IntrinsicInst *II); + void handleGetParentNode(DFInternalNode *N, IntrinsicInst *II); + void handleBindInput(DFInternalNode *N, IntrinsicInst *II); + void handleBindOutput(DFInternalNode *N, IntrinsicInst *II); - void BuildGraph (DFInternalNode* N, Function* F); + void BuildGraph(DFInternalNode *N, Function *F); public: // Functions virtual bool runOnModule(Module &M); - static bool isViscLaunchIntrinsic(Instruction * I); - static bool isViscGraphIntrinsic(Instruction * I); - static bool isViscQueryIntrinsic(Instruction* I); - static bool isViscIntrinsic(Instruction* I); + static bool isViscLaunchIntrinsic(Instruction *I); + static bool isViscGraphIntrinsic(Instruction *I); + static bool isViscQueryIntrinsic(Instruction *I); + static bool isViscIntrinsic(Instruction *I); static bool isTypeCongruent(Type *L, Type *R); - //TODO: Maybe make these fields const + // TODO: Maybe make these fields const DFInternalNode *getRoot() const; - std::vector<DFInternalNode*> &getRoots(); + std::vector<DFInternalNode *> &getRoots(); HandleToDFNode &getHandleToDFNodeMap(); HandleToDFEdge &getHandleToDFEdgeMap(); - void addElementToHandleToDFNodeMap(Value* V, DFNode* N); - void removeElementFromHandleToDFNodeMap(Value* V); - void addElementToHandleToDFEdgeMap(Value* V, DFEdge* E); - void removeElementFromHandleToDFEdgeMap(Value* V); - + void addElementToHandleToDFNodeMap(Value *V, DFNode *N); + void removeElementFromHandleToDFNodeMap(Value *V); + void addElementToHandleToDFEdgeMap(Value *V, DFEdge *E); + void removeElementFromHandleToDFEdgeMap(Value *V); }; -} // End of namespace +} // namespace builddfg #endif - diff --git a/hpvm/include/GenVISC/GenVISC.h b/hpvm/include/GenVISC/GenVISC.h index 585af33953ebd28f108a3233b30c6769334230e7..1db9929be70fdc4335e23d7e879248f0ebb45c07 100644 --- a/hpvm/include/GenVISC/GenVISC.h +++ b/hpvm/include/GenVISC/GenVISC.h @@ -7,14 +7,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Module.h" +#include "SupportVISC/VISCTimer.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" -#include "llvm/IR/DerivedTypes.h" -#include "SupportVISC/VISCTimer.h" using namespace llvm; @@ -24,27 +24,25 @@ struct GenVISC : public ModulePass { static char ID; // Pass identification, replacement for typeid GenVISC() : ModulePass(ID) {} - private: // Member variables - Module* M; + Module *M; FunctionCallee llvm_visc_initializeTimerSet; FunctionCallee llvm_visc_switchToTimer; FunctionCallee llvm_visc_printTimerSet; - GlobalVariable* TimerSet; + GlobalVariable *TimerSet; // Functions - void initializeTimerSet(Instruction*); - void switchToTimer(enum visc_TimerID, Instruction*); - void printTimerSet(Instruction*); - Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); + void initializeTimerSet(Instruction *); + void switchToTimer(enum visc_TimerID, Instruction *); + void printTimerSet(Instruction *); + Value *getStringPointer(const Twine &S, Instruction *InsertBefore, + const Twine &Name = ""); public: // Functions virtual bool runOnModule(Module &M); - }; -} // End of namespace - +} // namespace genvisc diff --git a/hpvm/include/SupportVISC/DFG2LLVM.h b/hpvm/include/SupportVISC/DFG2LLVM.h index 841756889fd7a791c78ace8a25c59dce3645e831..b9e4cc4158b71ab18fbeadf2e4d094055feb6149 100644 --- a/hpvm/include/SupportVISC/DFG2LLVM.h +++ b/hpvm/include/SupportVISC/DFG2LLVM.h @@ -10,31 +10,37 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Module.h" +#include "BuildDFG/BuildDFG.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCTimer.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" -#include "BuildDFG/BuildDFG.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCTimer.h" -#include "SupportVISC/VISCUtils.h" using namespace llvm; using namespace builddfg; -#define TIMER(X) do { if (VISCTimer) { X; } } while (0) -#define DECLARE(X) X = M.getOrInsertFunction(#X, \ - runtimeModule->getFunction(#X)->getFunctionType()); \ - //DEBUG(errs() << *X) +#define TIMER(X) \ + do { \ + if (VISCTimer) { \ + X; \ + } \ + } while (0) +#define DECLARE(X) \ + X = M.getOrInsertFunction( \ + #X, runtimeModule->getFunction(#X)->getFunctionType()); \ + // DEBUG(errs() << *X) namespace dfg2llvm { // Helper Functions -static inline ConstantInt* getTimerID(Module&, enum visc_TimerID); -static inline ConstantInt* getTimerID(Module&, enum visc::Target); +static inline ConstantInt *getTimerID(Module &, enum visc_TimerID); +static inline ConstantInt *getTimerID(Module &, enum visc::Target); -bool hasAttribute(Function*, unsigned, Attribute::AttrKind); +bool hasAttribute(Function *, unsigned, Attribute::AttrKind); // DFG2LLVM abstract class implementation class DFG2LLVM : public ModulePass { @@ -54,24 +60,23 @@ public: AU.addRequired<BuildDFG>(); AU.addPreserved<BuildDFG>(); } - }; // Abstract Visitor for Code generation traversal (tree traversal for now) class CodeGenTraversal : public DFNodeVisitor { protected: - //Member variables + // Member variables Module &M; BuildDFG &DFG; bool VISCTimer = false; std::string TargetName = "None"; - + // Map from Old function associated with DFNode to new cloned function with // extra index and dimension arguments. This map also serves to find out if // we already have an index and dim extended function copy or not (i.e., // "Have we visited this function before?") - DenseMap<DFNode*, Value*> OutputMap; + DenseMap<DFNode *, Value *> OutputMap; // VISC Runtime API std::unique_ptr<Module> runtimeModule; @@ -79,103 +84,107 @@ protected: FunctionCallee llvm_visc_initializeTimerSet; FunctionCallee llvm_visc_switchToTimer; FunctionCallee llvm_visc_printTimerSet; - GlobalVariable* TimerSet; - GlobalVariable* GraphIDAddr; - Instruction* InitCall; - Instruction* CleanupCall; - + GlobalVariable *TimerSet; + GlobalVariable *GraphIDAddr; + Instruction *InitCall; + Instruction *CleanupCall; // Functions - Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); -// void addArgument(Function*, Type*, const Twine& Name = ""); - Function *addArgument(Function*, Type*, const Twine& Name = ""); -// void addIdxDimArgs(Function* F); - Function *addIdxDimArgs(Function* F); - std::vector<Value*> extractElements(Value*, std::vector<Type*>, - std::vector<std::string>, Instruction*); - Argument* getArgumentAt(Function* F, unsigned offset); + Value *getStringPointer(const Twine &S, Instruction *InsertBefore, + const Twine &Name = ""); + // void addArgument(Function*, Type*, const Twine& Name = ""); + Function *addArgument(Function *, Type *, const Twine &Name = ""); + // void addIdxDimArgs(Function* F); + Function *addIdxDimArgs(Function *F); + std::vector<Value *> extractElements(Value *, std::vector<Type *>, + std::vector<std::string>, Instruction *); + Argument *getArgumentAt(Function *F, unsigned offset); void initTimerAPI(); // Pure Virtual Functions virtual void init() = 0; virtual void initRuntimeAPI() = 0; - virtual void codeGen(DFInternalNode* N) = 0; - virtual void codeGen(DFLeafNode* N) = 0; + virtual void codeGen(DFInternalNode *N) = 0; + virtual void codeGen(DFLeafNode *N) = 0; // Virtual Functions - virtual void initializeTimerSet(Instruction*); - virtual void switchToTimer(enum visc_TimerID, Instruction*); - virtual void printTimerSet(Instruction*); + virtual void initializeTimerSet(Instruction *); + virtual void switchToTimer(enum visc_TimerID, Instruction *); + virtual void printTimerSet(Instruction *); virtual ~CodeGenTraversal() {} - public: - // Constructor CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} - static bool checkPreferredTarget(DFNode* N, visc::Target T); - static bool preferredTargetIncludes(DFNode* N, visc::Target T); + static bool checkPreferredTarget(DFNode *N, visc::Target T); + static bool preferredTargetIncludes(DFNode *N, visc::Target T); visc::Target getPreferredTarget(DFNode *N); - virtual void visit(DFInternalNode* N) { + virtual void visit(DFInternalNode *N) { // If code has already been generated for this internal node, skip the // children - if(N->getGenFunc() != NULL) + if (N->getGenFunc() != NULL) return; - DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Start: Generating Code for Node (I) - " + << N->getFuncPointer()->getName() << "\n"); // Follows a bottom-up approach for code generation. // First generate code for all the child nodes - for(DFGraph::children_iterator i = N->getChildGraph()->begin(), - e = N->getChildGraph()->end(); i != e; ++i) { - DFNode* child = *i; + for (DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); + i != e; ++i) { + DFNode *child = *i; child->applyDFNodeVisitor(*this); } // Generate code for this internal node now. This way all the cloned // functions for children exist. codeGen(N); - DEBUG(errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "DONE: Generating Code for Node (I) - " + << N->getFuncPointer()->getName() << "\n"); } - virtual void visit(DFLeafNode* N) { - DEBUG(errs() << "Start: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"); + virtual void visit(DFLeafNode *N) { + DEBUG(errs() << "Start: Generating Code for Node (L) - " + << N->getFuncPointer()->getName() << "\n"); codeGen(N); - DEBUG(errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "DONE: Generating Code for Node (L) - " + << N->getFuncPointer()->getName() << "\n"); } }; // -------------- CodeGenTraversal Implementation ----------------- -bool CodeGenTraversal::checkPreferredTarget(DFNode* N, visc::Target T) { - Function* F = N->getFuncPointer(); - Module* M = F->getParent(); - NamedMDNode* HintNode; +bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) { + Function *F = N->getFuncPointer(); + Module *M = F->getParent(); + NamedMDNode *HintNode; switch (T) { - case visc::GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); - break; - case visc::SPIR_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); - break; - case visc::CUDNN_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); - break; - case visc::PROMISE_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); - break; - case visc::CPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); - break; - default: - llvm_unreachable("Target Not supported yet!"); + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CUDNN_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + break; + case visc::PROMISE_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + break; + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + default: + llvm_unreachable("Target Not supported yet!"); } for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { - MDNode* MetaNode = HintNode->getOperand(i); - Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); - if(F == FHint) + MDNode *MetaNode = HintNode->getOperand(i); + Value *FHint = + dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if (F == FHint) return true; } return false; @@ -185,43 +194,44 @@ visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) { return viscUtils::getPreferredTarget(N->getFuncPointer()); } -bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) { +bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, visc::Target T) { - Function* F = N->getFuncPointer(); - Module* M = F->getParent(); + Function *F = N->getFuncPointer(); + Module *M = F->getParent(); std::vector<NamedMDNode *> HintNode; switch (T) { - case visc::GPU_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); - break; - case visc::SPIR_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); - break; - case visc::CPU_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); - break; - case visc::CUDNN_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn")); - break; - case visc::PROMISE_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise")); - break; - case visc::CPU_OR_GPU_TARGET: - case visc::CPU_OR_SPIR_TARGET: - assert(false && "Target should be one of CPU/GPU/SPIR\n"); - break; - default: - llvm_unreachable("Target Not supported yet!"); + case visc::GPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + break; + case visc::SPIR_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + break; + case visc::CPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + break; + case visc::CUDNN_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn")); + break; + case visc::PROMISE_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise")); + break; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && "Target should be one of CPU/GPU/SPIR\n"); + break; + default: + llvm_unreachable("Target Not supported yet!"); } for (unsigned h = 0; h < HintNode.size(); h++) { for (unsigned i = 0; i < HintNode[h]->getNumOperands(); i++) { MDNode *MetaNode = HintNode[h]->getOperand(i); - Value *FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + Value *FHint = + dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); if (F == FHint) return true; } @@ -230,22 +240,25 @@ bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) { return false; } - // Generate Code for declaring a constant string [L x i8] and return a pointer // to the start of it. -Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { - Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true); - Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true, - GlobalValue::InternalLinkage, SConstant, Name); - Value* Zero = ConstantInt::get(Type::getInt64Ty(M.getContext()), 0); - Value* GEPArgs[] = {Zero, Zero}; - GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, - ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); +Value *CodeGenTraversal::getStringPointer(const Twine &S, Instruction *IB, + const Twine &Name) { + Constant *SConstant = + ConstantDataArray::getString(M.getContext(), S.str(), true); + Value *SGlobal = + new GlobalVariable(M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value *Zero = ConstantInt::get(Type::getInt64Ty(M.getContext()), 0); + Value *GEPArgs[] = {Zero, Zero}; + GetElementPtrInst *SPtr = GetElementPtrInst::Create( + nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB); return SPtr; } // Add an argument of type Ty to the given function F -//void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) { +// void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) +// { // // Add the argument to argument list // new Argument(Ty, name, F); // @@ -258,14 +271,15 @@ Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const // // Adding new arguments to the function argument list, would not change the // // function type. We need to change the type of this function to reflect the // // added arguments -// FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); -// PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); +// FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, +// F->isVarArg()); PointerType* PTy = PointerType::get(FTy, +// cast<PointerType>(F->getType())->getAddressSpace()); // // // Change the function type // F->mutateType(PTy); //} -void renameNewArgument(Function *newF, const Twine& argName){ +void renameNewArgument(Function *newF, const Twine &argName) { // Get Last argument in Function Arg List and rename it to given name Argument *lastArg = &*(newF->arg_end() - 1); lastArg->setName(argName); @@ -273,29 +287,31 @@ void renameNewArgument(Function *newF, const Twine& argName){ // Creates a function with an additional argument of the specified type and // name. The previous function is not deleted. -Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) { +Function *CodeGenTraversal::addArgument(Function *F, Type *Ty, + const Twine &name) { Argument *new_arg = new Argument(Ty, name); // Create the argument type list with added argument types - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { + std::vector<Type *> ArgTypes; + for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { ArgTypes.push_back(ai->getType()); } ArgTypes.push_back(new_arg->getType()); - + // Adding new arguments to the function argument list, would not change the // function type. We need to change the type of this function to reflect the // added arguments. So, we create a clone of this function with the correct // type. - FunctionType *FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + FunctionType *FTy = + FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); Function *newF = Function::Create(FTy, F->getLinkage(), - F->getName() + "_cloned", F->getParent()); + F->getName() + "_cloned", F->getParent()); renameNewArgument(newF, name); newF = viscUtils::cloneFunction(F, newF, false); // Check if the function is used by a metadata node - if(F->isUsedByMetadata()) { + if (F->isUsedByMetadata()) { viscUtils::fixHintMetadata(*F->getParent(), F, newF); } @@ -303,17 +319,17 @@ Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name } // Change the argument list of function F to add index and limit arguments -//void CodeGenTraversal::addIdxDimArgs(Function* F) { +// void CodeGenTraversal::addIdxDimArgs(Function* F) { // // Add Index and Dim arguments -// std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; -// for (int i = 0; i < 6; ++i) { +// std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", +// "dim_z"}; for (int i = 0; i < 6; ++i) { // addArgument(F, Type::getInt32Ty(F->getContext()), names[i]); // } //} // Return new function with additional index and limit arguments. // The original function is removed from the module and erased. -Function *CodeGenTraversal::addIdxDimArgs(Function* F) { +Function *CodeGenTraversal::addIdxDimArgs(Function *F) { DEBUG(errs() << "Function Type: " << *F->getFunctionType() << "\n"); // Add Index and Dim arguments std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; @@ -324,43 +340,42 @@ Function *CodeGenTraversal::addIdxDimArgs(Function* F) { F->eraseFromParent(); F = newF; } - DEBUG(errs() << "Function Type after adding args: " << *newF->getFunctionType() << "\n"); + DEBUG(errs() << "Function Type after adding args: " + << *newF->getFunctionType() << "\n"); return newF; } // Extract elements from an aggregate value. TyList contains the type of each // element, and names vector contains a name. IB is the instruction before which // all the generated code would be inserted. -std::vector<Value*> CodeGenTraversal::extractElements(Value* Aggregate, - std::vector<Type*> TyList, std::vector<std::string> names, Instruction* IB) { +std::vector<Value *> +CodeGenTraversal::extractElements(Value *Aggregate, std::vector<Type *> TyList, + std::vector<std::string> names, + Instruction *IB) { // Extract input data from i8* Aggregate.addr and store them in a vector. // For each argument - std::vector<Value*> Elements; - GetElementPtrInst* GEP; + std::vector<Value *> Elements; + GetElementPtrInst *GEP; unsigned argNum = 0; - for(Type* Ty: TyList) { + for (Type *Ty : TyList) { // BitCast: %arg.addr = bitcast i8* Aggregate.addr to <pointer-to-argType> - CastInst* BI = BitCastInst::CreatePointerCast(Aggregate, - Ty->getPointerTo(), - names[argNum]+".addr", - IB); + CastInst *BI = BitCastInst::CreatePointerCast(Aggregate, Ty->getPointerTo(), + names[argNum] + ".addr", IB); // Load: %arg = load <pointer-to-argType> %arg.addr - LoadInst* LI = new LoadInst(BI, names[argNum], IB); + LoadInst *LI = new LoadInst(BI, names[argNum], IB); // Patch argument to call instruction Elements.push_back(LI); - //errs() << "Pushing element " << *LI << "\n"; - //CI->setArgOperand(argNum, LI); + // errs() << "Pushing element " << *LI << "\n"; + // CI->setArgOperand(argNum, LI); // TODO: Minor Optimization - The last GEP statement can/should be left out // as no more arguments left - // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1 - // This essentially takes us to the next argument in memory - Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1); - if (argNum < TyList.size()-1) - GEP = GetElementPtrInst::Create(nullptr, BI, - ArrayRef<Value*>(IntOne), - "nextArg", - IB); + // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, + // i64 1 This essentially takes us to the next argument in memory + Constant *IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1); + if (argNum < TyList.size() - 1) + GEP = GetElementPtrInst::Create(nullptr, BI, ArrayRef<Value *>(IntOne), + "nextArg", IB); // Increment argNum and for the next iteration use result of this GEP to // extract next argument argNum++; @@ -370,11 +385,11 @@ std::vector<Value*> CodeGenTraversal::extractElements(Value* Aggregate, } // Traverse the function F argument list to get argument at offset -Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { +Argument *CodeGenTraversal::getArgumentAt(Function *F, unsigned offset) { DEBUG(errs() << "Finding argument " << offset << ":\n"); - assert((F->getFunctionType()->getNumParams() > offset) - && "Invalid offset to access arguments!"); - + assert((F->getFunctionType()->getNumParams() > offset) && + "Invalid offset to access arguments!"); + Function::arg_iterator ArgIt = F->arg_begin() + offset; Argument *Arg = &*ArgIt; return Arg; @@ -388,57 +403,51 @@ void CodeGenTraversal::initTimerAPI() { // Timer Routines // Initialize the timer set -void CodeGenTraversal::initializeTimerSet(Instruction* InsertBefore) { - //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n"); - TIMER(TimerSet = new GlobalVariable(M, - Type::getInt8PtrTy(M.getContext()), - false, - GlobalValue::CommonLinkage, - Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - Twine("viscTimerSet_")+TargetName); - DEBUG(errs() << "New global variable: " << *TimerSet << "\n"); - - Value* TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, - None, - "", - InsertBefore); - new StoreInst(TimerSetAddr, TimerSet, InsertBefore); - ); +void CodeGenTraversal::initializeTimerSet(Instruction *InsertBefore) { + // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << + // "\n"); + TIMER(TimerSet = new GlobalVariable( + M, Type::getInt8PtrTy(M.getContext()), false, + GlobalValue::CommonLinkage, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + Twine("viscTimerSet_") + TargetName); + DEBUG(errs() << "New global variable: " << *TimerSet << "\n"); + + Value *TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, + None, "", InsertBefore); + new StoreInst(TimerSetAddr, TimerSet, InsertBefore);); } -void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) { - Value* switchArgs[] = {TimerSet, getTimerID(M, timer)}; +void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, + Instruction *InsertBefore) { + Value *switchArgs[] = {TimerSet, getTimerID(M, timer)}; TIMER(CallInst::Create(llvm_visc_switchToTimer, - ArrayRef<Value*>(switchArgs, 2), - "", - InsertBefore)); + ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); } -void CodeGenTraversal::printTimerSet(Instruction* InsertBefore) { - Value* TimerName; - TIMER(TimerName = getStringPointer(TargetName+Twine("_Timer"), InsertBefore)); - Value* printArgs[] = {TimerSet, TimerName}; +void CodeGenTraversal::printTimerSet(Instruction *InsertBefore) { + Value *TimerName; + TIMER(TimerName = + getStringPointer(TargetName + Twine("_Timer"), InsertBefore)); + Value *printArgs[] = {TimerSet, TimerName}; TIMER(CallInst::Create(llvm_visc_printTimerSet, - ArrayRef<Value*>(printArgs, 2), - "", - InsertBefore)); + ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); } // Implementation of Helper Functions -static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) { +static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); } -static inline ConstantInt* getTargetID(Module& M, enum visc::Target T) { +static inline ConstantInt *getTargetID(Module &M, enum visc::Target T) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), T); } // Find if argument has the given attribute -bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) { - return F->getAttributes().hasAttribute(arg_index+1, AK); +bool hasAttribute(Function *F, unsigned arg_index, Attribute::AttrKind AK) { + return F->getAttributes().hasAttribute(arg_index + 1, AK); } -} // End of namespace +} // namespace dfg2llvm #endif - diff --git a/hpvm/include/SupportVISC/DFGTreeTraversal.h b/hpvm/include/SupportVISC/DFGTreeTraversal.h index 095ba1fb88978c3ec71fc1d1ac03e07d05b5f88c..67c317a2e9857b9000e4d77f6858494eb81c1ec1 100644 --- a/hpvm/include/SupportVISC/DFGTreeTraversal.h +++ b/hpvm/include/SupportVISC/DFGTreeTraversal.h @@ -1,6 +1,6 @@ #ifndef __DFGTREETRAVERSAL_H__ #define __DFGTREETRAVERSAL_H__ - + //=== DFGTreeTraversal.h - Header file for Tree Traversal of the HPVM DFG ====// // // The LLVM Compiler Infrastructure @@ -9,56 +9,61 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// - -#include "llvm/IR/Module.h" + +#include "llvm/BuildDFG/BuildDFG.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/BuildDFG/BuildDFG.h" - + using namespace llvm; using namespace builddfg; - + namespace dfg2llvm { - - class DFGTreeTraversal : public DFNodeVisitor { - - protected: - //Member variables - Module &M; - BuildDFG &DFG; - - virtual void process(DFInternalNode* N) = 0; - virtual void process(DFLeafNode* N) = 0; - - virtual ~DFGTreeTraversal() {} - - public: - // Constructor + +class DFGTreeTraversal : public DFNodeVisitor { + +protected: + // Member variables + Module &M; + BuildDFG &DFG; + + virtual void process(DFInternalNode *N) = 0; + virtual void process(DFLeafNode *N) = 0; + + virtual ~DFGTreeTraversal() {} + +public: + // Constructor DFGTreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} - - void visit(DFInternalNode* N) { - // May visit a nodemore than once, there is no marking it as visited - DEBUG(errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName() << "\n"); - - // Follows a bottom-up approach. - for (DFGraph::children_iterator i = N->getChildGraph()->begin(), - e = N->getChildGraph()->end(); i != e; ++i) { - DFNode* child = *i; - child->applyDFNodeVisitor(*this); - } - - // Process this internal node now. - process(N); - DEBUG(errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName() << "\n"); - } - - void visit(DFLeafNode* N) { - DEBUG(errs() << "Start: In Node (L) - " << N->getFuncPointer()->getName() << "\n"); - process(N); - DEBUG(errs() << "DONE: In Node (L) - " << N->getFuncPointer()->getName() << "\n"); + + void visit(DFInternalNode *N) { + // May visit a nodemore than once, there is no marking it as visited + DEBUG(errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName() + << "\n"); + + // Follows a bottom-up approach. + for (DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); + i != e; ++i) { + DFNode *child = *i; + child->applyDFNodeVisitor(*this); } - }; - + + // Process this internal node now. + process(N); + DEBUG(errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName() + << "\n"); + } + + void visit(DFLeafNode *N) { + DEBUG(errs() << "Start: In Node (L) - " << N->getFuncPointer()->getName() + << "\n"); + process(N); + DEBUG(errs() << "DONE: In Node (L) - " << N->getFuncPointer()->getName() + << "\n"); + } +}; + } // end namespace dfg2llvm - + #endif diff --git a/hpvm/include/SupportVISC/DFGraph.h b/hpvm/include/SupportVISC/DFGraph.h index 1207f1efc65ef69570425036ff4de7cf5f9cbf0c..0c224a344c4ec342f52f4816280e101518ba43dd 100644 --- a/hpvm/include/SupportVISC/DFGraph.h +++ b/hpvm/include/SupportVISC/DFGraph.h @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file contains the definition of the following classes: +// This file contains the definition of the following classes: // 1. DFNode // 2. DFGraph // 3. DFInternalNode @@ -20,17 +20,16 @@ #ifndef LLVM_IR_DFGRAPH_H #define LLVM_IR_DFGRAPH_H +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" +#include "llvm/ADT/GraphTraits.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" -#include "llvm/Support/Compiler.h" -#include "llvm/ADT/GraphTraits.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/GraphWriter.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" - +#include "llvm/Support/raw_ostream.h" namespace llvm { @@ -62,57 +61,44 @@ struct TargetGenFuncInfo { class DFGraph { private: - typedef std::vector<DFNode*> DFNodeListType; - typedef std::vector<DFEdge*> DFEdgeListType; + typedef std::vector<DFNode *> DFNodeListType; + typedef std::vector<DFEdge *> DFEdgeListType; // Important things that make up a Dataflow graph - DFNode* Entry; ///< Dummy node to act as source for edges - ///< from parent to nodes in the graph - DFNode* Exit; ///< Dummy node to act as destination for edges - ///< from nodes in the graph to parent - DFInternalNode* Parent; - DFNodeListType ChildrenList; ///< List of children Dataflow Nodes - DFEdgeListType DFEdgeList; ///< List of Dataflow edges among children - + DFNode *Entry; ///< Dummy node to act as source for edges + ///< from parent to nodes in the graph + DFNode *Exit; ///< Dummy node to act as destination for edges + ///< from nodes in the graph to parent + DFInternalNode *Parent; + DFNodeListType ChildrenList; ///< List of children Dataflow Nodes + DFEdgeListType DFEdgeList; ///< List of Dataflow edges among children public: - DFGraph(DFInternalNode* P); + DFGraph(DFInternalNode *P); virtual ~DFGraph() {} - void addChildDFNode(DFNode* child) { - ChildrenList.push_back(child); - } + void addChildDFNode(DFNode *child) { ChildrenList.push_back(child); } - void removeChildDFNode(DFNode* child) { + void removeChildDFNode(DFNode *child) { children_iterator position = std::find(begin(), end(), child); if (position != end()) // the child was found ChildrenList.erase(position); } // Dataflow edge connecting child dataflow nodes - void addDFEdge(DFEdge* E) { - DFEdgeList.push_back(E); - } + void addDFEdge(DFEdge *E) { DFEdgeList.push_back(E); } - DFNode* getEntry() const { - return Entry; - } + DFNode *getEntry() const { return Entry; } - DFNode* getExit() const { - return Exit; - } + DFNode *getExit() const { return Exit; } - bool isEntry(const DFNode* N) const { - return N == Entry; - } + bool isEntry(const DFNode *N) const { return N == Entry; } - bool isExit(const DFNode* N) const { - return N == Exit; - } + bool isExit(const DFNode *N) const { return N == Exit; } void sortChildren(); - static bool compareRank(DFNode* A, DFNode* B); + static bool compareRank(DFNode *A, DFNode *B); // Iterators typedef DFNodeListType::iterator children_iterator; @@ -124,56 +110,52 @@ public: //===--------------------------------------------------------------------===// // DFNodeList iterator forwarding functions // - children_iterator begin() { return ChildrenList.begin(); } + children_iterator begin() { return ChildrenList.begin(); } const_children_iterator begin() const { return ChildrenList.begin(); } - children_iterator end () { return ChildrenList.end(); } - const_children_iterator end () const { return ChildrenList.end(); } + children_iterator end() { return ChildrenList.end(); } + const_children_iterator end() const { return ChildrenList.end(); } - size_t size() const { return ChildrenList.size(); } - bool empty() const { return ChildrenList.empty(); } - const DFNode *front() const { return ChildrenList.front(); } - DFNode *front() { return ChildrenList.front(); } - const DFNode *back() const { return ChildrenList.back(); } - DFNode *back() { return ChildrenList.back(); } + size_t size() const { return ChildrenList.size(); } + bool empty() const { return ChildrenList.empty(); } + const DFNode *front() const { return ChildrenList.front(); } + DFNode *front() { return ChildrenList.front(); } + const DFNode *back() const { return ChildrenList.back(); } + DFNode *back() { return ChildrenList.back(); } //===--------------------------------------------------------------------===// //===--------------------------------------------------------------------===// // DFEdgeList iterator forwarding functions // - dfedge_iterator dfedge_begin() { return DFEdgeList.begin(); } + dfedge_iterator dfedge_begin() { return DFEdgeList.begin(); } const_dfedge_iterator dfedge_begin() const { return DFEdgeList.begin(); } - dfedge_iterator dfedge_end () { return DFEdgeList.end(); } - const_dfedge_iterator dfedge_end () const { return DFEdgeList.end(); } + dfedge_iterator dfedge_end() { return DFEdgeList.end(); } + const_dfedge_iterator dfedge_end() const { return DFEdgeList.end(); } - size_t dfedge_size() const { return DFEdgeList.size(); } - bool dfedge_empty() const { return DFEdgeList.empty(); } - const DFEdge *dfedge_front() const { return DFEdgeList.front(); } - DFEdge *dfedge_front() { return DFEdgeList.front(); } - const DFEdge *dfedge_back() const { return DFEdgeList.back(); } - DFEdge *dfedge_back() { return DFEdgeList.back(); } + size_t dfedge_size() const { return DFEdgeList.size(); } + bool dfedge_empty() const { return DFEdgeList.empty(); } + const DFEdge *dfedge_front() const { return DFEdgeList.front(); } + DFEdge *dfedge_front() { return DFEdgeList.front(); } + const DFEdge *dfedge_back() const { return DFEdgeList.back(); } + DFEdge *dfedge_back() { return DFEdgeList.back(); } //===--------------------------------------------------------------------===// - DFInternalNode* getParent() const { - return Parent; - } + DFInternalNode *getParent() const { return Parent; } // Child graph is streaming if any of the edges in the edge list is streaming bool isStreaming(); - //**************************************************************************// //* Functions to modify a dataflow graph *// //**************************************************************************// // Delete an edge of the child graph - void deleteEdge(DFEdge* E) { + void deleteEdge(DFEdge *E) { dfedge_iterator position = std::find(dfedge_begin(), dfedge_end(), E); if (position != dfedge_end()) // the edge was found DFEdgeList.erase(position); } - }; // DFNode represents a single VISC Dataflow Node in LLVM. @@ -190,35 +172,29 @@ class DFNode { public: // Discriminator for LLVM-style RTTI (dyn_cast et al.) - enum DFNodeKind { - InternalNode, - LeafNode - }; + enum DFNodeKind { InternalNode, LeafNode }; - enum PropertyKind { - Allocation, - NumProperties - }; + enum PropertyKind { Allocation, NumProperties }; private: - typedef std::vector<DFNode*> DFNodeListType; - typedef std::vector<DFEdge*> DFEdgeListType; - typedef void* PropertyType; + typedef std::vector<DFNode *> DFNodeListType; + typedef std::vector<DFEdge *> DFEdgeListType; + typedef void *PropertyType; typedef std::map<PropertyKind, PropertyType> PropertyListType; // Important things that make up a Dataflow Node - IntrinsicInst* II; ///< Associated IntrinsicInst/Value - Function* FuncPointer; ///< Associated Function - Function* GenFunc = NULL; ///< Associated Function generated by backend + IntrinsicInst *II; ///< Associated IntrinsicInst/Value + Function *FuncPointer; ///< Associated Function + Function *GenFunc = NULL; ///< Associated Function generated by backend struct TargetGenFunctions GenFuncs; - ///< Associated Functions generated by backends - ///< (if multiple are available) + ///< Associated Functions generated by backends + ///< (if multiple are available) struct TargetGenFuncInfo GenFuncInfo; - ///< True for each target generated function - ///< if the associated genFunc is an x86 function - DFInternalNode* Parent; ///< Pointer to parent dataflow Node + ///< True for each target generated function + ///< if the associated genFunc is an x86 function + DFInternalNode *Parent; ///< Pointer to parent dataflow Node unsigned NumOfDim; ///< Number of dimensions - std::vector<Value*> DimLimits; ///< Number of instances in each dimension + std::vector<Value *> DimLimits; ///< Number of instances in each dimension DFNodeListType Successors; ///< List of successors i.e., ///< destination DFNodes to DFEdges ///< originating from this DFNode @@ -229,7 +205,7 @@ private: ///< DFEdges originating from this DFNode to ///< successor DFNodes PropertyListType PropertyList; ///< List of Properties - StructType* OutputType; ///< Output Type + StructType *OutputType; ///< Output Type unsigned Level; ///< Distance to the top-level DFNode in the ///< hierarchy unsigned Rank; ///< Ordering based on toplogical sort @@ -255,268 +231,233 @@ public: //===--------------------------------------------------------------------===// // Successors iterator forwarding functions // - successor_iterator successors_begin() { return Successors.begin(); } - const_successor_iterator successors_begin() const { return Successors.begin(); } - successor_iterator successors_end () { return Successors.end(); } - const_successor_iterator successors_end () const { return Successors.end(); } - - size_t successors_size() const { return Successors.size(); } - bool successors_empty() const { return Successors.empty(); } - const DFNode* successors_front() const { return Successors.front(); } - DFNode* successors_front() { return Successors.front(); } - const DFNode* successors_back() const { return Successors.back(); } - DFNode* successors_back() { return Successors.back(); } + successor_iterator successors_begin() { return Successors.begin(); } + const_successor_iterator successors_begin() const { + return Successors.begin(); + } + successor_iterator successors_end() { return Successors.end(); } + const_successor_iterator successors_end() const { return Successors.end(); } + + size_t successors_size() const { return Successors.size(); } + bool successors_empty() const { return Successors.empty(); } + const DFNode *successors_front() const { return Successors.front(); } + DFNode *successors_front() { return Successors.front(); } + const DFNode *successors_back() const { return Successors.back(); } + DFNode *successors_back() { return Successors.back(); } //===--------------------------------------------------------------------===// //===--------------------------------------------------------------------===// // InDFEdges iterator forwarding functions // - indfedge_iterator indfedge_begin() { return InDFEdges.begin(); } + indfedge_iterator indfedge_begin() { return InDFEdges.begin(); } const_indfedge_iterator indfedge_begin() const { return InDFEdges.begin(); } - indfedge_iterator indfedge_end () { return InDFEdges.end(); } - const_indfedge_iterator indfedge_end () const { return InDFEdges.end(); } + indfedge_iterator indfedge_end() { return InDFEdges.end(); } + const_indfedge_iterator indfedge_end() const { return InDFEdges.end(); } - size_t indfedge_size() const { return InDFEdges.size(); } - bool indfedge_empty() const { return InDFEdges.empty(); } - const DFEdge *indfedge_front() const { return InDFEdges.front(); } - DFEdge *indfedge_front() { return InDFEdges.front(); } - const DFEdge *indfedge_back() const { return InDFEdges.back(); } - DFEdge *indfedge_back() { return InDFEdges.back(); } + size_t indfedge_size() const { return InDFEdges.size(); } + bool indfedge_empty() const { return InDFEdges.empty(); } + const DFEdge *indfedge_front() const { return InDFEdges.front(); } + DFEdge *indfedge_front() { return InDFEdges.front(); } + const DFEdge *indfedge_back() const { return InDFEdges.back(); } + DFEdge *indfedge_back() { return InDFEdges.back(); } //===--------------------------------------------------------------------===// //===--------------------------------------------------------------------===// // OutDFEdges iterator forwarding functions // - outdfedge_iterator outdfedge_begin() { return OutDFEdges.begin(); } - const_outdfedge_iterator outdfedge_begin() const { return OutDFEdges.begin(); } - outdfedge_iterator outdfedge_end () { return OutDFEdges.end(); } - const_outdfedge_iterator outdfedge_end () const { return OutDFEdges.end(); } - - size_t outdfedge_size() const { return OutDFEdges.size(); } - bool outdfedge_empty() const { return OutDFEdges.empty(); } - const DFEdge *outdfedge_front() const { return OutDFEdges.front(); } - DFEdge *outdfedge_front() { return OutDFEdges.front(); } - const DFEdge *outdfedge_back() const { return OutDFEdges.back(); } - DFEdge *outdfedge_back() { return OutDFEdges.back(); } + outdfedge_iterator outdfedge_begin() { return OutDFEdges.begin(); } + const_outdfedge_iterator outdfedge_begin() const { + return OutDFEdges.begin(); + } + outdfedge_iterator outdfedge_end() { return OutDFEdges.end(); } + const_outdfedge_iterator outdfedge_end() const { return OutDFEdges.end(); } + + size_t outdfedge_size() const { return OutDFEdges.size(); } + bool outdfedge_empty() const { return OutDFEdges.empty(); } + const DFEdge *outdfedge_front() const { return OutDFEdges.front(); } + DFEdge *outdfedge_front() { return OutDFEdges.front(); } + const DFEdge *outdfedge_back() const { return OutDFEdges.back(); } + DFEdge *outdfedge_back() { return OutDFEdges.back(); } //===--------------------------------------------------------------------===// // Functions - DFNodeKind getKind() const { - return Kind; - } - - DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint, - DFInternalNode* _Parent, unsigned _NumOfDim, std::vector<Value*> - _DimLimits, DFNodeKind _K); + DFNodeKind getKind() const { return Kind; } + + DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, + DFInternalNode *_Parent, unsigned _NumOfDim, + std::vector<Value *> _DimLimits, DFNodeKind _K); bool isRoot() const { // It is a root node is it was created from a launch intrinsic - if(II->getCalledFunction()->getName().equals("llvm.visc.launch")) { - assert(Level == 0 && "Root node's level is zero."); - return true; + if (II->getCalledFunction()->getName().equals("llvm.visc.launch")) { + assert(Level == 0 && "Root node's level is zero."); + return true; } return false; } - StructType* getOutputType() const { - return OutputType; - } + StructType *getOutputType() const { return OutputType; } - void addSuccessor(DFNode* N) { - Successors.push_back(N); - } + void addSuccessor(DFNode *N) { Successors.push_back(N); } // Add incoming dataflow edge - void addInDFEdge(DFEdge* E) { - InDFEdges.push_back(E); - } + void addInDFEdge(DFEdge *E) { InDFEdges.push_back(E); } // Add outgoing dataflow edge - void addOutDFEdge(DFEdge* E) { - OutDFEdges.push_back(E); - } + void addOutDFEdge(DFEdge *E) { OutDFEdges.push_back(E); } - Function* getFuncPointer() const { - return FuncPointer; - } + Function *getFuncPointer() const { return FuncPointer; } - void setFuncPointer(Function* _FuncPointer) { - FuncPointer = _FuncPointer; - } + void setFuncPointer(Function *_FuncPointer) { FuncPointer = _FuncPointer; } - IntrinsicInst* getInstruction() const { - return II; - } + IntrinsicInst *getInstruction() const { return II; } - DFInternalNode* getParent() const { - return Parent; - } + DFInternalNode *getParent() const { return Parent; } - unsigned getNumOfDim() const { - return NumOfDim; - } + unsigned getNumOfDim() const { return NumOfDim; } - std::vector<Value*> getDimLimits() const { - return DimLimits; - } + std::vector<Value *> getDimLimits() const { return DimLimits; } - unsigned getLevel() const { - return Level; - } + unsigned getLevel() const { return Level; } - unsigned getRank() const { - return Rank; - } + unsigned getRank() const { return Rank; } - void setTag(visc::Target T) { - Tag = T; - } + void setTag(visc::Target T) { Tag = T; } - visc::Target getTag() const { - return Tag; - } + visc::Target getTag() const { return Tag; } - void* getProperty(PropertyKind PType) { - assert(PropertyList.count(PType) == 1 - && "Requesting a property not defined!"); + void *getProperty(PropertyKind PType) { + assert(PropertyList.count(PType) == 1 && + "Requesting a property not defined!"); return PropertyList[PType]; } - void setProperty(PropertyKind PType, void* PValue) { - assert(PropertyList.count(PType) == 0 - && "Inserting a property already defined!"); + void setProperty(PropertyKind PType, void *PValue) { + assert(PropertyList.count(PType) == 0 && + "Inserting a property already defined!"); PropertyList[PType] = PValue; } - void setGenFunc(Function* F, visc::Target T) { + void setGenFunc(Function *F, visc::Target T) { GenFunc = F; Tag = T; } - Function* getGenFunc() const { - return GenFunc; - } + Function *getGenFunc() const { return GenFunc; } void setHasX86FuncForTarget(visc::Target T, bool isX86Func) { switch (T) { - case visc::None: - return; // Do nothing. - case visc::CPU_TARGET: - GenFuncInfo.cpu_hasX86Func = isX86Func; - break; - case visc::GPU_TARGET: - GenFuncInfo.gpu_hasX86Func = isX86Func; - break; - case visc::CPU_OR_GPU_TARGET: - break; - default: - assert(false && "Unknown target\n"); - break; + case visc::None: + return; // Do nothing. + case visc::CPU_TARGET: + GenFuncInfo.cpu_hasX86Func = isX86Func; + break; + case visc::GPU_TARGET: + GenFuncInfo.gpu_hasX86Func = isX86Func; + break; + case visc::CPU_OR_GPU_TARGET: + break; + default: + assert(false && "Unknown target\n"); + break; } - return; + return; } bool hasX86GenFuncForTarget(visc::Target T) const { switch (T) { - case visc::None: - return false; - case visc::CPU_TARGET: - return GenFuncInfo.cpu_hasX86Func; - case visc::GPU_TARGET: - return GenFuncInfo.gpu_hasX86Func; - case visc::CPU_OR_GPU_TARGET: - assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n"); - default: - assert(false && "Unknown target\n"); + case visc::None: + return false; + case visc::CPU_TARGET: + return GenFuncInfo.cpu_hasX86Func; + case visc::GPU_TARGET: + return GenFuncInfo.gpu_hasX86Func; + case visc::CPU_OR_GPU_TARGET: + assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n"); + default: + assert(false && "Unknown target\n"); } - return false; + return false; } - void addGenFunc(Function* F, visc::Target T, bool isX86Func) { + void addGenFunc(Function *F, visc::Target T, bool isX86Func) { switch (T) { - case visc::CPU_TARGET: - if (GenFuncs.CPUGenFunc != NULL) { - DEBUG(errs() << "Warning: Second generated CPU function for node " - << FuncPointer->getName() << "\n"); - } - GenFuncs.CPUGenFunc = F; - GenFuncInfo.cpu_hasX86Func = isX86Func; - break; - case visc::GPU_TARGET: - if (GenFuncs.GPUGenFunc != NULL) { - DEBUG(errs() << "Warning: Second generated GPU function for node " - << FuncPointer->getName() << "\n"); - } - GenFuncs.GPUGenFunc = F; - GenFuncInfo.gpu_hasX86Func = isX86Func; - break; - case visc::CPU_OR_GPU_TARGET: - assert(false && - "A node function should be set with a tag specifying its \ + case visc::CPU_TARGET: + if (GenFuncs.CPUGenFunc != NULL) { + DEBUG(errs() << "Warning: Second generated CPU function for node " + << FuncPointer->getName() << "\n"); + } + GenFuncs.CPUGenFunc = F; + GenFuncInfo.cpu_hasX86Func = isX86Func; + break; + case visc::GPU_TARGET: + if (GenFuncs.GPUGenFunc != NULL) { + DEBUG(errs() << "Warning: Second generated GPU function for node " + << FuncPointer->getName() << "\n"); + } + GenFuncs.GPUGenFunc = F; + GenFuncInfo.gpu_hasX86Func = isX86Func; + break; + case visc::CPU_OR_GPU_TARGET: + assert(false && "A node function should be set with a tag specifying its \ type, not the node hint itself\n"); - default: - assert(false && "Unknown target for generated function\n"); + default: + assert(false && "Unknown target for generated function\n"); } - Tag = viscUtils::getUpdatedTag(Tag,T); + Tag = viscUtils::getUpdatedTag(Tag, T); } - Function* getGenFuncForTarget(visc::Target T) const { + Function *getGenFuncForTarget(visc::Target T) const { switch (T) { - case visc::None: - return NULL; - case visc::CPU_TARGET: - return GenFuncs.CPUGenFunc; - case visc::GPU_TARGET: - return GenFuncs.GPUGenFunc; - case visc::CPU_OR_GPU_TARGET: - assert(false && - "Requesting genarated node function with dual tag instead of \ + case visc::None: + return NULL; + case visc::CPU_TARGET: + return GenFuncs.CPUGenFunc; + case visc::GPU_TARGET: + return GenFuncs.GPUGenFunc; + case visc::CPU_OR_GPU_TARGET: + assert(false && + "Requesting genarated node function with dual tag instead of \ CPU/GPU/SPIR/CUDNN/PROMISE\n"); - default: - assert(false && "Unknown target for generated function\n"); + default: + assert(false && "Unknown target for generated function\n"); } return NULL; } void removeGenFuncForTarget(visc::Target T) { switch (T) { - case visc::None: - return; - case visc::CPU_TARGET: - GenFuncs.CPUGenFunc = NULL; - GenFuncInfo.cpu_hasX86Func = false; - break; - case visc::GPU_TARGET: - GenFuncs.GPUGenFunc = NULL; - GenFuncInfo.gpu_hasX86Func = false; - break; - case visc::CPU_OR_GPU_TARGET: - assert(false && - "Removing genarated node function with dual tag instead of \ + case visc::None: + return; + case visc::CPU_TARGET: + GenFuncs.CPUGenFunc = NULL; + GenFuncInfo.cpu_hasX86Func = false; + break; + case visc::GPU_TARGET: + GenFuncs.GPUGenFunc = NULL; + GenFuncInfo.gpu_hasX86Func = false; + break; + case visc::CPU_OR_GPU_TARGET: + assert(false && + "Removing genarated node function with dual tag instead of \ CPU/GPU/SPIR/CUDNN/PROMISE\n"); - default: - assert(false && "Unknown target for generated function\n"); + default: + assert(false && "Unknown target for generated function\n"); } return; } - void setTargetHint(visc::Target T) { - Hint = T; - } + void setTargetHint(visc::Target T) { Hint = T; } - visc::Target getTargetHint() const { - return Hint; - } + visc::Target getTargetHint() const { return Hint; } - bool isDummyNode() const { - return isEntryNode() || isExitNode(); - } + bool isDummyNode() const { return isEntryNode() || isExitNode(); } bool isAllocationNode() { // If Allocation Property is defined then it is not an allocation node @@ -525,18 +466,18 @@ public: void setRank(unsigned r); bool isEntryNode() const; bool isExitNode() const; - DFEdge* getInDFEdgeAt(unsigned inPort); - DFEdge* getExtendedInDFEdgeAt(unsigned inPort); - DFEdge* getOutDFEdgeAt(unsigned outPort); - DFEdge* getExtendedOutDFEdgeAt(unsigned outPort); + DFEdge *getInDFEdgeAt(unsigned inPort); + DFEdge *getExtendedInDFEdgeAt(unsigned inPort); + DFEdge *getOutDFEdgeAt(unsigned outPort); + DFEdge *getExtendedOutDFEdgeAt(unsigned outPort); std::map<unsigned, unsigned> getInArgMap(); - std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap(); + std::map<unsigned, std::pair<Value *, unsigned>> getSharedInArgMap(); std::vector<unsigned> getOutArgMap(); - int getAncestorHops(DFNode* N); + int getAncestorHops(DFNode *N); bool hasSideEffects(); virtual void applyDFNodeVisitor(DFNodeVisitor &V) = 0; -// virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0; + // virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0; void clearGraphElements() { Successors.clear(); @@ -544,7 +485,6 @@ public: OutDFEdges.clear(); Parent = NULL; } - }; /***************************************************** @@ -553,49 +493,43 @@ public: class DFInternalNode : public DFNode { private: - DFGraph* childGraph; ///< Pointer to dataflow graph + DFGraph *childGraph; ///< Pointer to dataflow graph // Constructor - DFInternalNode(IntrinsicInst* II, Function* FuncPointer, visc::Target Hint, - DFInternalNode* Parent, int NumOfDim, std::vector<Value*> DimLimits) : - DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, InternalNode) { + DFInternalNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, + DFInternalNode *Parent, int NumOfDim, + std::vector<Value *> DimLimits) + : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, + InternalNode) { childGraph = new DFGraph(this); } -public: - static DFInternalNode *Create(IntrinsicInst* II, Function* FuncPointer, - visc::Target Hint = visc::CPU_TARGET, DFInternalNode* Parent = NULL, int - NumOfDim = 0, std::vector<Value*> DimLimits = std::vector<Value*>()) { +public: + static DFInternalNode * + Create(IntrinsicInst *II, Function *FuncPointer, + visc::Target Hint = visc::CPU_TARGET, DFInternalNode *Parent = NULL, + int NumOfDim = 0, + std::vector<Value *> DimLimits = std::vector<Value *>()) { return new DFInternalNode(II, FuncPointer, Hint, Parent, NumOfDim, - DimLimits); + DimLimits); } - static bool classof(const DFNode *N) { - return N->getKind() == InternalNode; - } + static bool classof(const DFNode *N) { return N->getKind() == InternalNode; } - void addChildToDFGraph(DFNode* N) { - childGraph->addChildDFNode(N); - } + void addChildToDFGraph(DFNode *N) { childGraph->addChildDFNode(N); } - void removeChildFromDFGraph(DFNode* N) { - childGraph->removeChildDFNode(N); - } + void removeChildFromDFGraph(DFNode *N) { childGraph->removeChildDFNode(N); } - void addEdgeToDFGraph(DFEdge* E); - - DFGraph* getChildGraph() const { - return childGraph; - } + void addEdgeToDFGraph(DFEdge *E); - bool isChildGraphStreaming() { - return childGraph->isStreaming(); - } + DFGraph *getChildGraph() const { return childGraph; } + + bool isChildGraphStreaming() { return childGraph->isStreaming(); } void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/ -// void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ + // void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ }; /***************************************************** @@ -605,26 +539,23 @@ class DFLeafNode : public DFNode { private: // Constructor - DFLeafNode(IntrinsicInst* II, Function* FuncPointer, visc::Target Hint, - DFInternalNode* Parent, int NumOfDim = 0, std::vector<Value*> DimLimits = - std::vector<Value*>()) : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, - DimLimits, LeafNode) {} + DFLeafNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, + DFInternalNode *Parent, int NumOfDim = 0, + std::vector<Value *> DimLimits = std::vector<Value *>()) + : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {} public: - - static DFLeafNode *Create(IntrinsicInst* II, Function* FuncPointer, visc::Target Hint, - DFInternalNode* Parent, int NumOfDim = 0, - std::vector<Value*> DimLimits = std::vector<Value*>()) { + static DFLeafNode * + Create(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, + DFInternalNode *Parent, int NumOfDim = 0, + std::vector<Value *> DimLimits = std::vector<Value *>()) { return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits); } - static bool classof(const DFNode *N) { - return N->getKind() == LeafNode; - } + static bool classof(const DFNode *N) { return N->getKind() == LeafNode; } void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/ -// void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ - + // void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ }; // DFEdge represents a single VISC Dataflow Edge in LLVM. @@ -646,135 +577,110 @@ public: class DFEdge { private: // Important things that make up a Dataflow Edge - DFNode* SrcDF; ///< Pointer to source dataflow Node - DFNode* DestDF; ///< Pointer to destination dataflow Node - bool EdgeType; ///< ONE_TO_ONE or ALL_TO_ALL - unsigned SourcePosition; ///< Position of data in the output of source - ///< DFnode - unsigned DestPosition; ///< Position of data in the input of - ///< destination DFnode - Type* ArgType; ///< Type of the argument - bool isStreaming; ///< Is this an streaming edge + DFNode *SrcDF; ///< Pointer to source dataflow Node + DFNode *DestDF; ///< Pointer to destination dataflow Node + bool EdgeType; ///< ONE_TO_ONE or ALL_TO_ALL + unsigned SourcePosition; ///< Position of data in the output of source + ///< DFnode + unsigned DestPosition; ///< Position of data in the input of + ///< destination DFnode + Type *ArgType; ///< Type of the argument + bool isStreaming; ///< Is this an streaming edge // Functions - DFEdge(DFNode* _SrcDF, DFNode* _DestDF, bool _EdgeType, - unsigned _SourcePosition, unsigned _DestPosition, Type* _ArgType, bool _isStreaming) - : SrcDF(_SrcDF), DestDF(_DestDF), EdgeType(_EdgeType), - SourcePosition(_SourcePosition), DestPosition(_DestPosition), - ArgType(_ArgType), isStreaming(_isStreaming) {} + DFEdge(DFNode *_SrcDF, DFNode *_DestDF, bool _EdgeType, + unsigned _SourcePosition, unsigned _DestPosition, Type *_ArgType, + bool _isStreaming) + : SrcDF(_SrcDF), DestDF(_DestDF), EdgeType(_EdgeType), + SourcePosition(_SourcePosition), DestPosition(_DestPosition), + ArgType(_ArgType), isStreaming(_isStreaming) {} public: - //TODO: Decide whether we need this type -// typedef enum {ONE_TO_ONE = false, ALL_TO_ALL} DFEdgeType; + // TODO: Decide whether we need this type + // typedef enum {ONE_TO_ONE = false, ALL_TO_ALL} DFEdgeType; - static DFEdge *Create(DFNode* SrcDF, DFNode* DestDF, bool EdgeType, - unsigned SourcePosition, unsigned DestPosition, Type* - ArgType, bool isStreaming = false) { + static DFEdge *Create(DFNode *SrcDF, DFNode *DestDF, bool EdgeType, + unsigned SourcePosition, unsigned DestPosition, + Type *ArgType, bool isStreaming = false) { return new DFEdge(SrcDF, DestDF, EdgeType, SourcePosition, DestPosition, ArgType, isStreaming); - } - DFNode* getSourceDF() const { - return SrcDF; - } + DFNode *getSourceDF() const { return SrcDF; } - void setSourceDF(DFNode* N) { - SrcDF = N; - } + void setSourceDF(DFNode *N) { SrcDF = N; } - DFNode* getDestDF() const { - return DestDF; - } + DFNode *getDestDF() const { return DestDF; } - void setDestDF(DFNode* N) { - DestDF = N; - } + void setDestDF(DFNode *N) { DestDF = N; } - bool getEdgeType() const { - return EdgeType; - } + bool getEdgeType() const { return EdgeType; } - unsigned getSourcePosition() const { - return SourcePosition; - } + unsigned getSourcePosition() const { return SourcePosition; } - void setSourcePosition(unsigned i) { - SourcePosition = i; - } + void setSourcePosition(unsigned i) { SourcePosition = i; } - unsigned getDestPosition() const { - return DestPosition; - } + unsigned getDestPosition() const { return DestPosition; } - void setDestPosition(unsigned i) { - DestPosition = i; - } + void setDestPosition(unsigned i) { DestPosition = i; } - Type* getType() const { - return ArgType; - } - - bool isStreamingEdge() const { - return isStreaming; - } + Type *getType() const { return ArgType; } + bool isStreamingEdge() const { return isStreaming; } }; - //===--------------------- DFGraph Outlined Functions --------------===// -DFGraph::DFGraph(DFInternalNode* P) { +DFGraph::DFGraph(DFInternalNode *P) { Parent = P; // Create dummy entry and exit nodes and add them to the graph - Entry = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); + Entry = + DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); addChildDFNode(Entry); addChildDFNode(Exit); } -void DFGraph::sortChildren() { - std::sort(begin(), end(), compareRank); -} +void DFGraph::sortChildren() { std::sort(begin(), end(), compareRank); } -bool DFGraph::compareRank(DFNode* A, DFNode* B) { +bool DFGraph::compareRank(DFNode *A, DFNode *B) { return A->getRank() < B->getRank(); } bool DFGraph::isStreaming() { - for (auto E: DFEdgeList) { - if(E->isStreamingEdge()) + for (auto E : DFEdgeList) { + if (E->isStreamingEdge()) return true; } return false; } //===--------------------- DFNode Outlined Functions --------------===// -DFNode::DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint, - DFInternalNode* _Parent, unsigned _NumOfDim, std::vector<Value*> _DimLimits, - DFNodeKind _K): II(_II), FuncPointer(_FuncPointer), Parent(_Parent), - NumOfDim(_NumOfDim), DimLimits(_DimLimits), Kind(_K) { +DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, + DFInternalNode *_Parent, unsigned _NumOfDim, + std::vector<Value *> _DimLimits, DFNodeKind _K) + : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim), + DimLimits(_DimLimits), Kind(_K) { - Type* Ty = FuncPointer->getFunctionType()->getReturnType(); + Type *Ty = FuncPointer->getFunctionType()->getReturnType(); // Allow the return type to be void too, in the hVISC IR. If return type is // void, create an empty struct type and keep that as the return type of the // node. - if(Ty->isVoidTy()) + if (Ty->isVoidTy()) Ty = StructType::get(Ty->getContext(), true); // All nodes output type must always be a struct type. - assert(isa<StructType>(Ty) - && "Invalid return type of a dataflow node"); + assert(isa<StructType>(Ty) && "Invalid return type of a dataflow node"); // Check that the number of dimensions is correct assert(NumOfDim <= 3 && "Invalid num of dimensions for dataflow node!"); // Check that the number of dimensions is correct - assert(DimLimits.size() == NumOfDim - && "Incompatible num of dimensions and dimension limits for DFNode!"); + assert(DimLimits.size() == NumOfDim && + "Incompatible num of dimensions and dimension limits for DFNode!"); OutputType = cast<StructType>(Ty); - Level = (_Parent) ? _Parent->getLevel() + 1 : 0 ; + Level = (_Parent) ? _Parent->getLevel() + 1 : 0; Rank = 0; Tag = visc::None; @@ -794,12 +700,12 @@ DFNode::DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint, void DFNode::setRank(unsigned r) { Rank = r; // Update rank of successors - for(outdfedge_iterator i = outdfedge_begin(), - e = outdfedge_end(); i != e; ++i) { - DFEdge* E = *i; - DFNode* D = E->getDestDF(); - if(D->getRank() <= r) - D->setRank(r+1); + for (outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e; + ++i) { + DFEdge *E = *i; + DFNode *D = E->getDestDF(); + if (D->getRank() <= r) + D->setRank(r + 1); } } @@ -815,33 +721,35 @@ bool DFNode::isExitNode() const { return Parent->getChildGraph()->isExit(this); } -DFEdge* DFNode::getInDFEdgeAt(unsigned inPort) { +DFEdge *DFNode::getInDFEdgeAt(unsigned inPort) { // If it is not a dummy node, then check if inPort should be less than the // number of arguments in the associated function. - assert((inPort < FuncPointer->getFunctionType()->getNumParams() - || isDummyNode()) && "Invalid input port request!"); - - for(indfedge_iterator i = indfedge_begin(), e = indfedge_end(); i != e; ++i) { - DFEdge* E = *i; - if(inPort == E->getDestPosition()) + assert((inPort < FuncPointer->getFunctionType()->getNumParams() || + isDummyNode()) && + "Invalid input port request!"); + + for (indfedge_iterator i = indfedge_begin(), e = indfedge_end(); i != e; + ++i) { + DFEdge *E = *i; + if (inPort == E->getDestPosition()) return E; } return NULL; } -DFEdge* DFNode::getExtendedInDFEdgeAt(unsigned inPort) { - DFEdge* Ein = getInDFEdgeAt(inPort); - DFNode* sn = Ein->getSourceDF(); +DFEdge *DFNode::getExtendedInDFEdgeAt(unsigned inPort) { + DFEdge *Ein = getInDFEdgeAt(inPort); + DFNode *sn = Ein->getSourceDF(); if (!sn->isEntryNode()) return Ein; - DFNode* pn = getParent(); + DFNode *pn = getParent(); if (pn->isRoot()) return Ein; - DFEdge* PEin = pn->getInDFEdgeAt(inPort); - DFInternalNode* SPN = dyn_cast<DFInternalNode>(PEin->getSourceDF()); + DFEdge *PEin = pn->getInDFEdgeAt(inPort); + DFInternalNode *SPN = dyn_cast<DFInternalNode>(PEin->getSourceDF()); if (!SPN) return PEin; @@ -849,30 +757,31 @@ DFEdge* DFNode::getExtendedInDFEdgeAt(unsigned inPort) { return SPN->getChildGraph()->getExit()->getInDFEdgeAt(outPort); } -DFEdge* DFNode::getOutDFEdgeAt(unsigned outPort) { +DFEdge *DFNode::getOutDFEdgeAt(unsigned outPort) { // Cannot perform check for the number of outputs here, // it depends on the node's return type - for(outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e; ++i) { - DFEdge* E = *i; - if(outPort == E->getSourcePosition()) + for (outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e; + ++i) { + DFEdge *E = *i; + if (outPort == E->getSourcePosition()) return E; } return NULL; } -DFEdge* DFNode::getExtendedOutDFEdgeAt(unsigned outPort) { - DFEdge* Eout = getOutDFEdgeAt(outPort); +DFEdge *DFNode::getExtendedOutDFEdgeAt(unsigned outPort) { + DFEdge *Eout = getOutDFEdgeAt(outPort); if (!Eout->getDestDF()->isExitNode()) return Eout; - DFNode* pn = getParent(); + DFNode *pn = getParent(); if (pn->isRoot()) return Eout; - DFEdge* PEout = pn->getOutDFEdgeAt(outPort); - DFInternalNode* DPN = dyn_cast<DFInternalNode>(PEout->getDestDF()); + DFEdge *PEout = pn->getOutDFEdgeAt(outPort); + DFInternalNode *DPN = dyn_cast<DFInternalNode>(PEout->getDestDF()); if (!DPN) return PEout; @@ -884,7 +793,7 @@ DFEdge* DFNode::getExtendedOutDFEdgeAt(unsigned outPort) { std::map<unsigned, unsigned> DFNode::getInArgMap() { std::map<unsigned, unsigned> map; for (unsigned i = 0; i < InDFEdges.size(); i++) { - DFEdge* E = getInDFEdgeAt(i); + DFEdge *E = getInDFEdgeAt(i); if (E->getSourceDF()->isAllocationNode()) continue; unsigned pos = E->getSourcePosition(); @@ -894,13 +803,13 @@ std::map<unsigned, unsigned> DFNode::getInArgMap() { } // Only Allocation Nodes - only detect relevant indices -std::map<unsigned, std::pair<Value*, unsigned> > DFNode::getSharedInArgMap() { - std::map<unsigned, std::pair<Value*, unsigned> > map; +std::map<unsigned, std::pair<Value *, unsigned>> DFNode::getSharedInArgMap() { + std::map<unsigned, std::pair<Value *, unsigned>> map; for (unsigned i = 0; i < InDFEdges.size(); i++) { - DFEdge* E = getInDFEdgeAt(i); + DFEdge *E = getInDFEdgeAt(i); if (!E->getSourceDF()->isAllocationNode()) continue; - map[i] = std::pair<Value *, unsigned>(NULL,0); + map[i] = std::pair<Value *, unsigned>(NULL, 0); } return map; } @@ -908,18 +817,18 @@ std::map<unsigned, std::pair<Value*, unsigned> > DFNode::getSharedInArgMap() { std::vector<unsigned> DFNode::getOutArgMap() { std::vector<unsigned> map(OutDFEdges.size()); for (unsigned i = 0; i < OutDFEdges.size(); i++) { - DFEdge* E = getOutDFEdgeAt(i); + DFEdge *E = getOutDFEdgeAt(i); unsigned pos = E->getDestPosition(); map[pos] = i; } return map; } -int DFNode::getAncestorHops(DFNode* N) { - DFNode* temp = this; +int DFNode::getAncestorHops(DFNode *N) { + DFNode *temp = this; int hops = 0; while (temp != NULL) { - if(temp == N) + if (temp == N) return hops; temp = temp->getParent(); hops++; @@ -938,22 +847,24 @@ int DFNode::getAncestorHops(DFNode* N) { bool DFNode::hasSideEffects() { bool hasSideEffects = false; // Check #1: No incoming pointer argument - for(DFEdge* E: this->InDFEdges) { + for (DFEdge *E : this->InDFEdges) { hasSideEffects |= E->getType()->isPointerTy(); } return hasSideEffects; } //===--------------------- DFInternalNode Outlined Functions --------------===// -void DFInternalNode::addEdgeToDFGraph(DFEdge* E) { - DFNode* S = E->getSourceDF(); - DFNode* D = E->getDestDF(); +void DFInternalNode::addEdgeToDFGraph(DFEdge *E) { + DFNode *S = E->getSourceDF(); + DFNode *D = E->getDestDF(); - assert(std::find(childGraph->begin(), childGraph->end(), S)!=childGraph->end() - && "Source node not found in child dataflow graph!"); + assert(std::find(childGraph->begin(), childGraph->end(), S) != + childGraph->end() && + "Source node not found in child dataflow graph!"); - assert(std::find(childGraph->begin(), childGraph->end(), D)!=childGraph->end() - && "Destination node not found in child dataflow graph!"); + assert(std::find(childGraph->begin(), childGraph->end(), D) != + childGraph->end() && + "Destination node not found in child dataflow graph!"); // Update Graph childGraph->addDFEdge(E); @@ -964,33 +875,29 @@ void DFInternalNode::addEdgeToDFGraph(DFEdge* E) { D->addInDFEdge(E); // Update Rank - if(D->getRank() <= S->getRank()) - D->setRank(S->getRank()+1); + if (D->getRank() <= S->getRank()) + D->setRank(S->getRank() + 1); } //===------------------------ Property Objects ---------------------------====// class AllocationNodeProperty { - public: - typedef std::pair<DFEdge*, Value*> AllocationType; +public: + typedef std::pair<DFEdge *, Value *> AllocationType; typedef std::vector<AllocationType> AllocationListType; - private: - AllocationListType AllocationList; +private: + AllocationListType AllocationList; - public: - AllocationNodeProperty() {} +public: + AllocationNodeProperty() {} - unsigned getNumAllocations() { - return AllocationList.size(); - } + unsigned getNumAllocations() { return AllocationList.size(); } - AllocationListType getAllocationList() { - return AllocationList; - } + AllocationListType getAllocationList() { return AllocationList; } - void insertAllocation(DFEdge* E, Value* V) { - AllocationList.push_back(AllocationType(E,V)); - } + void insertAllocation(DFEdge *E, Value *V) { + AllocationList.push_back(AllocationType(E, V)); + } }; //===-------------------------- Visitor Classes ---------------------------===// @@ -998,42 +905,40 @@ class AllocationNodeProperty { class DFNodeVisitor { public: virtual ~DFNodeVisitor() {} - virtual void visit(DFInternalNode* N) = 0; - virtual void visit(DFLeafNode* N) = 0; + virtual void visit(DFInternalNode *N) = 0; + virtual void visit(DFLeafNode *N) = 0; }; -void DFInternalNode::applyDFNodeVisitor(DFNodeVisitor &V) { - V.visit(this); -} +void DFInternalNode::applyDFNodeVisitor(DFNodeVisitor &V) { V.visit(this); } -void DFLeafNode::applyDFNodeVisitor(DFNodeVisitor &V) { - V.visit(this); -} +void DFLeafNode::applyDFNodeVisitor(DFNodeVisitor &V) { V.visit(this); } class DFTreeTraversal : public DFNodeVisitor { public: virtual ~DFTreeTraversal() {} - virtual void visit(DFInternalNode* N){ - DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName() << "\n"); - for(DFGraph::children_iterator i = N->getChildGraph()->begin(), - e = N->getChildGraph()->end(); i != e; ++i) { - DFNode* child = *i; + virtual void visit(DFInternalNode *N) { + DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName() + << "\n"); + for (DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); + i != e; ++i) { + DFNode *child = *i; child->applyDFNodeVisitor(*this); } } - virtual void visit(DFLeafNode* N) { - DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName() << "\n"); + virtual void visit(DFLeafNode *N) { + DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName() + << "\n"); } - }; class FollowSuccessors : public DFNodeVisitor { public: - virtual void visit(DFInternalNode* N) { + virtual void visit(DFInternalNode *N) { /*DFNodeListType L; // Empty List that will contain the sorted elements DFNodeListType S; // Set of all nodes with no incoming edges @@ -1047,9 +952,11 @@ public: if } }*/ - DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName() << "\n"); - for(DFInternalNode::successor_iterator i = N->successors_begin(), - e = N->successors_end(); i != e; ++i) { + DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName() + << "\n"); + for (DFInternalNode::successor_iterator i = N->successors_begin(), + e = N->successors_end(); + i != e; ++i) { /* Traverse the graph. * Choose the kind of traversal we want * Do we do a DAG kind of traversal? @@ -1057,67 +964,68 @@ public: } } - virtual void visit(DFLeafNode* N) { - DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName() << "\n"); + virtual void visit(DFLeafNode *N) { + DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName() + << "\n"); } }; class ReplaceNodeFunction : public DFNodeVisitor { protected: - //Member variables + // Member variables Module &M; - Function* F = NULL; // Function to replace - Function* G = NULL; // Function to be replaced by + Function *F = NULL; // Function to replace + Function *G = NULL; // Function to be replaced by // Functions - void replaceNodeFunction(DFInternalNode* N) { + void replaceNodeFunction(DFInternalNode *N) { if (N->getFuncPointer() == F) N->setFuncPointer(G); } - void replaceNodeFunction(DFLeafNode* N) { + void replaceNodeFunction(DFLeafNode *N) { if (N->getFuncPointer() == F) N->setFuncPointer(G); } - ~ReplaceNodeFunction() {}; + ~ReplaceNodeFunction(){}; public: - // Constructor - ReplaceNodeFunction(Module &_M, - Function* _F, Function* _G) : M(_M), F(_F), G(_G) {} + ReplaceNodeFunction(Module &_M, Function *_F, Function *_G) + : M(_M), F(_F), G(_G) {} ReplaceNodeFunction(Module &_M) : M(_M), F(NULL), G(NULL) {} - void setF(Function* _F) { - F = _F; - } + void setF(Function *_F) { F = _F; } - void setG(Function* _G) { - G = _G; - } + void setG(Function *_G) { G = _G; } - virtual void visit(DFInternalNode* N) { - DEBUG(errs() << "Start: Replace Node Function for Node (I) - " << N->getFuncPointer()->getName() << "\n"); + virtual void visit(DFInternalNode *N) { + DEBUG(errs() << "Start: Replace Node Function for Node (I) - " + << N->getFuncPointer()->getName() << "\n"); // Follows a bottom-up approach. - for(DFGraph::children_iterator i = N->getChildGraph()->begin(), - e = N->getChildGraph()->end(); i != e; ++i) { - DFNode* child = *i; + for (DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); + i != e; ++i) { + DFNode *child = *i; child->applyDFNodeVisitor(*this); } // Generate code for this internal node now. This way all the cloned // functions for children exist. replaceNodeFunction(N); - DEBUG(errs() << "DONE: Replace Node Function for Node (I) - " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "DONE: Replace Node Function for Node (I) - " + << N->getFuncPointer()->getName() << "\n"); } - virtual void visit(DFLeafNode* N) { - DEBUG(errs() << "Start: Replace Node Function for Node (L) - " << N->getFuncPointer()->getName() << "\n"); + virtual void visit(DFLeafNode *N) { + DEBUG(errs() << "Start: Replace Node Function for Node (L) - " + << N->getFuncPointer()->getName() << "\n"); replaceNodeFunction(N); - DEBUG(errs() << "DONE: Replace Node Function for Node (L) - " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "DONE: Replace Node Function for Node (L) - " + << N->getFuncPointer()->getName() << "\n"); } }; @@ -1133,7 +1041,7 @@ public: // GraphTraits specializations for DFNode graph (DFG) //===--------------------------------------------------------------------===// -template <> struct GraphTraits<DFNode*> { +template <> struct GraphTraits<DFNode *> { typedef DFNode *NodeRef; typedef typename DFNode::successor_iterator ChildIteratorType; @@ -1143,110 +1051,100 @@ template <> struct GraphTraits<DFNode*> { static inline ChildIteratorType child_end(NodeRef N) { return N->successors_end(); } - }; -template <> struct GraphTraits<DFGraph*> : public GraphTraits<DFNode*> { - typedef typename DFGraph::children_iterator nodes_iterator; +template <> struct GraphTraits<DFGraph *> : public GraphTraits<DFNode *> { + typedef typename DFGraph::children_iterator nodes_iterator; - static NodeRef getEntryNode(DFGraph* G) { - return G->front(); - } + static NodeRef getEntryNode(DFGraph *G) { return G->front(); } - static nodes_iterator nodes_begin(DFGraph *G) { - return G->begin(); - } + static nodes_iterator nodes_begin(DFGraph *G) { return G->begin(); } - static inline nodes_iterator nodes_end(DFGraph *G) { - return G->end(); - } + static inline nodes_iterator nodes_end(DFGraph *G) { return G->end(); } }; -template<> -struct DOTGraphTraits<DFGraph*> : public DefaultDOTGraphTraits { +template <> struct DOTGraphTraits<DFGraph *> : public DefaultDOTGraphTraits { - DOTGraphTraits (bool isSimple=false) - : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} - static std::string getGraphName(DFGraph* G) { - DFInternalNode* Parent = G->getParent(); - if(Parent != NULL) + static std::string getGraphName(DFGraph *G) { + DFInternalNode *Parent = G->getParent(); + if (Parent != NULL) return Parent->getFuncPointer()->getName(); else return "Dataflow Graph"; } - static std::string getGraphProperties(DFGraph* G) { + static std::string getGraphProperties(DFGraph *G) { return "\tcompound=true;"; } - std::string getNodeLabel (DFNode* N, DFGraph* G) { - if(N->isEntryNode()) + std::string getNodeLabel(DFNode *N, DFGraph *G) { + if (N->isEntryNode()) return "Entry"; - if(N->isExitNode()) + if (N->isExitNode()) return "Exit"; return N->getFuncPointer()->getName(); } - static bool isCompoundNode(DFNode* N) { + static bool isCompoundNode(DFNode *N) { bool ret = isa<DFInternalNode>(N); return ret; } - - static DFGraph* getSubGraph(DFNode* N, DFGraph* G) { - DFInternalNode* IN = dyn_cast<DFInternalNode>(N); + + static DFGraph *getSubGraph(DFNode *N, DFGraph *G) { + DFInternalNode *IN = dyn_cast<DFInternalNode>(N); assert(IN && "No subgraph for leaf dataflow node!"); return IN->getChildGraph(); } - static DFNode* getAnySimpleNodeForSrc(DFNode* N) { - DFInternalNode* IN = dyn_cast<DFInternalNode>(N); + static DFNode *getAnySimpleNodeForSrc(DFNode *N) { + DFInternalNode *IN = dyn_cast<DFInternalNode>(N); assert(IN && "No subgraph for leaf dataflow node!"); return IN->getChildGraph()->getExit(); } - static DFNode* getAnySimpleNodeForDest(DFNode* N) { - DFInternalNode* IN = dyn_cast<DFInternalNode>(N); + static DFNode *getAnySimpleNodeForDest(DFNode *N) { + DFInternalNode *IN = dyn_cast<DFInternalNode>(N); assert(IN && "No subgraph for leaf dataflow node!"); return IN->getChildGraph()->getEntry(); } - static std::string getNodeAttributes(DFNode* N, DFGraph* G) { + static std::string getNodeAttributes(DFNode *N, DFGraph *G) { std::string Attr = ""; raw_string_ostream OS(Attr); OS << "shape=oval"; return OS.str(); } - static std::string getEdgeAttributes(DFNode* N, DFNode::successor_iterator SI, DFGraph* G) { + static std::string getEdgeAttributes(DFNode *N, DFNode::successor_iterator SI, + DFGraph *G) { std::string Attr = ""; raw_string_ostream OS(Attr); bool comma = false; - if(DFInternalNode* SrcNode = dyn_cast<DFInternalNode>(N)) { + if (DFInternalNode *SrcNode = dyn_cast<DFInternalNode>(N)) { comma = true; - OS << "ltail=cluster"; - OS << static_cast<const void*>(SrcNode); + OS << "ltail=cluster"; + OS << static_cast<const void *>(SrcNode); } - DFNode* DN = *SI; - if(DFInternalNode* DestNode = dyn_cast<DFInternalNode>(DN)) { - if(comma) + DFNode *DN = *SI; + if (DFInternalNode *DestNode = dyn_cast<DFInternalNode>(DN)) { + if (comma) OS << ", "; - OS << "lhead=cluster"; - OS << static_cast<const void*>(DestNode); + OS << "lhead=cluster"; + OS << static_cast<const void *>(DestNode); } return OS.str(); } - static void addCustomGraphFeatures(DFGraph* G, GraphWriter<DFGraph*> &GW) { - - } + static void addCustomGraphFeatures(DFGraph *G, GraphWriter<DFGraph *> &GW) {} }; void viewDFGraph(DFGraph *G) { llvm::WriteGraph(G, "DataflowGraph"); - //llvm::ViewGraph(G, "DataflowGraph"); + // llvm::ViewGraph(G, "DataflowGraph"); } -} // End llvm namespace +} // namespace llvm #endif diff --git a/hpvm/include/SupportVISC/VISCHint.h b/hpvm/include/SupportVISC/VISCHint.h index 5324c0fabddeef5f85a540176ffffb278ac1dfdf..99266b071843ab0417ea73c6e4533dfa381d52cd 100644 --- a/hpvm/include/SupportVISC/VISCHint.h +++ b/hpvm/include/SupportVISC/VISCHint.h @@ -15,21 +15,21 @@ namespace visc { #endif - enum Target { - None, - CPU_TARGET, - GPU_TARGET, - SPIR_TARGET, - CUDNN_TARGET, - PROMISE_TARGET, - CPU_OR_GPU_TARGET, - CPU_OR_SPIR_TARGET, -// ALL_TARGETS, - NUM_TARGETS - }; +enum Target { + None, + CPU_TARGET, + GPU_TARGET, + SPIR_TARGET, + CUDNN_TARGET, + PROMISE_TARGET, + CPU_OR_GPU_TARGET, + CPU_OR_SPIR_TARGET, + // ALL_TARGETS, + NUM_TARGETS +}; #ifdef __cplusplus } #endif -#endif //VISC_HINT_HEADER +#endif // VISC_HINT_HEADER diff --git a/hpvm/include/SupportVISC/VISCTimer.h b/hpvm/include/SupportVISC/VISCTimer.h index 4dbadbd34f47e8fd35413317a5df28ba0589e3d5..ce3dc8a5e0f7c77ff06fec5857f223ca4f0e142f 100644 --- a/hpvm/include/SupportVISC/VISCTimer.h +++ b/hpvm/include/SupportVISC/VISCTimer.h @@ -27,57 +27,53 @@ enum visc_TimerState { struct visc_Timer { enum visc_TimerState state; - visc_Timestamp elapsed; /* Amount of time elapsed so far */ - visc_Timestamp init; /* Beginning of the current time interval, - * if state is RUNNING. End of the last - * recorded time interfal otherwise. */ + visc_Timestamp elapsed; /* Amount of time elapsed so far */ + visc_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ }; /* Reset a timer. * Use this to initialize a timer or to clear * its elapsed time. The reset timer is stopped. */ -void -visc_ResetTimer(struct visc_Timer *timer); +void visc_ResetTimer(struct visc_Timer *timer); /* Start a timer. The timer is set to RUNNING mode and * time elapsed while the timer is running is added to * the timer. * The timer should not already be running. */ -void -visc_StartTimer(struct visc_Timer *timer); +void visc_StartTimer(struct visc_Timer *timer); /* Stop a timer. * This stops adding elapsed time to the timer. * The timer should not already be stopped. */ -void -visc_StopTimer(struct visc_Timer *timer); +void visc_StopTimer(struct visc_Timer *timer); /* Get the elapsed time in seconds. */ -double -visc_GetElapsedTime(struct visc_Timer *timer); +double visc_GetElapsedTime(struct visc_Timer *timer); /* Execution time is assigned to one of these categories. */ enum visc_TimerID { visc_TimerID_NONE = 0, - visc_TimerID_IO, /* Time spent in input/output */ - visc_TimerID_KERNEL, /* Time spent computing on the device, - * recorded asynchronously */ - visc_TimerID_COPY, /* Time spent synchronously moving data - * to/from device and allocating/freeing - * memory on the device */ - visc_TimerID_DRIVER, /* Time spent in the host interacting with the - * driver, primarily for recording the time - * spent queueing asynchronous operations */ - visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ - visc_TimerID_COMPUTE, /* Time for all program execution other - * than parsing command line arguments, - * I/O, kernel, and copy */ - visc_TimerID_OVERLAP, /* Time double-counted in asynchronous and - * host activity: automatically filled in, - * not intended for direct usage */ + visc_TimerID_IO, /* Time spent in input/output */ + visc_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + visc_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + visc_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + visc_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + visc_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ // GPU FUNCTION visc_TimerID_INIT_CTX, visc_TimerID_CLEAR_CTX, @@ -97,16 +93,16 @@ enum visc_TimerID { visc_TimerID_OUTPUT_PACK, visc_TimerID_OUTPUT_UNPACK, - visc_TimerID_LAST /* Number of timer IDs */ + visc_TimerID_LAST /* Number of timer IDs */ }; /* Dynamic list of asynchronously tracked times between events */ struct visc_async_time_marker_list { - char *label; // actually just a pointer to a string - enum visc_TimerID timerID; /* The ID to which the interval beginning - * with this marker should be attributed */ - void * marker; - //cudaEvent_t marker; /* The driver event for this marker */ + char *label; // actually just a pointer to a string + enum visc_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void *marker; + // cudaEvent_t marker; /* The driver event for this marker */ struct visc_async_time_marker_list *next; }; @@ -124,7 +120,7 @@ struct visc_SubTimerList { /* A set of timers for recording execution times. */ struct visc_TimerSet { enum visc_TimerID current; - struct visc_async_time_marker_list* async_markers; + struct visc_async_time_marker_list *async_markers; visc_Timestamp async_begin; visc_Timestamp wall_begin; struct visc_Timer timers[visc_TimerID_LAST]; @@ -132,28 +128,24 @@ struct visc_TimerSet { }; /* Reset all timers in the set. */ -void -visc_InitializeTimerSet(struct visc_TimerSet *timers); +void visc_InitializeTimerSet(struct visc_TimerSet *timers); -void -visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category); +void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID visc_Category); /* Select which timer the next interval of time should be accounted * to. The selected timer is started and other timers are stopped. * Using visc_TimerID_NONE stops all timers. */ -inline void -visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer); +inline void visc_SwitchToTimer(struct visc_TimerSet *timers, + enum visc_TimerID timer); -void -visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category); +void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID category); /* Print timer values to standard output. */ -void -visc_PrintTimerSet(struct visc_TimerSet *timers); +void visc_PrintTimerSet(struct visc_TimerSet *timers); /* Release timer resources */ -void -visc_DestroyTimerSet(struct visc_TimerSet * timers); - +void visc_DestroyTimerSet(struct visc_TimerSet *timers); } -#endif //VISC_RT_HEADER +#endif // VISC_RT_HEADER diff --git a/hpvm/include/SupportVISC/VISCUtils.h b/hpvm/include/SupportVISC/VISCUtils.h index 325acfaf1993964bc93d98eadd7ee06df0fd7140..0efd20b5b5eb57943de1feb6d2afa886c6c48a5c 100644 --- a/hpvm/include/SupportVISC/VISCUtils.h +++ b/hpvm/include/SupportVISC/VISCUtils.h @@ -12,18 +12,18 @@ #define VISC_UTILS_HEADER #include <assert.h> - -#include "llvm/IR/Module.h" + +#include "SupportVISC/VISCHint.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" -#include "llvm/IR/Metadata.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "SupportVISC/VISCHint.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -32,125 +32,126 @@ using namespace llvm; namespace viscUtils { // Helper Functions -static bool isViscCreateNodeIntrinsic(Instruction* I) { - if(!isa<IntrinsicInst>(I)) +static bool isViscCreateNodeIntrinsic(Instruction *I) { + if (!isa<IntrinsicInst>(I)) return false; - IntrinsicInst* II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.visc.createNode"); + IntrinsicInst *II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()) + .startswith("llvm.visc.createNode"); } -static bool isViscCreateNodeCall(Instruction* I) { - if(!isa<CallInst>(I)) +static bool isViscCreateNodeCall(Instruction *I) { + if (!isa<CallInst>(I)) return false; - CallInst* CI = cast<CallInst>(I); - return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__createNode"); + CallInst *CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()) + .startswith("__visc__createNode"); } -static bool isViscLaunchCall(Instruction* I) { - if(!isa<CallInst>(I)) +static bool isViscLaunchCall(Instruction *I) { + if (!isa<CallInst>(I)) return false; - CallInst* CI = cast<CallInst>(I); - return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__launch"); + CallInst *CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()) + .startswith("__visc__launch"); } // Creates a new createNode intrinsic, similar to II but with different // associated function F instead -IntrinsicInst* createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function* F, - IntrinsicInst* II) { - Module* M = F->getParent(); +IntrinsicInst * +createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, + IntrinsicInst *II) { + Module *M = F->getParent(); // Find which createNode intrinsic we need to create - Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID()); - Constant* Fp = ConstantExpr::getPointerCast(F, - Type::getInt8PtrTy(II->getContext())); + Function *CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID()); + Constant *Fp = + ConstantExpr::getPointerCast(F, Type::getInt8PtrTy(II->getContext())); - ArrayRef<Value*> CreateNodeArgs; + ArrayRef<Value *> CreateNodeArgs; switch (II->getIntrinsicID()) { - case Intrinsic::visc_createNode: - { - CreateNodeArgs = ArrayRef<Value*>(Fp); - break; - } - case Intrinsic::visc_createNode1D: - { - Value* CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; - CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2); - break; - } - case Intrinsic::visc_createNode2D: - { - Value* CreateNode2DArgs[] = {Fp, II->getArgOperand(1), - II->getArgOperand(2)}; - CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3); - break; - } - case Intrinsic::visc_createNode3D: - { - Value* CreateNode3DArgs[] = {Fp, II->getArgOperand(1), - II->getArgOperand(2), - II->getArgOperand(3)}; - CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4); - break; - } - default : - assert(false && "Unknown createNode intrinsic"); - break; + case Intrinsic::visc_createNode: { + CreateNodeArgs = ArrayRef<Value *>(Fp); + break; + } + case Intrinsic::visc_createNode1D: { + Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; + CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2); + break; + } + case Intrinsic::visc_createNode2D: { + Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1), + II->getArgOperand(2)}; + CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3); + break; + } + case Intrinsic::visc_createNode3D: { + Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2), + II->getArgOperand(3)}; + CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4); + break; + } + default: + assert(false && "Unknown createNode intrinsic"); + break; } - CallInst* CI = CallInst::Create(CreateNodeF, - CreateNodeArgs, - F->getName()+".node"); - IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI); + CallInst *CI = + CallInst::Create(CreateNodeF, CreateNodeArgs, F->getName() + ".node"); + IntrinsicInst *CreateNodeII = cast<IntrinsicInst>(CI); return CreateNodeII; } // Fix VISC hints for this function -void fixHintMetadata(Module &M, Function* F, Function* G) { - Metadata* MD_F = ValueAsMetadata::getIfExists(F); - MDTuple* MDT_F = MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata*>(MD_F)); - DEBUG(errs() << "Associated Metadata: " << *MDT_F << "\n"); - MDTuple* MDT_G = MDNode::get(F->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(G))); - DEBUG(errs() << "New Metadata: " << *MDT_G << "\n"); - - auto FixHint = [&](StringRef Name) { - NamedMDNode* HintNode = M.getOrInsertNamedMetadata(Name); - for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { - if(HintNode->getOperand(i) == MDT_F) - HintNode->setOperand(i, MDT_G); - } - }; - - FixHint("visc_hint_gpu"); - FixHint("visc_hint_cpu"); - FixHint("visc_hint_cpu_gpu"); +void fixHintMetadata(Module &M, Function *F, Function *G) { + Metadata *MD_F = ValueAsMetadata::getIfExists(F); + MDTuple *MDT_F = + MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata *>(MD_F)); + DEBUG(errs() << "Associated Metadata: " << *MDT_F << "\n"); + MDTuple *MDT_G = MDNode::get(F->getContext(), + ArrayRef<Metadata *>(ValueAsMetadata::get(G))); + DEBUG(errs() << "New Metadata: " << *MDT_G << "\n"); + + auto FixHint = [&](StringRef Name) { + NamedMDNode *HintNode = M.getOrInsertNamedMetadata(Name); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if (HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + }; + + FixHint("visc_hint_gpu"); + FixHint("visc_hint_cpu"); + FixHint("visc_hint_cpu_gpu"); } // Assuming that the changed function is a node function, it is only used as a // first operand of createNode*. It is enough to iterate through all createNode* // calls in the program. -void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) { +void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { for (auto &Func : M) { DEBUG(errs() << "Function: " << Func.getName() << "\n"); - std::vector<Instruction*> toBeErased; + std::vector<Instruction *> toBeErased; - for (inst_iterator i = inst_begin(&Func), e = inst_end(&Func); i != e ; ++i) { - Instruction* I = &*i; // Grab pointer to Instruction + for (inst_iterator i = inst_begin(&Func), e = inst_end(&Func); i != e; + ++i) { + Instruction *I = &*i; // Grab pointer to Instruction if (isViscCreateNodeIntrinsic(I)) { - IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst *II = cast<IntrinsicInst>(I); // The found createNode is not associated with the changed function if (II->getArgOperand(0) != F) continue; // skip it // Otherwise, create a new createNode similar to the other one, // but with the changed function as first operand. - IntrinsicInst* CreateNodeII = - createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II); + IntrinsicInst *CreateNodeII = + createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II); II->replaceAllUsesWith(CreateNodeII); toBeErased.push_back(II); } else if (isViscCreateNodeCall(I)) { - CallInst* CI = cast<CallInst>(I); + CallInst *CI = cast<CallInst>(I); // The found createNode is not associated with the changed function if (CI->getArgOperand(1) != F) continue; // skip it @@ -160,8 +161,8 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) { // Replace use of F with use of G CI->setArgOperand(1, G); DEBUG(errs() << "Fixed use: " << *CI << "\n"); - } else if(isViscLaunchCall(I)) { - CallInst* CI = cast<CallInst>(I); + } else if (isViscLaunchCall(I)) { + CallInst *CI = cast<CallInst>(I); // The found launch call is not associated with the changed function if (CI->getArgOperand(1)->stripPointerCasts() != F) continue; @@ -171,31 +172,29 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) { DEBUG(errs() << *CI->getArgOperand(1)->getType() << "\n"); CI->setArgOperand(1, G); } - } - for(auto I: toBeErased) { + for (auto I : toBeErased) { DEBUG(errs() << "\tErasing " << *I << "\n"); I->eraseFromParent(); } } // Check if the function is used by a metadata node - if(F->isUsedByMetadata()) { + if (F->isUsedByMetadata()) { fixHintMetadata(M, F, G); } - DEBUG(errs() << "DONE: Replacing function " << F->getName() << " with " << G->getName() << "\n"); + DEBUG(errs() << "DONE: Replacing function " << F->getName() << " with " + << G->getName() << "\n"); // Remove replaced function from the module - //assert(F->user_empty() && "Still some uses of older function left\n"); + // assert(F->user_empty() && "Still some uses of older function left\n"); F->replaceAllUsesWith(UndefValue::get(F->getType())); F->eraseFromParent(); - } - -// Create new function F' as a copy of old function F with a new signature and input VMAP. -// The following two most used cases are handled by this function. +// Create new function F' as a copy of old function F with a new signature and +// input VMAP. The following two most used cases are handled by this function. // 1. When some extra arguments need to be added to this function // - Here we can map the old function arguments to // new ones @@ -204,77 +203,92 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) { // over extra pointer arguments. // The function returns the list of return instructions to the caller to fix in // case the return type is also changed. -Function* cloneFunction(Function* F, FunctionType* newFT, - bool isAddingPtrSizeArg, SmallVectorImpl<ReturnInst*>* Returns = NULL, std::vector<Argument*> *Args = NULL) { +Function *cloneFunction(Function *F, FunctionType *newFT, + bool isAddingPtrSizeArg, + SmallVectorImpl<ReturnInst *> *Returns = NULL, + std::vector<Argument *> *Args = NULL) { DEBUG(errs() << "Cloning Function: " << F->getName() << "\n"); DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n"); DEBUG(errs() << "New Function Type: " << *newFT << "\n"); - assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams() - && "This function assumes that the new function has more arguments than the old function!"); + assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams() && + "This function assumes that the new function has more arguments than " + "the old function!"); // Create Function of specified type - Function* newF = Function::Create(newFT, F->getLinkage(), F->getName()+"_cloned", F->getParent()); + Function *newF = Function::Create(newFT, F->getLinkage(), + F->getName() + "_cloned", F->getParent()); DEBUG(errs() << "Old Function name: " << F->getName() << "\n"); DEBUG(errs() << "New Function name: " << newF->getName() << "\n"); ValueToValueMapTy VMap; DEBUG(errs() << "No value map provided. Creating default value map\n"); - if(isAddingPtrSizeArg) { - DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n"); + if (isAddingPtrSizeArg) { + DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in " + "new function\n"); Function::arg_iterator new_ai = newF->arg_begin(); if (Args == NULL) { - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); - assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai + << "\n"); + assert(ai->getType() == new_ai->getType() && + "Arguments type do not match!"); VMap[&*ai] = &*new_ai; new_ai->takeName(&*ai); - if(ai->getType()->isPointerTy()) { + if (ai->getType()->isPointerTy()) { std::string oldName = new_ai->getName(); // If the current argument is pointer type, the next argument in new // function would be an i64 type containing the data size of this // argument. Hence, skip the next arguement in new function. ++new_ai; - new_ai->setName("bytes_"+oldName); + new_ai->setName("bytes_" + oldName); } ++new_ai; } } else { - DEBUG(errs() << "Arguments of original function will be read from a vector!\n"); + DEBUG(errs() + << "Arguments of original function will be read from a vector!\n"); for (auto *ai : *(Args)) { - DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); - assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai + << "\n"); + assert(ai->getType() == new_ai->getType() && + "Arguments type do not match!"); VMap[ai] = &*new_ai; new_ai->takeName(ai); - if(ai->getType()->isPointerTy()) { + if (ai->getType()->isPointerTy()) { std::string oldName = new_ai->getName(); // If the current argument is pointer type, the next argument in new // function would be an i64 type containing the data size of this // argument. Hence, skip the next arguement in new function. ++new_ai; - new_ai->setName("bytes_"+oldName); + new_ai->setName("bytes_" + oldName); } ++new_ai; - } + } } - } - else { - DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n"); + } else { + DEBUG(errs() + << "Case 2: Extra arguments are added at the end of old function\n"); Function::arg_iterator new_ai = newF->arg_begin(); if (Args == NULL) { - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai, ++new_ai) { - DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); - assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai, ++new_ai) { + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai + << "\n"); + assert(ai->getType() == new_ai->getType() && + "Arguments type do not match!"); VMap[&*ai] = &*new_ai; new_ai->takeName(&*ai); } } else { - DEBUG(errs() << "Arguments of original function will be read from a vector!\n"); + DEBUG(errs() + << "Arguments of original function will be read from a vector!\n"); for (auto *ai : *(Args)) { - DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); - assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai + << "\n"); + assert(ai->getType() == new_ai->getType() && + "Arguments type do not match!"); VMap[ai] = &*new_ai; new_ai->takeName(ai); ++new_ai; @@ -284,58 +298,62 @@ Function* cloneFunction(Function* F, FunctionType* newFT, // Clone function if (Returns == NULL) - Returns = new SmallVector<ReturnInst*, 8>(); + Returns = new SmallVector<ReturnInst *, 8>(); CloneFunctionInto(newF, F, VMap, false, *Returns); return newF; } // Overloaded version of cloneFunction -Function *cloneFunction(Function *F, Function *newF, - bool isAddingPtrSizeArg, - SmallVectorImpl<ReturnInst *> *Returns = NULL) { +Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg, + SmallVectorImpl<ReturnInst *> *Returns = NULL) { DEBUG(errs() << "Cloning Function: " << F->getName() << "\n"); DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n"); DEBUG(errs() << "New Function Type: " << *newF->getFunctionType() << "\n"); assert(F->getFunctionType()->getNumParams() <= - newF->getFunctionType()->getNumParams() && - "This function assumes that the new function has more arguments than " - "the old function!"); + newF->getFunctionType()->getNumParams() && + "This function assumes that the new function has more arguments than " + "the old function!"); // Create Function of specified type DEBUG(errs() << "Old Function name: " << F->getName() << "\n"); DEBUG(errs() << "New Function name: " << newF->getName() << "\n"); ValueToValueMapTy VMap; DEBUG(errs() << "No value map provided. Creating default value map\n"); - if(isAddingPtrSizeArg) { - DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n"); + if (isAddingPtrSizeArg) { + DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in " + "new function\n"); Function::arg_iterator new_ai = newF->arg_begin(); - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); - assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai + << "\n"); + assert(ai->getType() == new_ai->getType() && + "Arguments type do not match!"); VMap[&*ai] = &*new_ai; new_ai->takeName(&*ai); - if(ai->getType()->isPointerTy()) { + if (ai->getType()->isPointerTy()) { std::string oldName = new_ai->getName(); // If the current argument is pointer type, the next argument in new // function would be an i64 type containing the data size of this // argument. Hence, skip the next arguement in new function. ++new_ai; - new_ai->setName("bytes_"+oldName); + new_ai->setName("bytes_" + oldName); } ++new_ai; } - } - else { - DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n"); + } else { + DEBUG(errs() + << "Case 2: Extra arguments are added at the end of old function\n"); Function::arg_iterator new_ai = newF->arg_begin(); - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai, ++new_ai) { - DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); - assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai, ++new_ai) { + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai + << "\n"); + assert(ai->getType() == new_ai->getType() && + "Arguments type do not match!"); VMap[&*ai] = &*new_ai; new_ai->takeName(&*ai); } @@ -343,134 +361,133 @@ Function *cloneFunction(Function *F, Function *newF, // Clone function if (Returns == NULL) - Returns = new SmallVector<ReturnInst*, 8>(); + Returns = new SmallVector<ReturnInst *, 8>(); CloneFunctionInto(newF, F, VMap, false, *Returns); return newF; } - //------------------- Helper Functions For Handling Hints -------------------// // Return true if 1st arg (tag) contains 2nd (target) bool tagIncludesTarget(visc::Target Tag, visc::Target T) { switch (Tag) { - case visc::None: - return false; - case visc::CPU_TARGET: - if (T == visc::CPU_TARGET) - return true; - return false; - case visc::GPU_TARGET: - if (T == visc::GPU_TARGET) - return true; - return false; - case visc::CPU_OR_GPU_TARGET: - if ((T == visc::CPU_TARGET) || - (T == visc::GPU_TARGET) || - (T == visc::CPU_OR_GPU_TARGET)) - return true; - return false; - default: - assert(false && "Unknown Target\n"); + case visc::None: + return false; + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) + return true; + return false; + case visc::GPU_TARGET: + if (T == visc::GPU_TARGET) + return true; + return false; + case visc::CPU_OR_GPU_TARGET: + if ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET) || + (T == visc::CPU_OR_GPU_TARGET)) + return true; + return false; + default: + assert(false && "Unknown Target\n"); } } bool isSingleTargetTag(visc::Target T) { - return ((T == visc::CPU_TARGET) || - (T == visc::GPU_TARGET)); + return ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)); } // Add the specified target to the given tag visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) { - assert(((T == visc::CPU_TARGET) || - (T == visc::GPU_TARGET)) && - "The target is only allowed to be a single target: CPU, GPU, SPIR, CUDNN, PROMISE\n"); + assert(((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)) && + "The target is only allowed to be a single target: CPU, GPU, SPIR, " + "CUDNN, PROMISE\n"); switch (Tag) { - case visc::None: - return T; - case visc::CPU_TARGET: - if (T == visc::CPU_TARGET) - return visc::CPU_TARGET; - if (T == visc::GPU_TARGET) - return visc::CPU_OR_GPU_TARGET; - return T; - case visc::GPU_TARGET: - if (T == visc::CPU_TARGET) - return visc::CPU_OR_GPU_TARGET; - if (T == visc::GPU_TARGET) - return visc::GPU_TARGET; - return T; - case visc::CPU_OR_GPU_TARGET: + case visc::None: + return T; + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) + return visc::CPU_TARGET; + if (T == visc::GPU_TARGET) return visc::CPU_OR_GPU_TARGET; - default: - assert(false && "Unknown Target\n"); + return T; + case visc::GPU_TARGET: + if (T == visc::CPU_TARGET) + return visc::CPU_OR_GPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::GPU_TARGET; + return T; + case visc::CPU_OR_GPU_TARGET: + return visc::CPU_OR_GPU_TARGET; + default: + assert(false && "Unknown Target\n"); } return T; } // This functions add the hint as metadata in visc code -void addHint(Function* F, visc::Target T) { +void addHint(Function *F, visc::Target T) { // Get Module - Module* M = F->getParent(); + Module *M = F->getParent(); DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); // Based on the hint, get the hint metadata - NamedMDNode* HintNode; + NamedMDNode *HintNode; switch (T) { - case visc::GPU_TARGET: - DEBUG(errs() << "GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); - break; - case visc::CPU_TARGET: - DEBUG(errs() << "CPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); - break; - case visc::CPU_OR_GPU_TARGET: - DEBUG(errs() << "CPU or GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); - break; - default: - llvm_unreachable("Unsupported Target Hint!"); - break; + case visc::GPU_TARGET: + DEBUG(errs() << "GPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::CPU_TARGET: + DEBUG(errs() << "CPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + case visc::CPU_OR_GPU_TARGET: + DEBUG(errs() << "CPU or GPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; } // Create a node for the function and add it to the hint node - MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F))); + MDTuple *N = MDNode::get(M->getContext(), + ArrayRef<Metadata *>(ValueAsMetadata::get(F))); HintNode->addOperand(N); } // This function removes the hint as metadata in visc code -void removeHint(Function* F, visc::Target T) { +void removeHint(Function *F, visc::Target T) { // Get Module - Module* M = F->getParent(); - DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T << "\n"); + Module *M = F->getParent(); + DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T + << "\n"); // Based on the hint, get the hint metadata - NamedMDNode* HintNode; + NamedMDNode *HintNode; switch (T) { - case visc::GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); - break; - case visc::CPU_OR_GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); - break; - case visc::CPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); - break; - default: - llvm_unreachable("Unsupported Target Hint!"); - break; + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::CPU_OR_GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + break; + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; } // Gather metadata nodes, and keep those not associated with this function - MDNode* N = MDNode::get(M->getContext(), - ArrayRef<Metadata*>(ValueAsMetadata::get(F))); - std::vector<MDNode*> MDNodes; + MDNode *N = MDNode::get(M->getContext(), + ArrayRef<Metadata *>(ValueAsMetadata::get(F))); + std::vector<MDNode *> MDNodes; for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { - MDNode* MDN = HintNode->getOperand(i); + MDNode *MDN = HintNode->getOperand(i); if (MDN == N) { continue; } @@ -482,32 +499,34 @@ void removeHint(Function* F, visc::Target T) { for (unsigned i = 0; i < MDNodes.size(); i++) { HintNode->addOperand(MDNodes[i]); } - } -visc::Target getPreferredTarget(Function* F) { +visc::Target getPreferredTarget(Function *F) { DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); - Module* M = F->getParent(); + Module *M = F->getParent(); auto FoundPrefTarget = [=](StringRef Name) { - NamedMDNode* HintNode = M->getOrInsertNamedMetadata(Name); - for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { - MDNode* N = HintNode->getOperand(i); - Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); - if(F == FHint) + NamedMDNode *HintNode = M->getOrInsertNamedMetadata(Name); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode *N = HintNode->getOperand(i); + Value *FHint = + dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if (F == FHint) return true; } - return false; + return false; }; - if(FoundPrefTarget("visc_hint_cpu")) return visc::CPU_TARGET; - if(FoundPrefTarget("visc_hint_gpu")) return visc::GPU_TARGET; - if(FoundPrefTarget("visc_hint_cpu_gpu")) return visc::CPU_OR_GPU_TARGET; + if (FoundPrefTarget("visc_hint_cpu")) + return visc::CPU_TARGET; + if (FoundPrefTarget("visc_hint_gpu")) + return visc::GPU_TARGET; + if (FoundPrefTarget("visc_hint_cpu_gpu")) + return visc::CPU_OR_GPU_TARGET; return visc::None; } +} // namespace viscUtils -} // End of namespace - -#endif //VISC_UTILS_HEADER +#endif // VISC_UTILS_HEADER diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp index dc5a044dd7efc852f6b547c120b271eeea6cc107..058419f1dc80a8650e7a3b834090a88099741431 100644 --- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp +++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp @@ -10,13 +10,13 @@ #define DEBUG_TYPE "buildDFG" #include "BuildDFG/BuildDFG.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/ADT/Statistic.h" -#include "llvm/IR/ValueSymbolTable.h" #include "llvm/IR/InstIterator.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -26,15 +26,15 @@ bool BuildDFG::runOnModule(Module &M) { DEBUG(errs() << "\nBUILDDFG PASS\n"); DEBUG(errs() << "-------- Searching for launch sites ----------\n"); - IntrinsicInst* II; + IntrinsicInst *II; // Iterate over all functions in the module for (auto &Func : M) { - Function* F = &Func; + Function *F = &Func; DEBUG(errs() << "Function: " << F->getName() << "\n"); - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e ; ++i) { - Instruction* I = &*i; // Grab pointer to Instruction + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &*i; // Grab pointer to Instruction if (isViscLaunchIntrinsic(I)) { DEBUG(errs() << "------------ Found launch site --------------\n"); II = cast<IntrinsicInst>(I); @@ -42,24 +42,25 @@ bool BuildDFG::runOnModule(Module &M) { assert(II && "Launch intrinsic not recognized."); // Intrinsic Instruction has been initialized from this point on. - Function* F = cast<Function>(II->getOperand(0)->stripPointerCasts()); + Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts()); Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F)); Roots.push_back(Root); BuildGraph(Root, F); - for(DFGraph::children_iterator i = Root->getChildGraph()->begin(), - e = Root->getChildGraph()->end(); i!=e; i++) { - DFNode* N = *i; + for (DFGraph::children_iterator i = Root->getChildGraph()->begin(), + e = Root->getChildGraph()->end(); + i != e; i++) { + DFNode *N = *i; DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n"); } Root->getChildGraph()->sortChildren(); - for(DFGraph::children_iterator i = Root->getChildGraph()->begin(), - e = Root->getChildGraph()->end(); i!=e; i++) { - DFNode* N = *i; + for (DFGraph::children_iterator i = Root->getChildGraph()->begin(), + e = Root->getChildGraph()->end(); + i != e; i++) { + DFNode *N = *i; DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n"); } viewDFGraph(Root->getChildGraph()); - } } } @@ -75,85 +76,85 @@ DFInternalNode *BuildDFG::getRoot() const { return Root; } -std::vector<DFInternalNode*> &BuildDFG::getRoots() { +std::vector<DFInternalNode *> &BuildDFG::getRoots() { assert((Roots.size() != 0) && "Number of roots cannot be zero."); - + // All roots should have the same level - for(auto *Node : Roots) - assert(Node->getLevel() == 0 && "Invalid root node."); + for (auto *Node : Roots) + assert(Node->getLevel() == 0 && "Invalid root node."); return Roots; } -//TODO: Maybe make this const +// TODO: Maybe make this const BuildDFG::HandleToDFNode &BuildDFG::getHandleToDFNodeMap() { return HandleToDFNodeMap; } -//TODO: Maybe make this const +// TODO: Maybe make this const BuildDFG::HandleToDFEdge &BuildDFG::getHandleToDFEdgeMap() { return HandleToDFEdgeMap; } -void BuildDFG::addElementToHandleToDFNodeMap(Value* V, DFNode* N) { +void BuildDFG::addElementToHandleToDFNodeMap(Value *V, DFNode *N) { assert((HandleToDFNodeMap.find(V) == HandleToDFNodeMap.end()) && "Attempted to insert duplicate key in HandleToDFNodeMap"); - HandleToDFNodeMap.insert(std::pair<Value*, DFNode*>(V,N)); + HandleToDFNodeMap.insert(std::pair<Value *, DFNode *>(V, N)); } -//TODO: check if the removed element was not there -void BuildDFG::removeElementFromHandleToDFNodeMap(Value* V) { +// TODO: check if the removed element was not there +void BuildDFG::removeElementFromHandleToDFNodeMap(Value *V) { HandleToDFNodeMap.erase(V); } -void BuildDFG::addElementToHandleToDFEdgeMap(Value* V, DFEdge* E) { +void BuildDFG::addElementToHandleToDFEdgeMap(Value *V, DFEdge *E) { assert((HandleToDFEdgeMap.find(V) == HandleToDFEdgeMap.end()) && "Attempted to insert duplicate key in HandleToDFEdgeMap"); - HandleToDFEdgeMap.insert(std::pair<Value*, DFEdge*>(V,E)); + HandleToDFEdgeMap.insert(std::pair<Value *, DFEdge *>(V, E)); } -//TODO: check if the removed element was not there -void BuildDFG::removeElementFromHandleToDFEdgeMap(Value* V) { +// TODO: check if the removed element was not there +void BuildDFG::removeElementFromHandleToDFEdgeMap(Value *V) { HandleToDFEdgeMap.erase(V); } // Returns true if instruction I is a visc launch intrinsic, false otherwise -bool BuildDFG::isViscLaunchIntrinsic(Instruction* I) { - if(!isa<IntrinsicInst>(I)) +bool BuildDFG::isViscLaunchIntrinsic(Instruction *I) { + if (!isa<IntrinsicInst>(I)) return false; - IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst *II = cast<IntrinsicInst>(I); return (II->getCalledFunction()->getName()).equals("llvm.visc.launch"); } // Returns true if instruction I is a visc graph intrinsic, false otherwise -bool BuildDFG::isViscGraphIntrinsic(Instruction* I) { - if(!isa<IntrinsicInst>(I)) +bool BuildDFG::isViscGraphIntrinsic(Instruction *I) { + if (!isa<IntrinsicInst>(I)) return false; - IntrinsicInst* II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") - || (II->getCalledFunction()->getName()).startswith("llvm.visc.bind"); + IntrinsicInst *II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") || + (II->getCalledFunction()->getName()).startswith("llvm.visc.bind"); } // Returns true if instruction I is a visc query intrinsic, false otherwise -bool BuildDFG::isViscQueryIntrinsic(Instruction* I) { - if(!isa<IntrinsicInst>(I)) +bool BuildDFG::isViscQueryIntrinsic(Instruction *I) { + if (!isa<IntrinsicInst>(I)) return false; - IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst *II = cast<IntrinsicInst>(I); return (II->getCalledFunction()->getName()).startswith("llvm.visc.get"); } // Returns true if instruction I is a visc intrinsic, false otherwise -bool BuildDFG::isViscIntrinsic(Instruction* I) { - if(!isa<IntrinsicInst>(I)) +bool BuildDFG::isViscIntrinsic(Instruction *I) { + if (!isa<IntrinsicInst>(I)) return false; - IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst *II = cast<IntrinsicInst>(I); return (II->getCalledFunction()->getName()).startswith("llvm.visc"); } // Two types are "congruent" if they are identical, or if they are both // pointer types with different pointee types and the same address space. -bool BuildDFG::isTypeCongruent(Type* L, Type* R) { - if(L == R) +bool BuildDFG::isTypeCongruent(Type *L, Type *R) { + if (L == R) return true; PointerType *PL = dyn_cast<PointerType>(L); PointerType *PR = dyn_cast<PointerType>(R); @@ -163,15 +164,15 @@ bool BuildDFG::isTypeCongruent(Type* L, Type* R) { } // Handles all the createNodeXX visc intrinsics. -void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) { +void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { bool isInternalNode = false; - Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts()); + Function *F = cast<Function>((II->getOperand(0))->stripPointerCasts()); // Check if the function associated with this intrinsic is a leaf or // internal node for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction* I = &*i; // Grab pointer to Instruction + Instruction *I = &*i; // Grab pointer to Instruction if (isViscGraphIntrinsic(I)) isInternalNode = true; } @@ -179,46 +180,49 @@ void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) { // Number of Dimensions would be equal to the (number of operands - 1) as // the first operand is the pointer to associated Function and the // remaining operands are the limits in each dimension. - unsigned numOfDim = II->getCalledFunction()->getFunctionType()->getNumParams()-1; - assert(numOfDim <= 3 - && "Invalid number of dimensions for createNode intrinsic!"); - std::vector<Value*> dimLimits; + unsigned numOfDim = + II->getCalledFunction()->getFunctionType()->getNumParams() - 1; + assert(numOfDim <= 3 && + "Invalid number of dimensions for createNode intrinsic!"); + std::vector<Value *> dimLimits; for (unsigned i = 1; i <= numOfDim; i++) { // The operands of II are same as the operands of the called // intrinsic. It has one extra operand at the end, which is the intrinsic // being called. - dimLimits.push_back(cast<Value> (II->getOperand(i))); + dimLimits.push_back(cast<Value>(II->getOperand(i))); } - if(isInternalNode) { + if (isInternalNode) { // Create Internal DFNode, add it to the map and recursively build its // dataflow graph - DFInternalNode* childDFNode = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + DFInternalNode *childDFNode = DFInternalNode::Create( + II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; BuildGraph(childDFNode, F); - } - else { + } else { // Create Leaf DFnode and add it to the map. - DFLeafNode* childDFNode = DFLeafNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + DFLeafNode *childDFNode = DFLeafNode::Create( + II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; } } -void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) { +void BuildDFG::handleCreateEdge(DFInternalNode *N, IntrinsicInst *II) { // The DFNode structures must be in the map before the edge is processed HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0)); assert(DFI != HandleToDFNodeMap.end()); DFI = HandleToDFNodeMap.find(II->getOperand(1)); assert(DFI != HandleToDFNodeMap.end()); - DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)]; - DFNode* DestDF = HandleToDFNodeMap[II->getOperand(1)]; + DFNode *SrcDF = HandleToDFNodeMap[II->getOperand(0)]; + DFNode *DestDF = HandleToDFNodeMap[II->getOperand(1)]; bool EdgeType = !cast<ConstantInt>(II->getOperand(2))->isZero(); - unsigned SourcePosition = cast<ConstantInt>(II->getOperand(3))->getZExtValue(); + unsigned SourcePosition = + cast<ConstantInt>(II->getOperand(3))->getZExtValue(); unsigned DestPosition = cast<ConstantInt>(II->getOperand(4))->getZExtValue(); bool isStreaming = !cast<ConstantInt>(II->getOperand(5))->isZero(); @@ -227,27 +231,22 @@ void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) { // Get destination type FunctionType *FT = DestDF->getFuncPointer()->getFunctionType(); - assert((FT->getNumParams() > DestPosition) - && "Invalid argument number for destination dataflow node!"); + assert((FT->getNumParams() > DestPosition) && + "Invalid argument number for destination dataflow node!"); DestTy = FT->getParamType(DestPosition); // Get source type - StructType* OutTy = SrcDF->getOutputType(); - assert((OutTy->getNumElements() > SourcePosition) - && "Invalid argument number for source dataflow node!"); + StructType *OutTy = SrcDF->getOutputType(); + assert((OutTy->getNumElements() > SourcePosition) && + "Invalid argument number for source dataflow node!"); SrcTy = OutTy->getElementType(SourcePosition); // check if the types are compatible - assert(isTypeCongruent(SrcTy, DestTy) - && "Source and destination type of edge do not match"); + assert(isTypeCongruent(SrcTy, DestTy) && + "Source and destination type of edge do not match"); - DFEdge* newDFEdge = DFEdge::Create(SrcDF, - DestDF, - EdgeType, - SourcePosition, - DestPosition, - DestTy, - isStreaming); + DFEdge *newDFEdge = DFEdge::Create(SrcDF, DestDF, EdgeType, SourcePosition, + DestPosition, DestTy, isStreaming); HandleToDFEdgeMap[II] = newDFEdge; @@ -255,43 +254,39 @@ void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) { N->addEdgeToDFGraph(newDFEdge); } -void BuildDFG::handleBindInput(DFInternalNode* N, IntrinsicInst* II) { +void BuildDFG::handleBindInput(DFInternalNode *N, IntrinsicInst *II) { // The DFNode structures must be in the map before the edge is processed HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0)); assert(DFI != HandleToDFNodeMap.end()); - DFNode* SrcDF = N->getChildGraph()->getEntry(); - DFNode* DestDF = HandleToDFNodeMap[II->getOperand(0)]; + DFNode *SrcDF = N->getChildGraph()->getEntry(); + DFNode *DestDF = HandleToDFNodeMap[II->getOperand(0)]; - unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + unsigned SourcePosition = + cast<ConstantInt>(II->getOperand(1))->getZExtValue(); unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero(); - + // Get destination type FunctionType *FT = DestDF->getFuncPointer()->getFunctionType(); - assert((FT->getNumParams() > DestPosition) - && "Invalid argument number for destination dataflow node!"); - Type* DestTy = FT->getParamType(DestPosition); + assert((FT->getNumParams() > DestPosition) && + "Invalid argument number for destination dataflow node!"); + Type *DestTy = FT->getParamType(DestPosition); // Get source type FT = SrcDF->getFuncPointer()->getFunctionType(); - assert((FT->getNumParams() > SourcePosition) - && "Invalid argument number for parent dataflow node!"); - Type* SrcTy = FT->getParamType(SourcePosition); + assert((FT->getNumParams() > SourcePosition) && + "Invalid argument number for parent dataflow node!"); + Type *SrcTy = FT->getParamType(SourcePosition); // check if the types are compatible - assert(isTypeCongruent(SrcTy, DestTy) - && "Source and destination type of edge do not match"); + assert(isTypeCongruent(SrcTy, DestTy) && + "Source and destination type of edge do not match"); // Add Binding as an edge between Entry and child Node - DFEdge* newDFEdge = DFEdge::Create(SrcDF, - DestDF, - false, - SourcePosition, - DestPosition, - DestTy, - isStreaming); + DFEdge *newDFEdge = DFEdge::Create(SrcDF, DestDF, false, SourcePosition, + DestPosition, DestTy, isStreaming); HandleToDFEdgeMap[II] = newDFEdge; @@ -299,43 +294,39 @@ void BuildDFG::handleBindInput(DFInternalNode* N, IntrinsicInst* II) { N->addEdgeToDFGraph(newDFEdge); } -void BuildDFG::handleBindOutput(DFInternalNode* N, IntrinsicInst* II) { +void BuildDFG::handleBindOutput(DFInternalNode *N, IntrinsicInst *II) { // The DFNode structures must be in the map before the edge is processed HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0)); assert(DFI != HandleToDFNodeMap.end()); - DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)]; - DFNode* DestDF = N->getChildGraph()->getExit(); + DFNode *SrcDF = HandleToDFNodeMap[II->getOperand(0)]; + DFNode *DestDF = N->getChildGraph()->getExit(); - unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + unsigned SourcePosition = + cast<ConstantInt>(II->getOperand(1))->getZExtValue(); unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero(); - + // Get destination type - StructType* OutTy = DestDF->getOutputType(); - assert((OutTy->getNumElements() > DestPosition) - && "Invalid argument number for destination parent dataflow node!"); - Type* DestTy = OutTy->getElementType(DestPosition); + StructType *OutTy = DestDF->getOutputType(); + assert((OutTy->getNumElements() > DestPosition) && + "Invalid argument number for destination parent dataflow node!"); + Type *DestTy = OutTy->getElementType(DestPosition); // Get source type OutTy = SrcDF->getOutputType(); - assert((OutTy->getNumElements() > SourcePosition) - && "Invalid argument number for source dataflow node!"); - Type* SrcTy = OutTy->getElementType(SourcePosition); + assert((OutTy->getNumElements() > SourcePosition) && + "Invalid argument number for source dataflow node!"); + Type *SrcTy = OutTy->getElementType(SourcePosition); // check if the types are compatible - assert(isTypeCongruent(SrcTy, DestTy) - && "Source and destination type of edge do not match"); + assert(isTypeCongruent(SrcTy, DestTy) && + "Source and destination type of edge do not match"); // Add Binding as an edge between child and exit node - DFEdge* newDFEdge = DFEdge::Create(SrcDF, - DestDF, - false, - SourcePosition, - DestPosition, - DestTy, - isStreaming); + DFEdge *newDFEdge = DFEdge::Create(SrcDF, DestDF, false, SourcePosition, + DestPosition, DestTy, isStreaming); HandleToDFEdgeMap[II] = newDFEdge; @@ -343,7 +334,7 @@ void BuildDFG::handleBindOutput(DFInternalNode* N, IntrinsicInst* II) { N->addEdgeToDFGraph(newDFEdge); } -void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) { +void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { DEBUG(errs() << "FUNCTION: " << F->getName() << "\n"); // TODO: Place checks for valid visc functions. For example one of the // check can be that any function that contains visc dataflow graph @@ -351,49 +342,55 @@ void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) { // Iterate over all the instructions of a function and look for visc // intrinsics. - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e ; ++i) { - Instruction* I = &*i; // Grab pointer to Instruction + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &*i; // Grab pointer to Instruction DEBUG(errs() << *I << "\n"); - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) { - DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName()<<"\n"); - switch(II->getIntrinsicID()) { - case Intrinsic::visc_createNode: - case Intrinsic::visc_createNode1D: - case Intrinsic::visc_createNode2D: - case Intrinsic::visc_createNode3D: - handleCreateNode (N, II); - break; - case Intrinsic::visc_createEdge: - handleCreateEdge(N, II); - break; - case Intrinsic::visc_bind_input: - handleBindInput(N, II); - break; - case Intrinsic::visc_bind_output: - handleBindOutput(N, II); - break; - - //TODO: Reconsider launch within a dataflow graph (recursion?) - case Intrinsic::visc_wait: - case Intrinsic::visc_launch: - DEBUG(errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" << *II << "\n"); - break; - - default: - DEBUG(errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n"); - break; - } - continue; - } - if(!isa<ReturnInst>(I) && !isa<CastInst>(I)) { - DEBUG(errs() << "Non-intrinsic instruction: " << *I << "\n"); - llvm_unreachable("Found non-intrinsic instruction inside an internal node. Only return instruction is allowed!"); - } - } + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " + << II->getCalledFunction()->getName() << "\n"); + switch (II->getIntrinsicID()) { + case Intrinsic::visc_createNode: + case Intrinsic::visc_createNode1D: + case Intrinsic::visc_createNode2D: + case Intrinsic::visc_createNode3D: + handleCreateNode(N, II); + break; + case Intrinsic::visc_createEdge: + handleCreateEdge(N, II); + break; + case Intrinsic::visc_bind_input: + handleBindInput(N, II); + break; + case Intrinsic::visc_bind_output: + handleBindOutput(N, II); + break; + + // TODO: Reconsider launch within a dataflow graph (recursion?) + case Intrinsic::visc_wait: + case Intrinsic::visc_launch: + DEBUG(errs() + << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" + << *II << "\n"); + break; + + default: + DEBUG( + errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" + << *II << "\n"); + break; + } + continue; + } + if (!isa<ReturnInst>(I) && !isa<CastInst>(I)) { + DEBUG(errs() << "Non-intrinsic instruction: " << *I << "\n"); + llvm_unreachable("Found non-intrinsic instruction inside an internal " + "node. Only return instruction is allowed!"); + } + } } char BuildDFG::ID = 0; -static RegisterPass<BuildDFG> X("buildDFG", "Hierarchical Dataflow Graph Builder Pass", false, false); +static RegisterPass<BuildDFG> + X("buildDFG", "Hierarchical Dataflow Graph Builder Pass", false, false); } // End of namespace builddfg - diff --git a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp index e8e814f881b5066d86f60bd10ae5941eed9179d6..6dae9e6977d31a0b62a9fa903966ec10810a2f71 100644 --- a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp +++ b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp @@ -8,17 +8,17 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "ClearDFG" +#include "BuildDFG/BuildDFG.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Support/Debug.h" -#include "BuildDFG/BuildDFG.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; using namespace builddfg; -//STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted"); +// STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted"); namespace { @@ -35,18 +35,14 @@ private: public: bool runOnModule(Module &M); - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<BuildDFG>(); - } - - + void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<BuildDFG>(); } }; // Visitor for Code generation traversal (tree traversal for now) class TreeTraversal : public DFNodeVisitor { private: - //Member variables + // Member variables Module &M; BuildDFG &DFG; @@ -54,37 +50,43 @@ private: // extra index and dimension arguments. This map also serves to find out if // we already have an index and dim extended function copy or not (i.e., // "Have we visited this function before?") - ValueMap<Function*, Function*> FMap; - DenseMap<DFNode*, CallInst*> CallMap; + ValueMap<Function *, Function *> FMap; + DenseMap<DFNode *, CallInst *> CallMap; + + // Functions + void deleteNode(DFNode *N); - //Functions - void deleteNode(DFNode* N); public: // Constructor - TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { } + TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} - virtual void visit(DFInternalNode* N) { + virtual void visit(DFInternalNode *N) { // Follows a bottom-up approach for code generation. // First generate code for all the child nodes - for(DFGraph::children_iterator i = N->getChildGraph()->begin(), - e = N->getChildGraph()->end(); i != e; ++i) { - DFNode* child = *i; + for (DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); + i != e; ++i) { + DFNode *child = *i; child->applyDFNodeVisitor(*this); } - DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() + << "\n"); // Generate code for this internal node now. This way all the cloned // functions for children exist. deleteNode(N); - DEBUG(errs() << "\tDone - " << "\n"); - //errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"; + DEBUG(errs() << "\tDone - " + << "\n"); + // errs() << "DONE: Generating Code for Node (I) - " << + // N->getFuncPointer()->getName() << "\n"; } - virtual void visit(DFLeafNode* N) { - DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n"); + virtual void visit(DFLeafNode *N) { + DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() + << "\n"); deleteNode(N); - DEBUG(errs() << "DONE" << "\n"); + DEBUG(errs() << "DONE" + << "\n"); } - }; bool ClearDFG::runOnModule(Module &M) { @@ -95,26 +97,28 @@ bool ClearDFG::runOnModule(Module &M) { BuildDFG &DFG = getAnalysis<BuildDFG>(); // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); + std::vector<DFInternalNode *> Roots = DFG.getRoots(); // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - Function* VI = M.getFunction("llvm.visc.init"); + Function *VI = M.getFunction("llvm.visc.init"); assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n"); - for(Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) { - Instruction* I = dyn_cast<Instruction>(*ui); + for (Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); + ui != ue; ui++) { + Instruction *I = dyn_cast<Instruction>(*ui); I->eraseFromParent(); } VI->replaceAllUsesWith(UndefValue::get(VI->getType())); VI->eraseFromParent(); - Function* VC = M.getFunction("llvm.visc.cleanup"); + Function *VC = M.getFunction("llvm.visc.cleanup"); assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n"); - for(Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) { - Instruction* I = dyn_cast<Instruction>(*ui); + for (Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); + ui != ue; ui++) { + Instruction *I = dyn_cast<Instruction>(*ui); I->eraseFromParent(); } - + VC->replaceAllUsesWith(UndefValue::get(VC->getType())); VC->eraseFromParent(); @@ -122,25 +126,25 @@ bool ClearDFG::runOnModule(Module &M) { TreeTraversal *Visitor = new TreeTraversal(M, DFG); // Initiate code generation for root DFNode - for (auto rootNode: Roots) { + for (auto rootNode : Roots) { Visitor->visit(rootNode); } delete Visitor; return true; } -void TreeTraversal::deleteNode(DFNode* N) { - if(N->isDummyNode()) +void TreeTraversal::deleteNode(DFNode *N) { + if (N->isDummyNode()) return; // Erase Function associated with this node - Function* F = N->getFuncPointer(); + Function *F = N->getFuncPointer(); F->replaceAllUsesWith(UndefValue::get(F->getType())); F->eraseFromParent(); // If N is not a root node, we are done. Return. - if(!N->isRoot()) + if (!N->isRoot()) return; // N is a root node. Delete the Launch Intrinsic associated it with as well. - IntrinsicInst* LI = N->getInstruction(); + IntrinsicInst *LI = N->getInstruction(); LI->replaceAllUsesWith(UndefValue::get(LI->getType())); LI->eraseFromParent(); } @@ -148,8 +152,7 @@ void TreeTraversal::deleteNode(DFNode* N) { } // End of namespace char ClearDFG::ID = 0; -static RegisterPass<ClearDFG> X("clearDFG", - "Delete all DFG functions for which code has been generated", - false /* does not modify the CFG */, - true /* transformation, not just analysis */); - +static RegisterPass<ClearDFG> + X("clearDFG", "Delete all DFG functions for which code has been generated", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 08f6314a812844f85e7fe6d5ce50cf6e8393a2e0..c9ce98cb7230cc694d50303eeff8f007a24aecdd 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -15,29 +15,28 @@ #define SHARED_ADDRSPACE 3 #define DEBUG_TYPE "DFG2LLVM_NVPTX" +#include "SupportVISC/DFG2LLVM.h" +#include "SupportVISC/VISCTimer.h" +#include "SupportVISC/VISCUtils.h" +#include "llvm-c/Core.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" #include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/Pass.h" #include "llvm/Support/FileSystem.h" -#include "llvm/IR/Attributes.h" -#include "llvm-c/Core.h" -#include "SupportVISC/VISCTimer.h" -#include "SupportVISC/DFG2LLVM.h" -#include "SupportVISC/VISCUtils.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/ToolOutputFile.h" #include "llvm/IR/UseListOrder.h" - +#include "llvm/Support/ToolOutputFile.h" #include <sstream> @@ -47,8 +46,8 @@ using namespace dfg2llvm; using namespace viscUtils; // VISC Command line option to use timer or not -static cl::opt<bool> -VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers")); +static cl::opt<bool> VISCTimer_NVPTX("visc-timers-ptx", + cl::desc("Enable visc timers")); namespace { // Helper class declarations @@ -57,94 +56,88 @@ namespace { // in bytes. Would have preferred to use tuple but support not yet available class OutputPtr { public: - OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) - : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} - Value* h_ptr; - Value* d_ptr; - Value* bytes; + Value *h_ptr; + Value *d_ptr; + Value *bytes; }; // Class to maintain important kernel info required for generating runtime // calls class Kernel { public: - Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = - std::map<unsigned, unsigned>(), - std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap = - std::map<unsigned, std::pair<Value*, unsigned> >(), - std::vector<unsigned> _outArgMap = std::vector<unsigned>(), - unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), - unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) - : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), - sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), - globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { - - assert(gridDim == globalWGSize.size() - && "gridDim should be same as the size of vector globalWGSize"); - assert(blockDim == localWGSize.size() - && "blockDim should be same as the size of vector localWGSize"); + Kernel( + Function *_KF, DFLeafNode *_KLeafNode, + std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(), + std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap = + std::map<unsigned, std::pair<Value *, unsigned>>(), + std::vector<unsigned> _outArgMap = std::vector<unsigned>(), + unsigned _gridDim = 0, + std::vector<Value *> _globalWGSize = std::vector<Value *>(), + unsigned _blockDim = 0, + std::vector<Value *> _localWGSize = std::vector<Value *>()) + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), + gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), + localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() && + "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() && + "blockDim should be same as the size of vector localWGSize"); } - Function* KernelFunction; - DFLeafNode* KernelLeafNode; + Function *KernelFunction; + DFLeafNode *KernelLeafNode; std::map<unsigned, unsigned> inArgMap; // Map for shared memory arguments - std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap; + std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap; // Fields for (potential) allocation node - DFLeafNode* AllocationNode; - Function* AllocationFunction; + DFLeafNode *AllocationNode; + Function *AllocationFunction; std::map<unsigned, unsigned> allocInArgMap; std::vector<unsigned> outArgMap; unsigned gridDim; - std::vector<Value*> globalWGSize; + std::vector<Value *> globalWGSize; unsigned blockDim; - std::vector<Value*> localWGSize; + std::vector<Value *> localWGSize; std::vector<int> localDimMap; - std::map<unsigned, unsigned> &getInArgMap() { - return inArgMap; - } - void setInArgMap(std::map<unsigned, unsigned> map) { - inArgMap = map; - } + std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; } + void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; } - std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() { + std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() { return sharedInArgMap; } - void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) { + void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) { sharedInArgMap = map; } - std::vector<unsigned> &getOutArgMap() { - return outArgMap; - } - void setOutArgMap(std::vector<unsigned> map) { - outArgMap = map; - } + std::vector<unsigned> &getOutArgMap() { return outArgMap; } + void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; } - void setLocalWGSize(std::vector<Value*> V) { - localWGSize = V; - } + void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; } - bool hasLocalWG() const { - return blockDim != 0; - } + bool hasLocalWG() const { return blockDim != 0; } }; // Helper function declarations -static bool canBePromoted(Argument* arg, Function* F); -static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*, - ValueToValueMapTy&, Instruction*); -static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&, - Instruction*, const Twine& WGName = "WGSize"); -static std::string getPTXFilename(const Module&); -static std::string getFilenameFromModule(const Module& M); +static bool canBePromoted(Argument *arg, Function *F); +static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&, + Kernel *, ValueToValueMapTy &, Instruction *); +static Value *genWorkGroupPtr(Module &M, std::vector<Value *>, + ValueToValueMapTy &, Instruction *, + const Twine &WGName = "WGSize"); +static std::string getPTXFilename(const Module &); +static std::string getFilenameFromModule(const Module &M); static void changeDataLayout(Module &); static void changeTargetTriple(Module &); static void findReturnInst(Function *, std::vector<ReturnInst *> &); -static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &); +static void findIntrinsicInst(Function *, Intrinsic::ID, + std::vector<IntrinsicInst *> &); static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID); static std::string getAtomicOpName(Intrinsic::ID); @@ -154,7 +147,6 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM { DFG2LLVM_NVPTX() : DFG2LLVM(ID) {} private: - public: bool runOnModule(Module &M); }; @@ -163,10 +155,10 @@ public: class CGT_NVPTX : public CodeGenTraversal { private: - //Member variables + // Member variables std::unique_ptr<Module> KernelM; - DFNode* KernelLaunchNode = NULL; - Kernel* kernel; + DFNode *KernelLaunchNode = NULL; + Kernel *kernel; // VISC Runtime API FunctionCallee llvm_visc_ocl_launch; @@ -181,14 +173,16 @@ private: FunctionCallee llvm_visc_ocl_getOutput; FunctionCallee llvm_visc_ocl_executeNode; - //Functions + // Functions std::string getKernelsModuleName(Module &M); - void fixValueAddrspace(Value* V, unsigned addrspace); - std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*); - Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i); - void addCLMetadata(Function* F); - Function* transformFunctionToVoid(Function* F); - void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); + void fixValueAddrspace(Value *V, unsigned addrspace); + std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *, + Function *); + Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags, + unsigned i); + void addCLMetadata(Function *F); + Function *transformFunctionToVoid(Function *F); + void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName); // Virtual Functions void init() { @@ -196,24 +190,25 @@ private: TargetName = "NVPTX"; } void initRuntimeAPI(); - void codeGen(DFInternalNode* N); - void codeGen(DFLeafNode* N); + void codeGen(DFInternalNode *N); + void codeGen(DFLeafNode *N); public: - // Constructor - CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { + CGT_NVPTX(Module &_M, BuildDFG &_DFG) + : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { init(); initRuntimeAPI(); DEBUG(errs() << "Old module pointer: " << &_M << "\n"); - DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n"); + DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n"); - // Copying instead of creating new, in order to preserve required info (metadata) - // Remove functions, global variables and aliases - std::vector<GlobalVariable*> GVVect; + // Copying instead of creating new, in order to preserve required info + // (metadata) Remove functions, global variables and aliases + std::vector<GlobalVariable *> GVVect; for (Module::global_iterator mi = KernelM->global_begin(), - me = KernelM->global_end(); (mi != me); ++mi) { - GlobalVariable* GV = &*mi; + me = KernelM->global_end(); + (mi != me); ++mi) { + GlobalVariable *GV = &*mi; GVVect.push_back(GV); } for (auto *GV : GVVect) { @@ -221,10 +216,10 @@ public: GV->eraseFromParent(); } - std::vector<Function*> FuncVect; - for (Module::iterator mi = KernelM->begin(), - me = KernelM->end(); (mi != me); ++mi) { - Function* F = &*mi; + std::vector<Function *> FuncVect; + for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); + (mi != me); ++mi) { + Function *F = &*mi; FuncVect.push_back(F); } for (auto *F : FuncVect) { @@ -232,10 +227,11 @@ public: F->eraseFromParent(); } - std::vector<GlobalAlias*> GAVect; + std::vector<GlobalAlias *> GAVect; for (Module::alias_iterator mi = KernelM->alias_begin(), - me = KernelM->alias_end(); (mi != me); ++mi) { - GlobalAlias* GA = &*mi; + me = KernelM->alias_end(); + (mi != me); ++mi) { + GlobalAlias *GA = &*mi; GAVect.push_back(GA); } for (auto *GA : GAVect) { @@ -246,9 +242,7 @@ public: changeDataLayout(*KernelM); changeTargetTriple(*KernelM); - DEBUG(errs() << *KernelM); - } void writeKernelsModule(); @@ -260,14 +254,14 @@ void CGT_NVPTX::initRuntimeAPI() { // Load Runtime API Module SMDiagnostic Err; - char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll"; runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if(runtimeModule == nullptr) + if (runtimeModule == nullptr) DEBUG(errs() << Err.getMessage()); else DEBUG(errs() << "Successfully loaded visc-rt API module\n"); @@ -290,27 +284,25 @@ void CGT_NVPTX::initRuntimeAPI() { // Insert init context in main DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n"); - Function* VI = M.getFunction("llvm.visc.init"); + Function *VI = M.getFunction("llvm.visc.init"); assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); InitCall = cast<Instruction>(*VI->user_begin()); initializeTimerSet(InitCall); switchToTimer(visc_TimerID_INIT_CTX, InitCall); CallInst::Create(llvm_visc_ocl_initContext, - ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)), - "", InitCall); + ArrayRef<Value *>(getTargetID(M, visc::GPU_TARGET)), "", + InitCall); switchToTimer(visc_TimerID_NONE, InitCall); // Insert print instruction at visc exit DEBUG(errs() << "Gen Code to print NVPTX Timer\n"); - Function* VC = M.getFunction("llvm.visc.cleanup"); + Function *VC = M.getFunction("llvm.visc.cleanup"); DEBUG(errs() << *VC << "\n"); assert(VC->getNumUses() == 1 && "__visc__clear should only be used once"); CleanupCall = cast<Instruction>(*VC->user_begin()); printTimerSet(CleanupCall); - - } // Generate Code to call the kernel @@ -318,36 +310,37 @@ void CGT_NVPTX::initRuntimeAPI() { // used to generate a function to associate with this leaf node. The function // is responsible for all the memory allocation/transfer and invoking the // kernel call on the device -void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { +void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, + const Twine &FileName) { // Check if clone already exists. If it does, it means we have visited this // function before. -// assert(N->getGenFunc() == NULL && "Code already generated for this node"); + // assert(N->getGenFunc() == NULL && "Code already generated for this node"); assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL && "Code already generated for this node"); // Useful values - Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); - Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); // If kernel struct has not been initialized with kernel function, then fail assert(K != NULL && "No kernel found!!"); DEBUG(errs() << "Generating kernel call code\n"); - Function* F = N->getFuncPointer(); - + Function *F = N->getFuncPointer(); // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. - Function* F_X86; + Function *F_X86; // Clone the function, if we are seeing this function for the first time. We // only need a clone in terms of type. ValueToValueMapTy VMap; // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + F_X86 = + Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); // Loop over the arguments, copying the names of arguments over. Function::arg_iterator dest_iterator = F_X86->arg_begin(); @@ -360,27 +353,26 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi // Add a basic block to this empty function BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); - ReturnInst* RI = ReturnInst::Create(M.getContext(), - UndefValue::get(F_X86->getReturnType()), BB); + ReturnInst *RI = ReturnInst::Create( + M.getContext(), UndefValue::get(F_X86->getReturnType()), BB); // FIXME: Adding Index and Dim arguments are probably not required except // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do // have those arguments) // Add Index and Dim arguments except for the root node - if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); BB = &*F_X86->begin(); RI = cast<ReturnInst>(BB->getTerminator()); - //Add the generated function info to DFNode -// N->setGenFunc(F_X86, visc::CPU_TARGET); + // Add the generated function info to DFNode + // N->setGenFunc(F_X86, visc::CPU_TARGET); N->addGenFunc(F_X86, visc::GPU_TARGET, true); DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " << N->getFuncPointer()->getName() << "\n"); - // Loop over the arguments, to create the VMap dest_iterator = F_X86->arg_begin(); for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); @@ -412,51 +404,53 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi break; } - assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + assert(C->isDummyNode() == false && "Internal Node only contains dummy + nodes!"); Function* CF = C->getFuncPointer(); */ - Function* KF = K->KernelLeafNode->getFuncPointer(); + Function *KF = K->KernelLeafNode->getFuncPointer(); // Initialize context - //DEBUG(errs() << "Initializing context" << "\n"); - //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); + // DEBUG(errs() << "Initializing context" << "\n"); + // CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); - DEBUG(errs() << "Initializing commandQ" << "\n"); + DEBUG(errs() << "Initializing commandQ" + << "\n"); // Initialize command queue switchToTimer(visc_TimerID_SETUP, InitCall); - Value* fileStr = getStringPointer(FileName, InitCall, "Filename"); + Value *fileStr = getStringPointer(FileName, InitCall, "Filename"); DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); - DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); - Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName"); - - Value* LaunchInstArgs[] = {fileStr, kernelStr}; - - DEBUG(errs() << "Inserting launch call" << "\n"); - CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, - ArrayRef<Value*>(LaunchInstArgs, 2), - "graph"+KF->getName(), - InitCall); + DEBUG(errs() << "Generating code for kernel - " + << K->KernelFunction->getName() << "\n"); + Value *kernelStr = + getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName"); + + Value *LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" + << "\n"); + CallInst *NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, + ArrayRef<Value *>(LaunchInstArgs, 2), + "graph" + KF->getName(), InitCall); DEBUG(errs() << *NVPTX_Ctx << "\n"); - GraphIDAddr = new GlobalVariable(M, - NVPTX_Ctx->getType(), - false, + GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false, GlobalValue::CommonLinkage, Constant::getNullValue(NVPTX_Ctx->getType()), - "graph"+KF->getName()+".addr"); + "graph" + KF->getName() + ".addr"); DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n"); - StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); + StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); DEBUG(errs() << *SI << "\n"); switchToTimer(visc_TimerID_NONE, InitCall); switchToTimer(visc_TimerID_SETUP, RI); - Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI); + Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI); // Iterate over the required input edges of the node and use the visc-rt API // to set inputs DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); std::vector<OutputPtr> OutputPointers; - // Vector to hold the device memory object that need to be cleared before we release - // context - std::vector<Value*> DevicePointers; + // Vector to hold the device memory object that need to be cleared before we + // release context + std::vector<Value *> DevicePointers; std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap(); /* @@ -468,133 +462,134 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi */ - for(auto &InArgMapPair : kernelInArgMap) { + for (auto &InArgMapPair : kernelInArgMap) { unsigned i = InArgMapPair.first; - Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second); - DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); + Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second); + DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n"); // input value has been obtained. // Check if input is a scalar value or a pointer operand // For scalar values such as int, float, etc. the size is simply the size of // type on target machine, but for pointers, the size of data would be the // next integer argument - if(inputVal->getType()->isPointerTy()) { + if (inputVal->getType()->isPointerTy()) { switchToTimer(visc_TimerID_COPY_PTR, RI); // Pointer Input // CheckAttribute - Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; - Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) - && !(hasAttribute(KF, i, Attribute::In)))? False : True; - - Argument* A = getArgumentAt(KF, i); - if(isOutput == True) { + Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False; + Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) && + !(hasAttribute(KF, i, Attribute::In))) + ? False + : True; + + Argument *A = getArgumentAt(KF, i); + if (isOutput == True) { DEBUG(errs() << *A << " is an OUTPUT argument\n"); } - if(isInput == True) { + if (isInput == True) { DEBUG(errs() << *A << " is an INPUT argument\n"); } - - Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, - Type::getInt8PtrTy(M.getContext()), - inputVal->getName()+".i8ptr", - RI); + Value *inputValI8Ptr = CastInst::CreatePointerCast( + inputVal, Type::getInt8PtrTy(M.getContext()), + inputVal->getName() + ".i8ptr", RI); // Assert that the pointer argument size (next argument) is in the map - assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); - - Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); - assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) - && "Pointer type input must always be followed by size (integer type)"); - Value* setInputArgs[] = {GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - inputSize, - isInput, - isOutput - }; - Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, - ArrayRef<Value*>(setInputArgs, 6), "", RI); + assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end()); + + Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]); + assert( + inputSize->getType() == Type::getInt64Ty(M.getContext()) && + "Pointer type input must always be followed by size (integer type)"); + Value *setInputArgs[] = { + GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), i), + inputSize, + isInput, + isOutput}; + Value *d_ptr = + CallInst::Create(llvm_visc_ocl_argument_ptr, + ArrayRef<Value *>(setInputArgs, 6), "", RI); DevicePointers.push_back(d_ptr); // If this has out attribute, store the returned device pointer in // memory to read device memory later - if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); - } - else { + if (isOutput == True) + OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } else { switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Scalar Input // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI); - StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); - - Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, - Type::getInt8PtrTy(M.getContext()), - inputVal->getName()+".i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - ConstantExpr::getSizeOf(inputVal->getType()) - }; + AllocaInst *inputValPtr = new AllocaInst( + inputVal->getType(), 0, inputVal->getName() + ".ptr", RI); + StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI); + + Value *inputValI8Ptr = CastInst::CreatePointerCast( + inputValPtr, Type::getInt8PtrTy(M.getContext()), + inputVal->getName() + ".i8ptr", RI); + + Value *setInputArgs[] = { + GraphID, inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), i), + ConstantExpr::getSizeOf(inputVal->getType())}; CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + ArrayRef<Value *>(setInputArgs, 4), "", RI); } } - DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); + DEBUG( + errs() << "Setup shared memory arguments of node and insert visc api\n"); // Check to see if all the allocation sizes are constant (determined // statically) bool constSizes = true; - for (auto& e: K->getSharedInArgMap()) { + for (auto &e : K->getSharedInArgMap()) { constSizes &= isa<Constant>(e.second.first); } // If the sizes are all constant if (constSizes) { - for (auto& e: K->getSharedInArgMap()) { + for (auto &e : K->getSharedInArgMap()) { unsigned argNum = e.first; - Value* allocSize = e.second.first; + Value *allocSize = e.second.first; - DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at " << argNum + << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position switchToTimer(visc_TimerID_COPY_SCALAR, RI); - assert(isa<Constant>(allocSize) && "Constant shared memory size is expected"); + assert(isa<Constant>(allocSize) && + "Constant shared memory size is expected"); - Value* setInputArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - allocSize - }; + Value *setInputArgs[] = { + GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + allocSize}; CallInst::Create(llvm_visc_ocl_argument_shared, - ArrayRef<Value*>(setInputArgs, 3), "", RI); - } - else { + ArrayRef<Value *>(setInputArgs, 3), "", RI); + } else { // Sharem memory size argument - scalar at address position switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, - allocSize->getName()+".sharedMem.ptr", RI); - StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, - Type::getInt8PtrTy(M.getContext()), - allocSize->getName()+".sharedMem.i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - ConstantExpr::getSizeOf(allocSize->getType()) - }; + AllocaInst *allocSizePtr = + new AllocaInst(allocSize->getType(), 0, + allocSize->getName() + ".sharedMem.ptr", RI); + StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value *allocSizeI8Ptr = CastInst::CreatePointerCast( + allocSizePtr, Type::getInt8PtrTy(M.getContext()), + allocSize->getName() + ".sharedMem.i8ptr", RI); + + Value *setInputArgs[] = { + GraphID, allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + ConstantExpr::getSizeOf(allocSize->getType())}; CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + ArrayRef<Value *>(setInputArgs, 4), "", RI); } } } else { @@ -615,68 +610,64 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi ExtractValueInstVec.push_back(EI); } - for (auto& e: K->getSharedInArgMap()) { + for (auto &e : K->getSharedInArgMap()) { unsigned argNum = e.first; - Value* allocSize = ExtractValueInstVec[e.second.second/2]; + Value *allocSize = ExtractValueInstVec[e.second.second / 2]; - DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at " << argNum + << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position switchToTimer(visc_TimerID_COPY_SCALAR, RI); - Value* setInputArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - allocSize - }; + Value *setInputArgs[] = { + GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + allocSize}; CallInst::Create(llvm_visc_ocl_argument_shared, - ArrayRef<Value*>(setInputArgs, 3), "", RI); - } - else { + ArrayRef<Value *>(setInputArgs, 3), "", RI); + } else { // Sharem memory size argument - scalar at address position switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, - allocSize->getName()+".sharedMem.ptr", RI); - StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, - Type::getInt8PtrTy(M.getContext()), - allocSize->getName()+".sharedMem.i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), - ConstantExpr::getSizeOf(allocSize->getType()) - }; + AllocaInst *allocSizePtr = + new AllocaInst(allocSize->getType(), 0, + allocSize->getName() + ".sharedMem.ptr", RI); + StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value *allocSizeI8Ptr = CastInst::CreatePointerCast( + allocSizePtr, Type::getInt8PtrTy(M.getContext()), + allocSize->getName() + ".sharedMem.i8ptr", RI); + + Value *setInputArgs[] = { + GraphID, allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), + ConstantExpr::getSizeOf(allocSize->getType())}; CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + ArrayRef<Value *>(setInputArgs, 4), "", RI); } } } - DEBUG(errs() << "Setup output edges of node and insert visc api\n"); // Set output if struct is not an empty struct - StructType* OutputTy = K->KernelLeafNode->getOutputType(); - std::vector<Value*> d_Outputs; - if(!OutputTy->isEmptyTy()) { + StructType *OutputTy = K->KernelLeafNode->getOutputType(); + std::vector<Value *> d_Outputs; + if (!OutputTy->isEmptyTy()) { switchToTimer(visc_TimerID_COPY_PTR, RI); // Not an empty struct // Iterate over all elements of the struct and put them in - for(unsigned i=0; i < OutputTy->getNumElements(); i++) { - unsigned outputIndex = KF->getFunctionType()->getNumParams()+i; - Value* setOutputArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), - ConstantExpr::getSizeOf(OutputTy->getElementType(i)) - }; - - CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, - ArrayRef<Value*>(setOutputArgs, 3), - "d_output."+KF->getName(), - RI); + for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { + unsigned outputIndex = KF->getFunctionType()->getNumParams() + i; + Value *setOutputArgs[] = { + GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex), + ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; + + CallInst *d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, + ArrayRef<Value *>(setOutputArgs, 3), + "d_output." + KF->getName(), RI); d_Outputs.push_back(d_Output); } } @@ -690,46 +681,37 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi Value *workDim, *LocalWGPtr, *GlobalWGPtr; getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); switchToTimer(visc_TimerID_KERNEL, RI); - Value* ExecNodeArgs[] = {GraphID, - workDim, - LocalWGPtr, - GlobalWGPtr - }; - CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, - ArrayRef<Value*>(ExecNodeArgs, 4), - "event."+KF->getName(), - RI); + Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr}; + CallInst *Event = CallInst::Create(llvm_visc_ocl_executeNode, + ArrayRef<Value *>(ExecNodeArgs, 4), + "event." + KF->getName(), RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); // Wait for Kernel to Finish - CallInst::Create(llvm_visc_ocl_wait, - ArrayRef<Value*>(GraphID), - "", - RI); + CallInst::Create(llvm_visc_ocl_wait, ArrayRef<Value *>(GraphID), "", RI); switchToTimer(visc_TimerID_READ_OUTPUT, RI); // Read Output Struct if not empty - if(!OutputTy->isEmptyTy()) { - std::vector<Value*>h_Outputs; - Value* KernelOutput = UndefValue::get(OutputTy); - for(unsigned i=0; i < OutputTy->getNumElements(); i++) { - Value* GetOutputArgs[] = {GraphID, - Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - d_Outputs[i], - ConstantExpr::getSizeOf(OutputTy->getElementType(i)) - }; - CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, - ArrayRef<Value*>(GetOutputArgs, 4), - "h_output."+KF->getName()+".addr", - RI); + if (!OutputTy->isEmptyTy()) { + std::vector<Value *> h_Outputs; + Value *KernelOutput = UndefValue::get(OutputTy); + for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { + Value *GetOutputArgs[] = { + GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; + CallInst *h_Output = CallInst::Create( + llvm_visc_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4), + "h_output." + KF->getName() + ".addr", RI); // Read each device pointer listed in output struct // Load the output struct - CastInst* BI = BitCastInst::CreatePointerCast(h_Output, - OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI); - - Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI); - KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i), - KF->getName()+"output", RI); + CastInst *BI = BitCastInst::CreatePointerCast( + h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr", + RI); + + Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI); + KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, + ArrayRef<unsigned>(i), + KF->getName() + "output", RI); } OutputMap[K->KernelLeafNode] = KernelOutput; } @@ -744,75 +726,76 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); - Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; - CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, + output.bytes}; CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "", RI); }*/ switchToTimer(visc_TimerID_MEM_FREE, RI); // Clear Context and free device memory - DEBUG(errs() << "Clearing context" << "\n"); + DEBUG(errs() << "Clearing context" + << "\n"); // Free Device Memory - for(auto d_ptr: DevicePointers) { - CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); + for (auto d_ptr : DevicePointers) { + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value *>(d_ptr), "", RI); } switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); // Clear Context - LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); - CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); + LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value *>(LI), "", + CleanupCall); switchToTimer(visc_TimerID_NONE, CleanupCall); switchToTimer(visc_TimerID_MISC, RI); DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - DFNode* C = N->getChildGraph()->getExit(); + DFNode *C = N->getChildGraph()->getExit(); // Get OutputType of this node - StructType* OutTy = N->getOutputType(); + StructType *OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find the kernel's output arg map, to use instead of the bindings std::vector<unsigned> outArgMap = kernel->getOutArgMap(); // Find all the input edges to exit node - for (unsigned i=0; i < OutTy->getNumElements(); i++) { + for (unsigned i = 0; i < OutTy->getNumElements(); i++) { DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); + DFEdge *E = C->getInDFEdgeAt(i); assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); + DFNode *SrcDF = E->getSourceDF(); - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() + << "\n"); // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { + Value *inputVal; + if (SrcDF->isEntryNode()) { inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { + DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); + } else { // edge is from a internal node // Check - code should already be generated for this source dfnode // FIXME: Since the 2-level kernel code gen has aspecific structure, we // can assume the SrcDF is same as Kernel Leaf node. // Use outArgMap to get correct mapping SrcDF = K->KernelLeafNode; - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); + assert(OutputMap.count(SrcDF) && + "Source node call not found. Dependency violation!"); // Find Output Value associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; + Value *CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; // i is the destination of DFEdge E // Use the mapping instead of the bindings -// IndexList.push_back(E->getSourcePosition()); + // IndexList.push_back(E->getSourcePosition()); IndexList.push_back(outArgMap[i]); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "",RI); + DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); + ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI); inputVal = EI; } std::vector<unsigned> IdxList; @@ -823,29 +806,31 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi DEBUG(errs() << "Extracted all\n"); switchToTimer(visc_TimerID_NONE, RI); retVal->setName("output"); - ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); } - // Right now, only targeting the one level case. In general, device functions // can return values so we don't need to change them -void CGT_NVPTX::codeGen(DFInternalNode* N) { - DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName() << "\n"); - if(KernelLaunchNode == NULL) +void CGT_NVPTX::codeGen(DFInternalNode *N) { + DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName() + << "\n"); + if (KernelLaunchNode == NULL) DEBUG(errs() << "No kernel launch node\n"); else { - DEBUG(errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "KernelLaunchNode: " + << KernelLaunchNode->getFuncPointer()->getName() << "\n"); } if (!KernelLaunchNode) { - DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); + DEBUG(errs() + << "No code generated (host code for kernel launch complete).\n"); return; } if (N == KernelLaunchNode) { DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); - //TODO + // TODO // Now the remaining nodes to be visited should be ignored KernelLaunchNode = NULL; @@ -860,7 +845,8 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // TODO: Structure assumed: one thread node, one allocation node (at most), // TB node std::map<unsigned, unsigned> inmapFinal; - for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), + ie = inmap2.end(); ib != ie; ++ib) { inmapFinal[ib->first] = inmap1[ib->second]; } @@ -877,8 +863,9 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // 0 ... outmap2.size()-1 // The limit is the size of outmap2, because this is the number of kernel // output arguments for which the mapping matters - // For now, it reasonable to assume that all the kernel arguments are returned, - // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size() + // For now, it reasonable to assume that all the kernel arguments are + // returned, maybe plys some others from other nodes, thus outmap2.size() <= + // outmap1.size() for (unsigned i = 0; i < outmap2.size(); i++) { outmap1[i] = outmap2[outmap1[i]]; } @@ -886,15 +873,14 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // Track the source of local dimlimits for the kernel // Dimension limit can either be a constant or an argument of parent - // function. Since Internal node would no longer exist, we need to insert the - // localWGSize with values from the parent of N. - std::vector<Value*> localWGSizeMapped; + // function. Since Internal node would no longer exist, we need to insert + // the localWGSize with values from the parent of N. + std::vector<Value *> localWGSizeMapped; for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { if (isa<Constant>(kernel->localWGSize[i])) { // if constant, use as it is localWGSizeMapped.push_back(kernel->localWGSize[i]); - } - else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { // if argument, find the argument location in N. Use InArgMap of N to // find the source location in Parent of N. Retrieve the argument from // parent to insert in the vector. @@ -904,46 +890,49 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); unsigned parentArgNum = N->getInArgMap()[argNum]; - Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + Argument *A = + getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); localWGSizeMapped.push_back(A); - } - else { - assert(false && "LocalWGsize using value which is neither argument nor constant!"); + } else { + assert( + false && + "LocalWGsize using value which is neither argument nor constant!"); } } // Update localWGSize vector of kernel kernel->setLocalWGSize(localWGSizeMapped); } - } -void CGT_NVPTX::codeGen(DFLeafNode* N) { - DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n"); +void CGT_NVPTX::codeGen(DFLeafNode *N) { + DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName() + << "\n"); // Skip code generation if it is a dummy node - if(N->isDummyNode()) { + if (N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } // Skip code generation if it is an allocation node - if(N->isAllocationNode()) { + if (N->isAllocationNode()) { DEBUG(errs() << "Skipping allocation node\n"); return; } // Generate code only if it has the right hint -// if(!checkPreferredTarget(N, visc::GPU_TARGET)) { -// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; -// return; -// } - if(!preferredTargetIncludes(N, visc::GPU_TARGET)) { - DEBUG(errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"); + // if(!checkPreferredTarget(N, visc::GPU_TARGET)) { + // errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + // return; + // } + if (!preferredTargetIncludes(N, visc::GPU_TARGET)) { + DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName() + << "\n"); return; } // Checking which node is the kernel launch - DFNode* PNode = N->getParent(); + DFNode *PNode = N->getParent(); int pLevel = PNode->getLevel(); int pReplFactor = PNode->getNumOfDim(); @@ -956,37 +945,35 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node."); // Only these options are supported - enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy; - if(pLevel == 1 || !pReplFactor) { - DEBUG(errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n"); + enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy; + if (pLevel == 1 || !pReplFactor) { + DEBUG(errs() + << "*************** Kernel Gen: 1-Level Hierarchy **************\n"); SelectedHierarchy = ONE_LEVEL; KernelLaunchNode = PNode; - kernel = new Kernel(NULL, - N, - N->getInArgMap(), - N->getSharedInArgMap(), - N->getOutArgMap(), - N->getNumOfDim(), - N->getDimLimits()); - } - else { + kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(), + N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits()); + } else { // Converting a 2-level DFG to opencl kernel - DEBUG(errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n"); - assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node."); + DEBUG(errs() + << "*************** Kernel Gen: 2-Level Hierarchy **************\n"); + assert((pLevel >= 2) && + "Selected node not nested deep enough to be Kernel Node."); SelectedHierarchy = TWO_LEVEL; KernelLaunchNode = PNode->getParent(); - assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); + assert((PNode->getNumOfDim() == N->getNumOfDim()) && + "Dimension number must match"); // Contains the instructions generating the kernel configuration parameters - kernel = new Kernel(NULL, // kernel function - N, // kernel leaf node - N->getInArgMap(), // kenel argument mapping + kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node + N->getInArgMap(), // kenel argument mapping N->getSharedInArgMap(), - N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node - PNode->getNumOfDim(), // gridDim - PNode->getDimLimits(),// grid size - N->getNumOfDim(), // blockDim - N->getDimLimits()); // block size - + N->getOutArgMap(), // kernel output mapping from the + // leaf to the interemediate node + PNode->getNumOfDim(), // gridDim + PNode->getDimLimits(), // grid size + N->getNumOfDim(), // blockDim + N->getDimLimits()); // block size } std::vector<Instruction *> IItoRemove; @@ -998,58 +985,62 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // Look up if we have visited this function before. If we have, then just // get the cloned function pointer from DFNode. Otherwise, create the cloned // function and add it to the DFNode GenFunc. -// Function *F_nvptx = N->getGenFunc(); + // Function *F_nvptx = N->getGenFunc(); Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET); - assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated"); + assert(F_nvptx == NULL && + "Error: Visiting a node for which code already generated"); // Clone the function ValueToValueMapTy VMap; - //F_nvptx->setName(FName+"_nvptx"); + // F_nvptx->setName(FName+"_nvptx"); Twine FName = F->getName(); StringRef fStr = FName.getSingleStringRef(); - Twine newFName = Twine(fStr, "_nvptx"); + Twine newFName = Twine(fStr, "_nvptx"); F_nvptx = CloneFunction(F, VMap); F_nvptx->setName(newFName); - // errs() << "Old Function Name: " << F->getName() << "\n"; // errs() << "New Function Name: " << F_nvptx->getName() << "\n"; F_nvptx->removeFromParent(); - // Insert the cloned function into the kernels module KernelM->getFunctionList().push_back(F_nvptx); - - //TODO: Iterate over all the instructions of F_nvptx and identify the - //callees and clone them into this module. + // TODO: Iterate over all the instructions of F_nvptx and identify the + // callees and clone them into this module. DEBUG(errs() << *F_nvptx->getType()); DEBUG(errs() << *F_nvptx); // Transform the function to void and remove all target dependent attributes // from the function F_nvptx = transformFunctionToVoid(F_nvptx); - - //Add generated function info to DFNode -// N->setGenFunc(F_nvptx, visc::GPU_TARGET); + + // Add generated function info to DFNode + // N->setGenFunc(F_nvptx, visc::GPU_TARGET); N->addGenFunc(F_nvptx, visc::GPU_TARGET, false); - DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); - F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes()); + DEBUG( + errs() + << "Removing all attributes from Kernel Function and adding nounwind\n"); + F_nvptx->removeAttributes(AttributeList::FunctionIndex, + F_nvptx->getAttributes().getFnAttributes()); F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); - //FIXME: For now, assume only one allocation node + // FIXME: For now, assume only one allocation node kernel->AllocationNode = NULL; - for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), + iee = N->indfedge_end(); ieb != iee; ++ieb) { DFNode *SrcDFNode = (*ieb)->getSourceDF(); - DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Found edge from node: " + << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"); + DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode() + << "\n"); if (!SrcDFNode->isDummyNode()) { assert(SrcDFNode->isAllocationNode()); kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); @@ -1065,10 +1056,11 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { if (kernel->AllocationNode) { ValueToValueMapTy VMap; - Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); - //F_alloc->removeFromParent(); + Function *F_alloc = + CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); + // F_alloc->removeFromParent(); // Insert the cloned function into the kernels module - //M.getFunctionList().push_back(F_alloc); + // M.getFunctionList().push_back(F_alloc); std::vector<IntrinsicInst *> ViscMallocInstVec; findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec); @@ -1076,7 +1068,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) { IntrinsicInst *II = ViscMallocInstVec[i]; assert(II->hasOneUse() && "visc_malloc result is used more than once"); - II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); + II->replaceAllUsesWith( + ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); II->eraseFromParent(); } kernel->AllocationFunction = F_alloc; @@ -1091,15 +1084,19 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { assert(RetStructTy && "Allocation node does not return a struct type"); unsigned numFields = RetStructTy->getNumElements(); */ - std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap(); - AllocationNodeProperty* APN = - (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); - for (auto& AllocPair: APN->getAllocationList()) { + std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap = + kernel->getSharedInArgMap(); + AllocationNodeProperty *APN = + (AllocationNodeProperty *)kernel->AllocationNode->getProperty( + DFNode::Allocation); + for (auto &AllocPair : APN->getAllocationList()) { unsigned destPos = AllocPair.first->getDestPosition(); unsigned srcPos = AllocPair.first->getSourcePosition(); SharedMemArgs.push_back(destPos); - sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); - sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + sharedInMap[destPos] = + std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); + sharedInMap[destPos + 1] = + std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); } kernel->setSharedInArgMap(sharedInMap); } @@ -1109,12 +1106,14 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // global address space unsigned argIndex = 0; std::vector<unsigned> GlobalMemArgs; - for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end(); - ai != ae; ++ai) { - if (ai->getType()->isPointerTy()) { - // If the arguement is already chosen for shared memory arguemnt list, skip. - // Else put it in Global memory arguement list - if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) { + for (Function::arg_iterator ai = F_nvptx->arg_begin(), + ae = F_nvptx->arg_end(); + ai != ae; ++ai) { + if (ai->getType()->isPointerTy()) { + // If the arguement is already chosen for shared memory arguemnt list, + // skip. Else put it in Global memory arguement list + if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == + 0) { GlobalMemArgs.push_back(argIndex); } } @@ -1128,20 +1127,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // Optimization: Gloabl memory arguments, which are not modified and whose // loads are not dependent on node id of current node, should be moved to // constant memory, subject to size of course - std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); + std::vector<unsigned> ConstantMemArgs = + globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE); -// Function to replace call instructions to functions in the kernel + // Function to replace call instructions to functions in the kernel std::map<Function *, Function *> OrgToClonedFuncMap; std::vector<Function *> FuncToBeRemoved; - auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) { - Function* NewFunc; + auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) { + Function *NewFunc; // Check if the called function has already been cloned before. auto It = OrgToClonedFuncMap.find(OrgFunc); - if(It == OrgToClonedFuncMap.end()) { + if (It == OrgToClonedFuncMap.end()) { ValueToValueMapTy VMap; NewFunc = CloneFunction(OrgFunc, VMap); OrgToClonedFuncMap[OrgFunc] = NewFunc; @@ -1150,42 +1150,47 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { NewFunc = (*It).second; } // Replace the calls to this function - std::vector<Value*> args; - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + std::vector<Value *> args; + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { args.push_back(CI->getArgOperand(i)); } - CallInst* Inst = CallInst::Create(NewFunc, args, - OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + CallInst *Inst = CallInst::Create( + NewFunc, args, + OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); CI->replaceAllUsesWith(Inst); IItoRemove.push_back(CI); return NewFunc; }; - // Go through all the instructions - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; + ++i) { Instruction *I = &(*i); // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + assert(!BuildDFG::isViscLaunchIntrinsic(I) && + "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && + "VISC graph intrinsic within a leaf dataflow node!"); if (BuildDFG::isViscIntrinsic(I)) { - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - IntrinsicInst* ArgII; - DFNode* ArgDFNode; + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst *ArgII; + DFNode *ArgDFNode; - /************************ Handle VISC Query intrinsics ************************/ + /************************ Handle VISC Query intrinsics + * ************************/ switch (II->getIntrinsicID()) { - /**************************** llvm.visc.getNode() *****************************/ + /**************************** llvm.visc.getNode() + * *****************************/ case Intrinsic::visc_getNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); - } - break; - /************************* llvm.visc.getParentNode() **************************/ + } break; + /************************* llvm.visc.getParentNode() + * **************************/ case Intrinsic::visc_getParentNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); // get the parent node of the arg node @@ -1199,9 +1204,9 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); IItoRemove.push_back(II); - } - break; - /*************************** llvm.visc.getNumDims() ***************************/ + } break; + /*************************** llvm.visc.getNumDims() + * ***************************/ case Intrinsic::visc_getNumDims: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); // get node from map @@ -1210,47 +1215,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; int numOfDim = ArgDFNode->getNumOfDim(); DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); - IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext()); - ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext()); + ConstantInt *numOfDimConstant = + ConstantInt::getSigned(IntTy, (int64_t)numOfDim); // Replace the result of the intrinsic with the computed value II->replaceAllUsesWith(numOfDimConstant); IItoRemove.push_back(II); - } - break; - /*********************** llvm.visc.getNodeInstanceID() ************************/ + } break; + /*********************** llvm.visc.getNodeInstanceID() + * ************************/ case Intrinsic::visc_getNodeInstanceID_x: case Intrinsic::visc_getNodeInstanceID_y: case Intrinsic::visc_getNodeInstanceID_z: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n"); + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" + << "\t: " << *II << "\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; assert(ArgDFNode && "Arg node is NULL"); // A leaf node always has a parent - DFNode* ParentDFNode = ArgDFNode->getParent(); + DFNode *ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = II->getIntrinsicID() - - Intrinsic::visc_getNodeInstanceID_x; + uint64_t dim = + II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt * DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - //ArrayRef<Value *> Args(DimConstant); + ConstantInt *DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + // ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function * OpenCLFunction; + Function *OpenCLFunction; - FunctionType* FT = - FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), - false); + FunctionType *FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), false); if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel @@ -1259,20 +1265,23 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // itself DEBUG(errs() << "Substitute with get_global_id()\n"); DEBUG(errs() << *II << "\n"); - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)) + .getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { - //DEBUG(errs() << "Here inside cond 2\n"); + // DEBUG(errs() << "Here inside cond 2\n"); // We are asking for this node's id with respect to its parent // this is a local id call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee()); - //DEBUG(errs() << "exiting condition 2\n"); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)) + .getCallee()); + // DEBUG(errs() << "exiting condition 2\n"); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's id with respect to its // parent: this is a group id call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)) + .getCallee()); } else { DEBUG(errs() << N->getFuncPointer()->getName() << "\n"); DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n"); @@ -1281,21 +1290,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { assert(false && "Unable to translate getNodeInstanceID intrinsic"); } - //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n"); - //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n"); - //DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); - //DEBUG(errs() << "Argument: " << Args[0] << "\n"); - //DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); + // DEBUG(errs() << "Create call instruction, insert it before the + // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction << + // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); + // DEBUG(errs() << "Argument: " << Args[0] << "\n"); + // DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); - //DEBUG(errs() << "Replace uses\n"); + CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + // DEBUG(errs() << "Replace uses\n"); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } - break; - /********************** llvm.visc.getNumNodeInstances() ***********************/ + } break; + /********************** llvm.visc.getNumNodeInstances() + * ***********************/ case Intrinsic::visc_getNumNodeInstances_x: case Intrinsic::visc_getNumNodeInstances_y: case Intrinsic::visc_getNumNodeInstances_z: { @@ -1304,78 +1313,82 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // then, why do we need to keep that info in the graph? (only for the // kernel configuration during the call) - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); + DEBUG(errs() << F_nvptx->getName() + << "\t: Handling getNumNodeInstances\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; // A leaf node always has a parent - DFNode* ParentDFNode = ArgDFNode->getParent(); + DFNode *ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = II->getIntrinsicID() - - Intrinsic::visc_getNumNodeInstances_x; + uint64_t dim = + II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt * DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - //ArrayRef<Value *> Args(DimConstant); + ConstantInt *DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + // ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function * OpenCLFunction; - FunctionType* FT = + Function *OpenCLFunction; + FunctionType *FT = FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), - false); + Type::getInt32Ty(KernelM->getContext()), false); if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel // launch, so the instances are global_size (gridDim x blockDim) - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)) + .getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { // We are asking for this node's instances // this is a local size (block dim) call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)) + .getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's instances // this is a (global_size/local_size) (grid dim) call - OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee()); + OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)) + .getCallee()); } else { assert(false && "Unable to translate getNumNodeInstances intrinsic"); } // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } - break; - case Intrinsic::visc_barrier: - { + } break; + case Intrinsic::visc_barrier: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n"); DEBUG(errs() << "Substitute with barrier()\n"); DEBUG(errs() << *II << "\n"); - FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()), - std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())), - false); - Function* OpenCLFunction = cast<Function> - ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee()); - CallInst* CI = CallInst::Create(OpenCLFunction, - ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)), - "", II); + FunctionType *FT = FunctionType::get( + Type::getVoidTy(KernelM->getContext()), + std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())), + false); + Function *OpenCLFunction = cast<Function>( + (KernelM->getOrInsertFunction(StringRef("barrier"), FT)) + .getCallee()); + CallInst *CI = + CallInst::Create(OpenCLFunction, + ArrayRef<Value *>(ConstantInt::get( + Type::getInt32Ty(KernelM->getContext()), 1)), + "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } - break; + } break; case Intrinsic::visc_atomic_cmpxchg: break; case Intrinsic::visc_atomic_add: @@ -1386,607 +1399,627 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { case Intrinsic::visc_atomic_and: case Intrinsic::visc_atomic_or: case Intrinsic::visc_atomic_xor: - //case Intrinsic::visc_atomic_inc: - //case Intrinsic::visc_atomic_dec: - { - DEBUG(errs() << *II << "\n"); - // Only have support for i32 atomic intrinsics - assert(II->getType() == Type::getInt32Ty(II->getContext()) - && "Only support i32 atomic intrinsics for now"); - // Substitute with atomicrmw instruction - assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics"); - Value* Ptr = II->getArgOperand(0); - Value* Val = II->getArgOperand(1); - assert(Ptr->getType()->isPointerTy() - && "First argument of supported atomics is expected to be a pointer"); - PointerType* PtrTy = cast<PointerType>(Ptr->getType()); - PointerType* TargetTy = Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); - if (PtrTy != TargetTy) { - Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); - PtrTy = TargetTy; + // case Intrinsic::visc_atomic_inc: + // case Intrinsic::visc_atomic_dec: + { + DEBUG(errs() << *II << "\n"); + // Only have support for i32 atomic intrinsics + assert(II->getType() == Type::getInt32Ty(II->getContext()) && + "Only support i32 atomic intrinsics for now"); + // Substitute with atomicrmw instruction + assert(II->getNumArgOperands() == 2 && + "Expecting 2 operands for these atomics"); + Value *Ptr = II->getArgOperand(0); + Value *Val = II->getArgOperand(1); + assert(Ptr->getType()->isPointerTy() && + "First argument of supported atomics is expected to be a " + "pointer"); + PointerType *PtrTy = cast<PointerType>(Ptr->getType()); + PointerType *TargetTy = + Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); + if (PtrTy != TargetTy) { + Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); + PtrTy = TargetTy; + } + + std::string name; + if (II->getIntrinsicID() == Intrinsic::visc_atomic_add) + name = "atomic_add"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_sub) + name = "atomic_sub"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xchg) + name = "atomic_xchg"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_min) + name = "atomic_min"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_max) + name = "atomic_max"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_and) + name = "atomic_and"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_or) + name = "atomic_or"; + else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xor) + name = "atomic_xor"; + Type *paramTypes[] = {PtrTy, Val->getType()}; + FunctionType *AtomFuncT = FunctionType::get( + II->getType(), ArrayRef<Type *>(paramTypes, 2), false); + FunctionCallee AtomFunc = + KernelM->getOrInsertFunction(name, AtomFuncT); + + Value *Params[] = {Ptr, Val}; + CallInst *AtomCI = CallInst::Create( + AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II); + DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); + II->replaceAllUsesWith(AtomCI); + IItoRemove.push_back(II); } + break; + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + } - std::string name; - if(II->getIntrinsicID() == Intrinsic::visc_atomic_add) - name = "atomic_add"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub) - name = "atomic_sub"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg) - name = "atomic_xchg"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min) - name = "atomic_min"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max) - name = "atomic_max"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and) - name = "atomic_and"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or) - name = "atomic_or"; - else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor) - name = "atomic_xor"; - Type* paramTypes[] = {PtrTy, Val->getType()}; - FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false); - FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT); - - Value* Params[] = {Ptr, Val}; - CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II); - DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); - II->replaceAllUsesWith(AtomCI); - IItoRemove.push_back(II); - } - break; - default: - llvm_unreachable("Unknown VISC Intrinsic!"); - break; - } - - } - else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { - IRBuilder<> Builder(I); - Value *Source = MemCpyI->getSource(); - Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); - Value *Length = MemCpyI->getOperand(2); - DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); - DEBUG(errs() << "Source: " << *Source << "\n"); - DEBUG(errs() << "Destination: " << *Destination << "\n"); - DEBUG(errs() << "Length: " << *Length << "\n"); - - size_t memcpy_length; - unsigned int memcpy_count; - if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) { - if (CI->getBitWidth() <= 64) { - memcpy_length = CI->getSExtValue(); - DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); - Type *Source_Type = Source->getType()->getPointerElementType(); - DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); - memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); - DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); - if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) { - if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) { - Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); - Value *DestPtrOperand = destGEPI->getPointerOperand(); - for(int i = 0; i < memcpy_count; ++i) { - Constant *increment; - LoadInst *newLoadI; - StoreInst *newStoreI; - // First, need to increment the correct index for both source and dest - // This invluves checking to see how many indeces the GEP has - // Assume for now only 1 or 2 are the viable options. - - std::vector<Value*> GEPlIndex; - if (sourceGEPI->getNumIndices() == 1) { - Value *Index = sourceGEPI->getOperand(1); - increment = ConstantInt::get(Index->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPlIndex.push_back(incAdd); - Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex)); - DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); - newLoadI = Builder.CreateLoad(newGEPIl); - DEBUG(errs() << "Load: " << *newLoadI << "\n"); - } else { - llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n"); - } - - - std::vector<Value*> GEPsIndex; - if (destGEPI->getNumIndices() == 1) { - - } else if (destGEPI->getNumIndices() == 2) { - Value *Index0 = destGEPI->getOperand(1); - GEPsIndex.push_back(Index0); - Value *Index1 = destGEPI->getOperand(2); - increment = ConstantInt::get(Index1->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index1, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPsIndex.push_back(incAdd); - Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex)); - DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); - newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile()); - DEBUG(errs() << "Store: " << *newStoreI << "\n"); - } else { - llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n"); - } - } - IItoRemove.push_back(sourceGEPI); - IItoRemove.push_back(destGEPI); - Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); - Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); - IItoRemove.push_back(destBitcastI); - IItoRemove.push_back(sourceBitcastI); - IItoRemove.push_back(MemCpyI); - } - } - - } - } else { - llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); - } - // llvm_unreachable("HERE!"); - } - - else if(CallInst* CI = dyn_cast<CallInst>(I)) { - DEBUG(errs() << "Found a call: " << *CI << "\n"); - Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); - if(calleeF->isDeclaration()) { - // Add the declaration to kernel module - if (calleeF->getName() == "sqrtf") { - calleeF->setName(Twine("sqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } else if (calleeF->getName() == "rsqrtf") { - calleeF->setName(Twine("rsqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } - DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); - KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); - } - else { - // Check if the called function has already been cloned before. - Function *NewFunc = CloneAndReplaceCall(CI, calleeF); - // Iterate over the new function to see if it calls any other functions - // in the module. - for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) { - if(auto *Call = dyn_cast<CallInst>(&*i)) { - Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts()); - CloneAndReplaceCall(Call, CalledFunc); - } - } - } - //TODO: how to handle address space qualifiers in load/store - } - - } - // search for pattern where float is being casted to int and loaded/stored and change it. - DEBUG(errs() << "finding pattern for replacement!\n"); - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { - bool cont = false; - bool keepGEPI = false; - bool keepGEPI2= false; - Instruction *I = &(*i); - GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I); - - if (!GEPI) { - // did nod find pattern start, continue - continue; - } - // may have found pattern, check - DEBUG(errs() << "GEPI " << *GEPI << "\n"); - // print whatever we want for debug - Value* PtrOp = GEPI->getPointerOperand(); - Type *SrcTy = GEPI->getSourceElementType(); - unsigned GEPIaddrspace = GEPI->getAddressSpace(); - - if (SrcTy->isArrayTy()) - DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n"); - else - DEBUG(errs() << *SrcTy << " is not an array type!\n"); - // check that source element type is float - if (SrcTy->isArrayTy()) { - if (!(SrcTy->getArrayElementType()->isFloatTy())) { - DEBUG(errs() << "GEPI type is array but not float!\n"); - continue; - } - } - else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) { - DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); - // does not fit this pattern - no float GEP instruction - continue; - } - // check that addressspace is 1 - // if (GEPIaddrspace != 1) { - // // does not fit this pattern - addrspace of pointer argument is not global - // continue; - // } - if (!(GEPI->hasOneUse())) { - // does not fit this pattern - more than one uses - //continue; - // Keep GEPI around if it has other uses - keepGEPI = true; - } - DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); - - // 1st GEPI it has one use - // assert(GEPI->hasOneUse() && "GEPI has a single use"); - - // See if it is a bitcast - BitCastInst *BitCastI; - for (User * U : GEPI->users()) { - if(Instruction *ui = dyn_cast<Instruction> (U)) { - DEBUG(errs() << "--" << *ui << "\n"); - if (isa<BitCastInst>(ui)) { - BitCastI = dyn_cast<BitCastInst>(ui); - DEBUG(errs() << "---Found bitcast as only use of GEP\n"); - break; - } - } - DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); - cont = true; - } - // for (Value::user_iterator ui = GEPI->user_begin(), - // ue = GEPI->user_end(); ui!=ue; ++ui) { - // DEBUG(errs() << "--" << *ui << "\n"); - // if (isa<BitCastInst>(*ui)) { - // BitCastI = dyn_cast<BitCastInst>(*ui); - // DEBUG(errs() << "Found bitcast as only use of GEP\n"); - // } - // } - - if (cont/*!BitCastI*/) { - continue; // not in pattern - } - - // DEBUG(errs() << *BitCastI << "\n"); - // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP. - Value *Op2 = BitCastI->getOperand(0); - DEBUG(errs() << "----" << *Op2 << "\n"); - // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); - // Type *OpTy = cast<Type>(Op2); - Type *OpTy = BitCastI->getDestTy(); - DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); - // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n"); - if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { - // maybe right syntax is (Type::getInt32Ty)->getPointerTo() - continue; // not in pattern - } - - DEBUG(errs() << "----Here!\n"); - // We are in GEP, bitcast. - - // user_iterator, to find the load. - - if (!(BitCastI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - } - DEBUG(errs() << "----Bitcast has one use!\n"); - // it has one use - assert(BitCastI->hasOneUse() && "BitCastI has a single use"); - LoadInst *LoadI; - for (User * U : BitCastI->users()) { - if (Instruction *ui = dyn_cast<Instruction> (U)) { - DEBUG(errs() << "-----" << *ui << "\n"); - if (isa<LoadInst>(ui)) { - LoadI = dyn_cast<LoadInst>(ui); - DEBUG(errs() << "-----Found load as only use of bitcast\n"); - break; - } - } - DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); - cont = true; - } - // for (Value::user_iterator ui = BitCastI->user_begin(), - // ue = BitCastI->user_end(); ui!=ue; ++ui) { - // if (isa<LoadInst>(*ui)) { - // LoadI = dyn_cast<LoadInst>(*ui); - // errs() << "Found load as only use of bitcast\n"; - // } - // } - - if (cont) { - continue; // not in pattern - } - - DEBUG("HERE!\n"); - // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from - assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n"); - - // Copy user_iterator, to find the store. - - if (!(LoadI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - // TODO: generalize: one load can have more than one store users - } - - // it has one use - assert(LoadI->hasOneUse() && "LoadI has a single use"); - Value::user_iterator ui = LoadI->user_begin(); - // skipped loop, because is has a single use - StoreInst *StoreI = dyn_cast<StoreInst>(*ui); - if (!StoreI) { - continue; // not in pattern - } - - // Also check that the store uses the loaded value as the value operand - if (StoreI->getValueOperand() != LoadI) { - continue; - } - - DEBUG(errs() << "-------Found store instruction\n"); - - // Look for its bitcast, which is its pointer operand - Value *StPtrOp = StoreI->getPointerOperand(); - DEBUG(errs() << "-------" << *StPtrOp << "\n"); - BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); - DEBUG(errs() << "-------" << *BitCastI2 << "\n"); - if (!BitCastI2) { - continue; //not in pattern - } - - DEBUG(errs() << "-------- Found Bit Cast of store!\n" ); - // found bitcast. Look for the second GEP, its from operand. - Value *BCFromOp = BitCastI2->getOperand(0); - GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); - DEBUG(errs() << "---------- " << *GEPI2 << "\n"); - if (!GEPI2) { - continue; //not in pattern - } - - if (!(GEPI2->hasOneUse())) { - // does not fit this pattern - more than one uses - //continue; - // Keep GEPI around if it has other uses - keepGEPI2 = true; - } - DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); - - Value *PtrOp2 = GEPI2->getPointerOperand(); - - // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above. - - // Assume we found pattern - if (!keepGEPI) { - IItoRemove.push_back(GEPI); - DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); - } else { - DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); - } - IItoRemove.push_back(BitCastI); - DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); - IItoRemove.push_back(LoadI); - DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); - IItoRemove.push_back(GEPI2); - DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); - IItoRemove.push_back(BitCastI2); - DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); - if (!keepGEPI2) { - IItoRemove.push_back(StoreI); - DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); - } else { - - DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n"); - } - - std::vector<Value*> GEPlIndex; - if (GEPI->hasIndices()) { - for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); - GEPlIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); - - std::vector<Value*> GEPsIndex; - if (GEPI2->hasIndices()) { - for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); - GEPsIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); - - - - // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); - GetElementPtrInst* newlGEP = - GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()), - PtrOp, // operand from 1st GEP - ArrayRef<Value*>(GEPlIndex), - Twine(), - StoreI); - DEBUG(errs() << "Adding: " << *newlGEP << "\n"); - // insert load before GEPI - LoadInst *newLoadI = - new LoadInst(Type::getFloatTy(M.getContext()), - newlGEP, // new GEP - Twine(), - LoadI->isVolatile(), - LoadI->getAlignment(), - LoadI->getOrdering(), - LoadI->getSyncScopeID(), - StoreI); - DEBUG(errs() << "Adding: " << *newLoadI << "\n"); - // same for GEP for store, for store operand - GetElementPtrInst* newsGEP = - GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), - PtrOp2, // operand from 2nd GEP - ArrayRef<Value*>(GEPsIndex), - Twine(), - StoreI); - DEBUG(errs() << "Adding: " << *newsGEP << "\n"); - // insert store before GEPI - StoreInst *newStoreI = - new StoreInst(newLoadI, - newsGEP, // new GEP - StoreI->isVolatile(), - StoreI->getAlignment(), - StoreI->getOrdering(), - StoreI->getSyncScopeID(), - StoreI); - DEBUG(errs() << "Adding: " << *newStoreI << "\n"); - - } - - // We need to do this explicitly: DCE pass will not remove them because we - // have assumed theworst memory behaviour for these function calls - // Traverse the vector backwards, otherwise definitions are deleted while - // their subsequent uses are still around - for (auto *I : reverse(IItoRemove)) { - DEBUG(errs() << "Erasing: " << *I << "\n"); - I->eraseFromParent(); - } - - // Removed the cloned functions from the parent module into the new module - for(auto *F : FuncToBeRemoved) { - F->removeFromParent(); //TODO: MARIA check - KernelM->getFunctionList().push_back(F); - } - - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; - DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"); - DEBUG(errs() << *KernelM); - - return; -} + } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { + IRBuilder<> Builder(I); + Value *Source = MemCpyI->getSource(); + Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); + Value *Length = MemCpyI->getOperand(2); + DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); + DEBUG(errs() << "Source: " << *Source << "\n"); + DEBUG(errs() << "Destination: " << *Destination << "\n"); + DEBUG(errs() << "Length: " << *Length << "\n"); + + size_t memcpy_length; + unsigned int memcpy_count; + if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) { + if (CI->getBitWidth() <= 64) { + memcpy_length = CI->getSExtValue(); + DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); + Type *Source_Type = Source->getType()->getPointerElementType(); + DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); + memcpy_count = + memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); + DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); + if (GetElementPtrInst *sourceGEPI = + dyn_cast<GetElementPtrInst>(Source)) { + if (GetElementPtrInst *destGEPI = + dyn_cast<GetElementPtrInst>(Destination)) { + Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); + Value *DestPtrOperand = destGEPI->getPointerOperand(); + for (int i = 0; i < memcpy_count; ++i) { + Constant *increment; + LoadInst *newLoadI; + StoreInst *newStoreI; + // First, need to increment the correct index for both source + // and dest This invluves checking to see how many indeces the + // GEP has Assume for now only 1 or 2 are the viable options. + + std::vector<Value *> GEPlIndex; + if (sourceGEPI->getNumIndices() == 1) { + Value *Index = sourceGEPI->getOperand(1); + increment = ConstantInt::get(Index->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPlIndex.push_back(incAdd); + Value *newGEPIl = Builder.CreateGEP( + SourcePtrOperand, ArrayRef<Value *>(GEPlIndex)); + DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); + newLoadI = Builder.CreateLoad(newGEPIl); + DEBUG(errs() << "Load: " << *newLoadI << "\n"); + } else { + llvm_unreachable("Unhandled case where source GEPI has more " + "than 1 indices!\n"); + } + + std::vector<Value *> GEPsIndex; + if (destGEPI->getNumIndices() == 1) { + + } else if (destGEPI->getNumIndices() == 2) { + Value *Index0 = destGEPI->getOperand(1); + GEPsIndex.push_back(Index0); + Value *Index1 = destGEPI->getOperand(2); + increment = ConstantInt::get(Index1->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index1, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPsIndex.push_back(incAdd); + Value *newGEPIs = Builder.CreateGEP( + DestPtrOperand, ArrayRef<Value *>(GEPsIndex)); + DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); + newStoreI = Builder.CreateStore(newLoadI, newGEPIs, + MemCpyI->isVolatile()); + DEBUG(errs() << "Store: " << *newStoreI << "\n"); + } else { + llvm_unreachable("Unhandled case where dest GEPI has more " + "than 2 indices!\n"); + } + } + IItoRemove.push_back(sourceGEPI); + IItoRemove.push_back(destGEPI); + Instruction *destBitcastI = + dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); + Instruction *sourceBitcastI = + dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); + IItoRemove.push_back(destBitcastI); + IItoRemove.push_back(sourceBitcastI); + IItoRemove.push_back(MemCpyI); + } + } + } + } else { + llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); + } + // llvm_unreachable("HERE!"); + } -bool DFG2LLVM_NVPTX::runOnModule(Module &M) { - DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n"); + else if (CallInst *CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function *calleeF = + cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if (calleeF->isDeclaration()) { + // Add the declaration to kernel module + if (calleeF->getName() == "sqrtf") { + calleeF->setName(Twine("sqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } else if (calleeF->getName() == "rsqrtf") { + calleeF->setName(Twine("rsqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF + << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), + calleeF->getFunctionType()); + } else { + // Check if the called function has already been cloned before. + Function *NewFunc = CloneAndReplaceCall(CI, calleeF); + // Iterate over the new function to see if it calls any other functions + // in the module. + for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); + i != e; ++i) { + if (auto *Call = dyn_cast<CallInst>(&*i)) { + Function *CalledFunc = + cast<Function>(Call->getCalledValue()->stripPointerCasts()); + CloneAndReplaceCall(Call, CalledFunc); + } + } + } + // TODO: how to handle address space qualifiers in load/store + } + } + // search for pattern where float is being casted to int and loaded/stored and + // change it. + DEBUG(errs() << "finding pattern for replacement!\n"); + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; + ++i) { + bool cont = false; + bool keepGEPI = false; + bool keepGEPI2 = false; + Instruction *I = &(*i); + GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I); - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); + if (!GEPI) { + // did nod find pattern start, continue + continue; + } + // may have found pattern, check + DEBUG(errs() << "GEPI " << *GEPI << "\n"); + // print whatever we want for debug + Value *PtrOp = GEPI->getPointerOperand(); + Type *SrcTy = GEPI->getSourceElementType(); + unsigned GEPIaddrspace = GEPI->getAddressSpace(); + + if (SrcTy->isArrayTy()) + DEBUG(errs() << *SrcTy << " is an array type! " + << *(SrcTy->getArrayElementType()) << "\n"); + else + DEBUG(errs() << *SrcTy << " is not an array type!\n"); + // check that source element type is float + if (SrcTy->isArrayTy()) { + if (!(SrcTy->getArrayElementType()->isFloatTy())) { + DEBUG(errs() << "GEPI type is array but not float!\n"); + continue; + } + } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) { + DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); + // does not fit this pattern - no float GEP instruction + continue; + } + // check that addressspace is 1 + // if (GEPIaddrspace != 1) { + // // does not fit this pattern - addrspace of pointer argument is + //not global continue; + // } + if (!(GEPI->hasOneUse())) { + // does not fit this pattern - more than one uses + // continue; + // Keep GEPI around if it has other uses + keepGEPI = true; + } + DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); + + // 1st GEPI it has one use + // assert(GEPI->hasOneUse() && "GEPI has a single use"); + + // See if it is a bitcast + BitCastInst *BitCastI; + for (User *U : GEPI->users()) { + if (Instruction *ui = dyn_cast<Instruction>(U)) { + DEBUG(errs() << "--" << *ui << "\n"); + if (isa<BitCastInst>(ui)) { + BitCastI = dyn_cast<BitCastInst>(ui); + DEBUG(errs() << "---Found bitcast as only use of GEP\n"); + break; + } + } + DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); + cont = true; + } + // for (Value::user_iterator ui = GEPI->user_begin(), + // ue = GEPI->user_end(); ui!=ue; ++ui) { + // DEBUG(errs() << "--" << *ui << "\n"); + // if (isa<BitCastInst>(*ui)) { + // BitCastI = dyn_cast<BitCastInst>(*ui); + // DEBUG(errs() << "Found bitcast as only use of GEP\n"); + // } + // } + + if (cont /*!BitCastI*/) { + continue; // not in pattern + } - // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); - // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); - // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + // DEBUG(errs() << *BitCastI << "\n"); + // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand + // has to be the GEP, since this is a use of the GEP. + Value *Op2 = BitCastI->getOperand(0); + DEBUG(errs() << "----" << *Op2 << "\n"); + // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); + // Type *OpTy = cast<Type>(Op2); + Type *OpTy = BitCastI->getDestTy(); + DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); + // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << + // "\n"); + if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { + // maybe right syntax is (Type::getInt32Ty)->getPointerTo() + continue; // not in pattern + } - // Visitor for Code Generation Graph Traversal - CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); + DEBUG(errs() << "----Here!\n"); + // We are in GEP, bitcast. - // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode: Roots) { - // Initiate code generation for root DFNode - CGTVisitor->visit(rootNode); - } + // user_iterator, to find the load. - CGTVisitor->writeKernelsModule(); + if (!(BitCastI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + } + DEBUG(errs() << "----Bitcast has one use!\n"); + // it has one use + assert(BitCastI->hasOneUse() && "BitCastI has a single use"); + LoadInst *LoadI; + for (User *U : BitCastI->users()) { + if (Instruction *ui = dyn_cast<Instruction>(U)) { + DEBUG(errs() << "-----" << *ui << "\n"); + if (isa<LoadInst>(ui)) { + LoadI = dyn_cast<LoadInst>(ui); + DEBUG(errs() << "-----Found load as only use of bitcast\n"); + break; + } + } + DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); + cont = true; + } + // for (Value::user_iterator ui = BitCastI->user_begin(), + // ue = BitCastI->user_end(); ui!=ue; ++ui) { + // if (isa<LoadInst>(*ui)) { + // LoadI = dyn_cast<LoadInst>(*ui); + // errs() << "Found load as only use of bitcast\n"; + // } + // } + + if (cont) { + continue; // not in pattern + } - //TODO: Edit module epilogue to remove the VISC intrinsic declarations - delete CGTVisitor; + DEBUG("HERE!\n"); + // check that we load from pointer we got from bitcast - assert - the unique + // argument must be the use we found it from + assert(LoadI->getPointerOperand() == BitCastI && + "Unexpected Load Instruction Operand\n"); - return true; -} + // Copy user_iterator, to find the store. -std::string CGT_NVPTX::getKernelsModuleName(Module &M) { - /*SmallString<128> currentDir; - llvm::sys::fs::current_path(currentDir); - std::string fileName = getFilenameFromModule(M); - Twine output = Twine(currentDir) + "/Output/" + fileName + ""; - return output.str().append(".kernels.ll");*/ - std::string mid = M.getModuleIdentifier(); - return mid.append(".kernels.ll"); -} + if (!(LoadI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + // TODO: generalize: one load can have more than one store users + } -void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) { - assert(isa<PointerType>(V->getType()) - && "Value should be of Pointer Type!"); - PointerType* OldTy = cast<PointerType>(V->getType()); - PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); - V->mutateType(NewTy); - for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { - // Change all uses producing pointer type in same address space to new - // addressspace. - if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { - if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { - fixValueAddrspace(*ui, addrspace); - } - } - } + // it has one use + assert(LoadI->hasOneUse() && "LoadI has a single use"); + Value::user_iterator ui = LoadI->user_begin(); + // skipped loop, because is has a single use + StoreInst *StoreI = dyn_cast<StoreInst>(*ui); + if (!StoreI) { + continue; // not in pattern + } + + // Also check that the store uses the loaded value as the value operand + if (StoreI->getValueOperand() != LoadI) { + continue; + } + + DEBUG(errs() << "-------Found store instruction\n"); + + // Look for its bitcast, which is its pointer operand + Value *StPtrOp = StoreI->getPointerOperand(); + DEBUG(errs() << "-------" << *StPtrOp << "\n"); + BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); + DEBUG(errs() << "-------" << *BitCastI2 << "\n"); + if (!BitCastI2) { + continue; // not in pattern + } + + DEBUG(errs() << "-------- Found Bit Cast of store!\n"); + // found bitcast. Look for the second GEP, its from operand. + Value *BCFromOp = BitCastI2->getOperand(0); + GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); + DEBUG(errs() << "---------- " << *GEPI2 << "\n"); + if (!GEPI2) { + continue; // not in pattern + } + + if (!(GEPI2->hasOneUse())) { + // does not fit this pattern - more than one uses + // continue; + // Keep GEPI around if it has other uses + keepGEPI2 = true; + } + DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); + + Value *PtrOp2 = GEPI2->getPointerOperand(); + + // Found GEPI2. TODO: kind of confused as o what checks I need to add here, + // let's add them together- all the code for int-float type checks is + // already above. + + // Assume we found pattern + if (!keepGEPI) { + IItoRemove.push_back(GEPI); + DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); + } else { + DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); + } + IItoRemove.push_back(BitCastI); + DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); + IItoRemove.push_back(LoadI); + DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); + IItoRemove.push_back(GEPI2); + DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); + IItoRemove.push_back(BitCastI2); + DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); + if (!keepGEPI2) { + IItoRemove.push_back(StoreI); + DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); + } else { + + DEBUG(errs() << "Keeping " << *StoreI + << " since it has multiple uses!\n"); + } + + std::vector<Value *> GEPlIndex; + if (GEPI->hasIndices()) { + for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); + GEPlIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); + + std::vector<Value *> GEPsIndex; + if (GEPI2->hasIndices()) { + for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); + GEPsIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); + + // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); + GetElementPtrInst *newlGEP = GetElementPtrInst::Create( + GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp, // operand from 1st GEP + ArrayRef<Value *>(GEPlIndex), Twine(), StoreI); + DEBUG(errs() << "Adding: " << *newlGEP << "\n"); + // insert load before GEPI + LoadInst *newLoadI = + new LoadInst(Type::getFloatTy(M.getContext()), + newlGEP, // new GEP + Twine(), LoadI->isVolatile(), LoadI->getAlignment(), + LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI); + DEBUG(errs() << "Adding: " << *newLoadI << "\n"); + // same for GEP for store, for store operand + GetElementPtrInst *newsGEP = GetElementPtrInst::Create( + GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp2, // operand from 2nd GEP + ArrayRef<Value *>(GEPsIndex), Twine(), StoreI); + DEBUG(errs() << "Adding: " << *newsGEP << "\n"); + // insert store before GEPI + StoreInst *newStoreI = + new StoreInst(newLoadI, + newsGEP, // new GEP + StoreI->isVolatile(), StoreI->getAlignment(), + StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI); + DEBUG(errs() << "Adding: " << *newStoreI << "\n"); + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (auto *I : reverse(IItoRemove)) { + DEBUG(errs() << "Erasing: " << *I << "\n"); + I->eraseFromParent(); + } + + // Removed the cloned functions from the parent module into the new module + for (auto *F : FuncToBeRemoved) { + F->removeFromParent(); // TODO: MARIA check + KernelM->getFunctionList().push_back(F); + } + + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName() + << "\n"); + DEBUG(errs() << *KernelM); + + return; } +bool DFG2LLVM_NVPTX::runOnModule(Module &M) { + DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n"); + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode *> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = + // DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap + // = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode : Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } -std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) { - std::vector<unsigned> ConstantMemArgs; - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Argument* arg = &*ai; - std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), - GlobalMemArgs->end(), arg->getArgNo()); - // It has to be a global memory argument to be promotable - if(pos == GlobalMemArgs->end()) - continue; - - // Check if it can/should be promoted - if(canBePromoted(arg, F)) { - DEBUG(errs() << "Promoting << " << arg->getName() << " to constant memory."<< "\n"); - ConstantMemArgs.push_back(arg->getArgNo()); - GlobalMemArgs->erase(pos); - } - } - return ConstantMemArgs; + CGTVisitor->writeKernelsModule(); + + // TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; } -Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { - unsigned idx = 0; - std::vector<Type*> ArgTypes; - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Argument *arg = &*ai; - DEBUG(errs() << *arg << "\n"); - unsigned argno = arg->getArgNo(); - if ((idx < Args.size()) && (argno == Args[idx])) { - fixValueAddrspace(arg, addrspace); - idx++; - } - ArgTypes.push_back(arg->getType()); - } - FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); - - //F->mutateType(PTy); - Function* newF = cloneFunction(F, newFT, false); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - - DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); - return newF; +std::string CGT_NVPTX::getKernelsModuleName(Module &M) { + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); } -/* Add metadata to module KernelM, for OpenCL kernels */ -void CGT_NVPTX::addCLMetadata(Function *F) { +void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!"); + PointerType *OldTy = cast<PointerType>(V->getType()); + PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; + ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) { + if (PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } +} - IRBuilder<> Builder(&*F->begin()); +std::vector<unsigned> +CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs, + Function *F) { + std::vector<unsigned> ConstantMemArgs; + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai) { + Argument *arg = &*ai; + std::vector<unsigned>::iterator pos = std::find( + GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo()); + // It has to be a global memory argument to be promotable + if (pos == GlobalMemArgs->end()) + continue; + + // Check if it can/should be promoted + if (canBePromoted(arg, F)) { + DEBUG(errs() << "Promoting << " << arg->getName() + << " to constant memory." + << "\n"); + ConstantMemArgs.push_back(arg->getArgNo()); + GlobalMemArgs->erase(pos); + } + } + return ConstantMemArgs; +} - SmallVector<Metadata*,8> KernelMD; - KernelMD.push_back(ValueAsMetadata::get(F)); +Function *CGT_NVPTX::changeArgAddrspace(Function *F, + std::vector<unsigned> &Args, + unsigned addrspace) { + unsigned idx = 0; + std::vector<Type *> ArgTypes; + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai) { + Argument *arg = &*ai; + DEBUG(errs() << *arg << "\n"); + unsigned argno = arg->getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(arg, addrspace); + idx++; + } + ArgTypes.push_back(arg->getType()); + } + FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); - // TODO: There is additional metadata used by kernel files but we skip them as - // they are not mandatory. In future they might be useful to enable - // optimizations + // F->mutateType(PTy); + Function *newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); - MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); - MDN_kernels->addOperand(MDKernelNode); + DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n"); + return newF; +} - KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); - // TODO: Replace 1 with the number of the kernel. - // Add when support for multiple launces is added - KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); - MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); - MDN_annotations->addOperand(MDNvvmAnnotationsNode); +/* Add metadata to module KernelM, for OpenCL kernels */ +void CGT_NVPTX::addCLMetadata(Function *F) { + IRBuilder<> Builder(&*F->begin()); + + SmallVector<Metadata *, 8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); + + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations + + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_kernels = + KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); + + KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(ValueAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_annotations = + KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); } void CGT_NVPTX::writeKernelsModule() { - // In addition to deleting all other functions, we also want to spiff it - // up a little bit. Do this now. - legacy::PassManager Passes; + // In addition to deleting all other functions, we also want to spiff it + // up a little bit. Do this now. + legacy::PassManager Passes; DEBUG(errs() << "Writing to File --- "); DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n"); @@ -1996,105 +2029,103 @@ void CGT_NVPTX::writeKernelsModule() { DEBUG(errs() << EC.message() << '\n'); } - Passes.add( - createPrintModulePass(Out.os())); + Passes.add(createPrintModulePass(Out.os())); - Passes.run(*KernelM); + Passes.run(*KernelM); - // Declare success. - Out.keep(); + // Declare success. + Out.keep(); } -Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { - - DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); - // FIXME: Maybe do that using the Node? - StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); - assert(FRetTy && "Return Type must always be a struct"); - - // Keeps return statements, because we will need to replace them - std::vector<ReturnInst *> RItoRemove; - findReturnInst(F, RItoRemove); - - std::vector<Type *> RetArgTypes; - std::vector<Argument*> RetArgs; - std::vector<Argument*> Args; - // Check for { } return struct, which means that the function returns void - if (FRetTy->isEmptyTy()) { - - DEBUG(errs() << "\tFunction output struct is void\n"); - DEBUG(errs() << "\tNo parameters added\n"); - - // Replacing return statements with others returning void - for (auto *RI : RItoRemove) { - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - DEBUG(errs() << "\tChanged return statements to return void\n"); - } - else { - // The struct has return values, thus needs to be converted to parameter - - // Iterate over all element types of return struct and add arguments to the - // function - for (unsigned i=0; i<FRetTy->getNumElements(); i++) { - Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); - RetArgs.push_back(RetArg); - RetArgTypes.push_back(RetArg->getType()); - DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); - } - - DEBUG(errs() << "\tReplacing Return statements\n"); - // Replace return statements with extractValue and store instructions - for (auto *RI : RItoRemove) { - Value* RetVal = RI->getReturnValue(); - for(unsigned i = 0; i < RetArgs.size(); i++) { - ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), - RetArgs[i]->getName()+".val", RI); - new StoreInst(EI, RetArgs[i], RI); - } - // assert(RetVal && "Return value should not be null at this point"); - // StructType* RetType = cast<StructType>(RetVal->getType()); - // assert(RetType && "Return type is not a struct"); - - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - - } - } - DEBUG(errs() << "\tReplaced return statements\n"); - - // Create the argument type list with the added argument's type - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - for(auto *RATy: RetArgTypes) { - ArgTypes.push_back(RATy); - } - - // Creating Args vector to use in cloning! - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Args.push_back(&*ai); - } - for(auto *ai : RetArgs) { - Args.push_back(ai); - } - - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - Type* VoidRetType = Type::getVoidTy(F->getContext()); - FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); - - // Change the function type - //F->mutateType(PTy); - Function* newF = cloneFunction(F, newFT, false, NULL, &Args); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - //F->eraseFromParent(); - return newF; +Function *CGT_NVPTX::transformFunctionToVoid(Function *F) { + + DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); + // FIXME: Maybe do that using the Node? + StructType *FRetTy = dyn_cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); + + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); + + std::vector<Type *> RetArgTypes; + std::vector<Argument *> RetArgs; + std::vector<Argument *> Args; + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { + + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); + + // Replacing return statements with others returning void + for (auto *RI : RItoRemove) { + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + for (unsigned i = 0; i < FRetTy->getNumElements(); i++) { + Argument *RetArg = + new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + RetArgs.push_back(RetArg); + RetArgTypes.push_back(RetArg->getType()); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (auto *RI : RItoRemove) { + Value *RetVal = RI->getReturnValue(); + for (unsigned i = 0; i < RetArgs.size(); i++) { + ExtractValueInst *EI = ExtractValueInst::Create( + RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI); + new StoreInst(EI, RetArgs[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type *> ArgTypes; + for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + for (auto *RATy : RetArgTypes) { + ArgTypes.push_back(RATy); + } + + // Creating Args vector to use in cloning! + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai) { + Args.push_back(&*ai); + } + for (auto *ai : RetArgs) { + Args.push_back(ai); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type *VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + // F->mutateType(PTy); + Function *newF = cloneFunction(F, newFT, false, NULL, &Args); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + // F->eraseFromParent(); + return newF; } /****************************************************************************** @@ -2105,326 +2136,344 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { // 1. No stores // 2. Loads not dependent on getNodeInstanceID itrinsic -static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) { - if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - VisitedList->push_back(V); - for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); - ui != ue; ++ui) { - Instruction* I = dyn_cast<Instruction>(*ui); - if(!I) { - // if use is not an instruction, then skip it - continue; - } - DEBUG(errs() << "\t" << *I << "\n"); - if(isa<LoadInst>(I)) { - DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); - DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); - UseList->push_back(V); - } - else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { - // found a store in use chain - DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); - return true; - } - else if(BuildDFG::isViscIntrinsic(I)) { - // If it is an atomic intrinsic, we found a store - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") - && "Only visc atomic intrinsics can have an argument as input"); - return true; - } - else { - DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); - if(findLoadStoreUses(I, UseList, VisitedList)) - return true; - } - } - return false; +static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList, + std::vector<Value *> *VisitedList) { + if (std::find(VisitedList->begin(), VisitedList->end(), V) != + VisitedList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + VisitedList->push_back(V); + for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; + ++ui) { + Instruction *I = dyn_cast<Instruction>(*ui); + if (!I) { + // if use is not an instruction, then skip it + continue; + } + DEBUG(errs() << "\t" << *I << "\n"); + if (isa<LoadInst>(I)) { + DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); + DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); + UseList->push_back(V); + } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { + // found a store in use chain + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); + return true; + } else if (BuildDFG::isViscIntrinsic(I)) { + // If it is an atomic intrinsic, we found a store + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + assert(II && + II->getCalledValue()->getName().startswith("llvm.visc.atomic") && + "Only visc atomic intrinsics can have an argument as input"); + return true; + } else { + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); + if (findLoadStoreUses(I, UseList, VisitedList)) + return true; + } + } + return false; } -static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) { - if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - DependenceList->push_back(V); - // If not an instruction, then not dependent on node instance id - if(!isa<Instruction>(V) || isa<Constant>(V)) { - DEBUG(errs() << "\tStop\n"); - return false; - } - - Instruction* I = cast<Instruction>(V); - for(unsigned i = 0; i < I->getNumOperands(); i++) { - Value* operand = I->getOperand(i); - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { - if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x - || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y - || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { - Value* Node = II->getArgOperand(0); - IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); - assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); - if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { - DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); - return true; - } - } - } - if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { - DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); - continue; - } - DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); - if(isDependentOnNodeInstanceID(operand, DependenceList)) { - return true; - } - } - return false; +static bool isDependentOnNodeInstanceID(Value *V, + std::vector<Value *> *DependenceList) { + if (std::find(DependenceList->begin(), DependenceList->end(), V) != + DependenceList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + DependenceList->push_back(V); + // If not an instruction, then not dependent on node instance id + if (!isa<Instruction>(V) || isa<Constant>(V)) { + DEBUG(errs() << "\tStop\n"); + return false; + } + + Instruction *I = cast<Instruction>(V); + for (unsigned i = 0; i < I->getNumOperands(); i++) { + Value *operand = I->getOperand(i); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) { + if ((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x || + II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y || + II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { + Value *Node = II->getArgOperand(0); + IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node); + assert( + GN && + "NodeInstanceID operande should be node/parent node intrinsic\n"); + if (GN->getIntrinsicID() == Intrinsic::visc_getNode) { + DEBUG(errs() << "\tDependency found on Node instance ID: " << *II + << "\n"); + return true; + } + } + } + if (CmpInst *CI = dyn_cast<CmpInst>(operand)) { + DEBUG(errs() << "Found compare instruction: " << *CI + << "\nNot following its dependency list\n"); + continue; + } + DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n"); + if (isDependentOnNodeInstanceID(operand, DependenceList)) { + return true; + } + } + return false; } // Function to check if argument arg can be changed to a constant memory pointer -static bool canBePromoted(Argument* arg, Function* F) { - DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); - std::vector<Value*> UseList; - std::vector<Value*> VisitedList; - // recursively traverse use chain - // if find a store instruction return false, everything fails, cannot be - // promoted - // if find a load instruction as use, add the GEP instruction to list - bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); - if(foundStore == true) - return false; - // See that the GEP instructions are not dependent on getNodeInstanceID - // intrinsic - DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); - std::vector<Value*>DependenceList; - for(auto U: UseList) { - if(isDependentOnNodeInstanceID(U, &DependenceList)) - return false; - } - DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); - return true; +static bool canBePromoted(Argument *arg, Function *F) { + DEBUG(errs() << "OPT: Check if Argument " << *arg + << " can be changed to constant memory\n"); + std::vector<Value *> UseList; + std::vector<Value *> VisitedList; + // recursively traverse use chain + // if find a store instruction return false, everything fails, cannot be + // promoted + // if find a load instruction as use, add the GEP instruction to list + bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); + if (foundStore == true) + return false; + // See that the GEP instructions are not dependent on getNodeInstanceID + // intrinsic + DEBUG(errs() << foundStore + << "\tNo Store Instruction found. Check dependence on node " + "instance ID\n"); + std::vector<Value *> DependenceList; + for (auto U : UseList) { + if (isDependentOnNodeInstanceID(U, &DependenceList)) + return false; + } + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); + return true; } - // Calculate execute node parameters which include, number of diemnsions for // dynamic instances of the kernel, local and global work group sizes. -static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* - &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { - - // Assign number of dimenstions a constant value - workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); - - // If local work group size if null - if(!kernel->hasLocalWG()) { - LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); - } - else { - for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { - if(isa<Argument>(kernel->localWGSize[i])) - kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; - } - LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); - } - - for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { - if(isa<Argument>(kernel->globalWGSize[i])) - kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; - } - - // For OpenCL, global work group size is the total bumber of instances in each - // dimension. So, multiply local and global dim limits. - std::vector<Value*> globalWGSizeInsts; - if(kernel->hasLocalWG()) { - for (unsigned i = 0; i < kernel->gridDim; i++) { - BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); - globalWGSizeInsts.push_back(MulInst); - } - } - else { - globalWGSizeInsts = kernel->globalWGSize; - } - GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); - DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr, + Value *&GlobalWGPtr, Kernel *kernel, + ValueToValueMapTy &VMap, Instruction *IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if (!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } else { + for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if (isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = + genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if (isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value *> globalWGSizeInsts; + if (kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator *MulInst = + BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], + kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); } // CodeGen for allocating space for Work Group on stack and returning a pointer // to its address -static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { - Value* WGPtr; - // Get int64_t and or ease of use - Type* Int64Ty = Type::getInt64Ty(M.getContext()); - - // Work Group type is [#dim x i64] - Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); - // Allocate space of Global work group data on stack and get pointer to - // first element. - AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB); - WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); - Value* nextDim = WGPtr; - DEBUG(errs() << *WGPtr << "\n"); - - // Iterate over the number of dimensions and store the global work group - // size in that dimension - for(unsigned i=0; i < WGSize.size(); i++) { - DEBUG(errs() << *WGSize[i] << "\n"); - assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); - - if(WGSize[i]->getType() != Int64Ty) { - // If number of dimensions are mentioned in any other integer format, - // generate code to extend it to i64. We need to use the mapped value in - // the new generated function, hence the use of VMap - // FIXME: Why are we changing the kernel WGSize vector here? - DEBUG(errs() << "Not i64. Zero extend required.\n"); - DEBUG(errs() << *WGSize[i] << "\n"); - CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); - DEBUG(errs() << "Bitcast done.\n"); - StoreInst* SI = new StoreInst(CI, nextDim, IB); - DEBUG(errs() << "Zero extend done.\n"); - DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); - } else { - // Store the value representing work group size in ith dimension on - // stack - StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); - - DEBUG(errs() << "\t Work group size: " << *SI << "\n"); - } - if(i+1 < WGSize.size()) { - // Move to next dimension - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, - ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), - WG->getName()+"."+Twine(i+1), - IB); - DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); - nextDim = GEP; - } - } - return WGPtr; +static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize, + ValueToValueMapTy &VMap, Instruction *IB, + const Twine &WGName) { + Value *WGPtr; + // Get int64_t and or ease of use + Type *Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type *WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), + WG->getName() + ".0", IB); + Value *nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for (unsigned i = 0; i < WGSize.size(); i++) { + DEBUG(errs() << *WGSize[i] << "\n"); + assert(WGSize[i]->getType()->isIntegerTy() && + "Dimension not an integer type!"); + + if (WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst *CI = + BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst *SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB); + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if (i + 1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)), + WG->getName() + "." + Twine(i + 1), IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; } // Get generated PTX binary name -static std::string getPTXFilename(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - moduleID.append(".kernels.cl"); - return moduleID; +static std::string getPTXFilename(const Module &M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".kernels.cl"); + return moduleID; } // Get the name of the input file from module ID -static std::string getFilenameFromModule(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - return moduleID.substr(moduleID.find_last_of("/")+1); +static std::string getFilenameFromModule(const Module &M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/") + 1); } // Changes the data layout of the Module to be compiled with NVPTX backend // TODO: Figure out when to call it, probably after duplicating the modules static void changeDataLayout(Module &M) { - std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; - std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; - if (TARGET_PTX == 32) - M.setDataLayout(StringRef(nvptx32_layoutStr)); - else if (TARGET_PTX == 64) - M.setDataLayout(StringRef(nvptx64_layoutStr)); - else assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else + assert(false && "Invalid PTX target"); - return; + return; } static void changeTargetTriple(Module &M) { - std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; - std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; - if (TARGET_PTX == 32) - M.setTargetTriple(StringRef(nvptx32_TargetTriple)); - else if (TARGET_PTX == 64) - M.setTargetTriple(StringRef(nvptx64_TargetTriple)); - else assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else + assert(false && "Invalid PTX target"); - return; + return; } // Helper function, populate a vector with all return statements in a function -static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { - for (auto &BB : *F) { - if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) - ReturnInstVec.push_back(RI); - } +static void findReturnInst(Function *F, + std::vector<ReturnInst *> &ReturnInstVec) { + for (auto &BB : *F) { + if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) + ReturnInstVec.push_back(RI); + } } -// Helper function, populate a vector with all IntrinsicID intrinsics in a function -static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction *I = &(*i); - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - if (II && II->getIntrinsicID() == IntrinsicID) { - IntrinsicInstVec.push_back(II); - } - } +// Helper function, populate a vector with all IntrinsicID intrinsics in a +// function +static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID, + std::vector<IntrinsicInst *> &IntrinsicInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } } -// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op +// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic +// op static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { - switch(ID) { - case Intrinsic::visc_atomic_add: - return AtomicRMWInst::Add; - case Intrinsic::visc_atomic_sub: - return AtomicRMWInst::Sub; - case Intrinsic::visc_atomic_min: - return AtomicRMWInst::Min; - case Intrinsic::visc_atomic_umin: - return AtomicRMWInst::UMin; - case Intrinsic::visc_atomic_max: - return AtomicRMWInst::Max; - case Intrinsic::visc_atomic_umax: - return AtomicRMWInst::UMax; - //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc; - //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec; - case Intrinsic::visc_atomic_xchg: - return AtomicRMWInst::Xchg; - case Intrinsic::visc_atomic_and: - return AtomicRMWInst::And; - case Intrinsic::visc_atomic_or: - return AtomicRMWInst::Or; - case Intrinsic::visc_atomic_xor: - return AtomicRMWInst::Xor; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch (ID) { + case Intrinsic::visc_atomic_add: + return AtomicRMWInst::Add; + case Intrinsic::visc_atomic_sub: + return AtomicRMWInst::Sub; + case Intrinsic::visc_atomic_min: + return AtomicRMWInst::Min; + case Intrinsic::visc_atomic_umin: + return AtomicRMWInst::UMin; + case Intrinsic::visc_atomic_max: + return AtomicRMWInst::Max; + case Intrinsic::visc_atomic_umax: + return AtomicRMWInst::UMax; + // case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc; + // case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec; + case Intrinsic::visc_atomic_xchg: + return AtomicRMWInst::Xchg; + case Intrinsic::visc_atomic_and: + return AtomicRMWInst::And; + case Intrinsic::visc_atomic_or: + return AtomicRMWInst::Or; + case Intrinsic::visc_atomic_xor: + return AtomicRMWInst::Xor; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } - // Helper funtion, returns the OpenCL function name, corresponding to atomic op static std::string getAtomicOpName(Intrinsic::ID ID) { - switch(ID) { - case Intrinsic::visc_atomic_cmpxchg: - return "atom_cmpxchg"; - case Intrinsic::visc_atomic_add: - return "atom_add"; - case Intrinsic::visc_atomic_sub: - return "atom_sub"; - case Intrinsic::visc_atomic_min: - return "atom_min"; - case Intrinsic::visc_atomic_max: - return "atom_max"; - case Intrinsic::visc_atomic_inc: - return "atom_inc"; - case Intrinsic::visc_atomic_dec: - return "atom_dec"; - case Intrinsic::visc_atomic_xchg: - return "atom_xchg"; - case Intrinsic::visc_atomic_and: - return "atom_and"; - case Intrinsic::visc_atomic_or: - return "atom_or"; - case Intrinsic::visc_atomic_xor: - return "atom_xor"; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch (ID) { + case Intrinsic::visc_atomic_cmpxchg: + return "atom_cmpxchg"; + case Intrinsic::visc_atomic_add: + return "atom_add"; + case Intrinsic::visc_atomic_sub: + return "atom_sub"; + case Intrinsic::visc_atomic_min: + return "atom_min"; + case Intrinsic::visc_atomic_max: + return "atom_max"; + case Intrinsic::visc_atomic_inc: + return "atom_inc"; + case Intrinsic::visc_atomic_dec: + return "atom_dec"; + case Intrinsic::visc_atomic_xchg: + return "atom_xchg"; + case Intrinsic::visc_atomic_and: + return "atom_and"; + case Intrinsic::visc_atomic_or: + return "atom_or"; + case Intrinsic::visc_atomic_xor: + return "atom_xor"; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } } // End of namespace @@ -2435,4 +2484,3 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx", false /* does not modify the CFG */, true /* transformation, * * not just analysis */); - diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 90d7de11fa4a2e44be72360c0568ca63b6882b14..8ec14c80805c052b4a356df7b29b6f1cae2ab775 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -8,31 +8,30 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "DFG2LLVM_X86" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" +#include "SupportVISC/DFG2LLVM.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" +#include "llvm/Pass.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Constant.h" -#include "SupportVISC/DFG2LLVM.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; using namespace builddfg; using namespace dfg2llvm; // VISC Command line option to use timer or not -static cl::opt<bool> -VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); +static cl::opt<bool> VISCTimer_X86("visc-timers-x86", + cl::desc("Enable visc timers")); // Command line option to enable device abstraction or not static cl::opt<bool> -DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, - cl::desc("Enable visc device abstraction")); - + DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, + cl::desc("Enable visc device abstraction")); namespace { @@ -41,7 +40,8 @@ static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); - return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion"); + return (CI->getCalledValue()->stripPointerCasts()->getName()) + .equals("llvm_visc_policy_getVersion"); } CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { @@ -56,7 +56,7 @@ CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { // DFG2LLVM_X86 - The first implementation. struct DFG2LLVM_X86 : public DFG2LLVM { static char ID; // Pass identification, replacement for typeid - DFG2LLVM_X86() :DFG2LLVM(ID) {} + DFG2LLVM_X86() : DFG2LLVM(ID) {} private: // Member variables @@ -71,7 +71,7 @@ public: class CGT_X86 : public CodeGenTraversal { private: - //Member variables + // Member variables FunctionCallee malloc; // VISC Runtime API @@ -88,34 +88,35 @@ private: FunctionCallee llvm_visc_createEdgeBuffer; FunctionCallee llvm_visc_createLastInputBuffer; FunctionCallee llvm_visc_createThread; - //Constant* llvm_visc_freeThreads; + // Constant* llvm_visc_freeThreads; FunctionCallee llvm_visc_bufferPush; FunctionCallee llvm_visc_bufferPop; FunctionCallee llvm_visc_x86_dstack_push; FunctionCallee llvm_visc_x86_dstack_pop; FunctionCallee llvm_visc_x86_getDimLimit; FunctionCallee llvm_visc_x86_getDimInstance; - - //Functions - std::vector<IntrinsicInst*>* getUseList(Value* LI); - Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); - void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); + + // Functions + std::vector<IntrinsicInst *> *getUseList(Value *LI); + Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = ""); + void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *); Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); - Argument* getArgumentFromEnd(Function* F, unsigned offset); - Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, - Instruction* InsertBefore); - void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, - Instruction* InsertBefore); - void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, - Instruction* InsertBefore); - StructType* getArgumentListStructTy(DFNode*); - Function* createFunctionFilter(DFNode* C); - void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>, - Value*, Value*, Instruction*); - Function* createLaunchFunction(DFInternalNode*); - Function* createPushFunction(DFInternalNode*); - Function* createPopFunction(DFInternalNode*); - Function* createWaitFunction(DFInternalNode*); + Argument *getArgumentFromEnd(Function *F, unsigned offset); + Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86, + Instruction *InsertBefore); + void invokeChild_X86(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap, + Instruction *InsertBefore); + void invokeChild_PTX(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap, + Instruction *InsertBefore); + StructType *getArgumentListStructTy(DFNode *); + Function *createFunctionFilter(DFNode *C); + void startNodeThread(DFNode *, std::vector<Value *>, + DenseMap<DFEdge *, Value *>, Value *, Value *, + Instruction *); + Function *createLaunchFunction(DFInternalNode *); + Function *createPushFunction(DFInternalNode *); + Function *createPopFunction(DFInternalNode *); + Function *createWaitFunction(DFInternalNode *); // Virtual Functions void init() { @@ -123,10 +124,10 @@ private: TargetName = "X86"; } void initRuntimeAPI(); - void codeGen(DFInternalNode* N); - void codeGen(DFLeafNode* N); - Function* codeGenStreamPush(DFInternalNode* N); - Function* codeGenStreamPop(DFInternalNode* N); + void codeGen(DFInternalNode *N); + void codeGen(DFLeafNode *N); + Function *codeGenStreamPush(DFInternalNode *N); + Function *codeGenStreamPop(DFInternalNode *N); public: // Constructor @@ -135,8 +136,8 @@ public: initRuntimeAPI(); } - void codeGenLaunch(DFInternalNode* Root); - void codeGenLaunchStreaming(DFInternalNode* Root); + void codeGenLaunch(DFInternalNode *Root); + void codeGenLaunchStreaming(DFInternalNode *Root); }; bool DFG2LLVM_X86::runOnModule(Module &M) { @@ -147,8 +148,8 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { // - Maps from i8* hansles to DFNode and DFEdge BuildDFG &DFG = getAnalysis<BuildDFG>(); - //DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode *> Roots = DFG.getRoots(); // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); @@ -156,16 +157,17 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { CGT_X86 *CGTVisitor = new CGT_X86(M, DFG); // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode: Roots) { + for (auto rootNode : Roots) { // Initiate code generation for root DFNode CGTVisitor->visit(rootNode); - // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // Go ahead and replace the launch intrinsic with pthread call, otherwise + // return now. // TODO: Later on, we might like to do this in a separate pass, which would - // allow us the flexibility to switch between complete static code generation - // for DFG or having a customized runtime+scheduler - + // allow us the flexibility to switch between complete static code + // generation for DFG or having a customized runtime+scheduler + // Do streaming code generation if root node is streaming. Usual otherwise - if(rootNode->isChildGraphStreaming()) + if (rootNode->isChildGraphStreaming()) CGTVisitor->codeGenLaunchStreaming(rootNode); else CGTVisitor->codeGenLaunch(rootNode); @@ -181,7 +183,7 @@ void CGT_X86::initRuntimeAPI() { // Load Runtime API Module SMDiagnostic Err; - char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; @@ -189,7 +191,7 @@ void CGT_X86::initRuntimeAPI() { runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if(runtimeModule == NULL) + if (runtimeModule == NULL) DEBUG(errs() << Err.getMessage()); else DEBUG(errs() << "Successfully loaded visc-rt API module\n"); @@ -208,7 +210,7 @@ void CGT_X86::initRuntimeAPI() { DECLARE(llvm_visc_createEdgeBuffer); DECLARE(llvm_visc_createLastInputBuffer); DECLARE(llvm_visc_createThread); - //DECLARE(llvm_visc_freeThreads); + // DECLARE(llvm_visc_freeThreads); DECLARE(llvm_visc_bufferPush); DECLARE(llvm_visc_bufferPop); DECLARE(llvm_visc_x86_dstack_push); @@ -220,36 +222,40 @@ void CGT_X86::initRuntimeAPI() { initTimerAPI(); // Insert init context in main - Function* VI = M.getFunction("llvm.visc.init"); + Function *VI = M.getFunction("llvm.visc.init"); assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); DEBUG(errs() << "Inserting x86 timer initialization\n"); - Instruction* I = cast<Instruction>(*VI->user_begin()); + Instruction *I = cast<Instruction>(*VI->user_begin()); initializeTimerSet(I); switchToTimer(visc_TimerID_NONE, I); // Insert code for initializing the sceduling policy - FunctionCallee IP = M.getOrInsertFunction("llvm_visc_policy_init", - runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()); - CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + FunctionCallee IP = M.getOrInsertFunction( + "llvm_visc_policy_init", + runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()); + CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IPCallInst << "\n"); // If device abstraction is enabled, we add a runtime call to start the // device status simulation if (DeviceAbstraction) { - FunctionCallee ID = M.getOrInsertFunction("llvm_visc_deviceAbstraction_start", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType()); - CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + FunctionCallee ID = M.getOrInsertFunction( + "llvm_visc_deviceAbstraction_start", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_start") + ->getFunctionType()); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IDCallInst << "\n"); } // Insert print instruction at visc exit - Function* VC = M.getFunction("llvm.visc.cleanup"); + Function *VC = M.getFunction("llvm.visc.cleanup"); assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); // Insert code for clearing the sceduling policy I = cast<Instruction>(*VC->user_begin()); - IP = M.getOrInsertFunction("llvm_visc_policy_clear", - runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()); - IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + IP = M.getOrInsertFunction( + "llvm_visc_policy_clear", + runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()); + IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IPCallInst << "\n"); DEBUG(errs() << "Inserting x86 timer print\n"); @@ -258,22 +264,24 @@ void CGT_X86::initRuntimeAPI() { // If device abstraction is enabled, we add a runtime call to end the // device status simulation if (DeviceAbstraction) { - FunctionCallee ID = M.getOrInsertFunction("llvm_visc_deviceAbstraction_end", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType()); - CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + FunctionCallee ID = M.getOrInsertFunction( + "llvm_visc_deviceAbstraction_end", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_end") + ->getFunctionType()); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IDCallInst << "\n"); } - } /* Returns vector of all wait instructions */ -std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { - std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>(); +std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) { + std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>(); // It must have been loaded from memory somewhere - for(Value::user_iterator ui = GraphID->user_begin(), - ue = GraphID->user_end(); ui!=ue; ++ui) { - if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { + for (Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); + ui != ue; ++ui) { + if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) { UseList->push_back(waitI); } else { llvm_unreachable("Error: Operation on Graph ID not supported!\n"); @@ -285,14 +293,14 @@ std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { /* Traverse the function argument list in reverse order to get argument at a * distance offset fromt he end of argument list of function F */ -Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { - assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) - && "Invalid offset to access arguments!"); +Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) && + "Invalid offset to access arguments!"); Function::arg_iterator e = F->arg_end(); // Last element of argument iterator is dummy. Skip it. e--; - Argument* arg; - for( ; offset != 0; e--) { + Argument *arg; + for (; offset != 0; e--) { offset--; arg = &*e; } @@ -310,25 +318,24 @@ Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { * which loops over bidy if true and goes to end if false * (5) Update phi node of body */ -void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, - Instruction* BodyEnd, Value* TerminationCond) { - BasicBlock* Entry = CondBlockStart->getParent(); - BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); - BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); - BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); +void CGT_X86::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart, + Instruction *BodyEnd, Value *TerminationCond) { + BasicBlock *Entry = CondBlockStart->getParent(); + BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); + BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); + BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); // Replace the terminator instruction of conditional with new conditional // branch which goes to while.body if true and branches to while.end otherwise - BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); + BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); ReplaceInstWithInst(CondBlock->getTerminator(), BI); // While Body should jump to condition block - BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock); + BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock); ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); - } -Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, +Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, BasicBlock *Body) { Module *M = Entry->getParent()->getParent(); Type *Int64Ty = Type::getInt64Ty(M->getContext()); @@ -338,10 +345,10 @@ Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); ConstantInt *IConst = - ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); Instruction *CounterIncr = - BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, - "cnt_incr", Body->getTerminator()); + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); // Set incoming values for Phi node IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); @@ -363,39 +370,40 @@ Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, * which loops over bidy if true and goes to end if false * (5) Update phi node of body */ -Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { - BasicBlock* Entry = I->getParent(); - BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body"); +Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) { + BasicBlock *Entry = I->getParent(); + BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body"); BasicBlock::iterator i(I); ++i; - Instruction* NextI = &*i; + Instruction *NextI = &*i; // Next Instruction should also belong to the same basic block as the basic // block will have a terminator instruction - assert(NextI->getParent() == ForBody - && "Next Instruction should also belong to the same basic block!"); - BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); - + assert(NextI->getParent() == ForBody && + "Next Instruction should also belong to the same basic block!"); + BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); // Add Phi Node for index variable - PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), - 2, "index."+indexName, I); + PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2, + "index." + indexName, I); // Add incoming edge to phi IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0), Entry); // Increment index variable - BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add, - IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), - "index."+indexName+".inc", ForBody->getTerminator()); + BinaryOperator *IndexInc = BinaryOperator::Create( + Instruction::Add, IndexPhi, + ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), + "index." + indexName + ".inc", ForBody->getTerminator()); // Compare index variable with limit - CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, - limit, "cond."+indexName, ForBody->getTerminator()); + CmpInst *Cond = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit, + "cond." + indexName, ForBody->getTerminator()); // Replace the terminator instruction of for.body with new conditional // branch which loops over body if true and branches to for.end otherwise - BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond); + BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond); ReplaceInstWithInst(ForBody->getTerminator(), BI); // Add incoming edge to phi node in body @@ -407,263 +415,274 @@ Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { // types, output types and isLastInput buffer type. All the streaming // inputs/outputs are converted to i8*, since this is the type of buffer // handles. -StructType* CGT_X86::getArgumentListStructTy(DFNode* C) { - std::vector<Type*> TyList; +StructType *CGT_X86::getArgumentListStructTy(DFNode *C) { + std::vector<Type *> TyList; // Input types - Function* CF = C->getFuncPointer(); - for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); - ai != ae; ++ai) { - if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) + Function *CF = C->getFuncPointer(); + for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); + ai != ae; ++ai) { + if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - else + else TyList.push_back(ai->getType()); } // Output Types - StructType* OutStructTy = cast<StructType>(CF->getReturnType()); + StructType *OutStructTy = cast<StructType>(CF->getReturnType()); for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { // All outputs of a node are streaming edge - assert(C->getOutDFEdgeAt(i)->isStreamingEdge() - && "All output edges of child node have to be streaming"); + assert(C->getOutDFEdgeAt(i)->isStreamingEdge() && + "All output edges of child node have to be streaming"); TyList.push_back(Type::getInt8PtrTy(CF->getContext())); } // isLastInput buffer element TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - StructType* STy = StructType::create(CF->getContext(), TyList, - Twine("struct.thread."+CF->getName()).str(), true); + StructType *STy = + StructType::create(CF->getContext(), TyList, + Twine("struct.thread." + CF->getName()).str(), true); return STy; - } -void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*> - EdgeBufferMap, Value* isLastInputBuffer, Value* graphID, - Instruction* IB) { - DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n"); +void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args, + DenseMap<DFEdge *, Value *> EdgeBufferMap, + Value *isLastInputBuffer, Value *graphID, + Instruction *IB) { + DEBUG(errs() << "Starting Pipeline for child node: " + << C->getFuncPointer()->getName() << "\n"); // Create a filter/pipeline function for the child node - Function* C_Pipeline = createFunctionFilter(C); - Function* CF = C->getFuncPointer(); + Function *C_Pipeline = createFunctionFilter(C); + Function *CF = C->getFuncPointer(); // Get module context and i32 0 constant, as they would be frequently used in // this function. - LLVMContext& Ctx = IB->getParent()->getContext(); - Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + LLVMContext &Ctx = IB->getParent()->getContext(); + Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); // Marshall arguments // Create a packed struct type with inputs of C followed by outputs and then // another i8* to indicate isLastInput buffer. Streaming inputs are replaced // by i8* // - StructType* STy = getArgumentListStructTy(C); + StructType *STy = getArgumentListStructTy(C); // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* - CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)), - C->getFuncPointer()->getName()+".inputs", IB); - CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB); - //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB); + CallInst *CI = + CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)), + C->getFuncPointer()->getName() + ".inputs", IB); + CastInst *Struct = BitCastInst::CreatePointerCast( + CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB); + // AllocaInst* AI = new AllocaInst(STy, + // C->getFuncPointer()->getName()+".inputs", IB); // Insert elements in the struct - DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Marshall inputs for child node: " + << C->getFuncPointer()->getName() << "\n"); // Marshall Inputs - for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) { + for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { // Create constant int (i) - Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); // Get Element pointer instruction - Value* GEPIndices[] = { IntZero, Int_i }; - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, - ArrayRef<Value*>(GEPIndices, 2), - Struct->getName()+".arg_"+Twine(i), - IB); - DFEdge* E = C->getInDFEdgeAt(i); + Value *GEPIndices[] = {IntZero, Int_i}; + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), + Struct->getName() + ".arg_" + Twine(i), IB); + DFEdge *E = C->getInDFEdgeAt(i); if (E->getSourceDF()->isEntryNode()) { // This is a Bind Input Edge - if(E->isStreamingEdge()) { + if (E->isStreamingEdge()) { // Streaming Bind Input edge. Get buffer corresponding to it - assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!"); + assert(EdgeBufferMap.count(E) && + "No mapping buffer for a Streaming Bind DFEdge!"); new StoreInst(EdgeBufferMap[E], GEP, IB); - } - else { + } else { // Non-streaming Bind edge new StoreInst(Args[i], GEP, IB); } - } - else { - // This is an edge between siblings. + } else { + // This is an edge between siblings. // This must be an streaming edge. As it is our assumption that all edges // between two nodes in a DFG are streaming. - assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!"); + assert(EdgeBufferMap.count(E) && + "No mapping buffer for a Streaming DFEdge!"); new StoreInst(EdgeBufferMap[E], GEP, IB); } } unsigned numInputs = CF->getFunctionType()->getNumParams(); unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); // Marshall Outputs - DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n"); - for(unsigned i = 0; i < numOutputs; i++ ) { + DEBUG(errs() << "Marshall outputs for child node: " + << C->getFuncPointer()->getName() << "\n"); + for (unsigned i = 0; i < numOutputs; i++) { // Create constant int (i+numInputs) - Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs); + Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs); // Get Element pointer instruction - Value* GEPIndices[] = { IntZero, Int_i }; - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, - ArrayRef<Value*>(GEPIndices, 2), - Struct->getName()+".out_"+Twine(i), - IB); - DFEdge* E = C->getOutDFEdgeAt(i); - assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes"); - assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!"); + Value *GEPIndices[] = {IntZero, Int_i}; + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), + Struct->getName() + ".out_" + Twine(i), IB); + DFEdge *E = C->getOutDFEdgeAt(i); + assert(E->isStreamingEdge() && + "Output Edge must be streaming of all nodes"); + assert(EdgeBufferMap.count(E) && + "No mapping buffer for a Out Streaming DFEdge!"); new StoreInst(EdgeBufferMap[E], GEP, IB); } // Marshall last argument. isLastInput buffer - DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Marshall isLastInput for child node: " + << C->getFuncPointer()->getName() << "\n"); // Create constant int (i+numInputs) - Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs); + Constant *Int_index = + ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs); // Get Element pointer instruction - Value* GEPIndices[] = { IntZero, Int_index }; - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, - ArrayRef<Value*>(GEPIndices, 2), - Struct->getName()+".isLastInput", IB); + Value *GEPIndices[] = {IntZero, Int_index}; + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), + Struct->getName() + ".isLastInput", IB); new StoreInst(isLastInputBuffer, GEP, IB); // AllocaInst AI points to memory with all the arguments packed // Call runtime to create the thread with these arguments - DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); -// DEBUG(errs() << *llvm_visc_createThread << "\n"); + DEBUG(errs() << "Start Thread for child node: " + << C->getFuncPointer()->getName() << "\n"); + // DEBUG(errs() << *llvm_visc_createThread << "\n"); DEBUG(errs() << *graphID->getType() << "\n"); DEBUG(errs() << *C_Pipeline->getType() << "\n"); DEBUG(errs() << *Struct->getType() << "\n"); // Bitcast AI to i8* - CastInst* BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB); - Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI}; - CallInst* CreateThread = CallInst::Create(llvm_visc_createThread, - ArrayRef<Value*>(CreateThreadArgs, 3), - "", - IB); - + CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), + Struct->getName(), IB); + Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI}; + CallInst *CreateThread = CallInst::Create( + llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); } -Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { +Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { DEBUG(errs() << "Generating Streaming Launch Function\n"); // Get Function associated with Node N - Function* NF = N->getFuncPointer(); + Function *NF = N->getFuncPointer(); - // Map from Streaming edge to buffer - DenseMap<DFEdge*, Value*> EdgeBufferMap; + // Map from Streaming edge to buffer + DenseMap<DFEdge *, Value *> EdgeBufferMap; /* Now we have all the necessary global declarations necessary to generate the - * Launch function, pointer to which can be passed to pthread utils to execute - * DFG. The Launch function has just one input: i8* data.addr - * This is the address of the all the input data that needs to be passed to - * this function. In our case it contains the input arguments of the Root - * function in the correct order. - * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) - * (2) Extract each of inputs from data.addr - * (3) create Buffers for all the streaming edges - * - Put buffers in the context - * (4) Go over each child node - * - marshall its arguments together (use buffers in place of streaming - * arguments) - * - Start the threads - * (5) The return value from Root is stored in memory, pointer to which is - * passed to pthread_exit call. - */ + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) + * (2) Extract each of inputs from data.addr + * (3) create Buffers for all the streaming edges + * - Put buffers in the context + * (4) Go over each child node + * - marshall its arguments together (use buffers in place of streaming + * arguments) + * - Start the threads + * (5) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ // (1) Create Launch Function of type void (i8* args, i8* GraphID) - Type* i8Ty = Type::getInt8Ty(M.getContext()); - Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; - FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()), - ArrayRef<Type*>(ArgTypes, 2), false); - Function* LaunchFunc = Function::Create(LaunchFuncTy, - NF->getLinkage(), - NF->getName()+".LaunchFunction", - &M); + Type *i8Ty = Type::getInt8Ty(M.getContext()); + Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; + FunctionType *LaunchFuncTy = FunctionType::get( + Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false); + Function *LaunchFunc = Function::Create( + LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M); DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); // Give a name to the argument which is used pass data to this thread - Argument* data = &*LaunchFunc->arg_begin(); + Argument *data = &*LaunchFunc->arg_begin(); // NOTE-HS: Check correctness with Maria - Argument* graphID = &*(LaunchFunc->arg_begin() + 1); + Argument *graphID = &*(LaunchFunc->arg_begin() + 1); data->setName("data.addr"); graphID->setName("graphID"); // Add a basic block to this empty function and a return null statement to it DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); - BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); - ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(), - BB); + BasicBlock *BB = + BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); + ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB); DEBUG(errs() << "Created Empty Launch Function\n"); // (2) Extract each of inputs from data.addr - std::vector<Type*> TyList; + std::vector<Type *> TyList; std::vector<std::string> names; - std::vector<Value*> Args; + std::vector<Value *> Args; for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); - ai != ae; ++ai) { - if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) { + ai != ae; ++ai) { + if (N->getChildGraph() + ->getEntry() + ->getOutDFEdgeAt(ai->getArgNo()) + ->isStreamingEdge()) { TyList.push_back(i8Ty->getPointerTo()); - names.push_back(Twine(ai->getName()+"_buffer").str()); + names.push_back(Twine(ai->getName() + "_buffer").str()); continue; } TyList.push_back(ai->getType()); names.push_back(ai->getName()); } Args = extractElements(data, TyList, names, RI); - DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc << "\n"); + DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc + << "\n"); // (3) Create buffers for all the streaming edges - for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), - de = N->getChildGraph()->dfedge_end(); di != de; ++di) { - DFEdge* Edge = *di; + for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), + de = N->getChildGraph()->dfedge_end(); + di != de; ++di) { + DFEdge *Edge = *di; DEBUG(errs() << *Edge->getType() << "\n"); - Value* size = ConstantExpr::getSizeOf(Edge->getType()); - Value* CallArgs[] = {graphID, size}; + Value *size = ConstantExpr::getSizeOf(Edge->getType()); + Value *CallArgs[] = {graphID, size}; if (Edge->isStreamingEdge()) { - CallInst* CI; + CallInst *CI; // Create a buffer call - if(Edge->getSourceDF()->isEntryNode()) { + if (Edge->getSourceDF()->isEntryNode()) { // Bind Input Edge - Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()), - Edge->getSourcePosition()); - Value* BindInCallArgs[] = {graphID, size, Int_ArgNo}; - CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3), - "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(), - RI); - } - else if(Edge->getDestDF()->isExitNode()) { + Constant *Int_ArgNo = ConstantInt::get( + Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition()); + Value *BindInCallArgs[] = {graphID, size, Int_ArgNo}; + CI = CallInst::Create( + llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), + "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI); + } else if (Edge->getDestDF()->isExitNode()) { // Bind Output Edge - CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2), - "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(), - RI); - } - else { + CI = CallInst::Create( + llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), + "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI); + } else { // Streaming Edge - CI = CallInst::Create(llvm_visc_createEdgeBuffer, - ArrayRef<Value*>(CallArgs, 2), - Edge->getSourceDF()->getFuncPointer()->getName()+"." - +Edge->getDestDF()->getFuncPointer()->getName(), - RI); + CI = CallInst::Create( + llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), + Edge->getSourceDF()->getFuncPointer()->getName() + "." + + Edge->getDestDF()->getFuncPointer()->getName(), + RI); } EdgeBufferMap[Edge] = CI; } } // Create buffer for isLastInput for all the child nodes - DFGraph* G = N->getChildGraph(); - DenseMap<DFNode*, Value*> NodeLastInputMap; - for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) { - DFNode* child = *ci; - if(child->isDummyNode()) + DFGraph *G = N->getChildGraph(); + DenseMap<DFNode *, Value *> NodeLastInputMap; + for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; + ++ci) { + DFNode *child = *ci; + if (child->isDummyNode()) continue; - Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); - Value* CallArgs[] = {graphID, size}; - CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2), - "BindIn.isLastInput."+child->getFuncPointer()->getName(), - RI); + Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); + Value *CallArgs[] = {graphID, size}; + CallInst *CI = CallInst::Create( + llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), + "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI); NodeLastInputMap[child] = CI; } - DEBUG(errs() << "Start Each child node filter\n"); + DEBUG(errs() << "Start Each child node filter\n"); // (4) Marshall arguments for each child node and start the thread with its // pipeline funtion - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - DFNode* C = *ci; + for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); + ci != ce; ++ci) { + DFNode *C = *ci; // Skip dummy node call if (C->isDummyNode()) continue; - + // Marshall all the arguments for this node into an i8* // Pass to the runtime to create the thread // Start the thread for child node C @@ -676,22 +695,21 @@ Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { return LaunchFunc; } - -Function* CGT_X86::createPushFunction(DFInternalNode* N) { +Function *CGT_X86::createPushFunction(DFInternalNode *N) { DEBUG(errs() << "Generating Push function\n"); - Function* PushFunc; + Function *PushFunc; return PushFunc; } -Function* CGT_X86::createPopFunction(DFInternalNode* N) { +Function *CGT_X86::createPopFunction(DFInternalNode *N) { DEBUG(errs() << "Generating Pop function\n"); - Function* PushFunc; + Function *PushFunc; return PushFunc; } -Function* CGT_X86::createWaitFunction(DFInternalNode* N) { +Function *CGT_X86::createWaitFunction(DFInternalNode *N) { DEBUG(errs() << "Generating Wait function\n"); - Function* PushFunc; + Function *PushFunc; return PushFunc; } /* This fuction does the steps necessary to launch a streaming graph @@ -701,171 +719,162 @@ Function* CGT_X86::createWaitFunction(DFInternalNode* N) { * Modify each of the instrinsic in host code * Launch, Push, Pop, Wait */ -void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) { - IntrinsicInst* LI = Root->getInstruction(); - Function* RootLaunch = createLaunchFunction(Root); - //Function* RootPush = createPushFunction(Root); - //Function* RootPop = createPopFunction(Root); - //Function* RootWait = createWaitFunction(Root); +void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) { + IntrinsicInst *LI = Root->getInstruction(); + Function *RootLaunch = createLaunchFunction(Root); + // Function* RootPush = createPushFunction(Root); + // Function* RootPop = createPopFunction(Root); + // Function* RootWait = createWaitFunction(Root); // Substitute launch intrinsic main - DEBUG(errs() << "Substitute launch intrinsic\n"); - Value* LaunchInstArgs[] = {RootLaunch, - LI->getArgOperand(1) - }; - CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch, - ArrayRef<Value*>(LaunchInstArgs,2), - "graph"+Root->getFuncPointer()->getName(), LI); - //ReplaceInstWithInst(LI, LaunchInst); + DEBUG(errs() << "Substitute launch intrinsic\n"); + Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)}; + CallInst *LaunchInst = CallInst::Create( + llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), + "graph" + Root->getFuncPointer()->getName(), LI); + // ReplaceInstWithInst(LI, LaunchInst); DEBUG(errs() << *LaunchInst << "\n"); // Replace all wait instructions with x86 specific wait instructions - DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); - std::vector<IntrinsicInst*>* UseList = getUseList(LI); - for(unsigned i=0; i < UseList->size(); ++i) { - IntrinsicInst* II = UseList->at(i); - CallInst* CI; - Value* PushArgs[] = {LaunchInst, II->getOperand(1)}; - switch(II->getIntrinsicID()) { + DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); + std::vector<IntrinsicInst *> *UseList = getUseList(LI); + for (unsigned i = 0; i < UseList->size(); ++i) { + IntrinsicInst *II = UseList->at(i); + CallInst *CI; + Value *PushArgs[] = {LaunchInst, II->getOperand(1)}; + switch (II->getIntrinsicID()) { case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_streamWait, - ArrayRef<Value*>(LaunchInst), + CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst), ""); break; case Intrinsic::visc_push: CI = CallInst::Create(llvm_visc_streamPush, - ArrayRef<Value*>(PushArgs, 2), - ""); + ArrayRef<Value *>(PushArgs, 2), ""); break; case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_streamPop, - ArrayRef<Value*>(LaunchInst), + CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst), ""); break; default: - llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + llvm_unreachable( + "GraphID is used by an instruction other than wait, push, pop"); }; DEBUG(errs() << "Replace:\n\t" << *II << "\n"); ReplaceInstWithInst(II, CI); DEBUG(errs() << "\twith " << *CI << "\n"); } - - } -void CGT_X86::codeGenLaunch(DFInternalNode* Root) { +void CGT_X86::codeGenLaunch(DFInternalNode *Root) { // TODO: Place an assert to check if the constant passed by launch intrinsic // as the number of arguments to DFG is same as the number of arguments of the // root of DFG DEBUG(errs() << "Generating Launch Function\n"); // Get Launch Instruction - IntrinsicInst* LI = Root->getInstruction(); + IntrinsicInst *LI = Root->getInstruction(); switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); DEBUG(errs() << "Generating Launch Function\n"); /* Now we have all the necessary global declarations necessary to generate the - * Launch function, pointer to which can be passed to pthread utils to execute - * DFG. The Launch function has just one input: i8* data.addr - * This is the address of the all the input data that needs to be passed to - * this function. In our case it contains the input arguments of the Root - * function in the correct order. - * (1) Create an empty Launch function of type i8*(i8*) - * (2) Extract each of inputs from data.addr and pass them as arguments to the - * call to Root function - * (3) The return value from Root is stored in memory, pointer to which is - * passed to pthread_exit call. - */ + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type i8*(i8*) + * (2) Extract each of inputs from data.addr and pass them as arguments to the + * call to Root function + * (3) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ // Create Launch Function of type i8*(i8*) which calls the root function - Type* i8Ty = Type::getInt8Ty(M.getContext()); - FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), - ArrayRef<Type*>(i8Ty->getPointerTo()), - false); - Function* AppFunc = Function::Create(AppFuncTy, - Root->getFuncPointer()->getLinkage(), - "LaunchDataflowGraph", - &M); + Type *i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType *AppFuncTy = FunctionType::get( + i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false); + Function *AppFunc = + Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(), + "LaunchDataflowGraph", &M); DEBUG(errs() << "Generating Launch Function\n"); // Give a name to the argument which is used pass data to this thread - Value* data = &*AppFunc->arg_begin(); + Value *data = &*AppFunc->arg_begin(); data->setName("data.addr"); // Add a basic block to this empty function and a return null statement to it BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); - ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), - Constant::getNullValue(AppFunc->getReturnType()), - BB); + ReturnInst *RI = + ReturnInst::Create(AppFunc->getContext(), + Constant::getNullValue(AppFunc->getReturnType()), BB); switchToTimer(visc_TimerID_ARG_UNPACK, RI); DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and -// Function* RootF_X86 = Root->getGenFunc(); - Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); + // Function* RootF_X86 = Root->getGenFunc(); + Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && "Error: Generated Function for Root node with no x86 wrapper\n"); // Generate a call to RootF_X86 with null parameters for now - std::vector<Value*>Args; - for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { - Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); + std::vector<Value *> Args; + for (unsigned i = 0; i < RootF_X86->getFunctionType()->getNumParams(); i++) { + Args.push_back( + Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); } - CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI); + CallInst *CI = + CallInst::Create(RootF_X86, Args, RootF_X86->getName() + ".output", RI); // Extract input data from i8* data.addr and patch them to correct argument of // call to RootF_X86. For each argument - std::vector<Type*> TyList; + std::vector<Type *> TyList; std::vector<std::string> names; - for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); - ai != ae; ++ai) { + for (Function::arg_iterator ai = RootF_X86->arg_begin(), + ae = RootF_X86->arg_end(); + ai != ae; ++ai) { TyList.push_back(ai->getType()); names.push_back(ai->getName()); } - std::vector<Value*> elements = extractElements(data, TyList, names, CI); + std::vector<Value *> elements = extractElements(data, TyList, names, CI); // Patch the elements to the call arguments - for(unsigned i=0; i<CI->getNumArgOperands(); i++) + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) CI->setArgOperand(i, elements[i]); // Add timers around Call to RootF_X86 function switchToTimer(visc_TimerID_COMPUTATION, CI); switchToTimer(visc_TimerID_OUTPUT_PACK, RI); - StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType()); + StructType *RootRetTy = + cast<StructType>(RootF_X86->getFunctionType()->getReturnType()); - // if Root has non empty return + // if Root has non empty return if (RootRetTy->getNumElements()) { // We can't access the type of the arg struct - build it - std::vector<Type*> TyList; - for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); - ai != ae; ++ai) { + std::vector<Type *> TyList; + for (Function::arg_iterator ai = RootF_X86->arg_begin(), + ae = RootF_X86->arg_end(); + ai != ae; ++ai) { TyList.push_back(ai->getType()); } TyList.push_back(CI->getType()); - StructType* ArgStructTy = StructType::create(M.getContext(), - ArrayRef<Type*>(TyList), - (RootF_X86->getName()+".arg.struct.ty").str(), true); + StructType *ArgStructTy = StructType::create( + M.getContext(), ArrayRef<Type *>(TyList), + (RootF_X86->getName() + ".arg.struct.ty").str(), true); // Cast the data pointer to the type of the arg struct - CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, - ArgStructTy->getPointerTo(), - "argStructCast.addr", - RI); + CastInst *OutputAddrCast = CastInst::CreatePointerCast( + data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI); // Result struct is the last element of the packed struct passed to launch unsigned outStructIdx = ArgStructTy->getNumElements() - 1; - ConstantInt *IntZero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); - ConstantInt *IntIdx = ConstantInt::get(Type::getInt32Ty(M.getContext()), - outStructIdx); + ConstantInt *IntZero = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + ConstantInt *IntIdx = + ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx); - Value* GEPIIdxList[] = { IntZero, - IntIdx - }; + Value *GEPIIdxList[] = {IntZero, IntIdx}; // Get data pointer to the last element of struct - result field - GetElementPtrInst *OutGEPI = - GetElementPtrInst::Create(ArgStructTy, - OutputAddrCast, - ArrayRef<Value*>(GEPIIdxList, 2), - CI->getName()+".addr", - RI); + GetElementPtrInst *OutGEPI = GetElementPtrInst::Create( + ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2), + CI->getName() + ".addr", RI); // Store result there new StoreInst(CI, OutGEPI, RI); } else { @@ -874,10 +883,8 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) { // We were casting the data pointer to the result type of Root, and // returning result there. This would work at the LLVM level, but not // at the C level, thus the rewrite. - CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, - CI->getType()->getPointerTo(), - CI->getName()+".addr", - RI); + CastInst *OutputAddrCast = CastInst::CreatePointerCast( + data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI); new StoreInst(CI, OutputAddrCast, RI); } @@ -887,104 +894,100 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) { DEBUG(errs() << *AppFunc << "\n"); // Substitute launch intrinsic main - Value* LaunchInstArgs[] = {AppFunc, - LI->getArgOperand(1) - }; - CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, - ArrayRef<Value*>(LaunchInstArgs,2), - "graph"+Root->getFuncPointer()->getName(), LI); - //ReplaceInstWithInst(LI, LaunchInst); + Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)}; + CallInst *LaunchInst = CallInst::Create( + llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2), + "graph" + Root->getFuncPointer()->getName(), LI); + // ReplaceInstWithInst(LI, LaunchInst); DEBUG(errs() << *LaunchInst << "\n"); // Replace all wait instructions with x86 specific wait instructions - std::vector<IntrinsicInst*>* UseList = getUseList(LI); - for(unsigned i=0; i < UseList->size(); ++i) { - IntrinsicInst* II = UseList->at(i); - CallInst* CI; - switch(II->getIntrinsicID()) { + std::vector<IntrinsicInst *> *UseList = getUseList(LI); + for (unsigned i = 0; i < UseList->size(); ++i) { + IntrinsicInst *II = UseList->at(i); + CallInst *CI; + switch (II->getIntrinsicID()) { case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_x86_wait, - ArrayRef<Value*>(LaunchInst), + CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst), ""); break; case Intrinsic::visc_push: - CI = CallInst::Create(llvm_visc_bufferPush, - ArrayRef<Value*>(LaunchInst), + CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst), ""); break; case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value*>(LaunchInst), + CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst), ""); break; default: - llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + llvm_unreachable( + "GraphID is used by an instruction other than wait, push, pop"); }; ReplaceInstWithInst(II, CI); DEBUG(errs() << *CI << "\n"); } - } -Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { +Value *CGT_X86::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86, + Instruction *InsertBefore) { // TODO: Assumption is that each input port of a node has just one // incoming edge. May change later on. // Find the incoming edge at the requested input port - DFEdge* E = Child->getInDFEdgeAt(i); + DFEdge *E = Child->getInDFEdgeAt(i); assert(E && "No incoming edge or binding for input element!"); // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); + DFNode *SrcDF = E->getSourceDF(); // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { + Value *inputVal; + if (SrcDF->isEntryNode()) { inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { + DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); + } else { // edge is from a sibling // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); + assert(OutputMap.count(SrcDF) && + "Source node call not found. Dependency violation!"); // Find CallInst associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; + Value *CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "", InsertBefore); + DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); + ExtractValueInst *EI = + ExtractValueInst::Create(CI, IndexList, "", InsertBefore); inputVal = EI; } return inputVal; } -void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, - ValueToValueMapTy &VMap,Instruction* IB) { - Function* CF = C->getFuncPointer(); +void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, + ValueToValueMapTy &VMap, Instruction *IB) { + Function *CF = C->getFuncPointer(); -// Function* CF_X86 = C->getGenFunc(); + // Function* CF_X86 = C->getGenFunc(); Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); - assert(CF_X86 != NULL - && "Found leaf node for which code generation has not happened yet!\n"); + assert(CF_X86 != NULL && + "Found leaf node for which code generation has not happened yet!\n"); assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && - "The generated function to be called from x86 backend is not an x86 function\n"); + "The generated function to be called from x86 backend is not an x86 " + "function\n"); DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); - std::vector<Value*> Args; + std::vector<Value *> Args; // Create argument list to pass to call instruction // First find the correct values using the edges // The remaing six values are inserted as constants for now. - for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { Args.push_back(getInValueAt(C, i, F_X86, IB)); } - Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); - for(unsigned j=0; j<6; j++) + Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); + for (unsigned j = 0; j < 6; j++) Args.push_back(I64Zero); DEBUG(errs() << "Gen Function type: " << *CF_X86->getType() << "\n"); @@ -992,9 +995,8 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, DEBUG(errs() << "Arguments: " << Args.size() << "\n"); // Call the F_X86 function associated with this node - CallInst* CI = CallInst::Create(CF_X86, Args, - CF_X86->getName()+"_output", - IB); + CallInst *CI = + CallInst::Create(CF_X86, Args, CF_X86->getName() + "_output", IB); DEBUG(errs() << *CI << "\n"); OutputMap[C] = CI; @@ -1002,55 +1004,56 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, // Based on number of dimensions, insert loop instructions std::string varNames[3] = {"x", "y", "z"}; unsigned numArgs = CI->getNumArgOperands(); - for(unsigned j=0; j < C->getNumOfDim(); j++) { - Value* indexLimit = NULL; + for (unsigned j = 0; j < C->getNumOfDim(); j++) { + Value *indexLimit = NULL; // Limit can either be a constant or an arguement of the internal node. // In case of constant we can use that constant value directly in the // new F_X86 function. In case of an argument, we need to get the mapped // value using VMap - if(isa<Constant>(C->getDimLimits()[j])) { + if (isa<Constant>(C->getDimLimits()[j])) { indexLimit = C->getDimLimits()[j]; DEBUG(errs() << "In Constant case:\n" - << " indexLimit type = " << *indexLimit->getType() << "\n"); - } - else { + << " indexLimit type = " << *indexLimit->getType() << "\n"); + } else { indexLimit = VMap[C->getDimLimits()[j]]; DEBUG(errs() << "In VMap case:" - <<" indexLimit type = " << *indexLimit->getType() << "\n"); + << " indexLimit type = " << *indexLimit->getType() << "\n"); } assert(indexLimit && "Invalid dimension limit!"); // Insert loop - Value* indexVar = addLoop(CI, indexLimit, varNames[j]); + Value *indexVar = addLoop(CI, indexLimit, varNames[j]); DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); // Insert index variable and limit arguments - CI->setArgOperand(numArgs-6+j, indexVar); - CI->setArgOperand(numArgs-3+j, indexLimit); + CI->setArgOperand(numArgs - 6 + j, indexVar); + CI->setArgOperand(numArgs - 3 + j, indexLimit); } // Insert call to runtime to push the dim limits and instanceID on the depth // stack - Value* args[] = { - ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim - CI->getArgOperand(numArgs-3+0), // limitX - CI->getArgOperand(numArgs-6+0), // iX - CI->getArgOperand(numArgs-3+1), // limitY - CI->getArgOperand(numArgs-6+1), // iY - CI->getArgOperand(numArgs-3+2), // limitZ - CI->getArgOperand(numArgs-6+2) // iZ + Value *args[] = { + ConstantInt::get(Type::getInt32Ty(CI->getContext()), + C->getNumOfDim()), // numDim + CI->getArgOperand(numArgs - 3 + 0), // limitX + CI->getArgOperand(numArgs - 6 + 0), // iX + CI->getArgOperand(numArgs - 3 + 1), // limitY + CI->getArgOperand(numArgs - 6 + 1), // iY + CI->getArgOperand(numArgs - 3 + 2), // limitZ + CI->getArgOperand(numArgs - 6 + 2) // iZ }; - CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI); + CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push, + ArrayRef<Value *>(args, 7), "", CI); DEBUG(errs() << "Push on stack: " << *Push << "\n"); // Insert call to runtime to pop the dim limits and instanceID from the depth // stack BasicBlock::iterator i(CI); ++i; - Instruction* NextI = &*i; + Instruction *NextI = &*i; // Next Instruction should also belong to the same basic block as the basic // block will have a terminator instruction - assert(NextI->getParent() == CI->getParent() - && "Next Instruction should also belong to the same basic block!"); + assert(NextI->getParent() == CI->getParent() && + "Next Instruction should also belong to the same basic block!"); - CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); + CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); DEBUG(errs() << *CI->getParent()->getParent()); } @@ -1071,34 +1074,33 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, // Add runtime API calls to push output for each of the streaming outputs // Add loop around the basic block, which exits the loop if isLastInput is false -Function* CGT_X86::createFunctionFilter(DFNode* C) { - DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n"); +Function *CGT_X86::createFunctionFilter(DFNode *C) { + DEBUG(errs() << "*********Creating Function filter for " + << C->getFuncPointer()->getName() << "*****\n"); /* Create a function with same argument list as child.*/ DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); // Get the generated function for child node - Function* CF = C->getFuncPointer(); + Function *CF = C->getFuncPointer(); // Create Filter Function of type i8*(i8*) which calls the root function - Type* i8Ty = Type::getInt8Ty(M.getContext()); - FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(), - ArrayRef<Type*>(i8Ty->getPointerTo()), - false); - Function* CF_Pipeline = Function::Create(CF_PipelineTy, - CF->getLinkage(), - CF->getName()+"_Pipeline", - &M); + Type *i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType *CF_PipelineTy = FunctionType::get( + i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false); + Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(), + CF->getName() + "_Pipeline", &M); DEBUG(errs() << "Generating Pipline Function\n"); // Give a name to the argument which is used pass data to this thread - Value* data = &*CF_Pipeline->arg_begin(); + Value *data = &*CF_Pipeline->arg_begin(); data->setName("data.addr"); // Create a new basic block DEBUG(errs() << "\tCreate new BB and add a return function\n"); // Add a basic block to this empty function - BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); + BasicBlock *BB = + BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); // Add a return instruction to the basic block - ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(), - UndefValue::get(CF_Pipeline->getReturnType()), BB); - + ReturnInst *RI = + ReturnInst::Create(CF_Pipeline->getContext(), + UndefValue::get(CF_Pipeline->getReturnType()), BB); /* Extract the elements from the aggregate argument to the function. * Replace the streaming inputs with i8* types signifying handle to @@ -1109,25 +1111,24 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); // These Args will be used when passing arguments to the generated function // inside loop, and reading outputs as well. - std::vector<Value*> Args; - std::vector<Type*> TyList; + std::vector<Value *> Args; + std::vector<Type *> TyList; std::vector<std::string> names; // Adding inputs - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); - i != e; ++i) { - if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; + ++i) { + if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { TyList.push_back(i8Ty->getPointerTo()); - names.push_back((Twine(i->getName())+"_buffer").str()); - } - else { + names.push_back((Twine(i->getName()) + "_buffer").str()); + } else { TyList.push_back(i->getType()); names.push_back(i->getName()); } } // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, // because we get there buffer handles - StructType* RetTy = cast<StructType>(CF->getReturnType()); - for (unsigned i=0; i<RetTy->getNumElements(); i++) { + StructType *RetTy = cast<StructType>(CF->getReturnType()); + for (unsigned i = 0; i < RetTy->getNumElements(); i++) { TyList.push_back(i8Ty->getPointerTo()); names.push_back("out"); } @@ -1138,64 +1139,52 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { // Extract the inputs, outputs and Args = extractElements(data, TyList, names, RI); - for(unsigned i=0; i<Args.size(); i++) { + for (unsigned i = 0; i < Args.size(); i++) { DEBUG(errs() << *Args[i] << "\n"); } // Split the Args vector into, input output and isLastInput unsigned numInputs = CF->getFunctionType()->getNumParams(); unsigned numOutputs = RetTy->getNumElements(); - std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs); - std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs); - Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]); + std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs); + std::vector<Value *> OutputArgs(Args.begin() + numInputs, + Args.begin() + numInputs + numOutputs); + Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]); /* Add runtime API calls to get input for each of the streaming input edges */ - DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n"); + DEBUG(errs() << "\tAdd runtime API calls to get input for each of the " + "streaming input edges\n"); // First read the termination condition variable islastInput - CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value*>(isLastInput), - "", - RI); - - CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop, - Type::getInt64Ty(CF_Pipeline->getContext()), - false, - "isLastInput", - RI); + CallInst *isLastInputPop = CallInst::Create( + llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); + + CastInst *BI = BitCastInst::CreateIntegerCast( + isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false, + "isLastInput", RI); isLastInput = BI; // Create a loop termination condition - CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, - isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero", - RI); + CmpInst *Cond = CmpInst::Create( + Instruction::ICmp, CmpInst::ICMP_NE, isLastInput, + Constant::getNullValue(Type::getInt64Ty(CF->getContext())), + "isLastInputNotZero", RI); // Get input from buffers of all the incoming streaming edges - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); - i != e; ++i) { - if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { - CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value*>(InputArgs[i->getArgNo()]), - "", - RI); - CastInst* BI; - if(i->getType()->isPointerTy()) { - BI = CastInst::Create(CastInst::IntToPtr, - bufferIn, - i->getType(), - i->getName()+".addr", - RI); - } - else if(i->getType()->isFloatTy()) { - BI = CastInst::CreateFPCast(bufferIn, - i->getType(), - i->getName()+".addr", - RI); - } - else { - BI = CastInst::CreateIntegerCast(bufferIn, - i->getType(), - false, - i->getName()+".addr", - RI); + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; + ++i) { + if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + CallInst *bufferIn = + CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI); + CastInst *BI; + if (i->getType()->isPointerTy()) { + BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(), + i->getName() + ".addr", RI); + } else if (i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, i->getType(), + i->getName() + ".addr", RI); + } else { + BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false, + i->getName() + ".addr", RI); } // Replace the argument in Args vector. We would be using the vector as // parameters passed to the call @@ -1204,46 +1193,40 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { } /* Add a call to the generated function of the child node */ DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); -// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); -// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, -// C->getGenFunc()->getName()+".output", RI); + // DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); + // CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, + // C->getGenFunc()->getName()+".output", RI); Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); - DEBUG(errs() << "Type: " - << *CGenF->getType() - << "\n"); - CallInst* CI = CallInst::Create(CGenF, - InputArgs, - CGenF->getName()+".output", - RI); + DEBUG(errs() << "Type: " << *CGenF->getType() << "\n"); + CallInst *CI = + CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI); /* Add runtime API calls to push output for each of the streaming outputs */ // FIXME: Assumption // All edges between siblings are streaming edges - DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n"); - for (unsigned i=0; i< numOutputs; i++) { + DEBUG(errs() << "\tAdd runtime API calls to push output for each of the " + "streaming outputs\n"); + for (unsigned i = 0; i < numOutputs; i++) { // Extract output - ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), - "",RI); + ExtractValueInst *EI = + ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI); // Convert to i64 - CastInst* BI; - if(EI->getType()->isPointerTy()) - BI = CastInst::Create(CastInst::PtrToInt,EI, - Type::getInt64Ty(CF_Pipeline->getContext()), - "", - RI); + CastInst *BI; + if (EI->getType()->isPointerTy()) + BI = + CastInst::Create(CastInst::PtrToInt, EI, + Type::getInt64Ty(CF_Pipeline->getContext()), "", RI); else - BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()), - false, "", RI); + BI = CastInst::CreateIntegerCast( + EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI); // Push to Output buffer - Value* bufferOutArgs[] = {OutputArgs[i], BI}; - CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush, - ArrayRef<Value*>(bufferOutArgs, 2), - "", - RI); + Value *bufferOutArgs[] = {OutputArgs[i], BI}; + CallInst *bufferOut = CallInst::Create( + llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI); } - // Add loop around the basic block, which exits the loop if isLastInput is false - // Pointers to keep the created loop structure + // Add loop around the basic block, which exits the loop if isLastInput is + // false Pointers to keep the created loop structure BasicBlock *EntryBB, *CondBB, *BodyBB; Instruction *CondStartI = cast<Instruction>(isLastInputPop); Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); @@ -1258,16 +1241,16 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { // If the node function calls the visc runtime call to get policy, we update // it with the counter information. This means we need to pass an additional // argument to the generated function, that is the iteration number, and then - // use it as an argument to the policy_getVersion call + // use it as an argument to the policy_getVersion call if (GetPolicyCI) { CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB); assert(CntI && "Counter instruction not found\n"); // Create new function type (with additional argument for iteration number) Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); - std::vector<Type*> NewArgTypes; + std::vector<Type *> NewArgTypes; for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); - ai != ae ; ++ai) { + ai != ae; ++ai) { NewArgTypes.push_back(ai->getType()); } NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); @@ -1283,9 +1266,8 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { // Add counter to the actual parameter list, to create the new call InputArgs.push_back(CntI); - CallInst* newCI = CallInst::Create(NewCGenF, - InputArgs, - NewCGenF->getName()+".output"); + CallInst *newCI = + CallInst::Create(NewCGenF, InputArgs, NewCGenF->getName() + ".output"); ReplaceInstWithInst(CI, newCI); // Set second operand of the policy_getVersion call to the last function @@ -1300,19 +1282,19 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { return CF_Pipeline; } -void CGT_X86::codeGen(DFInternalNode* N) { +void CGT_X86::codeGen(DFInternalNode *N) { // Check if N is root node and its graph is streaming. We do not do codeGen // for Root in such a case - if(N->isRoot() && N->isChildGraphStreaming()) + if (N->isRoot() && N->isChildGraphStreaming()) return; // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. -// if(N->getGenFunc() != NULL) -// return; + // if(N->getGenFunc() != NULL) + // return; if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { - DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << - " : skipping it\n"); + DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() + << " : skipping it\n"); return; } @@ -1325,9 +1307,10 @@ void CGT_X86::codeGen(DFInternalNode* N) { // Only process if all children have a CPU x86 function // Otherwise skip to end bool codeGen = true; - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - DFNode* C = *ci; + for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); + ci != ce; ++ci) { + DFNode *C = *ci; // Skip dummy node call if (C->isDummyNode()) continue; @@ -1342,17 +1325,18 @@ void CGT_X86::codeGen(DFInternalNode* N) { } if (codeGen) { - Function* F = N->getFuncPointer(); + Function *F = N->getFuncPointer(); // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. - Function* F_X86; - + Function *F_X86; + // Clone the function, if we are seeing this function for the first time. We // only need a clone in terms of type. ValueToValueMapTy VMap; - + // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), + F->getName(), &M); // Loop over the arguments, copying the names of arguments over. Function::arg_iterator dest_iterator = F_X86->arg_begin(); @@ -1365,19 +1349,19 @@ void CGT_X86::codeGen(DFInternalNode* N) { // Add a basic block to this empty function BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); - ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), - UndefValue::get(F_X86->getReturnType()), BB); + ReturnInst *RI = ReturnInst::Create( + F_X86->getContext(), UndefValue::get(F_X86->getReturnType()), BB); - // Add Index and Dim arguments except for the root node and the child graph of - // parent node is not streaming - if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + // Add Index and Dim arguments except for the root node and the child graph + // of parent node is not streaming + if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); BB = &*F_X86->begin(); RI = cast<ReturnInst>(BB->getTerminator()); - - //Add generated function info to DFNode -// N->setGenFunc(F_X86, visc::CPU_TARGET); + + // Add generated function info to DFNode + // N->setGenFunc(F_X86, visc::CPU_TARGET); N->addGenFunc(F_X86, visc::CPU_TARGET, true); // Loop over the arguments, to create the VMap. @@ -1390,59 +1374,59 @@ void CGT_X86::codeGen(DFInternalNode* N) { } // Iterate over children in topological order - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - DFNode* C = *ci; + for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); + ci != ce; ++ci) { + DFNode *C = *ci; // Skip dummy node call if (C->isDummyNode()) continue; - + // Create calls to CPU function of child node invokeChild_X86(C, F_X86, VMap, RI); - } - + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - DFNode* C = N->getChildGraph()->getExit(); + DFNode *C = N->getChildGraph()->getExit(); // Get OutputType of this node - StructType* OutTy = N->getOutputType(); + StructType *OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find all the input edges to exit node - for (unsigned i=0; i < OutTy->getNumElements(); i++) { + for (unsigned i = 0; i < OutTy->getNumElements(); i++) { DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); - + DFEdge *E = C->getInDFEdgeAt(i); + assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); - - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); - + DFNode *SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() + << "\n"); + // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { + Value *inputVal; + if (SrcDF->isEntryNode()) { inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { + DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); + } else { // edge is from a internal node // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); - + assert(OutputMap.count(SrcDF) && + "Source node call not found. Dependency violation!"); + // Find Output Value associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; - + Value *CI = OutputMap[SrcDF]; + // Extract element at source position from this call instruction std::vector<unsigned> IndexList; IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "",RI); + DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI + << "\n"); + ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI); inputVal = EI; } std::vector<unsigned> IdxList; @@ -1451,9 +1435,8 @@ void CGT_X86::codeGen(DFInternalNode* N) { } DEBUG(errs() << "Extracted all\n"); retVal->setName("output"); - ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); - } //-------------------------------------------------------------------------// @@ -1470,11 +1453,11 @@ void CGT_X86::codeGen(DFInternalNode* N) { bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() - << " with tag " << N->getTag() << "\n"); - DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"); + DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " + << N->getTag() << "\n"); + DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n"); DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"); - DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"); + DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); if (N->getTag() == visc::None) { @@ -1482,7 +1465,7 @@ void CGT_X86::codeGen(DFInternalNode* N) { // node is a node that // - from the accelerator backends has been mapped to an intermediate // node, and thus they have not produced a genFunc - // - a child node had no CPU hint, thus no code gen for CPU could + // - a child node had no CPU hint, thus no code gen for CPU could // take place DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node " << N->getFuncPointer()->getName() << "\n"); @@ -1493,34 +1476,34 @@ void CGT_X86::codeGen(DFInternalNode* N) { // Sanity check - to be removed TODO switch (N->getTag()) { - case visc::CPU_TARGET: - assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); - assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); - break; - case visc::GPU_TARGET: - assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); - break; - default: - assert(false && "Unreachable: we checked that tag was single target!\n"); - break; + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + break; + case visc::GPU_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); + break; + default: + assert(false && "Unreachable: we checked that tag was single target!\n"); + break; } - // If device abstraction is enabled, then we may need to edit the node + // If device abstraction is enabled, then we may need to edit the node // function. In case this is a GPU or SPIR gen func, we issue a call to // the runtime that waits for the device to be available if (DeviceAbstraction) { Function *NodeGenFunc = NULL; switch (N->getTag()) { - case visc::GPU_TARGET: - NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); - break; - default: - break; + case visc::GPU_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); + break; + default: + break; } if (NodeGenFunc) { @@ -1528,12 +1511,14 @@ void CGT_X86::codeGen(DFInternalNode* N) { // its first statement BasicBlock *BB = &*NodeGenFunc->begin(); std::vector<Value *> Args; // TODO: add the device type as argument? - FunctionCallee RTF = - M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); + FunctionCallee RTF = M.getOrInsertFunction( + "llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule + ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") + ->getFunctionType()); + CallInst *RTFInst = + CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); } - } Function *Ftmp = N->getGenFuncForTarget(N->getTag()); @@ -1550,11 +1535,11 @@ void CGT_X86::codeGen(DFInternalNode* N) { GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); DEBUG(errs() << "After editing\n"); - DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() - << " with tag " << N->getTag() << "\n"); - DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"); + DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " + << N->getTag() << "\n"); + DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n"); DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"); - DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"); + DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); <<<<<<< HEAD ======= @@ -2015,4 +2000,3 @@ static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86", "Dataflow Graph to LLVM for X86 backend", false /* does not modify the CFG */, true /* transformation, not just analysis */); - diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp index 5e0d4df006cb89414d15991e99f29332f6329b99..799de784c8c4a675927fbb5a0e63ea30b668d738 100644 --- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp +++ b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp @@ -10,112 +10,118 @@ #define DEBUG_TYPE "genvisc" #include "GenVISC/GenVISC.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Instructions.h" #include "llvm/IRReader/IRReader.h" -#include "llvm/IR/DerivedTypes.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Debug.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/IR/Instructions.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "SupportVISC/VISCUtils.h" - +#include "llvm/Transforms/Utils/ValueMapper.h" -#define TIMER(X) do { if (VISCTimer) { X; } } while (0) +#define TIMER(X) \ + do { \ + if (VISCTimer) { \ + X; \ + } \ + } while (0) using namespace llvm; using namespace viscUtils; - // VISC Command line option to use timer or not -static cl::opt<bool> -VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer")); +static cl::opt<bool> VISCTimer("visc-timers-gen", + cl::desc("Enable GenVISC timer")); namespace genvisc { // Helper Functions -static inline ConstantInt* getTimerID(Module&, enum visc_TimerID); -static Function* transformReturnTypeToStruct(Function* F); -static Type* getReturnTypeFromReturnInst(Function* F); +static inline ConstantInt *getTimerID(Module &, enum visc_TimerID); +static Function *transformReturnTypeToStruct(Function *F); +static Type *getReturnTypeFromReturnInst(Function *F); // Check if the dummy function call is a __visc__node call -#define IS_VISC_CALL(callName) \ - static bool isVISCCall_##callName(Instruction* I) { \ - if(!isa<CallInst>(I)) \ - return false; \ - CallInst* CI = cast<CallInst>(I); \ - return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \ +#define IS_VISC_CALL(callName) \ + static bool isVISCCall_##callName(Instruction *I) { \ + if (!isa<CallInst>(I)) \ + return false; \ + CallInst *CI = cast<CallInst>(I); \ + return (CI->getCalledValue()->stripPointerCasts()->getName()) \ + .equals("__visc__" #callName); \ } -static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) { +static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID, + std::vector<Instruction *> *Erase) { // Check if the instruction is Call Instruction assert(isa<CallInst>(I) && "Expecting CallInst"); - CallInst* CI = cast<CallInst>(I); + CallInst *CI = cast<CallInst>(I); DEBUG(errs() << "Found call: " << *CI << "\n"); // Find the correct intrinsic call - Module* M = CI->getParent()->getParent()->getParent(); - Function* F; - std::vector<Type*> ArgTypes; - std::vector<Value*> args; - if(Intrinsic::isOverloaded(IntrinsicID)) { + Module *M = CI->getParent()->getParent()->getParent(); + Function *F; + std::vector<Type *> ArgTypes; + std::vector<Value *> args; + if (Intrinsic::isOverloaded(IntrinsicID)) { // This is an overloaded intrinsic. The types must exactly match. Get the // argument types - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { ArgTypes.push_back(CI->getArgOperand(i)->getType()); args.push_back(CI->getArgOperand(i)); } F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); DEBUG(errs() << *F << "\n"); - } - else { // Non-overloaded intrinsic + } else { // Non-overloaded intrinsic F = Intrinsic::getDeclaration(M, IntrinsicID); - FunctionType* FTy = F->getFunctionType(); + FunctionType *FTy = F->getFunctionType(); DEBUG(errs() << *F << "\n"); // Create argument list - assert(CI->getNumArgOperands() == FTy->getNumParams() - && "Number of arguments of call do not match with Intrinsic"); - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { - Value* V = CI->getArgOperand(i); + assert(CI->getNumArgOperands() == FTy->getNumParams() && + "Number of arguments of call do not match with Intrinsic"); + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + Value *V = CI->getArgOperand(i); // Either the type should match or both should be of pointer type assert((V->getType() == FTy->getParamType(i) || - (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) - && "Dummy function call argument does not match with Intrinsic argument!"); + (V->getType()->isPointerTy() && + FTy->getParamType(i)->isPointerTy())) && + "Dummy function call argument does not match with Intrinsic " + "argument!"); // If the types do not match, then both must be pointer type and pointer // cast needs to be performed - if(V->getType() != FTy->getParamType(i)) { + if (V->getType() != FTy->getParamType(i)) { V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); } args.push_back(V); } } // Insert call instruction - CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + CallInst *Inst = CallInst::Create( + F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); CI->replaceAllUsesWith(Inst); // If the previous instruction needs to be erased, insert it in the vector // Erased - if(Erase != NULL) + if (Erase != NULL) Erase->push_back(CI); } IS_VISC_CALL(launch) /* Exists but not required */ -IS_VISC_CALL(edge) /* Exists but not required */ +IS_VISC_CALL(edge) /* Exists but not required */ IS_VISC_CALL(createNodeND) -//IS_VISC_CALL(createNode) -//IS_VISC_CALL(createNode1D) -//IS_VISC_CALL(createNode2D) -//IS_VISC_CALL(createNode3D) +// IS_VISC_CALL(createNode) +// IS_VISC_CALL(createNode1D) +// IS_VISC_CALL(createNode2D) +// IS_VISC_CALL(createNode3D) IS_VISC_CALL(bindIn) IS_VISC_CALL(bindOut) IS_VISC_CALL(push) @@ -124,7 +130,7 @@ IS_VISC_CALL(getNode) IS_VISC_CALL(getParentNode) IS_VISC_CALL(barrier) IS_VISC_CALL(malloc) -IS_VISC_CALL(return) +IS_VISC_CALL(return ) IS_VISC_CALL(getNodeInstanceID_x) IS_VISC_CALL(getNodeInstanceID_y) IS_VISC_CALL(getNodeInstanceID_z) @@ -152,7 +158,6 @@ IS_VISC_CALL(sqrt) IS_VISC_CALL(sin) IS_VISC_CALL(cos) - IS_VISC_CALL(init) IS_VISC_CALL(cleanup) IS_VISC_CALL(wait) @@ -163,94 +168,91 @@ IS_VISC_CALL(attributes) IS_VISC_CALL(hint) // Return the constant integer represented by value V -static unsigned getNumericValue(Value* V) { - assert(isa<ConstantInt>(V) - && "Value indicating the number of arguments should be a constant integer"); +static unsigned getNumericValue(Value *V) { + assert( + isa<ConstantInt>(V) && + "Value indicating the number of arguments should be a constant integer"); return cast<ConstantInt>(V)->getZExtValue(); } // Take the __visc__return instruction and generate code for combining the // values being returned into a struct and returning it. // The first operand is the number of returned values -static Value* genCodeForReturn(CallInst* CI) { - LLVMContext& Ctx = CI->getContext(); - assert(isVISCCall_return(CI) - && "__visc__return instruction expected!"); +static Value *genCodeForReturn(CallInst *CI) { + LLVMContext &Ctx = CI->getContext(); + assert(isVISCCall_return(CI) && "__visc__return instruction expected!"); // Parse the dummy function call here - assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n"); + assert(CI->getNumArgOperands() > 0 && + "Too few arguments for __visc_return call!\n"); unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); - assert(CI->getNumArgOperands()-1 == numRetVals && + assert(CI->getNumArgOperands() - 1 == numRetVals && "Too few arguments for __visc_return call!\n"); DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); - std::vector<Type*> ArgTypes; - for(unsigned i=1; i < CI->getNumArgOperands(); i++) { + std::vector<Type *> ArgTypes; + for (unsigned i = 1; i < CI->getNumArgOperands(); i++) { ArgTypes.push_back(CI->getArgOperand(i)->getType()); } Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); - StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); + StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); - InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy), - CI->getArgOperand(1), - 0, - "returnStruct", - CI); + InsertValueInst *IV = InsertValueInst::Create( + UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI); DEBUG(errs() << "Code generation for return:\n"); DEBUG(errs() << *IV << "\n"); - for(unsigned i=2; i < CI->getNumArgOperands(); i++) { - IV = InsertValueInst::Create(IV, - CI->getArgOperand(i), - i-1, - IV->getName(), + for (unsigned i = 2; i < CI->getNumArgOperands(); i++) { + IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(), CI); DEBUG(errs() << *IV << "\n"); } - + return IV; } // Analyse the attribute call for this function. Add the in and out // attributes to pointer parameters. -static void handleVISCAttributes(Function* F, CallInst* CI) { - DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n"); +static void handleVISCAttributes(Function *F, CallInst *CI) { + DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" + << *F << "\n"); // Parse the dummy function call here unsigned offset = 0; // Find number of In pointers - assert(CI->getNumArgOperands() > offset - && "Too few arguments for __visc__attributes call!"); + assert(CI->getNumArgOperands() > offset && + "Too few arguments for __visc__attributes call!"); unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); - for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) { - Value* V = CI->getArgOperand(i); - if(Argument* arg = dyn_cast<Argument>(V)) { - F->addAttribute(1+arg->getArgNo(), Attribute::In); - } - else { + for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) { + Value *V = CI->getArgOperand(i); + if (Argument *arg = dyn_cast<Argument>(V)) { + F->addAttribute(1 + arg->getArgNo(), Attribute::In); + } else { DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n"); - llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); + llvm_unreachable( + "Only pointer arguments can be passed to __visc__attributes call"); } } // Find number of Out Pointers offset += 1 + numInPtrs; - assert(CI->getNumArgOperands() > offset - && "Too few arguments for __visc__attributes call!"); + assert(CI->getNumArgOperands() > offset && + "Too few arguments for __visc__attributes call!"); unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); - for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) { - Value* V = CI->getArgOperand(i); - if(Argument* arg = dyn_cast<Argument>(V)) { - F->addAttribute(1+arg->getArgNo(), Attribute::Out); - } - else { + for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) { + Value *V = CI->getArgOperand(i); + if (Argument *arg = dyn_cast<Argument>(V)) { + F->addAttribute(1 + arg->getArgNo(), Attribute::Out); + } else { DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n"); - llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); + llvm_unreachable( + "Only pointer arguments can be passed to __visc__attributes call"); } } - DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n"); + DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" + << *F << "\n"); } // Public Functions of GenVISC pass @@ -261,38 +263,42 @@ bool GenVISC::runOnModule(Module &M) { // Load Runtime API Module SMDiagnostic Err; - char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); - assert(LLVM_SRC_ROOT != NULL && - "Define LLVM_SRC_ROOT environment variable!"); + char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll"; DEBUG(errs() << llvmSrcRoot << "\n"); - std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + std::unique_ptr<Module> runtimeModule = + parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if(runtimeModule == NULL) + if (runtimeModule == NULL) DEBUG(errs() << Err.getMessage()); else DEBUG(errs() << "Successfully loaded visc-rt API module\n"); - llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet", - runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType()); - //DEBUG(errs() << *llvm_visc_initializeTimerSet); + llvm_visc_initializeTimerSet = M.getOrInsertFunction( + "llvm_visc_initializeTimerSet", + runtimeModule->getFunction("llvm_visc_initializeTimerSet") + ->getFunctionType()); + // DEBUG(errs() << *llvm_visc_initializeTimerSet); - llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer", - runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType()); - // DEBUG(errs() << *llvm_visc_switchToTimer); + llvm_visc_switchToTimer = M.getOrInsertFunction( + "llvm_visc_switchToTimer", + runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType()); + // DEBUG(errs() << *llvm_visc_switchToTimer); - llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet", - runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType()); - //DEBUG(errs() << *llvm_visc_printTimerSet); + llvm_visc_printTimerSet = M.getOrInsertFunction( + "llvm_visc_printTimerSet", + runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType()); + // DEBUG(errs() << *llvm_visc_printTimerSet); // Insert init context in main DEBUG(errs() << "Locate __visc__init()\n"); - Function* VI = M.getFunction("__visc__init"); + Function *VI = M.getFunction("__visc__init"); assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); - Instruction* I = cast<Instruction>(*VI->user_begin()); + Instruction *I = cast<Instruction>(*VI->user_begin()); DEBUG(errs() << "Initialize Timer Set\n"); initializeTimerSet(I); @@ -300,18 +306,17 @@ bool GenVISC::runOnModule(Module &M) { // Insert print instruction at visc exit DEBUG(errs() << "Locate __visc__cleanup()\n"); - Function* VC = M.getFunction("__visc__cleanup"); + Function *VC = M.getFunction("__visc__cleanup"); assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); I = cast<Instruction>(*VC->user_begin()); printTimerSet(I); - DEBUG(errs() << "-------- Searching for launch sites ----------\n"); - std::vector<Instruction*> toBeErased; - std::vector<Function*> functions; + std::vector<Instruction *> toBeErased; + std::vector<Function *> functions; - for (auto &F : M) + for (auto &F : M) functions.push_back(&F); // Iterate over all functions in the module @@ -319,7 +324,7 @@ bool GenVISC::runOnModule(Module &M) { DEBUG(errs() << "Function: " << f->getName() << "\n"); // List with the required additions in the function's return type - std::vector<Type*> FRetTypes; + std::vector<Type *> FRetTypes; enum mutateTypeCause { mtc_None, @@ -330,98 +335,106 @@ bool GenVISC::runOnModule(Module &M) { bind = mutateTypeCause::mtc_None; // Iterate over all the instructions in this function - for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { - Instruction* I = &*i; // Grab pointer to Instruction + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) { + Instruction *I = &*i; // Grab pointer to Instruction // If not a call instruction, move to next instruction - if(!isa<CallInst>(I)) + if (!isa<CallInst>(I)) continue; - CallInst* CI = cast<CallInst>(I); - LLVMContext& Ctx = CI->getContext(); + CallInst *CI = cast<CallInst>(I); + LLVMContext &Ctx = CI->getContext(); - if(isVISCCall_init(I)) { + if (isVISCCall_init(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased); } - if(isVISCCall_cleanup(I)) { + if (isVISCCall_cleanup(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased); } - if(isVISCCall_wait(I)) { + if (isVISCCall_wait(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased); } - if(isVISCCall_trackMemory(I)) { + if (isVISCCall_trackMemory(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased); } - if(isVISCCall_untrackMemory(I)) { + if (isVISCCall_untrackMemory(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased); } - if(isVISCCall_requestMemory(I)) { + if (isVISCCall_requestMemory(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased); } - if(isVISCCall_hint(I)) { - assert(isa<ConstantInt>(CI->getArgOperand(0)) - && "Argument to hint must be constant integer!"); - ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0)); + if (isVISCCall_hint(I)) { + assert(isa<ConstantInt>(CI->getArgOperand(0)) && + "Argument to hint must be constant integer!"); + ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0)); - visc::Target t = (visc::Target) hint->getZExtValue(); + visc::Target t = (visc::Target)hint->getZExtValue(); addHint(CI->getParent()->getParent(), t); DEBUG(errs() << "Found visc hint call: " << *CI << "\n"); toBeErased.push_back(CI); } - if(isVISCCall_launch(I)) { - Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); + if (isVISCCall_launch(I)) { + Function *LaunchF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); DEBUG(errs() << *LaunchF << "\n"); // Get i8* cast to function pointer - Function* graphFunc = cast<Function>(CI->getArgOperand(1)); + Function *graphFunc = cast<Function>(CI->getArgOperand(1)); graphFunc = transformReturnTypeToStruct(graphFunc); - Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - assert(F && "Function invoked by VISC launch has to be define and constant."); - - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0)); - assert(Op && "VISC launch's streaming argument is a constant value."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - + Constant *F = + ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + assert( + F && + "Function invoked by VISC launch has to be define and constant."); + + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0)); + assert(Op && "VISC launch's streaming argument is a constant value."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType()); assert(ArgTy && "VISC launch argument should be pointer type."); Value *Arg = CI->getArgOperand(2); - if(!ArgTy->getElementType()->isIntegerTy(8)) - Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI); - Value* LaunchArgs[] = {F, Arg, isStreaming}; - CallInst* LaunchInst = CallInst::Create(LaunchF, - ArrayRef<Value*>(LaunchArgs, 3), - "graphID", CI); + if (!ArgTy->getElementType()->isIntegerTy(8)) + Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), + Type::getInt8PtrTy(Ctx), "", CI); + Value *LaunchArgs[] = {F, Arg, isStreaming}; + CallInst *LaunchInst = CallInst::Create( + LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI); DEBUG(errs() << "Found visc launch call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); CI->replaceAllUsesWith(LaunchInst); toBeErased.push_back(CI); } - if(isVISCCall_push(I)) { + if (isVISCCall_push(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased); } - if(isVISCCall_pop(I)) { + if (isVISCCall_pop(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased); } - if(isVISCCall_createNodeND(I)) { + if (isVISCCall_createNodeND(I)) { assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc__createNodeND call"); unsigned numDims = getNumericValue(CI->getArgOperand(0)); // We need as meny dimension argments are there are dimensions - assert(CI->getNumArgOperands()-2 == numDims && - "Too few arguments for __visc_createNodeND call!\n"); + assert(CI->getNumArgOperands() - 2 == numDims && + "Too few arguments for __visc_createNodeND call!\n"); - Function* CreateNodeF; + Function *CreateNodeF; switch (numDims) { case 0: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); break; case 1: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); break; case 2: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); break; case 3: - CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); + CreateNodeF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); break; default: llvm_unreachable("Unsupported number of dimensions\n"); @@ -429,63 +442,57 @@ bool GenVISC::runOnModule(Module &M) { } DEBUG(errs() << *CreateNodeF << "\n"); DEBUG(errs() << *I << "\n"); - DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n"); + DEBUG(errs() << "in " << I->getParent()->getParent()->getName() + << "\n"); // Get i8* cast to function pointer - Function* graphFunc = cast<Function>(CI->getArgOperand(1)); + Function *graphFunc = cast<Function>(CI->getArgOperand(1)); graphFunc = transformReturnTypeToStruct(graphFunc); - Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + Constant *F = + ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - CallInst* CreateNodeInst; + CallInst *CreateNodeInst; switch (numDims) { case 0: - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(F), - graphFunc->getName()+".node", CI); + CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F), + graphFunc->getName() + ".node", CI); break; - case 1: - { + case 1: { assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 2, expected to be i64\n"); - Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)}; - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(CreateNodeArgs, 2), - graphFunc->getName()+".node", CI); - } - break; - case 2: - { + Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)}; + CreateNodeInst = CallInst::Create( + CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2), + graphFunc->getName() + ".node", CI); + } break; + case 2: { assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 2, expected to be i64\n"); assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 3, expected to be i64\n"); - Value* CreateNodeArgs[] = {F, - CI->getArgOperand(2), + Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), CI->getArgOperand(3)}; - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(CreateNodeArgs, 3), - graphFunc->getName()+".node", CI); - } - break; - case 3: - { + CreateNodeInst = CallInst::Create( + CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3), + graphFunc->getName() + ".node", CI); + } break; + case 3: { assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 2, expected to be i64\n"); assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 3, expected to be i64\n"); assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 4, expected to be i64\n"); - Value* CreateNodeArgs[] = {F, - CI->getArgOperand(2), + Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4)}; - CreateNodeInst = CallInst::Create(CreateNodeF, - ArrayRef<Value*>(CreateNodeArgs, 4), - graphFunc->getName()+".node", CI); - } - break; + CreateNodeInst = CallInst::Create( + CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4), + graphFunc->getName() + ".node", CI); + } break; default: - llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n"); + llvm_unreachable( + "Impossible path: number of dimensions is 0, 1, 2, 3\n"); break; } @@ -495,99 +502,104 @@ bool GenVISC::runOnModule(Module &M) { toBeErased.push_back(CI); } - if(isVISCCall_edge(I)) { - Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); + if (isVISCCall_edge(I)) { + Function *EdgeF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); DEBUG(errs() << *EdgeF << "\n"); - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5)); - ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); - assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4), - isStreaming - }; - CallInst* EdgeInst = CallInst::Create(EdgeF, - ArrayRef<Value*>(EdgeArgs, 6), - "output", CI); + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5)); + ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); + assert(Op && EdgeTypeOp && + "Arguments of CreateEdge are not constant integers."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + isAllToAll, CI->getArgOperand(3), + CI->getArgOperand(4), isStreaming}; + CallInst *EdgeInst = CallInst::Create( + EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI); DEBUG(errs() << "Found visc edge call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); CI->replaceAllUsesWith(EdgeInst); toBeErased.push_back(CI); } - if(isVISCCall_bindIn(I)) { - Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); + if (isVISCCall_bindIn(I)) { + Function *BindInF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); DEBUG(errs() << *BindInF << "\n"); // Check if this is a streaming bind or not - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind in intrinsic should be a constant integer."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming - }; - CallInst* BindInInst = CallInst::Create(BindInF, - ArrayRef<Value*>(BindInArgs, 4), - "", CI); + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind in intrinsic should be a " + "constant integer."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming}; + CallInst *BindInInst = + CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI); DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); CI->replaceAllUsesWith(BindInInst); toBeErased.push_back(CI); } - if(isVISCCall_bindOut(I)) { - Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); + if (isVISCCall_bindOut(I)) { + Function *BindOutF = + Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); DEBUG(errs() << *BindOutF << "\n"); // Check if this is a streaming bind or not - ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind out intrinsic should be a constant integer."); - Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming - }; - CallInst* BindOutInst = CallInst::Create(BindOutF, - ArrayRef<Value*>(BindOutArgs, 4), - "", CI); + ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind out intrinsic should be a " + "constant integer."); + Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming}; + CallInst *BindOutInst = CallInst::Create( + BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI); DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); DEBUG(errs() << "Fixing the return type of the function\n"); // FIXME: What if the child node function has not been visited already. // i.e., it's return type has not been fixed. - Function* F = I->getParent()->getParent(); + Function *F = I->getParent()->getParent(); DEBUG(errs() << F->getName() << "\n";); - IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0)); - assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic."); + IntrinsicInst *NodeIntrinsic = + cast<IntrinsicInst>(CI->getArgOperand(0)); + assert(NodeIntrinsic && + "Instruction value in bind out is not a create node intrinsic."); DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); - assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) && - "Instruction value in bind out is not a create node intrinsic."); - Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); + assert( + (NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) && + "Instruction value in bind out is not a create node intrinsic."); + Function *ChildF = cast<Function>( + NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); DEBUG(errs() << ChildF->getName() << "\n";); int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); - StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType()); + StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType()); - Type* ReturnType = F->getReturnType(); + Type *ReturnType = F->getReturnType(); DEBUG(errs() << *ReturnType << "\n";); - assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) - && "Return type should either be a struct or void type!"); + assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) && + "Return type should either be a struct or void type!"); - FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos)); + FRetTypes.insert(FRetTypes.begin() + destpos, + ChildReturnTy->getElementType(srcpos)); assert(((bind == mutateTypeCause::mtc_BIND) || (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and visc_return detected"); + "Both bind_out and visc_return detected"); bind = mutateTypeCause::mtc_BIND; CI->replaceAllUsesWith(BindOutInst); toBeErased.push_back(CI); } - if(isVISCCall_attributes(I)) { - Function* F = CI->getParent()->getParent(); + if (isVISCCall_attributes(I)) { + Function *F = CI->getParent()->getParent(); handleVISCAttributes(F, CI); toBeErased.push_back(CI); } @@ -604,67 +616,76 @@ bool GenVISC::runOnModule(Module &M) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased); } if (isVISCCall_return(I)) { - DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n"); + DEBUG(errs() << "Function before visc return processing\n" + << *I->getParent()->getParent() << "\n"); // The operands to this call are the values to be returned by the node - Value* ReturnVal = genCodeForReturn(CI); + Value *ReturnVal = genCodeForReturn(CI); DEBUG(errs() << *ReturnVal << "\n"); - Type* ReturnType = ReturnVal->getType(); - assert(isa<StructType>(ReturnType) - && "Return type should be a struct type!"); + Type *ReturnType = ReturnVal->getType(); + assert(isa<StructType>(ReturnType) && + "Return type should be a struct type!"); assert(((bind == mutateTypeCause::mtc_RETURN) || (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and visc_return detected"); + "Both bind_out and visc_return detected"); if (bind == mutateTypeCause::mtc_None) { // If this is None, this is the first __visc__return // instruction we have come upon. Place the return type of the // function in the return type vector bind = mutateTypeCause::mtc_RETURN; - StructType* ReturnStructTy = cast<StructType>(ReturnType); + StructType *ReturnStructTy = cast<StructType>(ReturnType); for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) FRetTypes.push_back(ReturnStructTy->getElementType(i)); } else { // bind == mutateTypeCause::mtc_RETURN // This is not the first __visc__return - // instruction we have come upon. + // instruction we have come upon. // Check that the return types are the same - assert((ReturnType == FRetTypes[0]) - && "Multiple returns with mismatching types"); + assert((ReturnType == FRetTypes[0]) && + "Multiple returns with mismatching types"); } - ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal); + ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal); DEBUG(errs() << "Found visc return call: " << *CI << "\n"); - Instruction* oldReturn = CI->getParent()->getTerminator(); - assert(isa<ReturnInst>(oldReturn) - && "Expecting a return to be the terminator of this BB!"); + Instruction *oldReturn = CI->getParent()->getTerminator(); + assert(isa<ReturnInst>(oldReturn) && + "Expecting a return to be the terminator of this BB!"); DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); - //CI->replaceAllUsesWith(RetInst); + // CI->replaceAllUsesWith(RetInst); toBeErased.push_back(CI); ReplaceInstWithInst(oldReturn, RetInst); - DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n"); + DEBUG(errs() << "Function after visc return processing\n" + << *I->getParent()->getParent() << "\n"); } if (isVISCCall_getNodeInstanceID_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, + &toBeErased); } if (isVISCCall_getNodeInstanceID_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, + &toBeErased); } if (isVISCCall_getNodeInstanceID_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, + &toBeErased); } if (isVISCCall_getNumNodeInstances_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, + &toBeErased); } if (isVISCCall_getNumNodeInstances_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, + &toBeErased); } if (isVISCCall_getNumNodeInstances_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, + &toBeErased); } if (isVISCCall_atomic_cmpxchg(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, + &toBeErased); } if (isVISCCall_atomic_add(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased); @@ -706,7 +727,8 @@ bool GenVISC::runOnModule(Module &M) { ReplaceCallWithIntrinsic(I, Intrinsic::floor, &toBeErased); } if (isVISCCall_rsqrt(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f, &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f, + &toBeErased); } if (isVISCCall_sqrt(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::sqrt, &toBeErased); @@ -721,148 +743,155 @@ bool GenVISC::runOnModule(Module &M) { // Erase the __visc__node calls DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); - for(auto I: toBeErased) { + for (auto I : toBeErased) { DEBUG(errs() << *I << "\n"); } - while(!toBeErased.empty()) { - Instruction* I = toBeErased.back(); + while (!toBeErased.empty()) { + Instruction *I = toBeErased.back(); DEBUG(errs() << "\tErasing " << *I << "\n"); I->eraseFromParent(); - toBeErased.pop_back(); + toBeErased.pop_back(); } - if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) { - DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); - // Argument type list. - std::vector<Type*> FArgTypes; - for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); - ai != ae; ++ai) { - FArgTypes.push_back(ai->getType()); - } - - // Find new return type of function - Type* NewReturnTy; - if(bind == mutateTypeCause::mtc_BIND) { - - std::vector<Type*> TyList; - for (unsigned i = 0; i < FRetTypes.size(); i++) - TyList.push_back(FRetTypes[i]); - - NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true); - } - else { - NewReturnTy = getReturnTypeFromReturnInst(f); - assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); - } - - FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); - - // Change the function type - Function* newF = cloneFunction(f, FTy, false); - DEBUG(errs() << *newF << "\n"); - - if (bind == mutateTypeCause::mtc_BIND) { - // This is certainly an internal node, and hence just one BB with one - // return terminator instruction. Change return statement - ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator()); - ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy)); - ReplaceInstWithInst(RI, newRI); - } - if (bind == mutateTypeCause::mtc_RETURN) { - // Nothing - } - replaceNodeFunctionInIR(*f->getParent(), f, newF); - DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); + if (bind == mutateTypeCause::mtc_BIND || + bind == mutateTypeCause::mtc_RETURN) { + DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); + // Argument type list. + std::vector<Type *> FArgTypes; + for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); + ai != ae; ++ai) { + FArgTypes.push_back(ai->getType()); + } + + // Find new return type of function + Type *NewReturnTy; + if (bind == mutateTypeCause::mtc_BIND) { + + std::vector<Type *> TyList; + for (unsigned i = 0; i < FRetTypes.size(); i++) + TyList.push_back(FRetTypes[i]); + + NewReturnTy = + StructType::create(f->getContext(), TyList, + Twine("struct.out." + f->getName()).str(), true); + } else { + NewReturnTy = getReturnTypeFromReturnInst(f); + assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); + } + + FunctionType *FTy = + FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); + + // Change the function type + Function *newF = cloneFunction(f, FTy, false); + DEBUG(errs() << *newF << "\n"); + + if (bind == mutateTypeCause::mtc_BIND) { + // This is certainly an internal node, and hence just one BB with one + // return terminator instruction. Change return statement + ReturnInst *RI = + cast<ReturnInst>(newF->getEntryBlock().getTerminator()); + ReturnInst *newRI = ReturnInst::Create(newF->getContext(), + UndefValue::get(NewReturnTy)); + ReplaceInstWithInst(RI, newRI); + } + if (bind == mutateTypeCause::mtc_RETURN) { + // Nothing + } + replaceNodeFunctionInIR(*f->getParent(), f, newF); + DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); } - - } - return false; //TODO: What does returning "false" mean? + return false; // TODO: What does returning "false" mean? } // Generate Code for declaring a constant string [L x i8] and return a pointer // to the start of it. -Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { - Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true); - Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true, - GlobalValue::InternalLinkage, SConstant, Name); - Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); - Value* GEPArgs[] = {Zero, Zero}; - GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, - ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); +Value *GenVISC::getStringPointer(const Twine &S, Instruction *IB, + const Twine &Name) { + Constant *SConstant = + ConstantDataArray::getString(M->getContext(), S.str(), true); + Value *SGlobal = + new GlobalVariable(*M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); + Value *GEPArgs[] = {Zero, Zero}; + GetElementPtrInst *SPtr = GetElementPtrInst::Create( + nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB); return SPtr; } -void GenVISC::initializeTimerSet(Instruction* InsertBefore) { - Value* TimerSetAddr; - StoreInst* SI; - TIMER(TimerSet = new GlobalVariable(*M, - Type::getInt8PtrTy(M->getContext()), - false, - GlobalValue::CommonLinkage, - Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), - "viscTimerSet_GenVISC")); - DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n"); - //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n"); - - TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, - None, - "", +void GenVISC::initializeTimerSet(Instruction *InsertBefore) { + Value *TimerSetAddr; + StoreInst *SI; + TIMER(TimerSet = new GlobalVariable( + *M, Type::getInt8PtrTy(M->getContext()), false, + GlobalValue::CommonLinkage, + Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), + "viscTimerSet_GenVISC")); + DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet + << "\n"); + // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << + // "\n"); + + TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, None, "", InsertBefore)); DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"); TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore)); DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n"); } -void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) { - Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)}; +void GenVISC::switchToTimer(enum visc_TimerID timer, + Instruction *InsertBefore) { + Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)}; TIMER(CallInst::Create(llvm_visc_switchToTimer, - ArrayRef<Value*>(switchArgs, 2), - "", - InsertBefore)); + ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); } -void GenVISC::printTimerSet(Instruction* InsertBefore) { - Value* TimerName; +void GenVISC::printTimerSet(Instruction *InsertBefore) { + Value *TimerName; TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore)); - Value* printArgs[] = {TimerSet, TimerName}; + Value *printArgs[] = {TimerSet, TimerName}; TIMER(CallInst::Create(llvm_visc_printTimerSet, - ArrayRef<Value*>(printArgs, 2), - "", - InsertBefore)); + ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); } -static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) { +static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); } -static Function* transformReturnTypeToStruct(Function* F) { +static Function *transformReturnTypeToStruct(Function *F) { // Currently only works for void return types - DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n"); + DEBUG(errs() << "Transforming return type of function to Struct: " + << F->getName() << "\n"); if (isa<StructType>(F->getReturnType())) { - DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n"); + DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " + << *F->getReturnType() << "\n"); return F; } - assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n"); + assert(F->getReturnType()->isVoidTy() && + "Unhandled case - Only void return type handled\n"); // Create the argument type list with added argument types - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { + std::vector<Type *> ArgTypes; + for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { ArgTypes.push_back(ai->getType()); } - - StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true); - FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); - - SmallVector<ReturnInst*, 8> Returns; - Function* newF = cloneFunction(F, FTy, false, &Returns); + + StructType *RetTy = + StructType::create(F->getContext(), None, "emptyStruct", true); + FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); + + SmallVector<ReturnInst *, 8> Returns; + Function *newF = cloneFunction(F, FTy, false, &Returns); // Replace ret void instruction with ret %RetTy undef - for(auto &RI: Returns) { - DEBUG(errs() << "Found return inst: "<< *RI << "\n"); - ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); + for (auto &RI : Returns) { + DEBUG(errs() << "Found return inst: " << *RI << "\n"); + ReturnInst *newRI = + ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); ReplaceInstWithInst(RI, newRI); } @@ -870,19 +899,20 @@ static Function* transformReturnTypeToStruct(Function* F) { return newF; } -static Type* getReturnTypeFromReturnInst(Function* F) { - for(BasicBlock &BB: *F) { - if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) { - DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n"); +static Type *getReturnTypeFromReturnInst(Function *F) { + for (BasicBlock &BB : *F) { + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) { + DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() + << "\n"); return RI->getReturnValue()->getType(); } } } - char genvisc::GenVISC::ID = 0; -static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false); +static RegisterPass<genvisc::GenVISC> + X("genvisc", + "Pass to generate VISC IR from LLVM IR (with dummy function calls)", + false, false); } // End of namespace genvisc - - diff --git a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp index 359ee74d41a64ae0b2aeb025d9d94c55feaac7b8..7bd66b62c6c8cda589fe3e6c1e3711893aceaffb 100644 --- a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp +++ b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp @@ -8,18 +8,18 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "LocalMem" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" +#include "SupportVISC/DFG2LLVM.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" +#include "llvm/Pass.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Constant.h" -#include "SupportVISC/DFG2LLVM.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; using namespace builddfg; @@ -28,7 +28,7 @@ using namespace dfg2llvm; namespace { // Helper Functions -static AllocationNodeProperty* isAllocationNode(DFLeafNode* N); +static AllocationNodeProperty *isAllocationNode(DFLeafNode *N); // LocalMem - The first implementation. struct LocalMem : public ModulePass { @@ -53,23 +53,22 @@ public: class AT_OCL : public CodeGenTraversal { private: - //Member variables + // Member variables - //Functions + // Functions // Virtual Functions void init() {} void initRuntimeAPI() {} - void codeGen(DFInternalNode* N); - void codeGen(DFLeafNode* N); + void codeGen(DFInternalNode *N); + void codeGen(DFLeafNode *N); public: // Constructor AT_OCL(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { - //init(); - //initRuntimeAPI(); + // init(); + // initRuntimeAPI(); } - }; bool LocalMem::runOnModule(Module &M) { @@ -80,8 +79,8 @@ bool LocalMem::runOnModule(Module &M) { // - Maps from i8* hansles to DFNode and DFEdge BuildDFG &DFG = getAnalysis<BuildDFG>(); - //DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode *> Roots = DFG.getRoots(); // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); @@ -89,102 +88,103 @@ bool LocalMem::runOnModule(Module &M) { AT_OCL *ATVisitor = new AT_OCL(M, DFG); // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode: Roots) { + for (auto rootNode : Roots) { // Initiate code generation for root DFNode ATVisitor->visit(rootNode); - // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // Go ahead and replace the launch intrinsic with pthread call, otherwise + // return now. // TODO: Later on, we might like to do this in a separate pass, which would - // allow us the flexibility to switch between complete static code generation - // for DFG or having a customized runtime+scheduler + // allow us the flexibility to switch between complete static code + // generation for DFG or having a customized runtime+scheduler } delete ATVisitor; return true; } -void AT_OCL::codeGen(DFInternalNode* N) { +void AT_OCL::codeGen(DFInternalNode *N) { DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); } // Code generation for leaf nodes -void AT_OCL::codeGen(DFLeafNode* N) { +void AT_OCL::codeGen(DFLeafNode *N) { DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); // Skip code generation if it is a dummy node - if(N->isDummyNode()) { + if (N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } // Check and mark as allocation node - AllocationNodeProperty* ANP = isAllocationNode(N); - if(ANP != NULL) { + AllocationNodeProperty *ANP = isAllocationNode(N); + if (ANP != NULL) { // set Properties of the allocation node N->setProperty(DFNode::Allocation, ANP); - AllocationNodeProperty* anp = (AllocationNodeProperty*) N->getProperty(DFNode::Allocation); + AllocationNodeProperty *anp = + (AllocationNodeProperty *)N->getProperty(DFNode::Allocation); AllocationNodeProperty::AllocationListType AL = anp->getAllocationList(); DEBUG(errs() << "Total allocations = " << AL.size() << "\n"); - for(auto P: AL) { + for (auto P : AL) { DEBUG(errs() << " EdgePort: " << P.first->getDestPosition()); DEBUG(errs() << " Size: " << *P.second << "\n"); - } - + } } } -// Return pointer to property if this leaf node matches the conditions for being an allocation -// node. -// Conditions +// Return pointer to property if this leaf node matches the conditions for being +// an allocation node. Conditions // 1. No incoming memory pointer. No in/out attribute on a pointer argument // 2. Uses visc malloc intrinsic to allocate memory // 3. Sends it out // 2. (TODO:) Whether the allocated pointer escapes the parent node -AllocationNodeProperty* isAllocationNode(DFLeafNode* N) { +AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { // Allocation node must be free from side-effects - if(N->hasSideEffects()) + if (N->hasSideEffects()) return NULL; // Allocation node must have some outgoing edges - if(N->getOutputType()->isEmptyTy()) + if (N->getOutputType()->isEmptyTy()) return NULL; - Function* F = N->getFuncPointer(); - + Function *F = N->getFuncPointer(); + // Allocation node must use visc malloc intrinsic bool usesVISCMalloc = false; - for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) { - Instruction* I = &*i; - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) { - if(II->getIntrinsicID() == Intrinsic::visc_malloc) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) { + Instruction *I = &*i; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::visc_malloc) { usesVISCMalloc = true; break; } } - } - if(!usesVISCMalloc) + } + if (!usesVISCMalloc) return NULL; // TODO: Check if allocated pointer leaves parent node - + // This is an allocation node - AllocationNodeProperty* ANP = new AllocationNodeProperty(); + AllocationNodeProperty *ANP = new AllocationNodeProperty(); // Find the return statement. // FIXME: For now, assuming their is just one BB. Terminator instruction of // this BB is a return statement. The value returned is what we need - BasicBlock& BB = F->getEntryBlock(); - assert(isa<ReturnInst>(BB.getTerminator()) - && "Currently we do not handle the case where Allocation Node has multiple BB"); - ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator()); + BasicBlock &BB = F->getEntryBlock(); + assert(isa<ReturnInst>(BB.getTerminator()) && + "Currently we do not handle the case where Allocation Node has " + "multiple BB"); + ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()); // Find the returned struct - Value* val = RI->getReturnValue(); - std::vector<Value*> OutValues(6, NULL); + Value *val = RI->getReturnValue(); + std::vector<Value *> OutValues(6, NULL); unsigned numOutputs = N->getOutputType()->getNumElements(); - for(unsigned i = 0; i < numOutputs; i++) { - if(InsertValueInst* IV = dyn_cast<InsertValueInst>(val)) { - DEBUG(errs() << "Value at out edge" << numOutputs-1-i << ": " << *val << "\n"); - OutValues[numOutputs-1-i] = IV->getOperand(1); + for (unsigned i = 0; i < numOutputs; i++) { + if (InsertValueInst *IV = dyn_cast<InsertValueInst>(val)) { + DEBUG(errs() << "Value at out edge" << numOutputs - 1 - i << ": " << *val + << "\n"); + OutValues[numOutputs - 1 - i] = IV->getOperand(1); val = IV->getOperand(0); - } - else { + } else { DEBUG(errs() << "Unexpected value at out edge: " << *val << "\n"); llvm_unreachable("Expecting InsertValue instruction. Error!"); } @@ -192,33 +192,34 @@ AllocationNodeProperty* isAllocationNode(DFLeafNode* N) { // OutValues vector contains all the values that will go out // Assume that the Allocation node only sends the pointers and their sizes // forward - unsigned i=0; - while(i < numOutputs) { - assert(OutValues[i]->getType()->isPointerTy() - && "Expected outgoing edge to be of pointer type"); - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(OutValues[i])) { - if(II->getIntrinsicID() == Intrinsic::visc_malloc) { + unsigned i = 0; + while (i < numOutputs) { + assert(OutValues[i]->getType()->isPointerTy() && + "Expected outgoing edge to be of pointer type"); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(OutValues[i])) { + if (II->getIntrinsicID() == Intrinsic::visc_malloc) { // Sanity check: Size passed to malloc intrinsic is same as the value // going into the next outgoing edge - DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); - DEBUG(errs() << "Out edge value: " << *OutValues[i+1] << "\n"); - assert(II->getArgOperand(0) == OutValues[i+1] - && "Sanity Check Failed: VISC Malloc size argument != next outgoing edge"); + DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); + DEBUG(errs() << "Out edge value: " << *OutValues[i + 1] << "\n"); + assert(II->getArgOperand(0) == OutValues[i + 1] && + "Sanity Check Failed: VISC Malloc size argument != next " + "outgoing edge"); ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0)); - i = i+2; + i = i + 2; continue; } } llvm_unreachable("Expecting visc malloc intrinsic instruction!"); - } + } return ANP; } } // End of namespace char LocalMem::ID = 0; -static RegisterPass<LocalMem> X("localmem", - "Pass to identifying nodes amenable to local memory allocation", - false /* does not modify the CFG */, - true /* transformation, not just analysis */); - +static RegisterPass<LocalMem> + X("localmem", + "Pass to identifying nodes amenable to local memory allocation", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); diff --git a/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h b/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h index 4870066fe4e10a6b0a02d50c7a79639103219be6..5e59ba96f2331663289a040326ebd4e453bd1e86 100644 --- a/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h +++ b/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h @@ -331,7 +331,7 @@ enum MetadataCodes { METADATA_INDEX_OFFSET = 38, // [offset] METADATA_INDEX = 39, // [bitpos] METADATA_LABEL = 40, // [distinct, scope, name, file, line] - METADATA_COMMON_BLOCK = 44, // [distinct, scope, name, variable,...] + METADATA_COMMON_BLOCK = 44, // [distinct, scope, name, variable,...] }; // The constants block (CONSTANTS_BLOCK_ID) describes emission for each @@ -363,7 +363,7 @@ enum ConstantsCodes { CST_CODE_INLINEASM = 23, // INLINEASM: [sideeffect|alignstack| // asmdialect,asmstr,conststr] CST_CODE_CE_GEP_WITH_INRANGE_INDEX = 24, // [opty, flags, n x operands] - CST_CODE_CE_UNOP = 25, // CE_UNOP: [opcode, opval] + CST_CODE_CE_UNOP = 25, // CE_UNOP: [opcode, opval] }; /// CastOpcodes - These are values used in the bitcode files to encode which @@ -390,9 +390,7 @@ enum CastOpcodes { /// unop a CST_CODE_CE_UNOP or a XXX refers to. The values of these enums /// have no fixed relation to the LLVM IR enum values. Changing these will /// break compatibility with old files. -enum UnaryOpcodes { - UNOP_NEG = 0 -}; +enum UnaryOpcodes { UNOP_NEG = 0 }; /// BinaryOpcodes - These are values used in the bitcode files to encode which /// binop a CST_CODE_CE_BINOP or a XXX refers to. The values of these enums @@ -444,14 +442,14 @@ enum OverflowingBinaryOperatorOptionalFlags { /// This is a fixed layout derived from the bitcode emitted by LLVM 5.0 /// intended to decouple the in-memory representation from the serialization. enum FastMathMap { - UnsafeAlgebra = (1 << 0), // Legacy - NoNaNs = (1 << 1), - NoInfs = (1 << 2), - NoSignedZeros = (1 << 3), + UnsafeAlgebra = (1 << 0), // Legacy + NoNaNs = (1 << 1), + NoInfs = (1 << 2), + NoSignedZeros = (1 << 3), AllowReciprocal = (1 << 4), - AllowContract = (1 << 5), - ApproxFunc = (1 << 6), - AllowReassoc = (1 << 7) + AllowContract = (1 << 5), + ApproxFunc = (1 << 6), + AllowReassoc = (1 << 7) }; /// PossiblyExactOperatorOptionalFlags - Flags for serializing @@ -653,7 +651,7 @@ enum SymtabCodes { SYMTAB_BLOB = 1, }; -} // End bitc namespace -} // End llvm namespace +} // namespace bitc +} // namespace llvm #endif diff --git a/hpvm/llvm_patches/include/Support/Debug.h b/hpvm/llvm_patches/include/Support/Debug.h index 277a6d7b89336841779941ca5034e116eb14ebfc..25c031100e29d06b594966c7378d190c73ea09fb 100644 --- a/hpvm/llvm_patches/include/Support/Debug.h +++ b/hpvm/llvm_patches/include/Support/Debug.h @@ -61,15 +61,20 @@ void setCurrentDebugTypes(const char **Types, unsigned Count); /// /// This will emit the debug information if -debug is present, and -debug-only /// is not specified, or is specified as "bitset". -#define DEBUG_WITH_TYPE(TYPE, X) \ - do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)) { X; } \ +#define DEBUG_WITH_TYPE(TYPE, X) \ + do { \ + if (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)) { \ + X; \ + } \ } while (false) #else #define isCurrentDebugType(X) (false) #define setCurrentDebugType(X) #define setCurrentDebugTypes(X, N) -#define DEBUG_WITH_TYPE(TYPE, X) do { } while (false) +#define DEBUG_WITH_TYPE(TYPE, X) \ + do { \ + } while (false) #endif /// This boolean is set to true if the '-debug' command line option diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp index 3861f4c7a22c9bc8e473d67e118357c5706c00d8..a924405a2cac85ccd2e5e903a1ee1abb52774566 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp +++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp @@ -47,8 +47,8 @@ uint64_t LLLexer::atoull(const char *Buffer, const char *End) { for (; Buffer != End; Buffer++) { uint64_t OldRes = Result; Result *= 10; - Result += *Buffer-'0'; - if (Result < OldRes) { // Uh, oh, overflow detected!!! + Result += *Buffer - '0'; + if (Result < OldRes) { // Uh, oh, overflow detected!!! Error("constant bigger than 64 bits detected!"); return 0; } @@ -63,7 +63,7 @@ uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) { Result *= 16; Result += hexDigitValue(*Buffer); - if (Result < OldRes) { // Uh, oh, overflow detected!!! + if (Result < OldRes) { // Uh, oh, overflow detected!!! Error("constant bigger than 64 bits detected!"); return 0; } @@ -93,9 +93,9 @@ void LLLexer::HexToIntPair(const char *Buffer, const char *End, /// FP80HexToIntPair - translate an 80 bit FP80 number (20 hexits) into /// { low64, high16 } as usual for an APInt. void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End, - uint64_t Pair[2]) { + uint64_t Pair[2]) { Pair[1] = 0; - for (int i=0; i<4 && Buffer != End; i++, Buffer++) { + for (int i = 0; i < 4 && Buffer != End; i++, Buffer++) { assert(Buffer != End); Pair[1] *= 16; Pair[1] += hexDigitValue(*Buffer); @@ -112,20 +112,21 @@ void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End, // UnEscapeLexed - Run through the specified buffer and change \xx codes to the // appropriate character. static void UnEscapeLexed(std::string &Str) { - if (Str.empty()) return; + if (Str.empty()) + return; - char *Buffer = &Str[0], *EndBuffer = Buffer+Str.size(); + char *Buffer = &Str[0], *EndBuffer = Buffer + Str.size(); char *BOut = Buffer; - for (char *BIn = Buffer; BIn != EndBuffer; ) { + for (char *BIn = Buffer; BIn != EndBuffer;) { if (BIn[0] == '\\') { - if (BIn < EndBuffer-1 && BIn[1] == '\\') { + if (BIn < EndBuffer - 1 && BIn[1] == '\\') { *BOut++ = '\\'; // Two \ becomes one BIn += 2; - } else if (BIn < EndBuffer-2 && + } else if (BIn < EndBuffer - 2 && isxdigit(static_cast<unsigned char>(BIn[1])) && isxdigit(static_cast<unsigned char>(BIn[2]))) { *BOut = hexDigitValue(BIn[1]) * 16 + hexDigitValue(BIn[2]); - BIn += 3; // Skip over handled chars + BIn += 3; // Skip over handled chars ++BOut; } else { *BOut++ = *BIn++; @@ -134,7 +135,7 @@ static void UnEscapeLexed(std::string &Str) { *BOut++ = *BIn++; } } - Str.resize(BOut-Buffer); + Str.resize(BOut - Buffer); } /// isLabelChar - Return true for [-a-zA-Z$._0-9]. @@ -146,8 +147,10 @@ static bool isLabelChar(char C) { /// isLabelTail - Return true if this pointer points to a valid end of a label. static const char *isLabelTail(const char *CurPtr) { while (true) { - if (CurPtr[0] == ':') return CurPtr+1; - if (!isLabelChar(CurPtr[0])) return nullptr; + if (CurPtr[0] == ':') + return CurPtr + 1; + if (!isLabelChar(CurPtr[0])) + return nullptr; ++CurPtr; } } @@ -166,15 +169,16 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err, int LLLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { - default: return (unsigned char)CurChar; + default: + return (unsigned char)CurChar; case 0: // A nul character in the stream is either the end of the current buffer or // a random nul in the file. Disambiguate that here. - if (CurPtr-1 != CurBuf.end()) - return 0; // Just whitespace. + if (CurPtr - 1 != CurBuf.end()) + return 0; // Just whitespace. // Otherwise, return end of file. - --CurPtr; // Another call to lex will return EOF again. + --CurPtr; // Another call to lex will return EOF again. return EOF; } } @@ -191,7 +195,8 @@ lltok::Kind LLLexer::LexToken() { return LexIdentifier(); return lltok::Error; - case EOF: return lltok::Eof; + case EOF: + return lltok::Eof; case 0: case ' ': case '\t': @@ -199,15 +204,20 @@ lltok::Kind LLLexer::LexToken() { case '\r': // Ignore whitespace. continue; - case '+': return LexPositive(); - case '@': return LexAt(); - case '$': return LexDollar(); - case '%': return LexPercent(); - case '"': return LexQuote(); + case '+': + return LexPositive(); + case '@': + return LexAt(); + case '$': + return LexDollar(); + case '%': + return LexPercent(); + case '"': + return LexQuote(); case '.': if (const char *Ptr = isLabelTail(CurPtr)) { CurPtr = Ptr; - StrVal.assign(TokStart, CurPtr-1); + StrVal.assign(TokStart, CurPtr - 1); return lltok::LabelStr; } if (CurPtr[0] == '.' && CurPtr[1] == '.') { @@ -218,28 +228,50 @@ lltok::Kind LLLexer::LexToken() { case ';': SkipLineComment(); continue; - case '!': return LexExclaim(); + case '!': + return LexExclaim(); case '^': return LexCaret(); case ':': return lltok::colon; - case '#': return LexHash(); - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': + case '#': + return LexHash(); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': case '-': return LexDigitOrNegative(); - case '=': return lltok::equal; - case '[': return lltok::lsquare; - case ']': return lltok::rsquare; - case '{': return lltok::lbrace; - case '}': return lltok::rbrace; - case '<': return lltok::less; - case '>': return lltok::greater; - case '(': return lltok::lparen; - case ')': return lltok::rparen; - case ',': return lltok::comma; - case '*': return lltok::star; - case '|': return lltok::bar; + case '=': + return lltok::equal; + case '[': + return lltok::lsquare; + case ']': + return lltok::rsquare; + case '{': + return lltok::lbrace; + case '}': + return lltok::rbrace; + case '<': + return lltok::less; + case '>': + return lltok::greater; + case '(': + return lltok::lparen; + case ')': + return lltok::rparen; + case ',': + return lltok::comma; + case '*': + return lltok::star; + case '|': + return lltok::bar; } } } @@ -307,7 +339,7 @@ lltok::Kind LLLexer::ReadString(lltok::Kind kind) { return lltok::Error; } if (CurChar == '"') { - StrVal.assign(Start, CurPtr-1); + StrVal.assign(Start, CurPtr - 1); UnEscapeLexed(StrVal); return kind; } @@ -317,13 +349,11 @@ lltok::Kind LLLexer::ReadString(lltok::Kind kind) { /// ReadVarName - Read the rest of a token containing a variable name. bool LLLexer::ReadVarName() { const char *NameStart = CurPtr; - if (isalpha(static_cast<unsigned char>(CurPtr[0])) || - CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') { + if (isalpha(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' || + CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_') { ++CurPtr; - while (isalnum(static_cast<unsigned char>(CurPtr[0])) || - CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') + while (isalnum(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' || + CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_') ++CurPtr; StrVal.assign(NameStart, CurPtr); @@ -361,7 +391,7 @@ lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) { return lltok::Error; } if (CurChar == '"') { - StrVal.assign(TokStart+2, CurPtr-1); + StrVal.assign(TokStart + 2, CurPtr - 1); UnEscapeLexed(StrVal); if (StringRef(StrVal).find_first_of(0) != StringRef::npos) { Error("Null bytes are not allowed in names"); @@ -414,16 +444,16 @@ lltok::Kind LLLexer::LexQuote() { /// ! lltok::Kind LLLexer::LexExclaim() { // Lex a metadata name as a MetadataVar. - if (isalpha(static_cast<unsigned char>(CurPtr[0])) || - CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') { + if (isalpha(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' || + CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_' || + CurPtr[0] == '\\') { ++CurPtr; - while (isalnum(static_cast<unsigned char>(CurPtr[0])) || - CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') + while (isalnum(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' || + CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_' || + CurPtr[0] == '\\') ++CurPtr; - StrVal.assign(TokStart+1, CurPtr); // Skip ! + StrVal.assign(TokStart + 1, CurPtr); // Skip ! UnEscapeLexed(StrVal); return lltok::MetadataVar; } @@ -466,13 +496,14 @@ lltok::Kind LLLexer::LexIdentifier() { // If we stopped due to a colon, unless we were directed to ignore it, // this really is a label. if (!IgnoreColonInIdentifiers && *CurPtr == ':') { - StrVal.assign(StartChar-1, CurPtr++); + StrVal.assign(StartChar - 1, CurPtr++); return lltok::LabelStr; } // Otherwise, this wasn't a label. If this was valid as an integer type, // return it. - if (!IntEnd) IntEnd = CurPtr; + if (!IntEnd) + IntEnd = CurPtr; if (IntEnd != StartChar) { CurPtr = IntEnd; uint64_t NumBits = atoull(StartChar, CurPtr); @@ -486,7 +517,8 @@ lltok::Kind LLLexer::LexIdentifier() { } // Otherwise, this was a letter sequence. See which keyword this is. - if (!KeywordEnd) KeywordEnd = CurPtr; + if (!KeywordEnd) + KeywordEnd = CurPtr; CurPtr = KeywordEnd; --StartChar; StringRef Keyword(StartChar, CurPtr - StartChar); @@ -497,9 +529,12 @@ lltok::Kind LLLexer::LexIdentifier() { return lltok::kw_##STR; \ } while (false) - KEYWORD(true); KEYWORD(false); - KEYWORD(declare); KEYWORD(define); - KEYWORD(global); KEYWORD(constant); + KEYWORD(true); + KEYWORD(false); + KEYWORD(declare); + KEYWORD(define); + KEYWORD(global); + KEYWORD(constant); KEYWORD(dso_local); KEYWORD(dso_preemptable); @@ -542,7 +577,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(triple); KEYWORD(source_filename); KEYWORD(unwind); - KEYWORD(deplibs); // FIXME: Remove in 4.0. + KEYWORD(deplibs); // FIXME: Remove in 4.0. KEYWORD(datalayout); KEYWORD(volatile); KEYWORD(atomic); @@ -703,12 +738,32 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(noduplicates); KEYWORD(samesize); - KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle); - KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge); - KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole); - KEYWORD(oge); KEYWORD(ord); KEYWORD(uno); KEYWORD(ueq); KEYWORD(une); - - KEYWORD(xchg); KEYWORD(nand); KEYWORD(max); KEYWORD(min); KEYWORD(umax); + KEYWORD(eq); + KEYWORD(ne); + KEYWORD(slt); + KEYWORD(sgt); + KEYWORD(sle); + KEYWORD(sge); + KEYWORD(ult); + KEYWORD(ugt); + KEYWORD(ule); + KEYWORD(uge); + KEYWORD(oeq); + KEYWORD(one); + KEYWORD(olt); + KEYWORD(ogt); + KEYWORD(ole); + KEYWORD(oge); + KEYWORD(ord); + KEYWORD(uno); + KEYWORD(ueq); + KEYWORD(une); + + KEYWORD(xchg); + KEYWORD(nand); + KEYWORD(max); + KEYWORD(min); + KEYWORD(umax); KEYWORD(umin); KEYWORD(vscale); @@ -800,10 +855,10 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(bit); KEYWORD(varFlags); -// VISC parameter attributes - KEYWORD(in); - KEYWORD(out); - KEYWORD(inout); + // VISC parameter attributes + KEYWORD(in); + KEYWORD(out); + KEYWORD(inout); #undef KEYWORD @@ -816,17 +871,17 @@ lltok::Kind LLLexer::LexIdentifier() { } \ } while (false) - TYPEKEYWORD("void", Type::getVoidTy(Context)); - TYPEKEYWORD("half", Type::getHalfTy(Context)); - TYPEKEYWORD("float", Type::getFloatTy(Context)); - TYPEKEYWORD("double", Type::getDoubleTy(Context)); - TYPEKEYWORD("x86_fp80", Type::getX86_FP80Ty(Context)); - TYPEKEYWORD("fp128", Type::getFP128Ty(Context)); + TYPEKEYWORD("void", Type::getVoidTy(Context)); + TYPEKEYWORD("half", Type::getHalfTy(Context)); + TYPEKEYWORD("float", Type::getFloatTy(Context)); + TYPEKEYWORD("double", Type::getDoubleTy(Context)); + TYPEKEYWORD("x86_fp80", Type::getX86_FP80Ty(Context)); + TYPEKEYWORD("fp128", Type::getFP128Ty(Context)); TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context)); - TYPEKEYWORD("label", Type::getLabelTy(Context)); - TYPEKEYWORD("metadata", Type::getMetadataTy(Context)); - TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); - TYPEKEYWORD("token", Type::getTokenTy(Context)); + TYPEKEYWORD("label", Type::getLabelTy(Context)); + TYPEKEYWORD("metadata", Type::getMetadataTy(Context)); + TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); + TYPEKEYWORD("token", Type::getTokenTy(Context)); #undef TYPEKEYWORD @@ -839,62 +894,74 @@ lltok::Kind LLLexer::LexIdentifier() { } \ } while (false) - INSTKEYWORD(fneg, FNeg); - - INSTKEYWORD(add, Add); INSTKEYWORD(fadd, FAdd); - INSTKEYWORD(sub, Sub); INSTKEYWORD(fsub, FSub); - INSTKEYWORD(mul, Mul); INSTKEYWORD(fmul, FMul); - INSTKEYWORD(udiv, UDiv); INSTKEYWORD(sdiv, SDiv); INSTKEYWORD(fdiv, FDiv); - INSTKEYWORD(urem, URem); INSTKEYWORD(srem, SRem); INSTKEYWORD(frem, FRem); - INSTKEYWORD(shl, Shl); INSTKEYWORD(lshr, LShr); INSTKEYWORD(ashr, AShr); - INSTKEYWORD(and, And); INSTKEYWORD(or, Or); INSTKEYWORD(xor, Xor); - INSTKEYWORD(icmp, ICmp); INSTKEYWORD(fcmp, FCmp); - - INSTKEYWORD(phi, PHI); - INSTKEYWORD(call, Call); - INSTKEYWORD(trunc, Trunc); - INSTKEYWORD(zext, ZExt); - INSTKEYWORD(sext, SExt); - INSTKEYWORD(fptrunc, FPTrunc); - INSTKEYWORD(fpext, FPExt); - INSTKEYWORD(uitofp, UIToFP); - INSTKEYWORD(sitofp, SIToFP); - INSTKEYWORD(fptoui, FPToUI); - INSTKEYWORD(fptosi, FPToSI); - INSTKEYWORD(inttoptr, IntToPtr); - INSTKEYWORD(ptrtoint, PtrToInt); - INSTKEYWORD(bitcast, BitCast); + INSTKEYWORD(fneg, FNeg); + + INSTKEYWORD(add, Add); + INSTKEYWORD(fadd, FAdd); + INSTKEYWORD(sub, Sub); + INSTKEYWORD(fsub, FSub); + INSTKEYWORD(mul, Mul); + INSTKEYWORD(fmul, FMul); + INSTKEYWORD(udiv, UDiv); + INSTKEYWORD(sdiv, SDiv); + INSTKEYWORD(fdiv, FDiv); + INSTKEYWORD(urem, URem); + INSTKEYWORD(srem, SRem); + INSTKEYWORD(frem, FRem); + INSTKEYWORD(shl, Shl); + INSTKEYWORD(lshr, LShr); + INSTKEYWORD(ashr, AShr); + INSTKEYWORD(and, And); + INSTKEYWORD(or, Or); + INSTKEYWORD(xor, Xor); + INSTKEYWORD(icmp, ICmp); + INSTKEYWORD(fcmp, FCmp); + + INSTKEYWORD(phi, PHI); + INSTKEYWORD(call, Call); + INSTKEYWORD(trunc, Trunc); + INSTKEYWORD(zext, ZExt); + INSTKEYWORD(sext, SExt); + INSTKEYWORD(fptrunc, FPTrunc); + INSTKEYWORD(fpext, FPExt); + INSTKEYWORD(uitofp, UIToFP); + INSTKEYWORD(sitofp, SIToFP); + INSTKEYWORD(fptoui, FPToUI); + INSTKEYWORD(fptosi, FPToSI); + INSTKEYWORD(inttoptr, IntToPtr); + INSTKEYWORD(ptrtoint, PtrToInt); + INSTKEYWORD(bitcast, BitCast); INSTKEYWORD(addrspacecast, AddrSpaceCast); - INSTKEYWORD(select, Select); - INSTKEYWORD(va_arg, VAArg); - INSTKEYWORD(ret, Ret); - INSTKEYWORD(br, Br); - INSTKEYWORD(switch, Switch); - INSTKEYWORD(indirectbr, IndirectBr); - INSTKEYWORD(invoke, Invoke); - INSTKEYWORD(resume, Resume); + INSTKEYWORD(select, Select); + INSTKEYWORD(va_arg, VAArg); + INSTKEYWORD(ret, Ret); + INSTKEYWORD(br, Br); + INSTKEYWORD(switch, Switch); + INSTKEYWORD(indirectbr, IndirectBr); + INSTKEYWORD(invoke, Invoke); + INSTKEYWORD(resume, Resume); INSTKEYWORD(unreachable, Unreachable); - INSTKEYWORD(callbr, CallBr); - - INSTKEYWORD(alloca, Alloca); - INSTKEYWORD(load, Load); - INSTKEYWORD(store, Store); - INSTKEYWORD(cmpxchg, AtomicCmpXchg); - INSTKEYWORD(atomicrmw, AtomicRMW); - INSTKEYWORD(fence, Fence); + INSTKEYWORD(callbr, CallBr); + + INSTKEYWORD(alloca, Alloca); + INSTKEYWORD(load, Load); + INSTKEYWORD(store, Store); + INSTKEYWORD(cmpxchg, AtomicCmpXchg); + INSTKEYWORD(atomicrmw, AtomicRMW); + INSTKEYWORD(fence, Fence); INSTKEYWORD(getelementptr, GetElementPtr); INSTKEYWORD(extractelement, ExtractElement); - INSTKEYWORD(insertelement, InsertElement); - INSTKEYWORD(shufflevector, ShuffleVector); - INSTKEYWORD(extractvalue, ExtractValue); - INSTKEYWORD(insertvalue, InsertValue); - INSTKEYWORD(landingpad, LandingPad); - INSTKEYWORD(cleanupret, CleanupRet); - INSTKEYWORD(catchret, CatchRet); - INSTKEYWORD(catchswitch, CatchSwitch); - INSTKEYWORD(catchpad, CatchPad); - INSTKEYWORD(cleanuppad, CleanupPad); + INSTKEYWORD(insertelement, InsertElement); + INSTKEYWORD(shufflevector, ShuffleVector); + INSTKEYWORD(extractvalue, ExtractValue); + INSTKEYWORD(insertvalue, InsertValue); + INSTKEYWORD(landingpad, LandingPad); + INSTKEYWORD(cleanupret, CleanupRet); + INSTKEYWORD(catchret, CatchRet); + INSTKEYWORD(catchswitch, CatchSwitch); + INSTKEYWORD(catchpad, CatchPad); + INSTKEYWORD(cleanuppad, CleanupPad); #undef INSTKEYWORD @@ -944,15 +1011,14 @@ lltok::Kind LLLexer::LexIdentifier() { // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by // the CFE to avoid forcing it to deal with 64-bit numbers. - if ((TokStart[0] == 'u' || TokStart[0] == 's') && - TokStart[1] == '0' && TokStart[2] == 'x' && - isxdigit(static_cast<unsigned char>(TokStart[3]))) { - int len = CurPtr-TokStart-3; + if ((TokStart[0] == 'u' || TokStart[0] == 's') && TokStart[1] == '0' && + TokStart[2] == 'x' && isxdigit(static_cast<unsigned char>(TokStart[3]))) { + int len = CurPtr - TokStart - 3; uint32_t bits = len * 4; StringRef HexStr(TokStart + 3, len); if (!all_of(HexStr, isxdigit)) { // Bad token, return it as an error. - CurPtr = TokStart+3; + CurPtr = TokStart + 3; return lltok::Error; } APInt Tmp(bits, HexStr, 16); @@ -965,12 +1031,12 @@ lltok::Kind LLLexer::LexIdentifier() { // If this is "cc1234", return this as just "cc". if (TokStart[0] == 'c' && TokStart[1] == 'c') { - CurPtr = TokStart+2; + CurPtr = TokStart + 2; return lltok::kw_cc; } // Finally, if this isn't known, return an error. - CurPtr = TokStart+1; + CurPtr = TokStart + 1; return lltok::Error; } @@ -993,7 +1059,7 @@ lltok::Kind LLLexer::Lex0x() { if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) { // Bad token, return it as an error. - CurPtr = TokStart+1; + CurPtr = TokStart + 1; return lltok::Error; } @@ -1011,25 +1077,26 @@ lltok::Kind LLLexer::Lex0x() { uint64_t Pair[2]; switch (Kind) { - default: llvm_unreachable("Unknown kind!"); + default: + llvm_unreachable("Unknown kind!"); case 'K': // F80HexFPConstant - x87 long double in hexadecimal format (10 bytes) - FP80HexToIntPair(TokStart+3, CurPtr, Pair); + FP80HexToIntPair(TokStart + 3, CurPtr, Pair); APFloatVal = APFloat(APFloat::x87DoubleExtended(), APInt(80, Pair)); return lltok::APFloat; case 'L': // F128HexFPConstant - IEEE 128-bit in hexadecimal format (16 bytes) - HexToIntPair(TokStart+3, CurPtr, Pair); + HexToIntPair(TokStart + 3, CurPtr, Pair); APFloatVal = APFloat(APFloat::IEEEquad(), APInt(128, Pair)); return lltok::APFloat; case 'M': // PPC128HexFPConstant - PowerPC 128-bit in hexadecimal format (16 bytes) - HexToIntPair(TokStart+3, CurPtr, Pair); + HexToIntPair(TokStart + 3, CurPtr, Pair); APFloatVal = APFloat(APFloat::PPCDoubleDouble(), APInt(128, Pair)); return lltok::APFloat; case 'H': APFloatVal = APFloat(APFloat::IEEEhalf(), - APInt(16,HexIntToVal(TokStart+3, CurPtr))); + APInt(16, HexIntToVal(TokStart + 3, CurPtr))); return lltok::APFloat; } } @@ -1049,7 +1116,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() { !isdigit(static_cast<unsigned char>(CurPtr[0]))) { // Okay, this is not a number after the -, it's probably a label. if (const char *End = isLabelTail(CurPtr)) { - StrVal.assign(TokStart, End-1); + StrVal.assign(TokStart, End - 1); CurPtr = End; return lltok::LabelStr; } @@ -1076,7 +1143,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() { // Check to see if this really is a string label, e.g. "-1:". if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') { if (const char *End = isLabelTail(CurPtr)) { - StrVal.assign(TokStart, End-1); + StrVal.assign(TokStart, End - 1); CurPtr = End; return lltok::LabelStr; } @@ -1094,19 +1161,21 @@ lltok::Kind LLLexer::LexDigitOrNegative() { ++CurPtr; // Skip over [0-9]*([eE][-+]?[0-9]+)? - while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr; + while (isdigit(static_cast<unsigned char>(CurPtr[0]))) + ++CurPtr; if (CurPtr[0] == 'e' || CurPtr[0] == 'E') { if (isdigit(static_cast<unsigned char>(CurPtr[1])) || ((CurPtr[1] == '-' || CurPtr[1] == '+') && - isdigit(static_cast<unsigned char>(CurPtr[2])))) { + isdigit(static_cast<unsigned char>(CurPtr[2])))) { CurPtr += 2; - while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr; + while (isdigit(static_cast<unsigned char>(CurPtr[0]))) + ++CurPtr; } } - APFloatVal = APFloat(APFloat::IEEEdouble(), - StringRef(TokStart, CurPtr - TokStart)); + APFloatVal = + APFloat(APFloat::IEEEdouble(), StringRef(TokStart, CurPtr - TokStart)); return lltok::APFloat; } @@ -1124,25 +1193,27 @@ lltok::Kind LLLexer::LexPositive() { // At this point, we need a '.'. if (CurPtr[0] != '.') { - CurPtr = TokStart+1; + CurPtr = TokStart + 1; return lltok::Error; } ++CurPtr; // Skip over [0-9]*([eE][-+]?[0-9]+)? - while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr; + while (isdigit(static_cast<unsigned char>(CurPtr[0]))) + ++CurPtr; if (CurPtr[0] == 'e' || CurPtr[0] == 'E') { if (isdigit(static_cast<unsigned char>(CurPtr[1])) || ((CurPtr[1] == '-' || CurPtr[1] == '+') && - isdigit(static_cast<unsigned char>(CurPtr[2])))) { + isdigit(static_cast<unsigned char>(CurPtr[2])))) { CurPtr += 2; - while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr; + while (isdigit(static_cast<unsigned char>(CurPtr[0]))) + ++CurPtr; } } - APFloatVal = APFloat(APFloat::IEEEdouble(), - StringRef(TokStart, CurPtr - TokStart)); + APFloatVal = + APFloat(APFloat::IEEEdouble(), StringRef(TokStart, CurPtr - TokStart)); return lltok::APFloat; } diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.h b/hpvm/llvm_patches/lib/AsmParser/LLLexer.h index 4d3a2920e937475ece2c2878a7476ad30647d7c1..c37b0dbaf14a1a890b5911c53ea2f3a026f4ecc0 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.h +++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.h @@ -20,85 +20,81 @@ #include <string> namespace llvm { - class MemoryBuffer; - class Type; - class SMDiagnostic; - class LLVMContext; - - class LLLexer { - const char *CurPtr; - StringRef CurBuf; - SMDiagnostic &ErrorInfo; - SourceMgr &SM; - LLVMContext &Context; - - // Information about the current token. - const char *TokStart; - lltok::Kind CurKind; - std::string StrVal; - unsigned UIntVal; - Type *TyVal; - APFloat APFloatVal; - APSInt APSIntVal; - - // When false (default), an identifier ending in ':' is a label token. - // When true, the ':' is treated as a separate token. - bool IgnoreColonInIdentifiers; - - public: - explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &, - LLVMContext &C); - - lltok::Kind Lex() { - return CurKind = LexToken(); - } - - typedef SMLoc LocTy; - LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); } - lltok::Kind getKind() const { return CurKind; } - const std::string &getStrVal() const { return StrVal; } - Type *getTyVal() const { return TyVal; } - unsigned getUIntVal() const { return UIntVal; } - const APSInt &getAPSIntVal() const { return APSIntVal; } - const APFloat &getAPFloatVal() const { return APFloatVal; } - - void setIgnoreColonInIdentifiers(bool val) { - IgnoreColonInIdentifiers = val; - } - - bool Error(LocTy ErrorLoc, const Twine &Msg) const; - bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); } - - void Warning(LocTy WarningLoc, const Twine &Msg) const; - void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); } - - private: - lltok::Kind LexToken(); - - int getNextChar(); - void SkipLineComment(); - lltok::Kind ReadString(lltok::Kind kind); - bool ReadVarName(); - - lltok::Kind LexIdentifier(); - lltok::Kind LexDigitOrNegative(); - lltok::Kind LexPositive(); - lltok::Kind LexAt(); - lltok::Kind LexDollar(); - lltok::Kind LexExclaim(); - lltok::Kind LexPercent(); - lltok::Kind LexUIntID(lltok::Kind Token); - lltok::Kind LexVar(lltok::Kind Var, lltok::Kind VarID); - lltok::Kind LexQuote(); - lltok::Kind Lex0x(); - lltok::Kind LexHash(); - lltok::Kind LexCaret(); - - uint64_t atoull(const char *Buffer, const char *End); - uint64_t HexIntToVal(const char *Buffer, const char *End); - void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]); - void FP80HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]); - }; +class MemoryBuffer; +class Type; +class SMDiagnostic; +class LLVMContext; + +class LLLexer { + const char *CurPtr; + StringRef CurBuf; + SMDiagnostic &ErrorInfo; + SourceMgr &SM; + LLVMContext &Context; + + // Information about the current token. + const char *TokStart; + lltok::Kind CurKind; + std::string StrVal; + unsigned UIntVal; + Type *TyVal; + APFloat APFloatVal; + APSInt APSIntVal; + + // When false (default), an identifier ending in ':' is a label token. + // When true, the ':' is treated as a separate token. + bool IgnoreColonInIdentifiers; + +public: + explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &, + LLVMContext &C); + + lltok::Kind Lex() { return CurKind = LexToken(); } + + typedef SMLoc LocTy; + LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); } + lltok::Kind getKind() const { return CurKind; } + const std::string &getStrVal() const { return StrVal; } + Type *getTyVal() const { return TyVal; } + unsigned getUIntVal() const { return UIntVal; } + const APSInt &getAPSIntVal() const { return APSIntVal; } + const APFloat &getAPFloatVal() const { return APFloatVal; } + + void setIgnoreColonInIdentifiers(bool val) { IgnoreColonInIdentifiers = val; } + + bool Error(LocTy ErrorLoc, const Twine &Msg) const; + bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); } + + void Warning(LocTy WarningLoc, const Twine &Msg) const; + void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); } + +private: + lltok::Kind LexToken(); + + int getNextChar(); + void SkipLineComment(); + lltok::Kind ReadString(lltok::Kind kind); + bool ReadVarName(); + + lltok::Kind LexIdentifier(); + lltok::Kind LexDigitOrNegative(); + lltok::Kind LexPositive(); + lltok::Kind LexAt(); + lltok::Kind LexDollar(); + lltok::Kind LexExclaim(); + lltok::Kind LexPercent(); + lltok::Kind LexUIntID(lltok::Kind Token); + lltok::Kind LexVar(lltok::Kind Var, lltok::Kind VarID); + lltok::Kind LexQuote(); + lltok::Kind Lex0x(); + lltok::Kind LexHash(); + lltok::Kind LexCaret(); + + uint64_t atoull(const char *Buffer, const char *End); + uint64_t HexIntToVal(const char *Buffer, const char *End); + void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]); + void FP80HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]); +}; } // end namespace llvm #endif diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp index bee1fc89014a43dfa3e13925def21e8be1aad58c..f5ce44e2a920405f7e3790fcb1d9eb7fba28d636 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp +++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp @@ -174,7 +174,7 @@ bool LLParser::ValidateEndOfModule() { } else if (auto *GV = dyn_cast<GlobalVariable>(V)) { AttrBuilder Attrs(GV->getAttributes()); Attrs.merge(B); - GV->setAttributes(AttributeSet::get(Context,Attrs)); + GV->setAttributes(AttributeSet::get(Context, Attrs)); } else { llvm_unreachable("invalid object with forward attribute group reference"); } @@ -191,8 +191,9 @@ bool LLParser::ValidateEndOfModule() { return Error(NT.second.second, "use of undefined type '%" + Twine(NT.first) + "'"); - for (StringMap<std::pair<Type*, LocTy> >::iterator I = - NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I) + for (StringMap<std::pair<Type *, LocTy>>::iterator I = NamedTypes.begin(), + E = NamedTypes.end(); + I != E; ++I) if (I->second.second.isValid()) return Error(I->second.second, "use of undefined type named '" + I->getKey() + "'"); @@ -205,17 +206,17 @@ bool LLParser::ValidateEndOfModule() { if (!ForwardRefVals.empty()) return Error(ForwardRefVals.begin()->second.second, "use of undefined value '@" + ForwardRefVals.begin()->first + - "'"); + "'"); if (!ForwardRefValIDs.empty()) return Error(ForwardRefValIDs.begin()->second.second, "use of undefined value '@" + - Twine(ForwardRefValIDs.begin()->first) + "'"); + Twine(ForwardRefValIDs.begin()->first) + "'"); if (!ForwardRefMDNodes.empty()) return Error(ForwardRefMDNodes.begin()->second.second, "use of undefined metadata '!" + - Twine(ForwardRefMDNodes.begin()->first) + "'"); + Twine(ForwardRefMDNodes.begin()->first) + "'"); // Resolve metadata cycles. for (auto &N : NumberedMetadata) { @@ -232,13 +233,13 @@ bool LLParser::ValidateEndOfModule() { } // Look for intrinsic functions and CallInst that need to be upgraded - for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ) + for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE;) UpgradeCallsToIntrinsic(&*FI++); // must be post-increment, as we remove // Some types could be renamed during loading if several modules are // loaded in the same LLVMContext (LTO scenario). In this case we should // remangle intrinsics names as well. - for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ) { + for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE;) { Function *F = &*FI++; if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) { F->replaceAllUsesWith(Remangled.getValue()); @@ -317,30 +318,74 @@ bool LLParser::ParseTopLevelEntities() { } while (true) { switch (Lex.getKind()) { - default: return TokError("expected top-level entity"); - case lltok::Eof: return false; - case lltok::kw_declare: if (ParseDeclare()) return true; break; - case lltok::kw_define: if (ParseDefine()) return true; break; - case lltok::kw_module: if (ParseModuleAsm()) return true; break; - case lltok::kw_target: if (ParseTargetDefinition()) return true; break; + default: + return TokError("expected top-level entity"); + case lltok::Eof: + return false; + case lltok::kw_declare: + if (ParseDeclare()) + return true; + break; + case lltok::kw_define: + if (ParseDefine()) + return true; + break; + case lltok::kw_module: + if (ParseModuleAsm()) + return true; + break; + case lltok::kw_target: + if (ParseTargetDefinition()) + return true; + break; case lltok::kw_source_filename: if (ParseSourceFileName()) return true; break; - case lltok::kw_deplibs: if (ParseDepLibs()) return true; break; - case lltok::LocalVarID: if (ParseUnnamedType()) return true; break; - case lltok::LocalVar: if (ParseNamedType()) return true; break; - case lltok::GlobalID: if (ParseUnnamedGlobal()) return true; break; - case lltok::GlobalVar: if (ParseNamedGlobal()) return true; break; - case lltok::ComdatVar: if (parseComdat()) return true; break; - case lltok::exclaim: if (ParseStandaloneMetadata()) return true; break; + case lltok::kw_deplibs: + if (ParseDepLibs()) + return true; + break; + case lltok::LocalVarID: + if (ParseUnnamedType()) + return true; + break; + case lltok::LocalVar: + if (ParseNamedType()) + return true; + break; + case lltok::GlobalID: + if (ParseUnnamedGlobal()) + return true; + break; + case lltok::GlobalVar: + if (ParseNamedGlobal()) + return true; + break; + case lltok::ComdatVar: + if (parseComdat()) + return true; + break; + case lltok::exclaim: + if (ParseStandaloneMetadata()) + return true; + break; case lltok::SummaryID: if (ParseSummaryEntry()) return true; break; - case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break; - case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break; - case lltok::kw_uselistorder: if (ParseUseListOrder()) return true; break; + case lltok::MetadataVar: + if (ParseNamedMetadata()) + return true; + break; + case lltok::kw_attributes: + if (ParseUnnamedAttrGrp()) + return true; + break; + case lltok::kw_uselistorder: + if (ParseUseListOrder()) + return true; + break; case lltok::kw_uselistorder_bb: if (ParseUseListOrderBB()) return true; @@ -357,7 +402,8 @@ bool LLParser::ParseModuleAsm() { std::string AsmStr; if (ParseToken(lltok::kw_asm, "expected 'module asm'") || - ParseStringConstant(AsmStr)) return true; + ParseStringConstant(AsmStr)) + return true; M->appendModuleInlineAsm(AsmStr); return false; @@ -370,7 +416,8 @@ bool LLParser::ParseTargetDefinition() { assert(Lex.getKind() == lltok::kw_target); std::string Str; switch (Lex.Lex()) { - default: return TokError("unknown target property"); + default: + return TokError("unknown target property"); case lltok::kw_triple: Lex.Lex(); if (ParseToken(lltok::equal, "expected '=' after target triple") || @@ -418,7 +465,8 @@ bool LLParser::ParseDepLibs() { do { std::string Str; - if (ParseStringConstant(Str)) return true; + if (ParseStringConstant(Str)) + return true; } while (EatIfPresent(lltok::comma)); return ParseToken(lltok::rsquare, "expected ']' at end of list"); @@ -436,11 +484,11 @@ bool LLParser::ParseUnnamedType() { return true; Type *Result = nullptr; - if (ParseStructDefinition(TypeLoc, "", - NumberedTypes[TypeID], Result)) return true; + if (ParseStructDefinition(TypeLoc, "", NumberedTypes[TypeID], Result)) + return true; if (!isa<StructType>(Result)) { - std::pair<Type*, LocTy> &Entry = NumberedTypes[TypeID]; + std::pair<Type *, LocTy> &Entry = NumberedTypes[TypeID]; if (Entry.first) return Error(TypeLoc, "non-struct types may not be recursive"); Entry.first = Result; @@ -455,18 +503,18 @@ bool LLParser::ParseUnnamedType() { bool LLParser::ParseNamedType() { std::string Name = Lex.getStrVal(); LocTy NameLoc = Lex.getLoc(); - Lex.Lex(); // eat LocalVar. + Lex.Lex(); // eat LocalVar. if (ParseToken(lltok::equal, "expected '=' after name") || ParseToken(lltok::kw_type, "expected 'type' after name")) return true; Type *Result = nullptr; - if (ParseStructDefinition(NameLoc, Name, - NamedTypes[Name], Result)) return true; + if (ParseStructDefinition(NameLoc, Name, NamedTypes[Name], Result)) + return true; if (!isa<StructType>(Result)) { - std::pair<Type*, LocTy> &Entry = NamedTypes[Name]; + std::pair<Type *, LocTy> &Entry = NamedTypes[Name]; if (Entry.first) return Error(NameLoc, "non-struct types may not be recursive"); Entry.first = Result; @@ -506,8 +554,7 @@ bool LLParser::ParseDefine() { Lex.Lex(); Function *F; - return ParseFunctionHeader(F, true) || - ParseOptionalFunctionMetadata(*F) || + return ParseFunctionHeader(F, true) || ParseOptionalFunctionMetadata(*F) || ParseFunctionBody(*F); } @@ -544,7 +591,8 @@ bool LLParser::ParseOptionalUnnamedAddr( /// OptionalDLLStorageClass /// ... -> global variable /// GlobalID '=' OptionalVisibility (ALIAS | IFUNC) ... -/// GlobalID '=' OptionalLinkage OptionalPreemptionSpecifier OptionalVisibility +/// GlobalID '=' OptionalLinkage OptionalPreemptionSpecifier +/// OptionalVisibility /// OptionalDLLStorageClass /// ... -> global variable bool LLParser::ParseUnnamedGlobal() { @@ -555,8 +603,8 @@ bool LLParser::ParseUnnamedGlobal() { // Handle the GlobalID form. if (Lex.getKind() == lltok::GlobalID) { if (Lex.getUIntVal() != VarID) - return Error(Lex.getLoc(), "variable expected to be numbered '%" + - Twine(VarID) + "'"); + return Error(Lex.getLoc(), + "variable expected to be numbered '%" + Twine(VarID) + "'"); Lex.Lex(); // eat GlobalID; if (ParseToken(lltok::equal, "expected '=' after name")) @@ -665,7 +713,8 @@ bool LLParser::parseComdat() { // ::= '!' STRINGCONSTANT bool LLParser::ParseMDString(MDString *&Result) { std::string Str; - if (ParseStringConstant(Str)) return true; + if (ParseStringConstant(Str)) + return true; Result = MDString::get(Context, Str); return false; } @@ -735,8 +784,7 @@ bool LLParser::ParseStandaloneMetadata() { unsigned MetadataID = 0; MDNode *Init; - if (ParseUInt32(MetadataID) || - ParseToken(lltok::equal, "expected '=' here")) + if (ParseUInt32(MetadataID) || ParseToken(lltok::equal, "expected '=' here")) return true; // Detect common error, from old metadata syntax. @@ -883,9 +931,9 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc, llvm_unreachable("Not an alias or ifunc!"); Lex.Lex(); - GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L; + GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes)L; - if(IsAlias && !GlobalAlias::isValidLinkage(Linkage)) + if (IsAlias && !GlobalAlias::isValidLinkage(Linkage)) return Error(NameLoc, "invalid linkage type for alias"); if (!isValidVisibilityForLinkage(Visibility, L)) @@ -923,14 +971,12 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc, unsigned AddrSpace = PTy->getAddressSpace(); if (IsAlias && Ty != PTy->getElementType()) - return Error( - ExplicitTypeLoc, - "explicit pointee type doesn't match operand's pointee type"); + return Error(ExplicitTypeLoc, + "explicit pointee type doesn't match operand's pointee type"); if (!IsAlias && !PTy->getElementType()->isFunctionTy()) - return Error( - ExplicitTypeLoc, - "explicit pointee type should be a function type"); + return Error(ExplicitTypeLoc, + "explicit pointee type should be a function type"); GlobalValue *GVal = nullptr; @@ -1042,16 +1088,14 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, ParseOptionalToken(lltok::kw_externally_initialized, IsExternallyInitialized, &IsExternallyInitializedLoc) || - ParseGlobalType(IsConstant) || - ParseType(Ty, TyLoc)) + ParseGlobalType(IsConstant) || ParseType(Ty, TyLoc)) return true; // If the linkage is specified and is external, then no initializer is // present. Constant *Init = nullptr; - if (!HasLinkage || - !GlobalValue::isValidDeclarationLinkage( - (GlobalValue::LinkageTypes)Linkage)) { + if (!HasLinkage || !GlobalValue::isValidDeclarationLinkage( + (GlobalValue::LinkageTypes)Linkage)) { if (ParseGlobalValue(Ty, Init)) return true; } @@ -1078,13 +1122,14 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, GlobalVariable *GV; if (!GVal) { - GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, nullptr, - Name, nullptr, GlobalVariable::NotThreadLocal, - AddrSpace); + GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, + nullptr, Name, nullptr, + GlobalVariable::NotThreadLocal, AddrSpace); } else { if (GVal->getValueType() != Ty) - return Error(TyLoc, - "forward reference and definition of global have different types"); + return Error( + TyLoc, + "forward reference and definition of global have different types"); GV = cast<GlobalVariable>(GVal); @@ -1123,7 +1168,8 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, return true; } else if (Lex.getKind() == lltok::kw_align) { unsigned Alignment; - if (ParseOptionalAlignment(Alignment)) return true; + if (ParseOptionalAlignment(Alignment)) + return true; GV->setAlignment(Alignment); } else if (Lex.getKind() == lltok::MetadataVar) { if (ParseGlobalObjectMetadataAttachment(*GV)) @@ -1195,7 +1241,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, BuiltinLoc = Lex.getLoc(); switch (Token) { default: - if (!inAttrGrp) return HaveError; + if (!inAttrGrp) + return HaveError; return Error(Lex.getLoc(), "unterminated attribute group"); case lltok::rbrace: // Finished. @@ -1206,12 +1253,13 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, // // define void @foo() #1 { ... } if (inAttrGrp) - HaveError |= - Error(Lex.getLoc(), - "cannot have an attribute group reference in an attribute group"); + HaveError |= Error( + Lex.getLoc(), + "cannot have an attribute group reference in an attribute group"); unsigned AttrGrpNum = Lex.getUIntVal(); - if (inAttrGrp) break; + if (inAttrGrp) + break; // Save the reference to the attribute group. We'll fill it in later. FwdRefAttrGrps.push_back(AttrGrpNum); @@ -1265,73 +1313,148 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, B.addAllocSizeAttr(ElemSizeArg, NumElemsArg); continue; } - case lltok::kw_alwaysinline: B.addAttribute(Attribute::AlwaysInline); break; - case lltok::kw_argmemonly: B.addAttribute(Attribute::ArgMemOnly); break; - case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break; - case lltok::kw_cold: B.addAttribute(Attribute::Cold); break; - case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break; + case lltok::kw_alwaysinline: + B.addAttribute(Attribute::AlwaysInline); + break; + case lltok::kw_argmemonly: + B.addAttribute(Attribute::ArgMemOnly); + break; + case lltok::kw_builtin: + B.addAttribute(Attribute::Builtin); + break; + case lltok::kw_cold: + B.addAttribute(Attribute::Cold); + break; + case lltok::kw_convergent: + B.addAttribute(Attribute::Convergent); + break; case lltok::kw_inaccessiblememonly: - B.addAttribute(Attribute::InaccessibleMemOnly); break; + B.addAttribute(Attribute::InaccessibleMemOnly); + break; case lltok::kw_inaccessiblemem_or_argmemonly: - B.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); break; - case lltok::kw_inlinehint: B.addAttribute(Attribute::InlineHint); break; - case lltok::kw_jumptable: B.addAttribute(Attribute::JumpTable); break; - case lltok::kw_minsize: B.addAttribute(Attribute::MinSize); break; - case lltok::kw_naked: B.addAttribute(Attribute::Naked); break; - case lltok::kw_nobuiltin: B.addAttribute(Attribute::NoBuiltin); break; - case lltok::kw_noduplicate: B.addAttribute(Attribute::NoDuplicate); break; - case lltok::kw_nofree: B.addAttribute(Attribute::NoFree); break; + B.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); + break; + case lltok::kw_inlinehint: + B.addAttribute(Attribute::InlineHint); + break; + case lltok::kw_jumptable: + B.addAttribute(Attribute::JumpTable); + break; + case lltok::kw_minsize: + B.addAttribute(Attribute::MinSize); + break; + case lltok::kw_naked: + B.addAttribute(Attribute::Naked); + break; + case lltok::kw_nobuiltin: + B.addAttribute(Attribute::NoBuiltin); + break; + case lltok::kw_noduplicate: + B.addAttribute(Attribute::NoDuplicate); + break; + case lltok::kw_nofree: + B.addAttribute(Attribute::NoFree); + break; case lltok::kw_noimplicitfloat: - B.addAttribute(Attribute::NoImplicitFloat); break; - case lltok::kw_noinline: B.addAttribute(Attribute::NoInline); break; - case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break; - case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break; - case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break; - case lltok::kw_nosync: B.addAttribute(Attribute::NoSync); break; - case lltok::kw_nocf_check: B.addAttribute(Attribute::NoCfCheck); break; - case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break; - case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break; + B.addAttribute(Attribute::NoImplicitFloat); + break; + case lltok::kw_noinline: + B.addAttribute(Attribute::NoInline); + break; + case lltok::kw_nonlazybind: + B.addAttribute(Attribute::NonLazyBind); + break; + case lltok::kw_noredzone: + B.addAttribute(Attribute::NoRedZone); + break; + case lltok::kw_noreturn: + B.addAttribute(Attribute::NoReturn); + break; + case lltok::kw_nosync: + B.addAttribute(Attribute::NoSync); + break; + case lltok::kw_nocf_check: + B.addAttribute(Attribute::NoCfCheck); + break; + case lltok::kw_norecurse: + B.addAttribute(Attribute::NoRecurse); + break; + case lltok::kw_nounwind: + B.addAttribute(Attribute::NoUnwind); + break; case lltok::kw_optforfuzzing: - B.addAttribute(Attribute::OptForFuzzing); break; - case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break; - case lltok::kw_optsize: B.addAttribute(Attribute::OptimizeForSize); break; - case lltok::kw_readnone: B.addAttribute(Attribute::ReadNone); break; - case lltok::kw_readonly: B.addAttribute(Attribute::ReadOnly); break; + B.addAttribute(Attribute::OptForFuzzing); + break; + case lltok::kw_optnone: + B.addAttribute(Attribute::OptimizeNone); + break; + case lltok::kw_optsize: + B.addAttribute(Attribute::OptimizeForSize); + break; + case lltok::kw_readnone: + B.addAttribute(Attribute::ReadNone); + break; + case lltok::kw_readonly: + B.addAttribute(Attribute::ReadOnly); + break; case lltok::kw_returns_twice: - B.addAttribute(Attribute::ReturnsTwice); break; - case lltok::kw_speculatable: B.addAttribute(Attribute::Speculatable); break; - case lltok::kw_ssp: B.addAttribute(Attribute::StackProtect); break; - case lltok::kw_sspreq: B.addAttribute(Attribute::StackProtectReq); break; + B.addAttribute(Attribute::ReturnsTwice); + break; + case lltok::kw_speculatable: + B.addAttribute(Attribute::Speculatable); + break; + case lltok::kw_ssp: + B.addAttribute(Attribute::StackProtect); + break; + case lltok::kw_sspreq: + B.addAttribute(Attribute::StackProtectReq); + break; case lltok::kw_sspstrong: - B.addAttribute(Attribute::StackProtectStrong); break; - case lltok::kw_safestack: B.addAttribute(Attribute::SafeStack); break; + B.addAttribute(Attribute::StackProtectStrong); + break; + case lltok::kw_safestack: + B.addAttribute(Attribute::SafeStack); + break; case lltok::kw_shadowcallstack: - B.addAttribute(Attribute::ShadowCallStack); break; + B.addAttribute(Attribute::ShadowCallStack); + break; case lltok::kw_sanitize_address: - B.addAttribute(Attribute::SanitizeAddress); break; + B.addAttribute(Attribute::SanitizeAddress); + break; case lltok::kw_sanitize_hwaddress: - B.addAttribute(Attribute::SanitizeHWAddress); break; + B.addAttribute(Attribute::SanitizeHWAddress); + break; case lltok::kw_sanitize_memtag: - B.addAttribute(Attribute::SanitizeMemTag); break; + B.addAttribute(Attribute::SanitizeMemTag); + break; case lltok::kw_sanitize_thread: - B.addAttribute(Attribute::SanitizeThread); break; + B.addAttribute(Attribute::SanitizeThread); + break; case lltok::kw_sanitize_memory: - B.addAttribute(Attribute::SanitizeMemory); break; + B.addAttribute(Attribute::SanitizeMemory); + break; case lltok::kw_speculative_load_hardening: B.addAttribute(Attribute::SpeculativeLoadHardening); break; - case lltok::kw_strictfp: B.addAttribute(Attribute::StrictFP); break; - case lltok::kw_uwtable: B.addAttribute(Attribute::UWTable); break; - case lltok::kw_willreturn: B.addAttribute(Attribute::WillReturn); break; - case lltok::kw_writeonly: B.addAttribute(Attribute::WriteOnly); break; + case lltok::kw_strictfp: + B.addAttribute(Attribute::StrictFP); + break; + case lltok::kw_uwtable: + B.addAttribute(Attribute::UWTable); + break; + case lltok::kw_willreturn: + B.addAttribute(Attribute::WillReturn); + break; + case lltok::kw_writeonly: + B.addAttribute(Attribute::WriteOnly); + break; // Error handling. case lltok::kw_inreg: case lltok::kw_signext: case lltok::kw_zeroext: HaveError |= - Error(Lex.getLoc(), - "invalid use of attribute on a function"); + Error(Lex.getLoc(), "invalid use of attribute on a function"); break; case lltok::kw_byval: case lltok::kw_dereferenceable: @@ -1346,14 +1469,14 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, case lltok::kw_swifterror: case lltok::kw_swiftself: case lltok::kw_immarg: - + // VISC Parameter only attributes case lltok::kw_in: case lltok::kw_out: case lltok::kw_inout: HaveError |= - Error(Lex.getLoc(), - "invalid use of parameter-only attribute on a function"); + Error(Lex.getLoc(), + "invalid use of parameter-only attribute on a function"); break; } @@ -1412,7 +1535,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty, // Look this name up in the normal function symbol table. GlobalValue *Val = - cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name)); + cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name)); // If this is a forward reference for the value, see if we already created a // forward ref record. @@ -1507,7 +1630,7 @@ bool LLParser::ParseStringConstant(std::string &Result) { bool LLParser::ParseUInt32(uint32_t &Val) { if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned()) return TokError("expected integer"); - uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL+1); + uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL + 1); if (Val64 != unsigned(Val64)) return TokError("expected 32-bit integer (too large)"); Val = Val64; @@ -1531,17 +1654,17 @@ bool LLParser::ParseUInt64(uint64_t &Val) { /// := 'localexec' bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) { switch (Lex.getKind()) { - default: - return TokError("expected localdynamic, initialexec or localexec"); - case lltok::kw_localdynamic: - TLM = GlobalVariable::LocalDynamicTLSModel; - break; - case lltok::kw_initialexec: - TLM = GlobalVariable::InitialExecTLSModel; - break; - case lltok::kw_localexec: - TLM = GlobalVariable::LocalExecTLSModel; - break; + default: + return TokError("expected localdynamic, initialexec or localexec"); + case lltok::kw_localdynamic: + TLM = GlobalVariable::LocalDynamicTLSModel; + break; + case lltok::kw_initialexec: + TLM = GlobalVariable::InitialExecTLSModel; + break; + case lltok::kw_localexec: + TLM = GlobalVariable::LocalExecTLSModel; + break; } Lex.Lex(); @@ -1561,7 +1684,7 @@ bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) { if (Lex.getKind() == lltok::lparen) { Lex.Lex(); return ParseTLSModel(TLM) || - ParseToken(lltok::rparen, "expected ')' after thread local model"); + ParseToken(lltok::rparen, "expected ')' after thread local model"); } return false; } @@ -1591,7 +1714,8 @@ bool LLParser::ParseStringAttribute(AttrBuilder &B) { return false; } -/// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes. +/// ParseOptionalParamAttrs - Parse a potentially empty list of parameter +/// attributes. bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { bool HaveError = false; @@ -1600,7 +1724,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { while (true) { lltok::Kind Token = Lex.getKind(); switch (Token) { - default: // End of attributes. + default: // End of attributes. return HaveError; case lltok::StringConstant: { if (ParseStringAttribute(B)) @@ -1635,27 +1759,65 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { B.addDereferenceableOrNullAttr(Bytes); continue; } - case lltok::kw_inalloca: B.addAttribute(Attribute::InAlloca); break; - case lltok::kw_inreg: B.addAttribute(Attribute::InReg); break; - case lltok::kw_nest: B.addAttribute(Attribute::Nest); break; - case lltok::kw_noalias: B.addAttribute(Attribute::NoAlias); break; - case lltok::kw_nocapture: B.addAttribute(Attribute::NoCapture); break; - case lltok::kw_nonnull: B.addAttribute(Attribute::NonNull); break; - case lltok::kw_readnone: B.addAttribute(Attribute::ReadNone); break; - case lltok::kw_readonly: B.addAttribute(Attribute::ReadOnly); break; - case lltok::kw_returned: B.addAttribute(Attribute::Returned); break; - case lltok::kw_signext: B.addAttribute(Attribute::SExt); break; - case lltok::kw_sret: B.addAttribute(Attribute::StructRet); break; - case lltok::kw_swifterror: B.addAttribute(Attribute::SwiftError); break; - case lltok::kw_swiftself: B.addAttribute(Attribute::SwiftSelf); break; - case lltok::kw_writeonly: B.addAttribute(Attribute::WriteOnly); break; - case lltok::kw_zeroext: B.addAttribute(Attribute::ZExt); break; - case lltok::kw_immarg: B.addAttribute(Attribute::ImmArg); break; + case lltok::kw_inalloca: + B.addAttribute(Attribute::InAlloca); + break; + case lltok::kw_inreg: + B.addAttribute(Attribute::InReg); + break; + case lltok::kw_nest: + B.addAttribute(Attribute::Nest); + break; + case lltok::kw_noalias: + B.addAttribute(Attribute::NoAlias); + break; + case lltok::kw_nocapture: + B.addAttribute(Attribute::NoCapture); + break; + case lltok::kw_nonnull: + B.addAttribute(Attribute::NonNull); + break; + case lltok::kw_readnone: + B.addAttribute(Attribute::ReadNone); + break; + case lltok::kw_readonly: + B.addAttribute(Attribute::ReadOnly); + break; + case lltok::kw_returned: + B.addAttribute(Attribute::Returned); + break; + case lltok::kw_signext: + B.addAttribute(Attribute::SExt); + break; + case lltok::kw_sret: + B.addAttribute(Attribute::StructRet); + break; + case lltok::kw_swifterror: + B.addAttribute(Attribute::SwiftError); + break; + case lltok::kw_swiftself: + B.addAttribute(Attribute::SwiftSelf); + break; + case lltok::kw_writeonly: + B.addAttribute(Attribute::WriteOnly); + break; + case lltok::kw_zeroext: + B.addAttribute(Attribute::ZExt); + break; + case lltok::kw_immarg: + B.addAttribute(Attribute::ImmArg); + break; // VISC parameter attributes - case lltok::kw_in: B.addAttribute(Attribute::In); break; - case lltok::kw_out: B.addAttribute(Attribute::Out); break; - case lltok::kw_inout: B.addAttribute(Attribute::InOut); break; + case lltok::kw_in: + B.addAttribute(Attribute::In); + break; + case lltok::kw_out: + B.addAttribute(Attribute::Out); + break; + case lltok::kw_inout: + B.addAttribute(Attribute::InOut); + break; case lltok::kw_alignstack: case lltok::kw_alwaysinline: @@ -1691,7 +1853,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { case lltok::kw_shadowcallstack: case lltok::kw_strictfp: case lltok::kw_uwtable: - HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute"); + HaveError |= + Error(Lex.getLoc(), "invalid use of function-only attribute"); break; } @@ -1699,7 +1862,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { } } -/// ParseOptionalReturnAttrs - Parse a potentially empty list of return attributes. +/// ParseOptionalReturnAttrs - Parse a potentially empty list of return +/// attributes. bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { bool HaveError = false; @@ -1708,7 +1872,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { while (true) { lltok::Kind Token = Lex.getKind(); switch (Token) { - default: // End of attributes. + default: // End of attributes. return HaveError; case lltok::StringConstant: { if (ParseStringAttribute(B)) @@ -1736,11 +1900,21 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { B.addAlignmentAttr(Alignment); continue; } - case lltok::kw_inreg: B.addAttribute(Attribute::InReg); break; - case lltok::kw_noalias: B.addAttribute(Attribute::NoAlias); break; - case lltok::kw_nonnull: B.addAttribute(Attribute::NonNull); break; - case lltok::kw_signext: B.addAttribute(Attribute::SExt); break; - case lltok::kw_zeroext: B.addAttribute(Attribute::ZExt); break; + case lltok::kw_inreg: + B.addAttribute(Attribute::InReg); + break; + case lltok::kw_noalias: + B.addAttribute(Attribute::NoAlias); + break; + case lltok::kw_nonnull: + B.addAttribute(Attribute::NonNull); + break; + case lltok::kw_signext: + B.addAttribute(Attribute::SExt); + break; + case lltok::kw_zeroext: + B.addAttribute(Attribute::ZExt); + break; // Error handling. case lltok::kw_byval: @@ -1757,7 +1931,8 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { case lltok::kw_in: case lltok::kw_out: case lltok::kw_inout: - HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute"); + HaveError |= + Error(Lex.getLoc(), "invalid use of parameter-only attribute"); break; case lltok::kw_alignstack: @@ -1795,12 +1970,14 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { case lltok::kw_shadowcallstack: case lltok::kw_strictfp: case lltok::kw_uwtable: - HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute"); + HaveError |= + Error(Lex.getLoc(), "invalid use of function-only attribute"); break; case lltok::kw_readnone: case lltok::kw_readonly: - HaveError |= Error(Lex.getLoc(), "invalid use of attribute on return type"); + HaveError |= + Error(Lex.getLoc(), "invalid use of attribute on return type"); } Lex.Lex(); @@ -1853,8 +2030,7 @@ static unsigned parseOptionalLinkageAux(lltok::Kind Kind, bool &HasLinkage) { /// ::= 'external' bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage, unsigned &Visibility, - unsigned &DLLStorageClass, - bool &DSOLocal) { + unsigned &DLLStorageClass, bool &DSOLocal) { Res = parseOptionalLinkageAux(Lex.getKind(), HasLinkage); if (HasLinkage) Lex.Lex(); @@ -1974,51 +2150,133 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) { /// bool LLParser::ParseOptionalCallingConv(unsigned &CC) { switch (Lex.getKind()) { - default: CC = CallingConv::C; return false; - case lltok::kw_ccc: CC = CallingConv::C; break; - case lltok::kw_fastcc: CC = CallingConv::Fast; break; - case lltok::kw_coldcc: CC = CallingConv::Cold; break; - case lltok::kw_x86_stdcallcc: CC = CallingConv::X86_StdCall; break; - case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break; - case lltok::kw_x86_regcallcc: CC = CallingConv::X86_RegCall; break; - case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break; - case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break; - case lltok::kw_arm_apcscc: CC = CallingConv::ARM_APCS; break; - case lltok::kw_arm_aapcscc: CC = CallingConv::ARM_AAPCS; break; - case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break; - case lltok::kw_aarch64_vector_pcs:CC = CallingConv::AArch64_VectorCall; break; - case lltok::kw_msp430_intrcc: CC = CallingConv::MSP430_INTR; break; - case lltok::kw_avr_intrcc: CC = CallingConv::AVR_INTR; break; - case lltok::kw_avr_signalcc: CC = CallingConv::AVR_SIGNAL; break; - case lltok::kw_ptx_kernel: CC = CallingConv::PTX_Kernel; break; - case lltok::kw_ptx_device: CC = CallingConv::PTX_Device; break; - case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; - case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; - case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; - case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; - case lltok::kw_win64cc: CC = CallingConv::Win64; break; - case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; - case lltok::kw_anyregcc: CC = CallingConv::AnyReg; break; - case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break; - case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break; - case lltok::kw_ghccc: CC = CallingConv::GHC; break; - case lltok::kw_swiftcc: CC = CallingConv::Swift; break; - case lltok::kw_x86_intrcc: CC = CallingConv::X86_INTR; break; - case lltok::kw_hhvmcc: CC = CallingConv::HHVM; break; - case lltok::kw_hhvm_ccc: CC = CallingConv::HHVM_C; break; - case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break; - case lltok::kw_amdgpu_vs: CC = CallingConv::AMDGPU_VS; break; - case lltok::kw_amdgpu_ls: CC = CallingConv::AMDGPU_LS; break; - case lltok::kw_amdgpu_hs: CC = CallingConv::AMDGPU_HS; break; - case lltok::kw_amdgpu_es: CC = CallingConv::AMDGPU_ES; break; - case lltok::kw_amdgpu_gs: CC = CallingConv::AMDGPU_GS; break; - case lltok::kw_amdgpu_ps: CC = CallingConv::AMDGPU_PS; break; - case lltok::kw_amdgpu_cs: CC = CallingConv::AMDGPU_CS; break; - case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break; + default: + CC = CallingConv::C; + return false; + case lltok::kw_ccc: + CC = CallingConv::C; + break; + case lltok::kw_fastcc: + CC = CallingConv::Fast; + break; + case lltok::kw_coldcc: + CC = CallingConv::Cold; + break; + case lltok::kw_x86_stdcallcc: + CC = CallingConv::X86_StdCall; + break; + case lltok::kw_x86_fastcallcc: + CC = CallingConv::X86_FastCall; + break; + case lltok::kw_x86_regcallcc: + CC = CallingConv::X86_RegCall; + break; + case lltok::kw_x86_thiscallcc: + CC = CallingConv::X86_ThisCall; + break; + case lltok::kw_x86_vectorcallcc: + CC = CallingConv::X86_VectorCall; + break; + case lltok::kw_arm_apcscc: + CC = CallingConv::ARM_APCS; + break; + case lltok::kw_arm_aapcscc: + CC = CallingConv::ARM_AAPCS; + break; + case lltok::kw_arm_aapcs_vfpcc: + CC = CallingConv::ARM_AAPCS_VFP; + break; + case lltok::kw_aarch64_vector_pcs: + CC = CallingConv::AArch64_VectorCall; + break; + case lltok::kw_msp430_intrcc: + CC = CallingConv::MSP430_INTR; + break; + case lltok::kw_avr_intrcc: + CC = CallingConv::AVR_INTR; + break; + case lltok::kw_avr_signalcc: + CC = CallingConv::AVR_SIGNAL; + break; + case lltok::kw_ptx_kernel: + CC = CallingConv::PTX_Kernel; + break; + case lltok::kw_ptx_device: + CC = CallingConv::PTX_Device; + break; + case lltok::kw_spir_kernel: + CC = CallingConv::SPIR_KERNEL; + break; + case lltok::kw_spir_func: + CC = CallingConv::SPIR_FUNC; + break; + case lltok::kw_intel_ocl_bicc: + CC = CallingConv::Intel_OCL_BI; + break; + case lltok::kw_x86_64_sysvcc: + CC = CallingConv::X86_64_SysV; + break; + case lltok::kw_win64cc: + CC = CallingConv::Win64; + break; + case lltok::kw_webkit_jscc: + CC = CallingConv::WebKit_JS; + break; + case lltok::kw_anyregcc: + CC = CallingConv::AnyReg; + break; + case lltok::kw_preserve_mostcc: + CC = CallingConv::PreserveMost; + break; + case lltok::kw_preserve_allcc: + CC = CallingConv::PreserveAll; + break; + case lltok::kw_ghccc: + CC = CallingConv::GHC; + break; + case lltok::kw_swiftcc: + CC = CallingConv::Swift; + break; + case lltok::kw_x86_intrcc: + CC = CallingConv::X86_INTR; + break; + case lltok::kw_hhvmcc: + CC = CallingConv::HHVM; + break; + case lltok::kw_hhvm_ccc: + CC = CallingConv::HHVM_C; + break; + case lltok::kw_cxx_fast_tlscc: + CC = CallingConv::CXX_FAST_TLS; + break; + case lltok::kw_amdgpu_vs: + CC = CallingConv::AMDGPU_VS; + break; + case lltok::kw_amdgpu_ls: + CC = CallingConv::AMDGPU_LS; + break; + case lltok::kw_amdgpu_hs: + CC = CallingConv::AMDGPU_HS; + break; + case lltok::kw_amdgpu_es: + CC = CallingConv::AMDGPU_ES; + break; + case lltok::kw_amdgpu_gs: + CC = CallingConv::AMDGPU_GS; + break; + case lltok::kw_amdgpu_ps: + CC = CallingConv::AMDGPU_PS; + break; + case lltok::kw_amdgpu_cs: + CC = CallingConv::AMDGPU_CS; + break; + case lltok::kw_amdgpu_kernel: + CC = CallingConv::AMDGPU_KERNEL; + break; case lltok::kw_cc: { - Lex.Lex(); - return ParseUInt32(CC); - } + Lex.Lex(); + return ParseUInt32(CC); + } } Lex.Lex(); @@ -2087,7 +2345,8 @@ bool LLParser::ParseOptionalAlignment(unsigned &Alignment) { if (!EatIfPresent(lltok::kw_align)) return false; LocTy AlignLoc = Lex.getLoc(); - if (ParseUInt32(Alignment)) return true; + if (ParseUInt32(Alignment)) + return true; if (!isPowerOf2_32(Alignment)) return Error(AlignLoc, "alignment is not a power of two"); if (Alignment > Value::MaximumAlignment) @@ -2113,7 +2372,8 @@ bool LLParser::ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, if (!EatIfPresent(lltok::lparen)) return Error(ParenLoc, "expected '('"); LocTy DerefLoc = Lex.getLoc(); - if (ParseUInt64(Bytes)) return true; + if (ParseUInt64(Bytes)) + return true; ParenLoc = Lex.getLoc(); if (!EatIfPresent(lltok::rparen)) return Error(ParenLoc, "expected ')'"); @@ -2141,7 +2401,8 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment, if (Lex.getKind() != lltok::kw_align) return Error(Lex.getLoc(), "expected metadata or 'align'"); - if (ParseOptionalAlignment(Alignment)) return true; + if (ParseOptionalAlignment(Alignment)) + return true; } return false; @@ -2153,8 +2414,7 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment, /// /// This returns with AteExtraComma set to true if it ate an excess comma at the /// end. -bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace, - LocTy &Loc, +bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc, bool &AteExtraComma) { AteExtraComma = false; while (EatIfPresent(lltok::comma)) { @@ -2249,14 +2509,25 @@ bool LLParser::ParseScope(SyncScope::ID &SSID) { /// This sets Ordering to the parsed value. bool LLParser::ParseOrdering(AtomicOrdering &Ordering) { switch (Lex.getKind()) { - default: return TokError("Expected ordering on atomic instruction"); - case lltok::kw_unordered: Ordering = AtomicOrdering::Unordered; break; - case lltok::kw_monotonic: Ordering = AtomicOrdering::Monotonic; break; + default: + return TokError("Expected ordering on atomic instruction"); + case lltok::kw_unordered: + Ordering = AtomicOrdering::Unordered; + break; + case lltok::kw_monotonic: + Ordering = AtomicOrdering::Monotonic; + break; // Not specified yet: // case lltok::kw_consume: Ordering = AtomicOrdering::Consume; break; - case lltok::kw_acquire: Ordering = AtomicOrdering::Acquire; break; - case lltok::kw_release: Ordering = AtomicOrdering::Release; break; - case lltok::kw_acq_rel: Ordering = AtomicOrdering::AcquireRelease; break; + case lltok::kw_acquire: + Ordering = AtomicOrdering::Acquire; + break; + case lltok::kw_release: + Ordering = AtomicOrdering::Release; + break; + case lltok::kw_acq_rel: + Ordering = AtomicOrdering::AcquireRelease; + break; case lltok::kw_seq_cst: Ordering = AtomicOrdering::SequentiallyConsistent; break; @@ -2276,7 +2547,8 @@ bool LLParser::ParseOptionalStackAlignment(unsigned &Alignment) { if (!EatIfPresent(lltok::lparen)) return Error(ParenLoc, "expected '('"); LocTy AlignLoc = Lex.getLoc(); - if (ParseUInt32(Alignment)) return true; + if (ParseUInt32(Alignment)) + return true; ParenLoc = Lex.getLoc(); if (!EatIfPresent(lltok::rparen)) return Error(ParenLoc, "expected ')'"); @@ -2303,12 +2575,14 @@ bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices, while (EatIfPresent(lltok::comma)) { if (Lex.getKind() == lltok::MetadataVar) { - if (Indices.empty()) return TokError("expected index"); + if (Indices.empty()) + return TokError("expected index"); AteExtraComma = true; return false; } unsigned Idx = 0; - if (ParseUInt32(Idx)) return true; + if (ParseUInt32(Idx)) + return true; Indices.push_back(Idx); } @@ -2353,7 +2627,7 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) { break; case lltok::LocalVar: { // Type ::= %foo - std::pair<Type*, LocTy> &Entry = NamedTypes[Lex.getStrVal()]; + std::pair<Type *, LocTy> &Entry = NamedTypes[Lex.getStrVal()]; // If the type hasn't been defined yet, create a forward definition and // remember where that forward def'n was seen (in case it never is defined). @@ -2368,7 +2642,7 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) { case lltok::LocalVarID: { // Type ::= %4 - std::pair<Type*, LocTy> &Entry = NumberedTypes[Lex.getUIntVal()]; + std::pair<Type *, LocTy> &Entry = NumberedTypes[Lex.getUIntVal()]; // If the type hasn't been defined yet, create a forward definition and // remember where that forward def'n was seen (in case it never is defined). @@ -2453,7 +2727,7 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList, return TokError(Twine(Msg) + "non-musttail call"); if (!InVarArgsFunc) return TokError(Twine(Msg) + "musttail call in non-varargs function"); - Lex.Lex(); // Lex the '...', it is purely for readability. + Lex.Lex(); // Lex the '...', it is purely for readability. return ParseToken(lltok::rparen, "expected ')' at end of argument list"); } @@ -2473,15 +2747,15 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList, if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS)) return true; } - ArgList.push_back(ParamInfo( - ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs))); + ArgList.push_back( + ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs))); } if (IsMustTailCall && InVarArgsFunc) return TokError("expected '...' at end of argument list for musttail call " "in varargs function"); - Lex.Lex(); // Lex the ')'. + Lex.Lex(); // Lex the ')'. return false; } @@ -2565,7 +2839,7 @@ bool LLParser::ParseOptionalOperandBundles( /// ::= ArgType (',' ArgType)* /// bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, - bool &isVarArg){ + bool &isVarArg) { isVarArg = false; assert(Lex.getKind() == lltok::lparen); Lex.Lex(); // eat the (. @@ -2581,8 +2855,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, AttrBuilder Attrs; std::string Name; - if (ParseType(ArgTy) || - ParseOptionalParamAttrs(Attrs)) return true; + if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs)) + return true; if (ArgTy->isVoidTy()) return Error(TypeLoc, "argument can not have void type"); @@ -2608,7 +2882,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, // Otherwise must be an argument type. TypeLoc = Lex.getLoc(); - if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs)) return true; + if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs)) + return true; if (ArgTy->isVoidTy()) return Error(TypeLoc, "argument can not have void type"); @@ -2654,7 +2929,7 @@ bool LLParser::ParseFunctionType(Type *&Result) { "argument attributes invalid in function type"); } - SmallVector<Type*, 16> ArgListTy; + SmallVector<Type *, 16> ArgListTy; for (unsigned i = 0, e = ArgList.size(); i != e; ++i) ArgListTy.push_back(ArgList[i].Ty); @@ -2665,8 +2940,9 @@ bool LLParser::ParseFunctionType(Type *&Result) { /// ParseAnonStructType - Parse an anonymous struct type, which is inlined into /// other structs. bool LLParser::ParseAnonStructType(Type *&Result, bool Packed) { - SmallVector<Type*, 8> Elts; - if (ParseStructBody(Elts)) return true; + SmallVector<Type *, 8> Elts; + if (ParseStructBody(Elts)) + return true; Result = StructType::get(Context, Elts, Packed); return false; @@ -2674,7 +2950,7 @@ bool LLParser::ParseAnonStructType(Type *&Result, bool Packed) { /// ParseStructDefinition - Parse a struct in a 'type' definition. bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name, - std::pair<Type*, LocTy> &Entry, + std::pair<Type *, LocTy> &Entry, Type *&ResultTy) { // If the type was already defined, diagnose the redefinition. if (Entry.first && !Entry.second.isValid()) @@ -2718,7 +2994,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name, StructType *STy = cast<StructType>(Entry.first); - SmallVector<Type*, 8> Body; + SmallVector<Type *, 8> Body; if (ParseStructBody(Body) || (isPacked && ParseToken(lltok::greater, "expected '>' in packed struct"))) return true; @@ -2734,7 +3010,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name, /// ::= '{' Type (',' Type)* '}' /// ::= '<' '{' '}' '>' /// ::= '<' '{' Type (',' Type)* '}' '>' -bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) { +bool LLParser::ParseStructBody(SmallVectorImpl<Type *> &Body) { assert(Lex.getKind() == lltok::lbrace); Lex.Lex(); // Consume the '{' @@ -2744,7 +3020,8 @@ bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) { LocTy EltTyLoc = Lex.getLoc(); Type *Ty = nullptr; - if (ParseType(Ty)) return true; + if (ParseType(Ty)) + return true; Body.push_back(Ty); if (!StructType::isValidElementType(Ty)) @@ -2752,7 +3029,8 @@ bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) { while (EatIfPresent(lltok::comma)) { EltTyLoc = Lex.getLoc(); - if (ParseType(Ty)) return true; + if (ParseType(Ty)) + return true; if (!StructType::isValidElementType(Ty)) return Error(EltTyLoc, "invalid element type for struct"); @@ -2789,11 +3067,12 @@ bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) { Lex.Lex(); if (ParseToken(lltok::kw_x, "expected 'x' after element count")) - return true; + return true; LocTy TypeLoc = Lex.getLoc(); Type *EltTy = nullptr; - if (ParseType(EltTy)) return true; + if (ParseType(EltTy)) + return true; if (ParseToken(isVector ? lltok::greater : lltok::rsquare, "expected end of sequential type")) @@ -2821,7 +3100,7 @@ bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) { LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f, int functionNumber) - : P(p), F(f), FunctionNumber(functionNumber) { + : P(p), F(f), FunctionNumber(functionNumber) { // Insert unnamed arguments into the NumberedVals list. for (Argument &A : F.args()) @@ -2853,11 +3132,11 @@ bool LLParser::PerFunctionState::FinishFunction() { if (!ForwardRefVals.empty()) return P.Error(ForwardRefVals.begin()->second.second, "use of undefined value '%" + ForwardRefVals.begin()->first + - "'"); + "'"); if (!ForwardRefValIDs.empty()) return P.Error(ForwardRefValIDs.begin()->second.second, "use of undefined value '%" + - Twine(ForwardRefValIDs.begin()->first) + "'"); + Twine(ForwardRefValIDs.begin()->first) + "'"); return false; } @@ -2954,14 +3233,15 @@ bool LLParser::PerFunctionState::SetInstName(int NameID, if (unsigned(NameID) != NumberedVals.size()) return P.Error(NameLoc, "instruction expected to be numbered '%" + - Twine(NumberedVals.size()) + "'"); + Twine(NumberedVals.size()) + "'"); auto FI = ForwardRefValIDs.find(NameID); if (FI != ForwardRefValIDs.end()) { Value *Sentinel = FI->second.first; if (Sentinel->getType() != Inst->getType()) return P.Error(NameLoc, "instruction forward referenced with type '" + - getTypeString(FI->second.first->getType()) + "'"); + getTypeString(FI->second.first->getType()) + + "'"); Sentinel->replaceAllUsesWith(Inst); Sentinel->deleteValue(); @@ -2978,7 +3258,8 @@ bool LLParser::PerFunctionState::SetInstName(int NameID, Value *Sentinel = FI->second.first; if (Sentinel->getType() != Inst->getType()) return P.Error(NameLoc, "instruction forward referenced with type '" + - getTypeString(FI->second.first->getType()) + "'"); + getTypeString(FI->second.first->getType()) + + "'"); Sentinel->replaceAllUsesWith(Inst); Sentinel->deleteValue(); @@ -2990,7 +3271,7 @@ bool LLParser::PerFunctionState::SetInstName(int NameID, if (Inst->getName() != NameStr) return P.Error(NameLoc, "multiple definition of local value named '" + - NameStr + "'"); + NameStr + "'"); return false; } @@ -3062,20 +3343,21 @@ BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name, bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ID.Loc = Lex.getLoc(); switch (Lex.getKind()) { - default: return TokError("expected value token"); - case lltok::GlobalID: // @42 + default: + return TokError("expected value token"); + case lltok::GlobalID: // @42 ID.UIntVal = Lex.getUIntVal(); ID.Kind = ValID::t_GlobalID; break; - case lltok::GlobalVar: // @foo + case lltok::GlobalVar: // @foo ID.StrVal = Lex.getStrVal(); ID.Kind = ValID::t_GlobalName; break; - case lltok::LocalVarID: // %42 + case lltok::LocalVarID: // %42 ID.UIntVal = Lex.getUIntVal(); ID.Kind = ValID::t_LocalID; break; - case lltok::LocalVar: // %foo + case lltok::LocalVar: // %foo ID.StrVal = Lex.getStrVal(); ID.Kind = ValID::t_LocalName; break; @@ -3095,15 +3377,23 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ID.ConstantVal = ConstantInt::getFalse(Context); ID.Kind = ValID::t_Constant; break; - case lltok::kw_null: ID.Kind = ValID::t_Null; break; - case lltok::kw_undef: ID.Kind = ValID::t_Undef; break; - case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break; - case lltok::kw_none: ID.Kind = ValID::t_None; break; + case lltok::kw_null: + ID.Kind = ValID::t_Null; + break; + case lltok::kw_undef: + ID.Kind = ValID::t_Undef; + break; + case lltok::kw_zeroinitializer: + ID.Kind = ValID::t_Zero; + break; + case lltok::kw_none: + ID.Kind = ValID::t_None; + break; case lltok::lbrace: { // ValID ::= '{' ConstVector '}' Lex.Lex(); - SmallVector<Constant*, 16> Elts; + SmallVector<Constant *, 16> Elts; if (ParseGlobalValueVector(Elts) || ParseToken(lltok::rbrace, "expected end of struct constant")) return true; @@ -3121,7 +3411,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { Lex.Lex(); bool isPackedStruct = EatIfPresent(lltok::lbrace); - SmallVector<Constant*, 16> Elts; + SmallVector<Constant *, 16> Elts; LocTy FirstEltLoc = Lex.getLoc(); if (ParseGlobalValueVector(Elts) || (isPackedStruct && @@ -3144,23 +3434,24 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { if (!Elts[0]->getType()->isIntegerTy() && !Elts[0]->getType()->isFloatingPointTy() && !Elts[0]->getType()->isPointerTy()) - return Error(FirstEltLoc, - "vector elements must have integer, pointer or floating point type"); + return Error( + FirstEltLoc, + "vector elements must have integer, pointer or floating point type"); // Verify that all the vector elements have the same type. for (unsigned i = 1, e = Elts.size(); i != e; ++i) if (Elts[i]->getType() != Elts[0]->getType()) - return Error(FirstEltLoc, - "vector element #" + Twine(i) + - " is not of type '" + getTypeString(Elts[0]->getType())); + return Error(FirstEltLoc, "vector element #" + Twine(i) + + " is not of type '" + + getTypeString(Elts[0]->getType())); ID.ConstantVal = ConstantVector::get(Elts); ID.Kind = ValID::t_Constant; return false; } - case lltok::lsquare: { // Array Constant + case lltok::lsquare: { // Array Constant Lex.Lex(); - SmallVector<Constant*, 16> Elts; + SmallVector<Constant *, 16> Elts; LocTy FirstEltLoc = Lex.getLoc(); if (ParseGlobalValueVector(Elts) || ParseToken(lltok::rsquare, "expected end of array constant")) @@ -3176,27 +3467,28 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { if (!Elts[0]->getType()->isFirstClassType()) return Error(FirstEltLoc, "invalid array element type: " + - getTypeString(Elts[0]->getType())); + getTypeString(Elts[0]->getType())); ArrayType *ATy = ArrayType::get(Elts[0]->getType(), Elts.size()); // Verify all elements are correct type! for (unsigned i = 0, e = Elts.size(); i != e; ++i) { if (Elts[i]->getType() != Elts[0]->getType()) - return Error(FirstEltLoc, - "array element #" + Twine(i) + - " is not of type '" + getTypeString(Elts[0]->getType())); + return Error(FirstEltLoc, "array element #" + Twine(i) + + " is not of type '" + + getTypeString(Elts[0]->getType())); } ID.ConstantVal = ConstantArray::get(ATy, Elts); ID.Kind = ValID::t_Constant; return false; } - case lltok::kw_c: // c "foo" + case lltok::kw_c: // c "foo" Lex.Lex(); - ID.ConstantVal = ConstantDataArray::getString(Context, Lex.getStrVal(), - false); - if (ParseToken(lltok::StringConstant, "expected string")) return true; + ID.ConstantVal = + ConstantDataArray::getString(Context, Lex.getStrVal(), false); + if (ParseToken(lltok::StringConstant, "expected string")) + return true; ID.Kind = ValID::t_Constant; return false; @@ -3213,8 +3505,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ParseToken(lltok::StringConstant, "expected constraint string")) return true; ID.StrVal2 = Lex.getStrVal(); - ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1) | - (unsigned(AsmDialect)<<2); + ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack) << 1) | + (unsigned(AsmDialect) << 2); ID.Kind = ValID::t_InlineAsm; return false; } @@ -3227,7 +3519,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { if (ParseToken(lltok::lparen, "expected '(' in block address expression") || ParseValID(Fn) || - ParseToken(lltok::comma, "expected comma in block address expression")|| + ParseToken(lltok::comma, + "expected comma in block address expression") || ParseValID(Label) || ParseToken(lltok::rparen, "expected ')' in block address expression")) return true; @@ -3258,9 +3551,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { if (!F) { // Make a global variable as a placeholder for this reference. GlobalValue *&FwdRef = - ForwardRefBlockAddresses.insert(std::make_pair( - std::move(Fn), - std::map<ValID, GlobalValue *>())) + ForwardRefBlockAddresses + .insert(std::make_pair(std::move(Fn), + std::map<ValID, GlobalValue *>())) .first->second.insert(std::make_pair(std::move(Label), nullptr)) .first->second; if (!FwdRef) @@ -3321,10 +3614,10 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { return true; if (!CastInst::castIsValid((Instruction::CastOps)Opc, SrcVal, DestTy)) return Error(ID.Loc, "invalid cast opcode for cast from '" + - getTypeString(SrcVal->getType()) + "' to '" + - getTypeString(DestTy) + "'"); - ID.ConstantVal = ConstantExpr::getCast((Instruction::CastOps)Opc, - SrcVal, DestTy); + getTypeString(SrcVal->getType()) + "' to '" + + getTypeString(DestTy) + "'"); + ID.ConstantVal = + ConstantExpr::getCast((Instruction::CastOps)Opc, SrcVal, DestTy); ID.Kind = ValID::t_Constant; return false; } @@ -3332,9 +3625,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { Lex.Lex(); Constant *Val; SmallVector<unsigned, 4> Indices; - if (ParseToken(lltok::lparen, "expected '(' in extractvalue constantexpr")|| - ParseGlobalTypeAndValue(Val) || - ParseIndexList(Indices) || + if (ParseToken(lltok::lparen, + "expected '(' in extractvalue constantexpr") || + ParseGlobalTypeAndValue(Val) || ParseIndexList(Indices) || ParseToken(lltok::rparen, "expected ')' in extractvalue constantexpr")) return true; @@ -3350,11 +3643,11 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { Lex.Lex(); Constant *Val0, *Val1; SmallVector<unsigned, 4> Indices; - if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr")|| + if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr") || ParseGlobalTypeAndValue(Val0) || - ParseToken(lltok::comma, "expected comma in insertvalue constantexpr")|| - ParseGlobalTypeAndValue(Val1) || - ParseIndexList(Indices) || + ParseToken(lltok::comma, + "expected comma in insertvalue constantexpr") || + ParseGlobalTypeAndValue(Val1) || ParseIndexList(Indices) || ParseToken(lltok::rparen, "expected ')' in insertvalue constantexpr")) return true; if (!Val0->getType()->isAggregateType()) @@ -3404,7 +3697,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ID.Kind = ValID::t_Constant; return false; } - + // Unary Operators. case lltok::kw_fneg: { unsigned Opc = Lex.getUIntVal(); @@ -3414,14 +3707,15 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ParseGlobalTypeAndValue(Val) || ParseToken(lltok::rparen, "expected ')' in unary constantexpr")) return true; - + // Check that the type is valid for the operator. switch (Opc) { case Instruction::FNeg: if (!Val->getType()->isFPOrFPVectorTy()) return Error(ID.Loc, "constexpr requires fp operands"); break; - default: llvm_unreachable("Unknown unary operator!"); + default: + llvm_unreachable("Unknown unary operator!"); } unsigned Flags = 0; Constant *C = ConstantExpr::get(Opc, Val, Flags); @@ -3496,12 +3790,16 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { if (!Val0->getType()->isFPOrFPVectorTy()) return Error(ID.Loc, "constexpr requires fp operands"); break; - default: llvm_unreachable("Unknown binary operator!"); + default: + llvm_unreachable("Unknown binary operator!"); } unsigned Flags = 0; - if (NUW) Flags |= OverflowingBinaryOperator::NoUnsignedWrap; - if (NSW) Flags |= OverflowingBinaryOperator::NoSignedWrap; - if (Exact) Flags |= PossiblyExactOperator::IsExact; + if (NUW) + Flags |= OverflowingBinaryOperator::NoUnsignedWrap; + if (NSW) + Flags |= OverflowingBinaryOperator::NoSignedWrap; + if (Exact) + Flags |= PossiblyExactOperator::IsExact; Constant *C = ConstantExpr::get(Opc, Val0, Val1, Flags); ID.ConstantVal = C; ID.Kind = ValID::t_Constant; @@ -3537,7 +3835,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { case lltok::kw_extractelement: case lltok::kw_select: { unsigned Opc = Lex.getUIntVal(); - SmallVector<Constant*, 16> Elts; + SmallVector<Constant *, 16> Elts; bool InBounds = false; Type *Ty; Lex.Lex(); @@ -3562,8 +3860,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { return true; if (Opc == Instruction::GetElementPtr) { - if (Elts.size() == 0 || - !Elts[0]->getType()->isPtrOrPtrVectorTy()) + if (Elts.size() == 0 || !Elts[0]->getType()->isPtrOrPtrVectorTy()) return Error(ID.Loc, "base of getelementptr must be a pointer"); Type *BaseType = Elts[0]->getType(); @@ -3593,7 +3890,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { } } - SmallPtrSet<Type*, 4> Visited; + SmallPtrSet<Type *, 4> Visited; if (!Indices.empty() && !Ty->isSized(&Visited)) return Error(ID.Loc, "base element of getelementptr must be sized"); @@ -3612,8 +3909,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { } else if (Opc == Instruction::Select) { if (Elts.size() != 3) return Error(ID.Loc, "expected three operands to select"); - if (const char *Reason = SelectInst::areInvalidOperands(Elts[0], Elts[1], - Elts[2])) + if (const char *Reason = + SelectInst::areInvalidOperands(Elts[0], Elts[1], Elts[2])) return Error(ID.Loc, Reason); ID.ConstantVal = ConstantExpr::getSelect(Elts[0], Elts[1], Elts[2]); } else if (Opc == Instruction::ShuffleVector) { @@ -3622,7 +3919,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { if (!ShuffleVectorInst::isValidOperands(Elts[0], Elts[1], Elts[2])) return Error(ID.Loc, "invalid operands to shufflevector"); ID.ConstantVal = - ConstantExpr::getShuffleVector(Elts[0], Elts[1],Elts[2]); + ConstantExpr::getShuffleVector(Elts[0], Elts[1], Elts[2]); } else if (Opc == Instruction::ExtractElement) { if (Elts.size() != 2) return Error(ID.Loc, "expected two operands to extractelement"); @@ -3632,11 +3929,11 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { } else { assert(Opc == Instruction::InsertElement && "Unknown opcode"); if (Elts.size() != 3) - return Error(ID.Loc, "expected three operands to insertelement"); + return Error(ID.Loc, "expected three operands to insertelement"); if (!InsertElementInst::isValidOperands(Elts[0], Elts[1], Elts[2])) return Error(ID.Loc, "invalid insertelement operands"); ID.ConstantVal = - ConstantExpr::getInsertElement(Elts[0], Elts[1],Elts[2]); + ConstantExpr::getInsertElement(Elts[0], Elts[1], Elts[2]); } ID.Kind = ValID::t_Constant; @@ -3662,8 +3959,7 @@ bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) { bool LLParser::ParseGlobalTypeAndValue(Constant *&V) { Type *Ty = nullptr; - return ParseType(Ty) || - ParseGlobalValue(Ty, V); + return ParseType(Ty) || ParseGlobalValue(Ty, V); } bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) { @@ -3695,10 +3991,8 @@ bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) { bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts, Optional<unsigned> *InRangeOp) { // Empty list. - if (Lex.getKind() == lltok::rbrace || - Lex.getKind() == lltok::rsquare || - Lex.getKind() == lltok::greater || - Lex.getKind() == lltok::rparen) + if (Lex.getKind() == lltok::rbrace || Lex.getKind() == lltok::rsquare || + Lex.getKind() == lltok::greater || Lex.getKind() == lltok::rparen) return false; do { @@ -3706,7 +4000,8 @@ bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts, *InRangeOp = Elts.size(); Constant *C; - if (ParseGlobalTypeAndValue(C)) return true; + if (ParseGlobalTypeAndValue(C)) + return true; Elts.push_back(C); } while (EatIfPresent(lltok::comma)); @@ -3730,8 +4025,7 @@ bool LLParser::ParseMDNode(MDNode *&N) { if (Lex.getKind() == lltok::MetadataVar) return ParseSpecializedMDNode(N); - return ParseToken(lltok::exclaim, "expected '!' here") || - ParseMDNodeTail(N); + return ParseToken(lltok::exclaim, "expected '!' here") || ParseMDNodeTail(N); } bool LLParser::ParseMDNodeTail(MDNode *&N) { @@ -3770,11 +4064,7 @@ template <class FieldTypeA, class FieldTypeB> struct MDEitherFieldImpl { FieldTypeB B; bool Seen; - enum { - IsInvalid = 0, - IsTypeA = 1, - IsTypeB = 2 - } WhatIs; + enum { IsInvalid = 0, IsTypeA = 1, IsTypeB = 2 } WhatIs; void assign(FieldTypeA A) { Seen = true; @@ -3817,7 +4107,7 @@ struct DwarfTagField : public MDUnsignedField { struct DwarfMacinfoTypeField : public MDUnsignedField { DwarfMacinfoTypeField() : MDUnsignedField(0, dwarf::DW_MACINFO_vendor_ext) {} DwarfMacinfoTypeField(dwarf::MacinfoRecordType DefaultType) - : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {} + : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {} }; struct DwarfAttEncodingField : public MDUnsignedField { @@ -3987,8 +4277,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal()); if (Macinfo == dwarf::DW_MACINFO_invalid) - return TokError( - "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'"); + return TokError("invalid DWARF macinfo type" + Twine(" '") + + Lex.getStrVal() + "'"); assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type"); Result.assign(Macinfo); @@ -4043,8 +4333,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) { unsigned CC = dwarf::getCallingConvention(Lex.getStrVal()); if (!CC) - return TokError("invalid DWARF calling convention" + Twine(" '") + Lex.getStrVal() + - "'"); + return TokError("invalid DWARF calling convention" + Twine(" '") + + Lex.getStrVal() + "'"); assert(CC <= Result.Max && "Expected valid DWARF calling convention"); Result.assign(CC); Lex.Lex(); @@ -4052,7 +4342,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) { } template <> -bool LLParser::ParseMDField(LocTy Loc, StringRef Name, EmissionKindField &Result) { +bool LLParser::ParseMDField(LocTy Loc, StringRef Name, + EmissionKindField &Result) { if (Lex.getKind() == lltok::APSInt) return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result)); @@ -4188,8 +4479,7 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) { } template <> -bool LLParser::ParseMDField(LocTy Loc, StringRef Name, - MDSignedField &Result) { +bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDSignedField &Result) { if (Lex.getKind() != lltok::APSInt) return TokError("expected signed integer"); @@ -4316,8 +4606,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DIFile::getChecksumKind(Lex.getStrVal()); if (Lex.getKind() != lltok::ChecksumKind || !CSKind) - return TokError( - "invalid checksum kind" + Twine(" '") + Lex.getStrVal() + "'"); + return TokError("invalid checksum kind" + Twine(" '") + Lex.getStrVal() + + "'"); Result.assign(*CSKind); Lex.Lex(); @@ -4387,10 +4677,13 @@ bool LLParser::ParseSpecializedMDNode(MDNode *&N, bool IsDistinct) { VISIT_MD_FIELDS(DECLARE_FIELD, DECLARE_FIELD) \ do { \ LocTy ClosingLoc; \ - if (ParseMDFieldsImpl([&]() -> bool { \ - VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD) \ - return TokError(Twine("invalid field '") + Lex.getStrVal() + "'"); \ - }, ClosingLoc)) \ + if (ParseMDFieldsImpl( \ + [&]() -> bool { \ + VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD) \ + return TokError(Twine("invalid field '") + Lex.getStrVal() + \ + "'"); \ + }, \ + ClosingLoc)) \ return true; \ VISIT_MD_FIELDS(NOP_FIELD, REQUIRE_FIELD) \ } while (false) @@ -4520,11 +4813,10 @@ bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) { if (dwarfAddressSpace.Val != UINT32_MAX) DWARFAddressSpace = dwarfAddressSpace.Val; - Result = GET_OR_DISTINCT(DIDerivedType, - (Context, tag.Val, name.Val, file.Val, line.Val, - scope.Val, baseType.Val, size.Val, align.Val, - offset.Val, DWARFAddressSpace, flags.Val, - extraData.Val)); + Result = GET_OR_DISTINCT( + DIDerivedType, (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, + baseType.Val, size.Val, align.Val, offset.Val, + DWARFAddressSpace, flags.Val, extraData.Val)); return false; } @@ -4554,8 +4846,8 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) { if (auto *CT = DICompositeType::buildODRType( Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val, - elements.Val, runtimeLang.Val, vtableHolder.Val, - templateParams.Val, discriminator.Val)) { + elements.Val, runtimeLang.Val, vtableHolder.Val, templateParams.Val, + discriminator.Val)) { Result = CT; return false; } @@ -4611,8 +4903,8 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) { Optional<MDString *> OptSource; if (source.Seen) OptSource = source.Val; - Result = GET_OR_DISTINCT(DIFile, (Context, filename.Val, directory.Val, - OptChecksum, OptSource)); + Result = GET_OR_DISTINCT( + DIFile, (Context, filename.Val, directory.Val, OptChecksum, OptSource)); return false; } @@ -4750,13 +5042,12 @@ bool LLParser::ParseDICommonBlock(MDNode *&Result, bool IsDistinct) { OPTIONAL(declaration, MDField, ); \ OPTIONAL(name, MDStringField, ); \ OPTIONAL(file, MDField, ); \ - OPTIONAL(line, LineField, ); + OPTIONAL(line, LineField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DICommonBlock, - (Context, scope.Val, declaration.Val, name.Val, - file.Val, line.Val)); + Result = GET_OR_DISTINCT(DICommonBlock, (Context, scope.Val, declaration.Val, + name.Val, file.Val, line.Val)); return false; } @@ -4776,7 +5067,8 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) { } /// ParseDIMacro: -/// ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue") +/// ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: +/// "SomeValue") bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) { #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ REQUIRED(type, DwarfMacinfoTypeField, ); \ @@ -4820,8 +5112,9 @@ bool LLParser::ParseDIModule(MDNode *&Result, bool IsDistinct) { PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DIModule, (Context, scope.Val, name.Val, - configMacros.Val, includePath.Val, isysroot.Val)); + Result = + GET_OR_DISTINCT(DIModule, (Context, scope.Val, name.Val, configMacros.Val, + includePath.Val, isysroot.Val)); return false; } @@ -4954,7 +5247,8 @@ bool LLParser::ParseDIExpression(MDNode *&Result, bool IsDistinct) { Elements.push_back(Op); continue; } - return TokError(Twine("invalid DWARF attribute encoding '") + Lex.getStrVal() + "'"); + return TokError(Twine("invalid DWARF attribute encoding '") + + Lex.getStrVal() + "'"); } if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned()) @@ -5130,11 +5424,13 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, switch (ID.Kind) { case ValID::t_LocalID: - if (!PFS) return Error(ID.Loc, "invalid use of function-local name"); + if (!PFS) + return Error(ID.Loc, "invalid use of function-local name"); V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc, IsCall); return V == nullptr; case ValID::t_LocalName: - if (!PFS) return Error(ID.Loc, "invalid use of function-local name"); + if (!PFS) + return Error(ID.Loc, "invalid use of function-local name"); V = PFS->GetVal(ID.StrVal, Ty, ID.Loc, IsCall); return V == nullptr; case ValID::t_InlineAsm: { @@ -5170,14 +5466,14 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, ID.APFloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored); else if (Ty->isFloatTy()) - ID.APFloatVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, - &Ignored); + ID.APFloatVal.convert(APFloat::IEEEsingle(), + APFloat::rmNearestTiesToEven, &Ignored); } V = ConstantFP::get(Context, ID.APFloatVal); if (V->getType() != Ty) return Error(ID.Loc, "floating point constant does not have type '" + - getTypeString(Ty) + "'"); + getTypeString(Ty) + "'"); return false; case ValID::t_Null: @@ -5225,8 +5521,10 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, // Verify that the elements are compatible with the structtype. for (unsigned i = 0, e = ID.UIntVal; i != e; ++i) if (ID.ConstantStructElts[i]->getType() != ST->getElementType(i)) - return Error(ID.Loc, "element " + Twine(i) + - " of struct initializer doesn't match struct element type"); + return Error( + ID.Loc, + "element " + Twine(i) + + " of struct initializer doesn't match struct element type"); V = ConstantStruct::get( ST, makeArrayRef(ID.ConstantStructElts.get(), ID.UIntVal)); @@ -5274,15 +5572,15 @@ bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) { bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) { Type *Ty = nullptr; - return ParseType(Ty) || - ParseValue(Ty, V, PFS); + return ParseType(Ty) || ParseValue(Ty, V, PFS); } bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc, PerFunctionState &PFS) { Value *V; Loc = Lex.getLoc(); - if (ParseTypeAndValue(V, PFS)) return true; + if (ParseTypeAndValue(V, PFS)) + return true; if (!isa<BasicBlock>(V)) return Error(Loc, "expected a basic block"); BB = cast<BasicBlock>(V); @@ -5347,7 +5645,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { std::string FunctionName; if (Lex.getKind() == lltok::GlobalVar) { FunctionName = Lex.getStrVal(); - } else if (Lex.getKind() == lltok::GlobalID) { // @42 is ok. + } else if (Lex.getKind() == lltok::GlobalID) { // @42 is ok. unsigned NameID = Lex.getUIntVal(); if (NameID != NumberedVals.size()) @@ -5383,18 +5681,13 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { ParseOptionalProgramAddrSpace(AddrSpace) || ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false, BuiltinLoc) || - (EatIfPresent(lltok::kw_section) && - ParseStringConstant(Section)) || - (EatIfPresent(lltok::kw_partition) && - ParseStringConstant(Partition)) || + (EatIfPresent(lltok::kw_section) && ParseStringConstant(Section)) || + (EatIfPresent(lltok::kw_partition) && ParseStringConstant(Partition)) || parseOptionalComdat(FunctionName, C) || ParseOptionalAlignment(Alignment) || - (EatIfPresent(lltok::kw_gc) && - ParseStringConstant(GC)) || - (EatIfPresent(lltok::kw_prefix) && - ParseGlobalTypeAndValue(Prefix)) || - (EatIfPresent(lltok::kw_prologue) && - ParseGlobalTypeAndValue(Prologue)) || + (EatIfPresent(lltok::kw_gc) && ParseStringConstant(GC)) || + (EatIfPresent(lltok::kw_prefix) && ParseGlobalTypeAndValue(Prefix)) || + (EatIfPresent(lltok::kw_prologue) && ParseGlobalTypeAndValue(Prologue)) || (EatIfPresent(lltok::kw_personality) && ParseGlobalTypeAndValue(PersonalityFn))) return true; @@ -5410,7 +5703,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { // Okay, if we got here, the function is syntactically valid. Convert types // and do semantic checks. - std::vector<Type*> ParamTypeList; + std::vector<Type *> ParamTypeList; SmallVector<AttributeSet, 8> Attrs; for (unsigned i = 0, e = ArgList.size(); i != e; ++i) { @@ -5425,8 +5718,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy()) return Error(RetTypeLoc, "functions with 'sret' argument must return void"); - FunctionType *FT = - FunctionType::get(RetType, ParamTypeList, isVarArg); + FunctionType *FT = FunctionType::get(RetType, ParamTypeList, isVarArg); PointerType *PFT = PointerType::get(FT, AddrSpace); Fn = nullptr; @@ -5438,17 +5730,21 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { Fn = M->getFunction(FunctionName); if (!Fn) return Error(FRVI->second.second, "invalid forward reference to " - "function as global value!"); + "function as global value!"); if (Fn->getType() != PFT) - return Error(FRVI->second.second, "invalid forward reference to " - "function '" + FunctionName + "' with wrong type: " - "expected '" + getTypeString(PFT) + "' but was '" + - getTypeString(Fn->getType()) + "'"); + return Error(FRVI->second.second, + "invalid forward reference to " + "function '" + + FunctionName + + "' with wrong type: " + "expected '" + + getTypeString(PFT) + "' but was '" + + getTypeString(Fn->getType()) + "'"); ForwardRefVals.erase(FRVI); } else if ((Fn = M->getFunction(FunctionName))) { // Reject redefinitions. - return Error(NameLoc, "invalid redefinition of function '" + - FunctionName + "'"); + return Error(NameLoc, + "invalid redefinition of function '" + FunctionName + "'"); } else if (M->getNamedValue(FunctionName)) { return Error(NameLoc, "redefinition of function '@" + FunctionName + "'"); } @@ -5461,9 +5757,11 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { Fn = cast<Function>(I->second.first); if (Fn->getType() != PFT) return Error(NameLoc, "type of definition and forward reference of '@" + - Twine(NumberedVals.size()) + "' disagree: " - "expected '" + getTypeString(PFT) + "' but was '" + - getTypeString(Fn->getType()) + "'"); + Twine(NumberedVals.size()) + + "' disagree: " + "expected '" + + getTypeString(PFT) + "' but was '" + + getTypeString(Fn->getType()) + "'"); ForwardRefValIDs.erase(I); } } @@ -5491,7 +5789,8 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { Fn->setPartition(Partition); Fn->setComdat(C); Fn->setPersonalityFn(PersonalityFn); - if (!GC.empty()) Fn->setGC(GC); + if (!GC.empty()) + Fn->setGC(GC); Fn->setPrefixData(Prefix); Fn->setPrologueData(Prologue); ForwardRefAttrGroups[Fn] = FwdRefAttrGrps; @@ -5500,14 +5799,15 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { Function::arg_iterator ArgIt = Fn->arg_begin(); for (unsigned i = 0, e = ArgList.size(); i != e; ++i, ++ArgIt) { // If the argument has a name, insert it into the argument symbol table. - if (ArgList[i].Name.empty()) continue; + if (ArgList[i].Name.empty()) + continue; // Set the name, if it conflicted, it will be auto-renamed. ArgIt->setName(ArgList[i].Name); if (ArgIt->getName() != ArgList[i].Name) - return Error(ArgList[i].Loc, "redefinition of argument '%" + - ArgList[i].Name + "'"); + return Error(ArgList[i].Loc, + "redefinition of argument '%" + ArgList[i].Name + "'"); } if (isDefine) @@ -5570,10 +5870,11 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() { bool LLParser::ParseFunctionBody(Function &Fn) { if (Lex.getKind() != lltok::lbrace) return TokError("expected '{' in function body"); - Lex.Lex(); // eat the {. + Lex.Lex(); // eat the {. int FunctionNumber = -1; - if (!Fn.hasName()) FunctionNumber = NumberedVals.size()-1; + if (!Fn.hasName()) + FunctionNumber = NumberedVals.size() - 1; PerFunctionState PFS(*this, Fn, FunctionNumber); @@ -5589,7 +5890,8 @@ bool LLParser::ParseFunctionBody(Function &Fn) { while (Lex.getKind() != lltok::rbrace && Lex.getKind() != lltok::kw_uselistorder) - if (ParseBasicBlock(PFS)) return true; + if (ParseBasicBlock(PFS)) + return true; while (Lex.getKind() != lltok::rbrace) if (ParseUseListOrder(&PFS)) @@ -5645,8 +5947,10 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) { } switch (ParseInstruction(Inst, BB, PFS)) { - default: llvm_unreachable("Unknown ParseInstruction result!"); - case InstError: return true; + default: + llvm_unreachable("Unknown ParseInstruction result!"); + case InstError: + return true; case InstNormal: BB->getInstList().push_back(Inst); @@ -5667,7 +5971,8 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) { } // Set the name on the instruction. - if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst)) return true; + if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst)) + return true; } while (!Inst->isTerminator()); return false; @@ -5686,28 +5991,43 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, return TokError("found end of file when expecting more instructions"); LocTy Loc = Lex.getLoc(); unsigned KeywordVal = Lex.getUIntVal(); - Lex.Lex(); // Eat the keyword. + Lex.Lex(); // Eat the keyword. switch (Token) { - default: return Error(Loc, "expected instruction opcode"); + default: + return Error(Loc, "expected instruction opcode"); // Terminator Instructions. - case lltok::kw_unreachable: Inst = new UnreachableInst(Context); return false; - case lltok::kw_ret: return ParseRet(Inst, BB, PFS); - case lltok::kw_br: return ParseBr(Inst, PFS); - case lltok::kw_switch: return ParseSwitch(Inst, PFS); - case lltok::kw_indirectbr: return ParseIndirectBr(Inst, PFS); - case lltok::kw_invoke: return ParseInvoke(Inst, PFS); - case lltok::kw_resume: return ParseResume(Inst, PFS); - case lltok::kw_cleanupret: return ParseCleanupRet(Inst, PFS); - case lltok::kw_catchret: return ParseCatchRet(Inst, PFS); - case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS); - case lltok::kw_catchpad: return ParseCatchPad(Inst, PFS); - case lltok::kw_cleanuppad: return ParseCleanupPad(Inst, PFS); - case lltok::kw_callbr: return ParseCallBr(Inst, PFS); + case lltok::kw_unreachable: + Inst = new UnreachableInst(Context); + return false; + case lltok::kw_ret: + return ParseRet(Inst, BB, PFS); + case lltok::kw_br: + return ParseBr(Inst, PFS); + case lltok::kw_switch: + return ParseSwitch(Inst, PFS); + case lltok::kw_indirectbr: + return ParseIndirectBr(Inst, PFS); + case lltok::kw_invoke: + return ParseInvoke(Inst, PFS); + case lltok::kw_resume: + return ParseResume(Inst, PFS); + case lltok::kw_cleanupret: + return ParseCleanupRet(Inst, PFS); + case lltok::kw_catchret: + return ParseCatchRet(Inst, PFS); + case lltok::kw_catchswitch: + return ParseCatchSwitch(Inst, PFS); + case lltok::kw_catchpad: + return ParseCatchPad(Inst, PFS); + case lltok::kw_cleanuppad: + return ParseCleanupPad(Inst, PFS); + case lltok::kw_callbr: + return ParseCallBr(Inst, PFS); // Unary Operators. case lltok::kw_fneg: { FastMathFlags FMF = EatFastMathFlagsIfPresent(); - int Res = ParseUnaryOp(Inst, PFS, KeywordVal, /*IsFP*/true); + int Res = ParseUnaryOp(Inst, PFS, KeywordVal, /*IsFP*/ true); if (Res != 0) return Res; if (FMF.any()) @@ -5721,12 +6041,16 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_shl: { bool NUW = EatIfPresent(lltok::kw_nuw); bool NSW = EatIfPresent(lltok::kw_nsw); - if (!NUW) NUW = EatIfPresent(lltok::kw_nuw); + if (!NUW) + NUW = EatIfPresent(lltok::kw_nuw); - if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/false)) return true; + if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ false)) + return true; - if (NUW) cast<BinaryOperator>(Inst)->setHasNoUnsignedWrap(true); - if (NSW) cast<BinaryOperator>(Inst)->setHasNoSignedWrap(true); + if (NUW) + cast<BinaryOperator>(Inst)->setHasNoUnsignedWrap(true); + if (NSW) + cast<BinaryOperator>(Inst)->setHasNoSignedWrap(true); return false; } case lltok::kw_fadd: @@ -5735,7 +6059,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_fdiv: case lltok::kw_frem: { FastMathFlags FMF = EatFastMathFlagsIfPresent(); - int Res = ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/true); + int Res = ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ true); if (Res != 0) return Res; if (FMF.any()) @@ -5749,18 +6073,23 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_ashr: { bool Exact = EatIfPresent(lltok::kw_exact); - if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/false)) return true; - if (Exact) cast<BinaryOperator>(Inst)->setIsExact(true); + if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ false)) + return true; + if (Exact) + cast<BinaryOperator>(Inst)->setIsExact(true); return false; } case lltok::kw_urem: - case lltok::kw_srem: return ParseArithmetic(Inst, PFS, KeywordVal, - /*IsFP*/false); + case lltok::kw_srem: + return ParseArithmetic(Inst, PFS, KeywordVal, + /*IsFP*/ false); case lltok::kw_and: case lltok::kw_or: - case lltok::kw_xor: return ParseLogical(Inst, PFS, KeywordVal); - case lltok::kw_icmp: return ParseCompare(Inst, PFS, KeywordVal); + case lltok::kw_xor: + return ParseLogical(Inst, PFS, KeywordVal); + case lltok::kw_icmp: + return ParseCompare(Inst, PFS, KeywordVal); case lltok::kw_fcmp: { FastMathFlags FMF = EatFastMathFlagsIfPresent(); int Res = ParseCompare(Inst, PFS, KeywordVal); @@ -5784,7 +6113,8 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_fptoui: case lltok::kw_fptosi: case lltok::kw_inttoptr: - case lltok::kw_ptrtoint: return ParseCast(Inst, PFS, KeywordVal); + case lltok::kw_ptrtoint: + return ParseCast(Inst, PFS, KeywordVal); // Other. case lltok::kw_select: { FastMathFlags FMF = EatFastMathFlagsIfPresent(); @@ -5799,27 +6129,46 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, } return 0; } - case lltok::kw_va_arg: return ParseVA_Arg(Inst, PFS); - case lltok::kw_extractelement: return ParseExtractElement(Inst, PFS); - case lltok::kw_insertelement: return ParseInsertElement(Inst, PFS); - case lltok::kw_shufflevector: return ParseShuffleVector(Inst, PFS); - case lltok::kw_phi: return ParsePHI(Inst, PFS); - case lltok::kw_landingpad: return ParseLandingPad(Inst, PFS); + case lltok::kw_va_arg: + return ParseVA_Arg(Inst, PFS); + case lltok::kw_extractelement: + return ParseExtractElement(Inst, PFS); + case lltok::kw_insertelement: + return ParseInsertElement(Inst, PFS); + case lltok::kw_shufflevector: + return ParseShuffleVector(Inst, PFS); + case lltok::kw_phi: + return ParsePHI(Inst, PFS); + case lltok::kw_landingpad: + return ParseLandingPad(Inst, PFS); // Call. - case lltok::kw_call: return ParseCall(Inst, PFS, CallInst::TCK_None); - case lltok::kw_tail: return ParseCall(Inst, PFS, CallInst::TCK_Tail); - case lltok::kw_musttail: return ParseCall(Inst, PFS, CallInst::TCK_MustTail); - case lltok::kw_notail: return ParseCall(Inst, PFS, CallInst::TCK_NoTail); + case lltok::kw_call: + return ParseCall(Inst, PFS, CallInst::TCK_None); + case lltok::kw_tail: + return ParseCall(Inst, PFS, CallInst::TCK_Tail); + case lltok::kw_musttail: + return ParseCall(Inst, PFS, CallInst::TCK_MustTail); + case lltok::kw_notail: + return ParseCall(Inst, PFS, CallInst::TCK_NoTail); // Memory. - case lltok::kw_alloca: return ParseAlloc(Inst, PFS); - case lltok::kw_load: return ParseLoad(Inst, PFS); - case lltok::kw_store: return ParseStore(Inst, PFS); - case lltok::kw_cmpxchg: return ParseCmpXchg(Inst, PFS); - case lltok::kw_atomicrmw: return ParseAtomicRMW(Inst, PFS); - case lltok::kw_fence: return ParseFence(Inst, PFS); - case lltok::kw_getelementptr: return ParseGetElementPtr(Inst, PFS); - case lltok::kw_extractvalue: return ParseExtractValue(Inst, PFS); - case lltok::kw_insertvalue: return ParseInsertValue(Inst, PFS); + case lltok::kw_alloca: + return ParseAlloc(Inst, PFS); + case lltok::kw_load: + return ParseLoad(Inst, PFS); + case lltok::kw_store: + return ParseStore(Inst, PFS); + case lltok::kw_cmpxchg: + return ParseCmpXchg(Inst, PFS); + case lltok::kw_atomicrmw: + return ParseAtomicRMW(Inst, PFS); + case lltok::kw_fence: + return ParseFence(Inst, PFS); + case lltok::kw_getelementptr: + return ParseGetElementPtr(Inst, PFS); + case lltok::kw_extractvalue: + return ParseExtractValue(Inst, PFS); + case lltok::kw_insertvalue: + return ParseInsertValue(Inst, PFS); } } @@ -5827,37 +6176,91 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) { if (Opc == Instruction::FCmp) { switch (Lex.getKind()) { - default: return TokError("expected fcmp predicate (e.g. 'oeq')"); - case lltok::kw_oeq: P = CmpInst::FCMP_OEQ; break; - case lltok::kw_one: P = CmpInst::FCMP_ONE; break; - case lltok::kw_olt: P = CmpInst::FCMP_OLT; break; - case lltok::kw_ogt: P = CmpInst::FCMP_OGT; break; - case lltok::kw_ole: P = CmpInst::FCMP_OLE; break; - case lltok::kw_oge: P = CmpInst::FCMP_OGE; break; - case lltok::kw_ord: P = CmpInst::FCMP_ORD; break; - case lltok::kw_uno: P = CmpInst::FCMP_UNO; break; - case lltok::kw_ueq: P = CmpInst::FCMP_UEQ; break; - case lltok::kw_une: P = CmpInst::FCMP_UNE; break; - case lltok::kw_ult: P = CmpInst::FCMP_ULT; break; - case lltok::kw_ugt: P = CmpInst::FCMP_UGT; break; - case lltok::kw_ule: P = CmpInst::FCMP_ULE; break; - case lltok::kw_uge: P = CmpInst::FCMP_UGE; break; - case lltok::kw_true: P = CmpInst::FCMP_TRUE; break; - case lltok::kw_false: P = CmpInst::FCMP_FALSE; break; + default: + return TokError("expected fcmp predicate (e.g. 'oeq')"); + case lltok::kw_oeq: + P = CmpInst::FCMP_OEQ; + break; + case lltok::kw_one: + P = CmpInst::FCMP_ONE; + break; + case lltok::kw_olt: + P = CmpInst::FCMP_OLT; + break; + case lltok::kw_ogt: + P = CmpInst::FCMP_OGT; + break; + case lltok::kw_ole: + P = CmpInst::FCMP_OLE; + break; + case lltok::kw_oge: + P = CmpInst::FCMP_OGE; + break; + case lltok::kw_ord: + P = CmpInst::FCMP_ORD; + break; + case lltok::kw_uno: + P = CmpInst::FCMP_UNO; + break; + case lltok::kw_ueq: + P = CmpInst::FCMP_UEQ; + break; + case lltok::kw_une: + P = CmpInst::FCMP_UNE; + break; + case lltok::kw_ult: + P = CmpInst::FCMP_ULT; + break; + case lltok::kw_ugt: + P = CmpInst::FCMP_UGT; + break; + case lltok::kw_ule: + P = CmpInst::FCMP_ULE; + break; + case lltok::kw_uge: + P = CmpInst::FCMP_UGE; + break; + case lltok::kw_true: + P = CmpInst::FCMP_TRUE; + break; + case lltok::kw_false: + P = CmpInst::FCMP_FALSE; + break; } } else { switch (Lex.getKind()) { - default: return TokError("expected icmp predicate (e.g. 'eq')"); - case lltok::kw_eq: P = CmpInst::ICMP_EQ; break; - case lltok::kw_ne: P = CmpInst::ICMP_NE; break; - case lltok::kw_slt: P = CmpInst::ICMP_SLT; break; - case lltok::kw_sgt: P = CmpInst::ICMP_SGT; break; - case lltok::kw_sle: P = CmpInst::ICMP_SLE; break; - case lltok::kw_sge: P = CmpInst::ICMP_SGE; break; - case lltok::kw_ult: P = CmpInst::ICMP_ULT; break; - case lltok::kw_ugt: P = CmpInst::ICMP_UGT; break; - case lltok::kw_ule: P = CmpInst::ICMP_ULE; break; - case lltok::kw_uge: P = CmpInst::ICMP_UGE; break; + default: + return TokError("expected icmp predicate (e.g. 'eq')"); + case lltok::kw_eq: + P = CmpInst::ICMP_EQ; + break; + case lltok::kw_ne: + P = CmpInst::ICMP_NE; + break; + case lltok::kw_slt: + P = CmpInst::ICMP_SLT; + break; + case lltok::kw_sgt: + P = CmpInst::ICMP_SGT; + break; + case lltok::kw_sle: + P = CmpInst::ICMP_SLE; + break; + case lltok::kw_sge: + P = CmpInst::ICMP_SGE; + break; + case lltok::kw_ult: + P = CmpInst::ICMP_ULT; + break; + case lltok::kw_ugt: + P = CmpInst::ICMP_UGT; + break; + case lltok::kw_ule: + P = CmpInst::ICMP_ULE; + break; + case lltok::kw_uge: + P = CmpInst::ICMP_UGE; + break; } } Lex.Lex(); @@ -5875,25 +6278,27 @@ bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS) { SMLoc TypeLoc = Lex.getLoc(); Type *Ty = nullptr; - if (ParseType(Ty, true /*void allowed*/)) return true; + if (ParseType(Ty, true /*void allowed*/)) + return true; Type *ResType = PFS.getFunction().getReturnType(); if (Ty->isVoidTy()) { if (!ResType->isVoidTy()) return Error(TypeLoc, "value doesn't match function result type '" + - getTypeString(ResType) + "'"); + getTypeString(ResType) + "'"); Inst = ReturnInst::Create(Context); return false; } Value *RV; - if (ParseValue(Ty, RV, PFS)) return true; + if (ParseValue(Ty, RV, PFS)) + return true; if (ResType != RV->getType()) return Error(TypeLoc, "value doesn't match function result type '" + - getTypeString(ResType) + "'"); + getTypeString(ResType) + "'"); Inst = ReturnInst::Create(Context, RV); return false; @@ -5906,7 +6311,8 @@ bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) { LocTy Loc, Loc2; Value *Op0; BasicBlock *Op1, *Op2; - if (ParseTypeAndValue(Op0, Loc, PFS)) return true; + if (ParseTypeAndValue(Op0, Loc, PFS)) + return true; if (BasicBlock *BB = dyn_cast<BasicBlock>(Op0)) { Inst = BranchInst::Create(BB); @@ -5945,8 +6351,8 @@ bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) { return Error(CondLoc, "switch condition must have integer type"); // Parse the jump table pairs. - SmallPtrSet<Value*, 32> SeenCases; - SmallVector<std::pair<ConstantInt*, BasicBlock*>, 32> Table; + SmallPtrSet<Value *, 32> SeenCases; + SmallVector<std::pair<ConstantInt *, BasicBlock *>, 32> Table; while (Lex.getKind() != lltok::rsquare) { Value *Constant; BasicBlock *DestBB; @@ -5964,7 +6370,7 @@ bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) { Table.push_back(std::make_pair(cast<ConstantInt>(Constant), DestBB)); } - Lex.Lex(); // Eat the ']'. + Lex.Lex(); // Eat the ']'. SwitchInst *SI = SwitchInst::Create(Cond, DefaultBB, Table.size()); for (unsigned i = 0, e = Table.size(); i != e; ++i) @@ -5988,7 +6394,7 @@ bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) { return Error(AddrLoc, "indirectbr address must have pointer type"); // Parse the destination list. - SmallVector<BasicBlock*, 16> DestList; + SmallVector<BasicBlock *, 16> DestList; if (Lex.getKind() != lltok::rsquare) { BasicBlock *DestBB; @@ -6049,7 +6455,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { FunctionType *Ty = dyn_cast<FunctionType>(RetType); if (!Ty) { // Pull out the types of all of the arguments... - std::vector<Type*> ParamTypes; + std::vector<Type *> ParamTypes; for (unsigned i = 0, e = ArgList.size(); i != e; ++i) ParamTypes.push_back(ArgList[i].V->getType()); @@ -6085,7 +6491,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { if (ExpectedTy && ExpectedTy != ArgList[i].V->getType()) return Error(ArgList[i].Loc, "argument is not of expected type '" + - getTypeString(ExpectedTy) + "'"); + getTypeString(ExpectedTy) + "'"); Args.push_back(ArgList[i].V); ArgAttrs.push_back(ArgList[i].Attrs); } @@ -6113,7 +6519,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { /// ParseResume /// ::= 'resume' TypeAndValue bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) { - Value *Exn; LocTy ExnLoc; + Value *Exn; + LocTy ExnLoc; if (ParseTypeAndValue(Exn, ExnLoc, PFS)) return true; @@ -6150,7 +6557,7 @@ bool LLParser::ParseExceptionArgs(SmallVectorImpl<Value *> &Args, Args.push_back(V); } - Lex.Lex(); // Lex the ']'. + Lex.Lex(); // Lex the ']'. return false; } @@ -6197,7 +6604,7 @@ bool LLParser::ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS) { BasicBlock *BB; if (ParseToken(lltok::kw_to, "expected 'to' in catchret") || ParseTypeAndBasicBlock(BB, PFS)) - return true; + return true; Inst = CatchReturnInst::Create(CatchPad, BB); return false; @@ -6232,8 +6639,7 @@ bool LLParser::ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) { if (ParseToken(lltok::rsquare, "expected ']' after catchswitch labels")) return true; - if (ParseToken(lltok::kw_unwind, - "expected 'unwind' after catchswitch scope")) + if (ParseToken(lltok::kw_unwind, "expected 'unwind' after catchswitch scope")) return true; BasicBlock *UnwindBB = nullptr; @@ -6309,7 +6715,8 @@ bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) { /// operand is allowed. bool LLParser::ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, bool IsFP) { - LocTy Loc; Value *LHS; + LocTy Loc; + Value *LHS; if (ParseTypeAndValue(LHS, Loc, PFS)) return true; @@ -6431,9 +6838,8 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) { AttributeList::get(Context, AttributeSet::get(Context, FnAttrs), AttributeSet::get(Context, RetAttrs), ArgAttrs); - CallBrInst *CBI = - CallBrInst::Create(Ty, Callee, DefaultDest, IndirectDests, Args, - BundleList); + CallBrInst *CBI = CallBrInst::Create(Ty, Callee, DefaultDest, IndirectDests, + Args, BundleList); CBI->setCallingConv(CC); CBI->setAttributes(PAL); ForwardRefAttrGroups[CBI] = FwdRefAttrGrps; @@ -6452,7 +6858,8 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) { /// operand is allowed. bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, bool IsFP) { - LocTy Loc; Value *LHS, *RHS; + LocTy Loc; + Value *LHS, *RHS; if (ParseTypeAndValue(LHS, Loc, PFS) || ParseToken(lltok::comma, "expected ',' in arithmetic operation") || ParseValue(LHS->getType(), RHS, PFS)) @@ -6472,14 +6879,16 @@ bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, /// ::= ArithmeticOps TypeAndValue ',' Value { bool LLParser::ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc) { - LocTy Loc; Value *LHS, *RHS; + LocTy Loc; + Value *LHS, *RHS; if (ParseTypeAndValue(LHS, Loc, PFS) || ParseToken(lltok::comma, "expected ',' in logical operation") || ParseValue(LHS->getType(), RHS, PFS)) return true; if (!LHS->getType()->isIntOrIntVectorTy()) - return Error(Loc,"instruction requires integer or integer vector operands"); + return Error(Loc, + "instruction requires integer or integer vector operands"); Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); return false; @@ -6494,8 +6903,7 @@ bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS, LocTy Loc; unsigned Pred; Value *LHS, *RHS; - if (ParseCmpPredicate(Pred, Opc) || - ParseTypeAndValue(LHS, Loc, PFS) || + if (ParseCmpPredicate(Pred, Opc) || ParseTypeAndValue(LHS, Loc, PFS) || ParseToken(lltok::comma, "expected ',' after compare value") || ParseValue(LHS->getType(), RHS, PFS)) return true; @@ -6518,7 +6926,6 @@ bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS, // Other Instructions. //===----------------------------------------------------------------------===// - /// ParseCast /// ::= CastOpc TypeAndValue 'to' Type bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS, @@ -6534,8 +6941,8 @@ bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS, if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy)) { CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy); return Error(Loc, "invalid cast opcode for cast from '" + - getTypeString(Op->getType()) + "' to '" + - getTypeString(DestTy) + "'"); + getTypeString(Op->getType()) + "' to '" + + getTypeString(DestTy) + "'"); } Inst = CastInst::Create((Instruction::CastOps)Opc, Op, DestTy); return false; @@ -6636,7 +7043,8 @@ bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) { /// ParsePHI /// ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Value ']')* int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) { - Type *Ty = nullptr; LocTy TypeLoc; + Type *Ty = nullptr; + LocTy TypeLoc; Value *Op0, *Op1; if (ParseType(Ty, TypeLoc) || @@ -6648,7 +7056,7 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) { return true; bool AteExtraComma = false; - SmallVector<std::pair<Value*, BasicBlock*>, 16> PHIVals; + SmallVector<std::pair<Value *, BasicBlock *>, 16> PHIVals; while (true) { PHIVals.push_back(std::make_pair(Op0, cast<BasicBlock>(Op1))); @@ -6686,7 +7094,8 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) { /// ::= 'filter' /// ::= 'filter' TypeAndValue ( ',' TypeAndValue )* bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) { - Type *Ty = nullptr; LocTy TyLoc; + Type *Ty = nullptr; + LocTy TyLoc; if (ParseType(Ty, TyLoc)) return true; @@ -6694,7 +7103,8 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) { std::unique_ptr<LandingPadInst> LP(LandingPadInst::Create(Ty, 0)); LP->setCleanup(EatIfPresent(lltok::kw_cleanup)); - while (Lex.getKind() == lltok::kw_catch || Lex.getKind() == lltok::kw_filter){ + while (Lex.getKind() == lltok::kw_catch || + Lex.getKind() == lltok::kw_filter) { LandingPadInst::ClauseType CT; if (EatIfPresent(lltok::kw_catch)) CT = LandingPadInst::Catch; @@ -6778,7 +7188,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, FunctionType *Ty = dyn_cast<FunctionType>(RetType); if (!Ty) { // Pull out the types of all of the arguments... - std::vector<Type*> ParamTypes; + std::vector<Type *> ParamTypes; for (unsigned i = 0, e = ArgList.size(); i != e; ++i) ParamTypes.push_back(ArgList[i].V->getType()); @@ -6799,7 +7209,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, // Set up the Attribute for the function. SmallVector<AttributeSet, 8> Attrs; - SmallVector<Value*, 8> Args; + SmallVector<Value *, 8> Args; // Loop through FunctionType's arguments and ensure they are specified // correctly. Also, gather any parameter attributes. @@ -6815,7 +7225,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, if (ExpectedTy && ExpectedTy != ArgList[i].V->getType()) return Error(ArgList[i].Loc, "argument is not of expected type '" + - getTypeString(ExpectedTy) + "'"); + getTypeString(ExpectedTy) + "'"); Args.push_back(ArgList[i].V); Attrs.push_back(ArgList[i].Attrs); } @@ -6859,7 +7269,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) { bool IsInAlloca = EatIfPresent(lltok::kw_inalloca); bool IsSwiftError = EatIfPresent(lltok::kw_swifterror); - if (ParseType(Ty, TyLoc)) return true; + if (ParseType(Ty, TyLoc)) + return true; if (Ty->isFunctionTy() || !PointerType::isValidElementType(Ty)) return Error(TyLoc, "invalid type for alloca"); @@ -6912,7 +7323,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) { /// ::= 'load' 'atomic' 'volatile'? TypeAndValue /// 'singlethread'? AtomicOrdering (',' 'align' i32)? int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) { - Value *Val; LocTy Loc; + Value *Val; + LocTy Loc; unsigned Alignment = 0; bool AteExtraComma = false; bool isAtomic = false; @@ -6961,7 +7373,8 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) { /// ::= 'store' 'atomic' 'volatile'? TypeAndValue ',' TypeAndValue /// 'singlethread'? AtomicOrdering (',' 'align' i32)? int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) { - Value *Val, *Ptr; LocTy Loc, PtrLoc; + Value *Val, *Ptr; + LocTy Loc, PtrLoc; unsigned Alignment = 0; bool AteExtraComma = false; bool isAtomic = false; @@ -7006,7 +7419,8 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) { /// ::= 'cmpxchg' 'weak'? 'volatile'? TypeAndValue ',' TypeAndValue ',' /// TypeAndValue 'singlethread'? AtomicOrdering AtomicOrdering int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) { - Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc; + Value *Ptr, *Cmp, *New; + LocTy PtrLoc, CmpLoc, NewLoc; bool AteExtraComma = false; AtomicOrdering SuccessOrdering = AtomicOrdering::NotAtomic; AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; @@ -7047,8 +7461,8 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) { return Error(NewLoc, "new value and pointer type do not match"); if (!New->getType()->isFirstClassType()) return Error(NewLoc, "cmpxchg operand must be a first class value"); - AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst( - Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SSID); + AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering, + FailureOrdering, SSID); CXI->setVolatile(isVolatile); CXI->setWeak(isWeak); Inst = CXI; @@ -7059,7 +7473,8 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) { /// ::= 'atomicrmw' 'volatile'? BinOp TypeAndValue ',' TypeAndValue /// 'singlethread'? AtomicOrdering int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) { - Value *Ptr, *Val; LocTy PtrLoc, ValLoc; + Value *Ptr, *Val; + LocTy PtrLoc, ValLoc; bool AteExtraComma = false; AtomicOrdering Ordering = AtomicOrdering::NotAtomic; SyncScope::ID SSID = SyncScope::System; @@ -7071,18 +7486,41 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) { isVolatile = true; switch (Lex.getKind()) { - default: return TokError("expected binary operation in atomicrmw"); - case lltok::kw_xchg: Operation = AtomicRMWInst::Xchg; break; - case lltok::kw_add: Operation = AtomicRMWInst::Add; break; - case lltok::kw_sub: Operation = AtomicRMWInst::Sub; break; - case lltok::kw_and: Operation = AtomicRMWInst::And; break; - case lltok::kw_nand: Operation = AtomicRMWInst::Nand; break; - case lltok::kw_or: Operation = AtomicRMWInst::Or; break; - case lltok::kw_xor: Operation = AtomicRMWInst::Xor; break; - case lltok::kw_max: Operation = AtomicRMWInst::Max; break; - case lltok::kw_min: Operation = AtomicRMWInst::Min; break; - case lltok::kw_umax: Operation = AtomicRMWInst::UMax; break; - case lltok::kw_umin: Operation = AtomicRMWInst::UMin; break; + default: + return TokError("expected binary operation in atomicrmw"); + case lltok::kw_xchg: + Operation = AtomicRMWInst::Xchg; + break; + case lltok::kw_add: + Operation = AtomicRMWInst::Add; + break; + case lltok::kw_sub: + Operation = AtomicRMWInst::Sub; + break; + case lltok::kw_and: + Operation = AtomicRMWInst::And; + break; + case lltok::kw_nand: + Operation = AtomicRMWInst::Nand; + break; + case lltok::kw_or: + Operation = AtomicRMWInst::Or; + break; + case lltok::kw_xor: + Operation = AtomicRMWInst::Xor; + break; + case lltok::kw_max: + Operation = AtomicRMWInst::Max; + break; + case lltok::kw_min: + Operation = AtomicRMWInst::Min; + break; + case lltok::kw_umax: + Operation = AtomicRMWInst::UMax; + break; + case lltok::kw_umin: + Operation = AtomicRMWInst::UMin; + break; case lltok::kw_fadd: Operation = AtomicRMWInst::FAdd; IsFP = true; @@ -7092,7 +7530,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) { IsFP = true; break; } - Lex.Lex(); // Eat the operation. + Lex.Lex(); // Eat the operation. if (ParseTypeAndValue(Ptr, PtrLoc, PFS) || ParseToken(lltok::comma, "expected ',' after atomicrmw address") || @@ -7110,21 +7548,21 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) { if (Operation == AtomicRMWInst::Xchg) { if (!Val->getType()->isIntegerTy() && !Val->getType()->isFloatingPointTy()) { - return Error(ValLoc, "atomicrmw " + - AtomicRMWInst::getOperationName(Operation) + - " operand must be an integer or floating point type"); + return Error(ValLoc, + "atomicrmw " + AtomicRMWInst::getOperationName(Operation) + + " operand must be an integer or floating point type"); } } else if (IsFP) { if (!Val->getType()->isFloatingPointTy()) { return Error(ValLoc, "atomicrmw " + - AtomicRMWInst::getOperationName(Operation) + - " operand must be a floating point type"); + AtomicRMWInst::getOperationName(Operation) + + " operand must be a floating point type"); } } else { if (!Val->getType()->isIntegerTy()) { return Error(ValLoc, "atomicrmw " + - AtomicRMWInst::getOperationName(Operation) + - " operand must be an integer"); + AtomicRMWInst::getOperationName(Operation) + + " operand must be an integer"); } } @@ -7133,8 +7571,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) { return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized" " integer"); - AtomicRMWInst *RMWI = - new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID); + AtomicRMWInst *RMWI = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID); RMWI->setVolatile(isVolatile); Inst = RMWI; return AteExtraComma ? InstExtraComma : InstNormal; @@ -7174,7 +7611,8 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) { return true; Type *BaseType = Ptr->getType(); - PointerType *BasePointerType = dyn_cast<PointerType>(BaseType->getScalarType()); + PointerType *BasePointerType = + dyn_cast<PointerType>(BaseType->getScalarType()); if (!BasePointerType) return Error(Loc, "base of getelementptr must be a pointer"); @@ -7182,33 +7620,35 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) { return Error(ExplicitTypeLoc, "explicit pointee type doesn't match operand's pointee type"); - SmallVector<Value*, 16> Indices; + SmallVector<Value *, 16> Indices; bool AteExtraComma = false; // GEP returns a vector of pointers if at least one of parameters is a vector. // All vector parameters should have the same vector width. - unsigned GEPWidth = BaseType->isVectorTy() ? - BaseType->getVectorNumElements() : 0; + unsigned GEPWidth = + BaseType->isVectorTy() ? BaseType->getVectorNumElements() : 0; while (EatIfPresent(lltok::comma)) { if (Lex.getKind() == lltok::MetadataVar) { AteExtraComma = true; break; } - if (ParseTypeAndValue(Val, EltLoc, PFS)) return true; + if (ParseTypeAndValue(Val, EltLoc, PFS)) + return true; if (!Val->getType()->isIntOrIntVectorTy()) return Error(EltLoc, "getelementptr index must be an integer"); if (Val->getType()->isVectorTy()) { unsigned ValNumEl = Val->getType()->getVectorNumElements(); if (GEPWidth && GEPWidth != ValNumEl) - return Error(EltLoc, - "getelementptr vector index has a wrong number of elements"); + return Error( + EltLoc, + "getelementptr vector index has a wrong number of elements"); GEPWidth = ValNumEl; } Indices.push_back(Val); } - SmallPtrSet<Type*, 4> Visited; + SmallPtrSet<Type *, 4> Visited; if (!Indices.empty() && !Ty->isSized(&Visited)) return Error(Loc, "base element of getelementptr must be sized"); @@ -7223,7 +7663,8 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) { /// ParseExtractValue /// ::= 'extractvalue' TypeAndValue (',' uint32)+ int LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) { - Value *Val; LocTy Loc; + Value *Val; + LocTy Loc; SmallVector<unsigned, 4> Indices; bool AteExtraComma; if (ParseTypeAndValue(Val, Loc, PFS) || @@ -7242,7 +7683,8 @@ int LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) { /// ParseInsertValue /// ::= 'insertvalue' TypeAndValue ',' TypeAndValue (',' uint32)+ int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) { - Value *Val0, *Val1; LocTy Loc0, Loc1; + Value *Val0, *Val1; + LocTy Loc0, Loc1; SmallVector<unsigned, 4> Indices; bool AteExtraComma; if (ParseTypeAndValue(Val0, Loc0, PFS) || @@ -7254,7 +7696,8 @@ int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) { if (!Val0->getType()->isAggregateType()) return Error(Loc0, "insertvalue operand must be aggregate type"); - Type *IndexedType = ExtractValueInst::getIndexedType(Val0->getType(), Indices); + Type *IndexedType = + ExtractValueInst::getIndexedType(Val0->getType(), Indices); if (!IndexedType) return Error(Loc0, "invalid indices for insertvalue"); if (IndexedType != Val1->getType()) @@ -7359,7 +7802,8 @@ bool LLParser::ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) { if (Indexes.size() < 2) return Error(Loc, "expected >= 2 uselistorder indexes"); if (Offset != 0 || Max >= Indexes.size()) - return Error(Loc, "expected distinct uselistorder indexes in range [0, size)"); + return Error(Loc, + "expected distinct uselistorder indexes in range [0, size)"); if (IsOrdered) return Error(Loc, "expected uselistorder indexes to change the order"); @@ -7408,7 +7852,8 @@ bool LLParser::ParseUseListOrderBB() { else return Error(Fn.Loc, "expected function name in uselistorder_bb"); if (!GV) - return Error(Fn.Loc, "invalid function forward reference in uselistorder_bb"); + return Error(Fn.Loc, + "invalid function forward reference in uselistorder_bb"); auto *F = dyn_cast<Function>(GV); if (!F) return Error(Fn.Loc, "expected function name in uselistorder_bb"); diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.h b/hpvm/llvm_patches/lib/AsmParser/LLParser.h index 610e2e262008190fc3102c4833846c2f70abe712..bc1983232f0570d816a23bc1f92ce490a44dee59 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLParser.h +++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.h @@ -26,587 +26,606 @@ #include <map> namespace llvm { - class Module; - class OpaqueType; - class Function; - class Value; - class BasicBlock; - class Instruction; - class Constant; - class GlobalValue; - class Comdat; - class MDString; - class MDNode; - struct SlotMapping; - class StructType; - - /// ValID - Represents a reference of a definition of some sort with no type. - /// There are several cases where we have to parse the value but where the - /// type can depend on later context. This may either be a numeric reference - /// or a symbolic (%var) reference. This is just a discriminated union. - struct ValID { - enum { - t_LocalID, t_GlobalID, // ID in UIntVal. - t_LocalName, t_GlobalName, // Name in StrVal. - t_APSInt, t_APFloat, // Value in APSIntVal/APFloatVal. - t_Null, t_Undef, t_Zero, t_None, // No value. - t_EmptyArray, // No value: [] - t_Constant, // Value in ConstantVal. - t_InlineAsm, // Value in FTy/StrVal/StrVal2/UIntVal. - t_ConstantStruct, // Value in ConstantStructElts. - t_PackedConstantStruct // Value in ConstantStructElts. - } Kind = t_LocalID; - - LLLexer::LocTy Loc; - unsigned UIntVal; - FunctionType *FTy = nullptr; - std::string StrVal, StrVal2; - APSInt APSIntVal; - APFloat APFloatVal{0.0}; - Constant *ConstantVal; - std::unique_ptr<Constant *[]> ConstantStructElts; - - ValID() = default; - ValID(const ValID &RHS) - : Kind(RHS.Kind), Loc(RHS.Loc), UIntVal(RHS.UIntVal), FTy(RHS.FTy), - StrVal(RHS.StrVal), StrVal2(RHS.StrVal2), APSIntVal(RHS.APSIntVal), - APFloatVal(RHS.APFloatVal), ConstantVal(RHS.ConstantVal) { - assert(!RHS.ConstantStructElts); - } +class Module; +class OpaqueType; +class Function; +class Value; +class BasicBlock; +class Instruction; +class Constant; +class GlobalValue; +class Comdat; +class MDString; +class MDNode; +struct SlotMapping; +class StructType; + +/// ValID - Represents a reference of a definition of some sort with no type. +/// There are several cases where we have to parse the value but where the +/// type can depend on later context. This may either be a numeric reference +/// or a symbolic (%var) reference. This is just a discriminated union. +struct ValID { + enum { + t_LocalID, + t_GlobalID, // ID in UIntVal. + t_LocalName, + t_GlobalName, // Name in StrVal. + t_APSInt, + t_APFloat, // Value in APSIntVal/APFloatVal. + t_Null, + t_Undef, + t_Zero, + t_None, // No value. + t_EmptyArray, // No value: [] + t_Constant, // Value in ConstantVal. + t_InlineAsm, // Value in FTy/StrVal/StrVal2/UIntVal. + t_ConstantStruct, // Value in ConstantStructElts. + t_PackedConstantStruct // Value in ConstantStructElts. + } Kind = t_LocalID; + + LLLexer::LocTy Loc; + unsigned UIntVal; + FunctionType *FTy = nullptr; + std::string StrVal, StrVal2; + APSInt APSIntVal; + APFloat APFloatVal{0.0}; + Constant *ConstantVal; + std::unique_ptr<Constant *[]> ConstantStructElts; + + ValID() = default; + ValID(const ValID &RHS) + : Kind(RHS.Kind), Loc(RHS.Loc), UIntVal(RHS.UIntVal), FTy(RHS.FTy), + StrVal(RHS.StrVal), StrVal2(RHS.StrVal2), APSIntVal(RHS.APSIntVal), + APFloatVal(RHS.APFloatVal), ConstantVal(RHS.ConstantVal) { + assert(!RHS.ConstantStructElts); + } + + bool operator<(const ValID &RHS) const { + if (Kind == t_LocalID || Kind == t_GlobalID) + return UIntVal < RHS.UIntVal; + assert((Kind == t_LocalName || Kind == t_GlobalName || + Kind == t_ConstantStruct || Kind == t_PackedConstantStruct) && + "Ordering not defined for this ValID kind yet"); + return StrVal < RHS.StrVal; + } +}; + +class LLParser { +public: + typedef LLLexer::LocTy LocTy; + +private: + LLVMContext &Context; + LLLexer Lex; + // Module being parsed, null if we are only parsing summary index. + Module *M; + // Summary index being parsed, null if we are only parsing Module. + ModuleSummaryIndex *Index; + SlotMapping *Slots; + + // Instruction metadata resolution. Each instruction can have a list of + // MDRef info associated with them. + // + // The simpler approach of just creating temporary MDNodes and then calling + // RAUW on them when the definition is processed doesn't work because some + // instruction metadata kinds, such as dbg, get stored in the IR in an + // "optimized" format which doesn't participate in the normal value use + // lists. This means that RAUW doesn't work, even on temporary MDNodes + // which otherwise support RAUW. Instead, we defer resolving MDNode + // references until the definitions have been processed. + struct MDRef { + SMLoc Loc; + unsigned MDKind, MDSlot; + }; - bool operator<(const ValID &RHS) const { - if (Kind == t_LocalID || Kind == t_GlobalID) - return UIntVal < RHS.UIntVal; - assert((Kind == t_LocalName || Kind == t_GlobalName || - Kind == t_ConstantStruct || Kind == t_PackedConstantStruct) && - "Ordering not defined for this ValID kind yet"); - return StrVal < RHS.StrVal; + SmallVector<Instruction *, 64> InstsWithTBAATag; + + // Type resolution handling data structures. The location is set when we + // have processed a use of the type but not a definition yet. + StringMap<std::pair<Type *, LocTy>> NamedTypes; + std::map<unsigned, std::pair<Type *, LocTy>> NumberedTypes; + + std::map<unsigned, TrackingMDNodeRef> NumberedMetadata; + std::map<unsigned, std::pair<TempMDTuple, LocTy>> ForwardRefMDNodes; + + // Global Value reference information. + std::map<std::string, std::pair<GlobalValue *, LocTy>> ForwardRefVals; + std::map<unsigned, std::pair<GlobalValue *, LocTy>> ForwardRefValIDs; + std::vector<GlobalValue *> NumberedVals; + + // Comdat forward reference information. + std::map<std::string, LocTy> ForwardRefComdats; + + // References to blockaddress. The key is the function ValID, the value is + // a list of references to blocks in that function. + std::map<ValID, std::map<ValID, GlobalValue *>> ForwardRefBlockAddresses; + class PerFunctionState; + /// Reference to per-function state to allow basic blocks to be + /// forward-referenced by blockaddress instructions within the same + /// function. + PerFunctionState *BlockAddressPFS; + + // Attribute builder reference information. + std::map<Value *, std::vector<unsigned>> ForwardRefAttrGroups; + std::map<unsigned, AttrBuilder> NumberedAttrBuilders; + + // Summary global value reference information. + std::map<unsigned, std::vector<std::pair<ValueInfo *, LocTy>>> + ForwardRefValueInfos; + std::map<unsigned, std::vector<std::pair<AliasSummary *, LocTy>>> + ForwardRefAliasees; + std::vector<ValueInfo> NumberedValueInfos; + + // Summary type id reference information. + std::map<unsigned, std::vector<std::pair<GlobalValue::GUID *, LocTy>>> + ForwardRefTypeIds; + + // Map of module ID to path. + std::map<unsigned, StringRef> ModuleIdMap; + + /// Only the llvm-as tool may set this to false to bypass + /// UpgradeDebuginfo so it can generate broken bitcode. + bool UpgradeDebugInfo; + + /// DataLayout string to override that in LLVM assembly. + StringRef DataLayoutStr; + + std::string SourceFileName; + +public: + LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M, + ModuleSummaryIndex *Index, LLVMContext &Context, + SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true, + StringRef DataLayoutString = "") + : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index), + Slots(Slots), BlockAddressPFS(nullptr), + UpgradeDebugInfo(UpgradeDebugInfo), DataLayoutStr(DataLayoutString) { + if (!DataLayoutStr.empty()) + M->setDataLayout(DataLayoutStr); + } + bool Run(); + + bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots); + + bool parseTypeAtBeginning(Type *&Ty, unsigned &Read, + const SlotMapping *Slots); + + LLVMContext &getContext() { return Context; } + +private: + bool Error(LocTy L, const Twine &Msg) const { return Lex.Error(L, Msg); } + bool TokError(const Twine &Msg) const { return Error(Lex.getLoc(), Msg); } + + /// Restore the internal name and slot mappings using the mappings that + /// were created at an earlier parsing stage. + void restoreParsingState(const SlotMapping *Slots); + + /// GetGlobalVal - Get a value with the specified name or ID, creating a + /// forward reference record if needed. This can return null if the value + /// exists but does not have the right type. + GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc, + bool IsCall); + GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall); + + /// Get a Comdat with the specified name, creating a forward reference + /// record if needed. + Comdat *getComdat(const std::string &Name, LocTy Loc); + + // Helper Routines. + bool ParseToken(lltok::Kind T, const char *ErrMsg); + bool EatIfPresent(lltok::Kind T) { + if (Lex.getKind() != T) + return false; + Lex.Lex(); + return true; + } + + FastMathFlags EatFastMathFlagsIfPresent() { + FastMathFlags FMF; + while (true) + switch (Lex.getKind()) { + case lltok::kw_fast: + FMF.setFast(); + Lex.Lex(); + continue; + case lltok::kw_nnan: + FMF.setNoNaNs(); + Lex.Lex(); + continue; + case lltok::kw_ninf: + FMF.setNoInfs(); + Lex.Lex(); + continue; + case lltok::kw_nsz: + FMF.setNoSignedZeros(); + Lex.Lex(); + continue; + case lltok::kw_arcp: + FMF.setAllowReciprocal(); + Lex.Lex(); + continue; + case lltok::kw_contract: + FMF.setAllowContract(true); + Lex.Lex(); + continue; + case lltok::kw_reassoc: + FMF.setAllowReassoc(); + Lex.Lex(); + continue; + case lltok::kw_afn: + FMF.setApproxFunc(); + Lex.Lex(); + continue; + default: + return FMF; + } + return FMF; + } + + bool ParseOptionalToken(lltok::Kind T, bool &Present, LocTy *Loc = nullptr) { + if (Lex.getKind() != T) { + Present = false; + } else { + if (Loc) + *Loc = Lex.getLoc(); + Lex.Lex(); + Present = true; } + return false; + } + bool ParseStringConstant(std::string &Result); + bool ParseUInt32(unsigned &Val); + bool ParseUInt32(unsigned &Val, LocTy &Loc) { + Loc = Lex.getLoc(); + return ParseUInt32(Val); + } + bool ParseUInt64(uint64_t &Val); + bool ParseUInt64(uint64_t &Val, LocTy &Loc) { + Loc = Lex.getLoc(); + return ParseUInt64(Val); + } + bool ParseFlag(unsigned &Val); + + bool ParseStringAttribute(AttrBuilder &B); + + bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM); + bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM); + bool ParseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr); + bool ParseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS = 0); + bool ParseOptionalProgramAddrSpace(unsigned &AddrSpace) { + return ParseOptionalAddrSpace(AddrSpace, + M->getDataLayout().getProgramAddressSpace()); }; - - class LLParser { - public: - typedef LLLexer::LocTy LocTy; - private: - LLVMContext &Context; - LLLexer Lex; - // Module being parsed, null if we are only parsing summary index. - Module *M; - // Summary index being parsed, null if we are only parsing Module. - ModuleSummaryIndex *Index; - SlotMapping *Slots; - - // Instruction metadata resolution. Each instruction can have a list of - // MDRef info associated with them. - // - // The simpler approach of just creating temporary MDNodes and then calling - // RAUW on them when the definition is processed doesn't work because some - // instruction metadata kinds, such as dbg, get stored in the IR in an - // "optimized" format which doesn't participate in the normal value use - // lists. This means that RAUW doesn't work, even on temporary MDNodes - // which otherwise support RAUW. Instead, we defer resolving MDNode - // references until the definitions have been processed. - struct MDRef { - SMLoc Loc; - unsigned MDKind, MDSlot; - }; - - SmallVector<Instruction*, 64> InstsWithTBAATag; - - // Type resolution handling data structures. The location is set when we - // have processed a use of the type but not a definition yet. - StringMap<std::pair<Type*, LocTy> > NamedTypes; - std::map<unsigned, std::pair<Type*, LocTy> > NumberedTypes; - - std::map<unsigned, TrackingMDNodeRef> NumberedMetadata; - std::map<unsigned, std::pair<TempMDTuple, LocTy>> ForwardRefMDNodes; - - // Global Value reference information. - std::map<std::string, std::pair<GlobalValue*, LocTy> > ForwardRefVals; - std::map<unsigned, std::pair<GlobalValue*, LocTy> > ForwardRefValIDs; - std::vector<GlobalValue*> NumberedVals; - - // Comdat forward reference information. - std::map<std::string, LocTy> ForwardRefComdats; - - // References to blockaddress. The key is the function ValID, the value is - // a list of references to blocks in that function. - std::map<ValID, std::map<ValID, GlobalValue *>> ForwardRefBlockAddresses; - class PerFunctionState; - /// Reference to per-function state to allow basic blocks to be - /// forward-referenced by blockaddress instructions within the same - /// function. - PerFunctionState *BlockAddressPFS; - - // Attribute builder reference information. - std::map<Value*, std::vector<unsigned> > ForwardRefAttrGroups; - std::map<unsigned, AttrBuilder> NumberedAttrBuilders; - - // Summary global value reference information. - std::map<unsigned, std::vector<std::pair<ValueInfo *, LocTy>>> - ForwardRefValueInfos; - std::map<unsigned, std::vector<std::pair<AliasSummary *, LocTy>>> - ForwardRefAliasees; - std::vector<ValueInfo> NumberedValueInfos; - - // Summary type id reference information. - std::map<unsigned, std::vector<std::pair<GlobalValue::GUID *, LocTy>>> - ForwardRefTypeIds; - - // Map of module ID to path. - std::map<unsigned, StringRef> ModuleIdMap; - - /// Only the llvm-as tool may set this to false to bypass - /// UpgradeDebuginfo so it can generate broken bitcode. - bool UpgradeDebugInfo; - - /// DataLayout string to override that in LLVM assembly. - StringRef DataLayoutStr; - - std::string SourceFileName; + bool ParseOptionalParamAttrs(AttrBuilder &B); + bool ParseOptionalReturnAttrs(AttrBuilder &B); + bool ParseOptionalLinkage(unsigned &Res, bool &HasLinkage, + unsigned &Visibility, unsigned &DLLStorageClass, + bool &DSOLocal); + void ParseOptionalDSOLocal(bool &DSOLocal); + void ParseOptionalVisibility(unsigned &Res); + void ParseOptionalDLLStorageClass(unsigned &Res); + bool ParseOptionalCallingConv(unsigned &CC); + bool ParseOptionalAlignment(unsigned &Alignment); + bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes); + bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID, + AtomicOrdering &Ordering); + bool ParseScope(SyncScope::ID &SSID); + bool ParseOrdering(AtomicOrdering &Ordering); + bool ParseOptionalStackAlignment(unsigned &Alignment); + bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma); + bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc, + bool &AteExtraComma); + bool ParseOptionalCommaInAlloca(bool &IsInAlloca); + bool parseAllocSizeArguments(unsigned &BaseSizeArg, + Optional<unsigned> &HowManyArg); + bool ParseIndexList(SmallVectorImpl<unsigned> &Indices, bool &AteExtraComma); + bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) { + bool AteExtraComma; + if (ParseIndexList(Indices, AteExtraComma)) + return true; + if (AteExtraComma) + return TokError("expected index"); + return false; + } + + // Top-Level Entities + bool ParseTopLevelEntities(); + bool ValidateEndOfModule(); + bool ValidateEndOfIndex(); + bool ParseTargetDefinition(); + bool ParseModuleAsm(); + bool ParseSourceFileName(); + bool ParseDepLibs(); // FIXME: Remove in 4.0. + bool ParseUnnamedType(); + bool ParseNamedType(); + bool ParseDeclare(); + bool ParseDefine(); + + bool ParseGlobalType(bool &IsConstant); + bool ParseUnnamedGlobal(); + bool ParseNamedGlobal(); + bool ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage, + bool HasLinkage, unsigned Visibility, + unsigned DLLStorageClass, bool DSOLocal, + GlobalVariable::ThreadLocalMode TLM, + GlobalVariable::UnnamedAddr UnnamedAddr); + bool parseIndirectSymbol(const std::string &Name, LocTy NameLoc, unsigned L, + unsigned Visibility, unsigned DLLStorageClass, + bool DSOLocal, GlobalVariable::ThreadLocalMode TLM, + GlobalVariable::UnnamedAddr UnnamedAddr); + bool parseComdat(); + bool ParseStandaloneMetadata(); + bool ParseNamedMetadata(); + bool ParseMDString(MDString *&Result); + bool ParseMDNodeID(MDNode *&Result); + bool ParseUnnamedAttrGrp(); + bool ParseFnAttributeValuePairs(AttrBuilder &B, + std::vector<unsigned> &FwdRefAttrGrps, + bool inAttrGrp, LocTy &BuiltinLoc); + bool ParseByValWithOptionalType(Type *&Result); + + // Module Summary Index Parsing. + bool SkipModuleSummaryEntry(); + bool ParseSummaryEntry(); + bool ParseModuleEntry(unsigned ID); + bool ParseModuleReference(StringRef &ModulePath); + bool ParseGVReference(ValueInfo &VI, unsigned &GVId); + bool ParseGVEntry(unsigned ID); + bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID); + bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID); + bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID); + bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags); + bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags); + bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags); + bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls); + bool ParseHotness(CalleeInfo::HotnessType &Hotness); + bool ParseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo); + bool ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests); + bool ParseVFuncIdList(lltok::Kind Kind, + std::vector<FunctionSummary::VFuncId> &VFuncIdList); + bool + ParseConstVCallList(lltok::Kind Kind, + std::vector<FunctionSummary::ConstVCall> &ConstVCallList); + using IdToIndexMapType = + std::map<unsigned, std::vector<std::pair<unsigned, LocTy>>>; + bool ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall, + IdToIndexMapType &IdToIndexMap, unsigned Index); + bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId, + IdToIndexMapType &IdToIndexMap, unsigned Index); + bool ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs); + bool ParseOptionalRefs(std::vector<ValueInfo> &Refs); + bool ParseTypeIdEntry(unsigned ID); + bool ParseTypeIdSummary(TypeIdSummary &TIS); + bool ParseTypeIdCompatibleVtableEntry(unsigned ID); + bool ParseTypeTestResolution(TypeTestResolution &TTRes); + bool ParseOptionalWpdResolutions( + std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap); + bool ParseWpdRes(WholeProgramDevirtResolution &WPDRes); + bool ParseOptionalResByArg( + std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg> + &ResByArg); + bool ParseArgs(std::vector<uint64_t> &Args); + void AddGlobalValueToIndex(std::string Name, GlobalValue::GUID, + GlobalValue::LinkageTypes Linkage, unsigned ID, + std::unique_ptr<GlobalValueSummary> Summary); + + // Type Parsing. + bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false); + bool ParseType(Type *&Result, bool AllowVoid = false) { + return ParseType(Result, "expected type", AllowVoid); + } + bool ParseType(Type *&Result, const Twine &Msg, LocTy &Loc, + bool AllowVoid = false) { + Loc = Lex.getLoc(); + return ParseType(Result, Msg, AllowVoid); + } + bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) { + Loc = Lex.getLoc(); + return ParseType(Result, AllowVoid); + } + bool ParseAnonStructType(Type *&Result, bool Packed); + bool ParseStructBody(SmallVectorImpl<Type *> &Body); + bool ParseStructDefinition(SMLoc TypeLoc, StringRef Name, + std::pair<Type *, LocTy> &Entry, Type *&ResultTy); + + bool ParseArrayVectorType(Type *&Result, bool isVector); + bool ParseFunctionType(Type *&Result); + + // Function Semantic Analysis. + class PerFunctionState { + LLParser &P; + Function &F; + std::map<std::string, std::pair<Value *, LocTy>> ForwardRefVals; + std::map<unsigned, std::pair<Value *, LocTy>> ForwardRefValIDs; + std::vector<Value *> NumberedVals; + + /// FunctionNumber - If this is an unnamed function, this is the slot + /// number of it, otherwise it is -1. + int FunctionNumber; public: - LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M, - ModuleSummaryIndex *Index, LLVMContext &Context, - SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true, - StringRef DataLayoutString = "") - : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index), - Slots(Slots), BlockAddressPFS(nullptr), - UpgradeDebugInfo(UpgradeDebugInfo), DataLayoutStr(DataLayoutString) { - if (!DataLayoutStr.empty()) - M->setDataLayout(DataLayoutStr); - } - bool Run(); - - bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots); + PerFunctionState(LLParser &p, Function &f, int functionNumber); + ~PerFunctionState(); - bool parseTypeAtBeginning(Type *&Ty, unsigned &Read, - const SlotMapping *Slots); + Function &getFunction() const { return F; } - LLVMContext &getContext() { return Context; } + bool FinishFunction(); - private: - - bool Error(LocTy L, const Twine &Msg) const { - return Lex.Error(L, Msg); - } - bool TokError(const Twine &Msg) const { - return Error(Lex.getLoc(), Msg); - } - - /// Restore the internal name and slot mappings using the mappings that - /// were created at an earlier parsing stage. - void restoreParsingState(const SlotMapping *Slots); - - /// GetGlobalVal - Get a value with the specified name or ID, creating a + /// GetVal - Get a value with the specified name or ID, creating a /// forward reference record if needed. This can return null if the value /// exists but does not have the right type. - GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc, - bool IsCall); - GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall); - - /// Get a Comdat with the specified name, creating a forward reference - /// record if needed. - Comdat *getComdat(const std::string &Name, LocTy Loc); - - // Helper Routines. - bool ParseToken(lltok::Kind T, const char *ErrMsg); - bool EatIfPresent(lltok::Kind T) { - if (Lex.getKind() != T) return false; - Lex.Lex(); - return true; - } - - FastMathFlags EatFastMathFlagsIfPresent() { - FastMathFlags FMF; - while (true) - switch (Lex.getKind()) { - case lltok::kw_fast: FMF.setFast(); Lex.Lex(); continue; - case lltok::kw_nnan: FMF.setNoNaNs(); Lex.Lex(); continue; - case lltok::kw_ninf: FMF.setNoInfs(); Lex.Lex(); continue; - case lltok::kw_nsz: FMF.setNoSignedZeros(); Lex.Lex(); continue; - case lltok::kw_arcp: FMF.setAllowReciprocal(); Lex.Lex(); continue; - case lltok::kw_contract: - FMF.setAllowContract(true); - Lex.Lex(); - continue; - case lltok::kw_reassoc: FMF.setAllowReassoc(); Lex.Lex(); continue; - case lltok::kw_afn: FMF.setApproxFunc(); Lex.Lex(); continue; - default: return FMF; - } - return FMF; - } - - bool ParseOptionalToken(lltok::Kind T, bool &Present, - LocTy *Loc = nullptr) { - if (Lex.getKind() != T) { - Present = false; - } else { - if (Loc) - *Loc = Lex.getLoc(); - Lex.Lex(); - Present = true; - } - return false; - } - bool ParseStringConstant(std::string &Result); - bool ParseUInt32(unsigned &Val); - bool ParseUInt32(unsigned &Val, LocTy &Loc) { - Loc = Lex.getLoc(); - return ParseUInt32(Val); - } - bool ParseUInt64(uint64_t &Val); - bool ParseUInt64(uint64_t &Val, LocTy &Loc) { - Loc = Lex.getLoc(); - return ParseUInt64(Val); - } - bool ParseFlag(unsigned &Val); - - bool ParseStringAttribute(AttrBuilder &B); - - bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM); - bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM); - bool ParseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr); - bool ParseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS = 0); - bool ParseOptionalProgramAddrSpace(unsigned &AddrSpace) { - return ParseOptionalAddrSpace( - AddrSpace, M->getDataLayout().getProgramAddressSpace()); - }; - bool ParseOptionalParamAttrs(AttrBuilder &B); - bool ParseOptionalReturnAttrs(AttrBuilder &B); - bool ParseOptionalLinkage(unsigned &Res, bool &HasLinkage, - unsigned &Visibility, unsigned &DLLStorageClass, - bool &DSOLocal); - void ParseOptionalDSOLocal(bool &DSOLocal); - void ParseOptionalVisibility(unsigned &Res); - void ParseOptionalDLLStorageClass(unsigned &Res); - bool ParseOptionalCallingConv(unsigned &CC); - bool ParseOptionalAlignment(unsigned &Alignment); - bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes); - bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID, - AtomicOrdering &Ordering); - bool ParseScope(SyncScope::ID &SSID); - bool ParseOrdering(AtomicOrdering &Ordering); - bool ParseOptionalStackAlignment(unsigned &Alignment); - bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma); - bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc, - bool &AteExtraComma); - bool ParseOptionalCommaInAlloca(bool &IsInAlloca); - bool parseAllocSizeArguments(unsigned &BaseSizeArg, - Optional<unsigned> &HowManyArg); - bool ParseIndexList(SmallVectorImpl<unsigned> &Indices, - bool &AteExtraComma); - bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) { - bool AteExtraComma; - if (ParseIndexList(Indices, AteExtraComma)) return true; - if (AteExtraComma) - return TokError("expected index"); - return false; - } + Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall); + Value *GetVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall); - // Top-Level Entities - bool ParseTopLevelEntities(); - bool ValidateEndOfModule(); - bool ValidateEndOfIndex(); - bool ParseTargetDefinition(); - bool ParseModuleAsm(); - bool ParseSourceFileName(); - bool ParseDepLibs(); // FIXME: Remove in 4.0. - bool ParseUnnamedType(); - bool ParseNamedType(); - bool ParseDeclare(); - bool ParseDefine(); - - bool ParseGlobalType(bool &IsConstant); - bool ParseUnnamedGlobal(); - bool ParseNamedGlobal(); - bool ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage, - bool HasLinkage, unsigned Visibility, - unsigned DLLStorageClass, bool DSOLocal, - GlobalVariable::ThreadLocalMode TLM, - GlobalVariable::UnnamedAddr UnnamedAddr); - bool parseIndirectSymbol(const std::string &Name, LocTy NameLoc, - unsigned L, unsigned Visibility, - unsigned DLLStorageClass, bool DSOLocal, - GlobalVariable::ThreadLocalMode TLM, - GlobalVariable::UnnamedAddr UnnamedAddr); - bool parseComdat(); - bool ParseStandaloneMetadata(); - bool ParseNamedMetadata(); - bool ParseMDString(MDString *&Result); - bool ParseMDNodeID(MDNode *&Result); - bool ParseUnnamedAttrGrp(); - bool ParseFnAttributeValuePairs(AttrBuilder &B, - std::vector<unsigned> &FwdRefAttrGrps, - bool inAttrGrp, LocTy &BuiltinLoc); - bool ParseByValWithOptionalType(Type *&Result); - - // Module Summary Index Parsing. - bool SkipModuleSummaryEntry(); - bool ParseSummaryEntry(); - bool ParseModuleEntry(unsigned ID); - bool ParseModuleReference(StringRef &ModulePath); - bool ParseGVReference(ValueInfo &VI, unsigned &GVId); - bool ParseGVEntry(unsigned ID); - bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID); - bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID); - bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID); - bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags); - bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags); - bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags); - bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls); - bool ParseHotness(CalleeInfo::HotnessType &Hotness); - bool ParseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo); - bool ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests); - bool ParseVFuncIdList(lltok::Kind Kind, - std::vector<FunctionSummary::VFuncId> &VFuncIdList); - bool ParseConstVCallList( - lltok::Kind Kind, - std::vector<FunctionSummary::ConstVCall> &ConstVCallList); - using IdToIndexMapType = - std::map<unsigned, std::vector<std::pair<unsigned, LocTy>>>; - bool ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall, - IdToIndexMapType &IdToIndexMap, unsigned Index); - bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId, - IdToIndexMapType &IdToIndexMap, unsigned Index); - bool ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs); - bool ParseOptionalRefs(std::vector<ValueInfo> &Refs); - bool ParseTypeIdEntry(unsigned ID); - bool ParseTypeIdSummary(TypeIdSummary &TIS); - bool ParseTypeIdCompatibleVtableEntry(unsigned ID); - bool ParseTypeTestResolution(TypeTestResolution &TTRes); - bool ParseOptionalWpdResolutions( - std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap); - bool ParseWpdRes(WholeProgramDevirtResolution &WPDRes); - bool ParseOptionalResByArg( - std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg> - &ResByArg); - bool ParseArgs(std::vector<uint64_t> &Args); - void AddGlobalValueToIndex(std::string Name, GlobalValue::GUID, - GlobalValue::LinkageTypes Linkage, unsigned ID, - std::unique_ptr<GlobalValueSummary> Summary); - - // Type Parsing. - bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false); - bool ParseType(Type *&Result, bool AllowVoid = false) { - return ParseType(Result, "expected type", AllowVoid); - } - bool ParseType(Type *&Result, const Twine &Msg, LocTy &Loc, - bool AllowVoid = false) { - Loc = Lex.getLoc(); - return ParseType(Result, Msg, AllowVoid); - } - bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) { - Loc = Lex.getLoc(); - return ParseType(Result, AllowVoid); - } - bool ParseAnonStructType(Type *&Result, bool Packed); - bool ParseStructBody(SmallVectorImpl<Type*> &Body); - bool ParseStructDefinition(SMLoc TypeLoc, StringRef Name, - std::pair<Type*, LocTy> &Entry, - Type *&ResultTy); - - bool ParseArrayVectorType(Type *&Result, bool isVector); - bool ParseFunctionType(Type *&Result); - - // Function Semantic Analysis. - class PerFunctionState { - LLParser &P; - Function &F; - std::map<std::string, std::pair<Value*, LocTy> > ForwardRefVals; - std::map<unsigned, std::pair<Value*, LocTy> > ForwardRefValIDs; - std::vector<Value*> NumberedVals; - - /// FunctionNumber - If this is an unnamed function, this is the slot - /// number of it, otherwise it is -1. - int FunctionNumber; - public: - PerFunctionState(LLParser &p, Function &f, int functionNumber); - ~PerFunctionState(); - - Function &getFunction() const { return F; } - - bool FinishFunction(); - - /// GetVal - Get a value with the specified name or ID, creating a - /// forward reference record if needed. This can return null if the value - /// exists but does not have the right type. - Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall); - Value *GetVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall); - - /// SetInstName - After an instruction is parsed and inserted into its - /// basic block, this installs its name. - bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc, - Instruction *Inst); - - /// GetBB - Get a basic block with the specified name or ID, creating a - /// forward reference record if needed. This can return null if the value - /// is not a BasicBlock. - BasicBlock *GetBB(const std::string &Name, LocTy Loc); - BasicBlock *GetBB(unsigned ID, LocTy Loc); - - /// DefineBB - Define the specified basic block, which is either named or - /// unnamed. If there is an error, this returns null otherwise it returns - /// the block being defined. - BasicBlock *DefineBB(const std::string &Name, int NameID, LocTy Loc); - - bool resolveForwardRefBlockAddresses(); - }; - - bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, - PerFunctionState *PFS, bool IsCall); - - Value *checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty, - Value *Val, bool IsCall); - - bool parseConstantValue(Type *Ty, Constant *&C); - bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS); - bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) { - return ParseValue(Ty, V, &PFS); - } + /// SetInstName - After an instruction is parsed and inserted into its + /// basic block, this installs its name. + bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc, + Instruction *Inst); - bool ParseValue(Type *Ty, Value *&V, LocTy &Loc, - PerFunctionState &PFS) { - Loc = Lex.getLoc(); - return ParseValue(Ty, V, &PFS); - } + /// GetBB - Get a basic block with the specified name or ID, creating a + /// forward reference record if needed. This can return null if the value + /// is not a BasicBlock. + BasicBlock *GetBB(const std::string &Name, LocTy Loc); + BasicBlock *GetBB(unsigned ID, LocTy Loc); - bool ParseTypeAndValue(Value *&V, PerFunctionState *PFS); - bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS) { - return ParseTypeAndValue(V, &PFS); - } - bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) { - Loc = Lex.getLoc(); - return ParseTypeAndValue(V, PFS); - } - bool ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc, - PerFunctionState &PFS); - bool ParseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) { - LocTy Loc; - return ParseTypeAndBasicBlock(BB, Loc, PFS); - } + /// DefineBB - Define the specified basic block, which is either named or + /// unnamed. If there is an error, this returns null otherwise it returns + /// the block being defined. + BasicBlock *DefineBB(const std::string &Name, int NameID, LocTy Loc); + bool resolveForwardRefBlockAddresses(); + }; - struct ParamInfo { - LocTy Loc; - Value *V; - AttributeSet Attrs; - ParamInfo(LocTy loc, Value *v, AttributeSet attrs) - : Loc(loc), V(v), Attrs(attrs) {} - }; - bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList, - PerFunctionState &PFS, - bool IsMustTailCall = false, - bool InVarArgsFunc = false); - - bool - ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList, - PerFunctionState &PFS); - - bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args, - PerFunctionState &PFS); - - // Constant Parsing. - bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr); - bool ParseGlobalValue(Type *Ty, Constant *&C); - bool ParseGlobalTypeAndValue(Constant *&V); - bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts, - Optional<unsigned> *InRangeOp = nullptr); - bool parseOptionalComdat(StringRef GlobalName, Comdat *&C); - bool ParseMetadataAsValue(Value *&V, PerFunctionState &PFS); - bool ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg, - PerFunctionState *PFS); - bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS); - bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false); - bool ParseMDNode(MDNode *&N); - bool ParseMDNodeTail(MDNode *&N); - bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts); - bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD); - bool ParseInstructionMetadata(Instruction &Inst); - bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO); - bool ParseOptionalFunctionMetadata(Function &F); - - template <class FieldTy> - bool ParseMDField(LocTy Loc, StringRef Name, FieldTy &Result); - template <class FieldTy> bool ParseMDField(StringRef Name, FieldTy &Result); - template <class ParserTy> - bool ParseMDFieldsImplBody(ParserTy parseField); - template <class ParserTy> - bool ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc); - bool ParseSpecializedMDNode(MDNode *&N, bool IsDistinct = false); + bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, + PerFunctionState *PFS, bool IsCall); + + Value *checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty, + Value *Val, bool IsCall); + + bool parseConstantValue(Type *Ty, Constant *&C); + bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS); + bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) { + return ParseValue(Ty, V, &PFS); + } + + bool ParseValue(Type *Ty, Value *&V, LocTy &Loc, PerFunctionState &PFS) { + Loc = Lex.getLoc(); + return ParseValue(Ty, V, &PFS); + } + + bool ParseTypeAndValue(Value *&V, PerFunctionState *PFS); + bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS) { + return ParseTypeAndValue(V, &PFS); + } + bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) { + Loc = Lex.getLoc(); + return ParseTypeAndValue(V, PFS); + } + bool ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc, + PerFunctionState &PFS); + bool ParseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) { + LocTy Loc; + return ParseTypeAndBasicBlock(BB, Loc, PFS); + } + + struct ParamInfo { + LocTy Loc; + Value *V; + AttributeSet Attrs; + ParamInfo(LocTy loc, Value *v, AttributeSet attrs) + : Loc(loc), V(v), Attrs(attrs) {} + }; + bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList, + PerFunctionState &PFS, bool IsMustTailCall = false, + bool InVarArgsFunc = false); + + bool + ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList, + PerFunctionState &PFS); + + bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args, + PerFunctionState &PFS); + + // Constant Parsing. + bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr); + bool ParseGlobalValue(Type *Ty, Constant *&C); + bool ParseGlobalTypeAndValue(Constant *&V); + bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts, + Optional<unsigned> *InRangeOp = nullptr); + bool parseOptionalComdat(StringRef GlobalName, Comdat *&C); + bool ParseMetadataAsValue(Value *&V, PerFunctionState &PFS); + bool ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg, + PerFunctionState *PFS); + bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS); + bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false); + bool ParseMDNode(MDNode *&N); + bool ParseMDNodeTail(MDNode *&N); + bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts); + bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD); + bool ParseInstructionMetadata(Instruction &Inst); + bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO); + bool ParseOptionalFunctionMetadata(Function &F); + + template <class FieldTy> + bool ParseMDField(LocTy Loc, StringRef Name, FieldTy &Result); + template <class FieldTy> bool ParseMDField(StringRef Name, FieldTy &Result); + template <class ParserTy> bool ParseMDFieldsImplBody(ParserTy parseField); + template <class ParserTy> + bool ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc); + bool ParseSpecializedMDNode(MDNode *&N, bool IsDistinct = false); #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) \ bool Parse##CLASS(MDNode *&Result, bool IsDistinct); #include "llvm/IR/Metadata.def" - // Function Parsing. - struct ArgInfo { - LocTy Loc; - Type *Ty; - AttributeSet Attrs; - std::string Name; - ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N) - : Loc(L), Ty(ty), Attrs(Attr), Name(N) {} - }; - bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg); - bool ParseFunctionHeader(Function *&Fn, bool isDefine); - bool ParseFunctionBody(Function &Fn); - bool ParseBasicBlock(PerFunctionState &PFS); - - enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail }; - - // Instruction Parsing. Each instruction parsing routine can return with a - // normal result, an error result, or return having eaten an extra comma. - enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 }; - int ParseInstruction(Instruction *&Inst, BasicBlock *BB, - PerFunctionState &PFS); - bool ParseCmpPredicate(unsigned &P, unsigned Opc); - - bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS); - bool ParseBr(Instruction *&Inst, PerFunctionState &PFS); - bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS); - bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS); - bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS); - bool ParseResume(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCallBr(Instruction *&Inst, PerFunctionState &PFS); - - bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, - bool IsFP); - bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, - bool IsFP); - bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); - bool ParseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); - bool ParseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); - bool ParseSelect(Instruction *&Inst, PerFunctionState &PFS); - bool ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS); - bool ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS); - bool ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS); - bool ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS); - int ParsePHI(Instruction *&Inst, PerFunctionState &PFS); - bool ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS); - bool ParseCall(Instruction *&Inst, PerFunctionState &PFS, - CallInst::TailCallKind TCK); - int ParseAlloc(Instruction *&Inst, PerFunctionState &PFS); - int ParseLoad(Instruction *&Inst, PerFunctionState &PFS); - int ParseStore(Instruction *&Inst, PerFunctionState &PFS); - int ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS); - int ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS); - int ParseFence(Instruction *&Inst, PerFunctionState &PFS); - int ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS); - int ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS); - int ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS); - - // Use-list order directives. - bool ParseUseListOrder(PerFunctionState *PFS = nullptr); - bool ParseUseListOrderBB(); - bool ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes); - bool sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes, SMLoc Loc); + // Function Parsing. + struct ArgInfo { + LocTy Loc; + Type *Ty; + AttributeSet Attrs; + std::string Name; + ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N) + : Loc(L), Ty(ty), Attrs(Attr), Name(N) {} }; -} // End llvm namespace + bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg); + bool ParseFunctionHeader(Function *&Fn, bool isDefine); + bool ParseFunctionBody(Function &Fn); + bool ParseBasicBlock(PerFunctionState &PFS); + + enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail }; + + // Instruction Parsing. Each instruction parsing routine can return with a + // normal result, an error result, or return having eaten an extra comma. + enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 }; + int ParseInstruction(Instruction *&Inst, BasicBlock *BB, + PerFunctionState &PFS); + bool ParseCmpPredicate(unsigned &P, unsigned Opc); + + bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS); + bool ParseBr(Instruction *&Inst, PerFunctionState &PFS); + bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS); + bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS); + bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS); + bool ParseResume(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCallBr(Instruction *&Inst, PerFunctionState &PFS); + + bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, + bool IsFP); + bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, + bool IsFP); + bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); + bool ParseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); + bool ParseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); + bool ParseSelect(Instruction *&Inst, PerFunctionState &PFS); + bool ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS); + bool ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS); + bool ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS); + bool ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS); + int ParsePHI(Instruction *&Inst, PerFunctionState &PFS); + bool ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCall(Instruction *&Inst, PerFunctionState &PFS, + CallInst::TailCallKind TCK); + int ParseAlloc(Instruction *&Inst, PerFunctionState &PFS); + int ParseLoad(Instruction *&Inst, PerFunctionState &PFS); + int ParseStore(Instruction *&Inst, PerFunctionState &PFS); + int ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS); + int ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS); + int ParseFence(Instruction *&Inst, PerFunctionState &PFS); + int ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS); + int ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS); + int ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS); + + // Use-list order directives. + bool ParseUseListOrder(PerFunctionState *PFS = nullptr); + bool ParseUseListOrderBB(); + bool ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes); + bool sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes, SMLoc Loc); +}; +} // namespace llvm #endif diff --git a/hpvm/llvm_patches/lib/AsmParser/LLToken.h b/hpvm/llvm_patches/lib/AsmParser/LLToken.h index 3c2eade04f928d737ad8d2408ab4c56825e34ab0..7f9816965b2a21ae3d23873ca789a22481b575fa 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLToken.h +++ b/hpvm/llvm_patches/lib/AsmParser/LLToken.h @@ -351,10 +351,10 @@ enum Kind { kw_insertvalue, kw_blockaddress, - // VISC parameter attributes - kw_in, - kw_out, - kw_inout, + // VISC parameter attributes + kw_in, + kw_out, + kw_inout, // Metadata types. kw_distinct, diff --git a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp index c6530f992bab22d36a1273f1b1c454970270c928..7eb289d5872713ef826174b1e691c6440d4dd43e 100644 --- a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp @@ -20,8 +20,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" -#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Bitcode/LLVMBitCodes.h" +#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -211,9 +211,9 @@ static Expected<std::string> readIdentificationBlock(BitstreamCursor &Stream) { case bitc::IDENTIFICATION_CODE_EPOCH: { // EPOCH: [epoch#] unsigned epoch = (unsigned)Record[0]; if (epoch != bitc::BITCODE_CURRENT_EPOCH) { - return error( - Twine("Incompatible epoch: Bitcode '") + Twine(epoch) + - "' vs current: '" + Twine(bitc::BITCODE_CURRENT_EPOCH) + "'"); + return error(Twine("Incompatible epoch: Bitcode '") + Twine(epoch) + + "' vs current: '" + Twine(bitc::BITCODE_CURRENT_EPOCH) + + "'"); } } } @@ -367,8 +367,9 @@ static Expected<std::string> readModuleTriple(BitstreamCursor &Stream) { if (!MaybeRecord) return MaybeRecord.takeError(); switch (MaybeRecord.get()) { - default: break; // Default behavior, ignore unknown content. - case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N] + default: + break; // Default behavior, ignore unknown content. + case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N] std::string S; if (convertToString(Record, 0, S)) return error("Invalid record"); @@ -493,7 +494,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { std::vector<std::string> SectionTable; std::vector<std::string> GCTable; - std::vector<Type*> TypeList; + std::vector<Type *> TypeList; DenseMap<Function *, FunctionType *> FunctionTypes; BitcodeReaderValueList ValueList; Optional<MetadataLoader> MDLoader; @@ -515,11 +516,11 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { /// While parsing a function body, this is a list of the basic blocks for the /// function. - std::vector<BasicBlock*> FunctionBBs; + std::vector<BasicBlock *> FunctionBBs; // When reading the module header, this list is populated with functions that // have bodies later in the file. - std::vector<Function*> FunctionsWithBodies; + std::vector<Function *> FunctionsWithBodies; // When intrinsic functions are encountered which require upgrading they are // stored here with their replacement function. @@ -535,7 +536,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { /// When function bodies are initially scanned, this map contains info about /// where to find deferred function body in the stream. - DenseMap<Function*, uint64_t> DeferredFunctionInfo; + DenseMap<Function *, uint64_t> DeferredFunctionInfo; /// When Metadata block is initially scanned when parsing the module, we may /// choose to defer parsing of the metadata. This vector contains info about @@ -597,9 +598,7 @@ private: /// type in the same address space if opaque pointers are being /// used, otherwise nop. This converts a bitcode-reader internal /// type into one suitable for use in a Value. - Type *flattenPointerTypes(Type *Ty) { - return Ty; - } + Type *flattenPointerTypes(Type *Ty) { return Ty; } /// Given a fully structured pointer type (i.e. not opaque), return /// the flattened form of its element, suitable for use in a Value. @@ -636,13 +635,14 @@ private: } BasicBlock *getBasicBlock(unsigned ID) const { - if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID + if (ID >= FunctionBBs.size()) + return nullptr; // Invalid ID return FunctionBBs[ID]; } AttributeList getAttributes(unsigned i) const { - if (i-1 < MAttributes.size()) - return MAttributes[i-1]; + if (i - 1 < MAttributes.size()) + return MAttributes[i - 1]; return AttributeList(); } @@ -652,7 +652,8 @@ private: bool getValueTypePair(SmallVectorImpl<uint64_t> &Record, unsigned &Slot, unsigned InstNum, Value *&ResVal, Type **FullTy = nullptr) { - if (Slot == Record.size()) return true; + if (Slot == Record.size()) + return true; unsigned ValNo = (unsigned)Record[Slot++]; // Adjust the ValNo, if it was encoded relative to the InstNum. if (UseRelativeIDs) @@ -696,7 +697,8 @@ private: /// error. Value *getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot, unsigned InstNum, Type *Ty) { - if (Slot == Record.size()) return nullptr; + if (Slot == Record.size()) + return nullptr; unsigned ValNo = (unsigned)Record[Slot]; // Adjust the ValNo, if it was encoded relative to the InstNum. if (UseRelativeIDs) @@ -707,7 +709,8 @@ private: /// Like getValue, but decodes signed VBRs. Value *getValueSigned(SmallVectorImpl<uint64_t> &Record, unsigned Slot, unsigned InstNum, Type *Ty) { - if (Slot == Record.size()) return nullptr; + if (Slot == Record.size()) + return nullptr; unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]); // Adjust the ValNo, if it was encoded relative to the InstNum. if (UseRelativeIDs) @@ -938,7 +941,7 @@ static GlobalValue::LinkageTypes getDecodedLinkage(unsigned Val) { return GlobalValue::PrivateLinkage; // Obsolete LinkerPrivateWeakLinkage case 15: return GlobalValue::ExternalLinkage; // Obsolete LinkOnceODRAutoHideLinkage - case 1: // Old value with implicit comdat. + case 1: // Old value with implicit comdat. case 16: return GlobalValue::WeakAnyLinkage; case 10: // Old value with implicit comdat. @@ -979,7 +982,8 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, bool Local = (RawFlags & 0x4); bool AutoHide = (RawFlags & 0x8); - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local, AutoHide); + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local, + AutoHide); } // Decode the flags for GlobalVariable in the summary @@ -991,9 +995,12 @@ static GlobalVarSummary::GVarFlags getDecodedGVarFlags(uint64_t RawFlags) { static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { switch (Val) { default: // Map unknown visibilities to default. - case 0: return GlobalValue::DefaultVisibility; - case 1: return GlobalValue::HiddenVisibility; - case 2: return GlobalValue::ProtectedVisibility; + case 0: + return GlobalValue::DefaultVisibility; + case 1: + return GlobalValue::HiddenVisibility; + case 2: + return GlobalValue::ProtectedVisibility; } } @@ -1001,56 +1008,83 @@ static GlobalValue::DLLStorageClassTypes getDecodedDLLStorageClass(unsigned Val) { switch (Val) { default: // Map unknown values to default. - case 0: return GlobalValue::DefaultStorageClass; - case 1: return GlobalValue::DLLImportStorageClass; - case 2: return GlobalValue::DLLExportStorageClass; + case 0: + return GlobalValue::DefaultStorageClass; + case 1: + return GlobalValue::DLLImportStorageClass; + case 2: + return GlobalValue::DLLExportStorageClass; } } static bool getDecodedDSOLocal(unsigned Val) { - switch(Val) { + switch (Val) { default: // Map unknown values to preemptable. - case 0: return false; - case 1: return true; + case 0: + return false; + case 1: + return true; } } static GlobalVariable::ThreadLocalMode getDecodedThreadLocalMode(unsigned Val) { switch (Val) { - case 0: return GlobalVariable::NotThreadLocal; - default: // Map unknown non-zero value to general dynamic. - case 1: return GlobalVariable::GeneralDynamicTLSModel; - case 2: return GlobalVariable::LocalDynamicTLSModel; - case 3: return GlobalVariable::InitialExecTLSModel; - case 4: return GlobalVariable::LocalExecTLSModel; + case 0: + return GlobalVariable::NotThreadLocal; + default: // Map unknown non-zero value to general dynamic. + case 1: + return GlobalVariable::GeneralDynamicTLSModel; + case 2: + return GlobalVariable::LocalDynamicTLSModel; + case 3: + return GlobalVariable::InitialExecTLSModel; + case 4: + return GlobalVariable::LocalExecTLSModel; } } static GlobalVariable::UnnamedAddr getDecodedUnnamedAddrType(unsigned Val) { switch (Val) { - default: // Map unknown to UnnamedAddr::None. - case 0: return GlobalVariable::UnnamedAddr::None; - case 1: return GlobalVariable::UnnamedAddr::Global; - case 2: return GlobalVariable::UnnamedAddr::Local; + default: // Map unknown to UnnamedAddr::None. + case 0: + return GlobalVariable::UnnamedAddr::None; + case 1: + return GlobalVariable::UnnamedAddr::Global; + case 2: + return GlobalVariable::UnnamedAddr::Local; } } static int getDecodedCastOpcode(unsigned Val) { switch (Val) { - default: return -1; - case bitc::CAST_TRUNC : return Instruction::Trunc; - case bitc::CAST_ZEXT : return Instruction::ZExt; - case bitc::CAST_SEXT : return Instruction::SExt; - case bitc::CAST_FPTOUI : return Instruction::FPToUI; - case bitc::CAST_FPTOSI : return Instruction::FPToSI; - case bitc::CAST_UITOFP : return Instruction::UIToFP; - case bitc::CAST_SITOFP : return Instruction::SIToFP; - case bitc::CAST_FPTRUNC : return Instruction::FPTrunc; - case bitc::CAST_FPEXT : return Instruction::FPExt; - case bitc::CAST_PTRTOINT: return Instruction::PtrToInt; - case bitc::CAST_INTTOPTR: return Instruction::IntToPtr; - case bitc::CAST_BITCAST : return Instruction::BitCast; - case bitc::CAST_ADDRSPACECAST: return Instruction::AddrSpaceCast; + default: + return -1; + case bitc::CAST_TRUNC: + return Instruction::Trunc; + case bitc::CAST_ZEXT: + return Instruction::ZExt; + case bitc::CAST_SEXT: + return Instruction::SExt; + case bitc::CAST_FPTOUI: + return Instruction::FPToUI; + case bitc::CAST_FPTOSI: + return Instruction::FPToSI; + case bitc::CAST_UITOFP: + return Instruction::UIToFP; + case bitc::CAST_SITOFP: + return Instruction::SIToFP; + case bitc::CAST_FPTRUNC: + return Instruction::FPTrunc; + case bitc::CAST_FPEXT: + return Instruction::FPExt; + case bitc::CAST_PTRTOINT: + return Instruction::PtrToInt; + case bitc::CAST_INTTOPTR: + return Instruction::IntToPtr; + case bitc::CAST_BITCAST: + return Instruction::BitCast; + case bitc::CAST_ADDRSPACECAST: + return Instruction::AddrSpaceCast; } } @@ -1108,33 +1142,54 @@ static int getDecodedBinaryOpcode(unsigned Val, Type *Ty) { static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val) { switch (Val) { - default: return AtomicRMWInst::BAD_BINOP; - case bitc::RMW_XCHG: return AtomicRMWInst::Xchg; - case bitc::RMW_ADD: return AtomicRMWInst::Add; - case bitc::RMW_SUB: return AtomicRMWInst::Sub; - case bitc::RMW_AND: return AtomicRMWInst::And; - case bitc::RMW_NAND: return AtomicRMWInst::Nand; - case bitc::RMW_OR: return AtomicRMWInst::Or; - case bitc::RMW_XOR: return AtomicRMWInst::Xor; - case bitc::RMW_MAX: return AtomicRMWInst::Max; - case bitc::RMW_MIN: return AtomicRMWInst::Min; - case bitc::RMW_UMAX: return AtomicRMWInst::UMax; - case bitc::RMW_UMIN: return AtomicRMWInst::UMin; - case bitc::RMW_FADD: return AtomicRMWInst::FAdd; - case bitc::RMW_FSUB: return AtomicRMWInst::FSub; + default: + return AtomicRMWInst::BAD_BINOP; + case bitc::RMW_XCHG: + return AtomicRMWInst::Xchg; + case bitc::RMW_ADD: + return AtomicRMWInst::Add; + case bitc::RMW_SUB: + return AtomicRMWInst::Sub; + case bitc::RMW_AND: + return AtomicRMWInst::And; + case bitc::RMW_NAND: + return AtomicRMWInst::Nand; + case bitc::RMW_OR: + return AtomicRMWInst::Or; + case bitc::RMW_XOR: + return AtomicRMWInst::Xor; + case bitc::RMW_MAX: + return AtomicRMWInst::Max; + case bitc::RMW_MIN: + return AtomicRMWInst::Min; + case bitc::RMW_UMAX: + return AtomicRMWInst::UMax; + case bitc::RMW_UMIN: + return AtomicRMWInst::UMin; + case bitc::RMW_FADD: + return AtomicRMWInst::FAdd; + case bitc::RMW_FSUB: + return AtomicRMWInst::FSub; } } static AtomicOrdering getDecodedOrdering(unsigned Val) { switch (Val) { - case bitc::ORDERING_NOTATOMIC: return AtomicOrdering::NotAtomic; - case bitc::ORDERING_UNORDERED: return AtomicOrdering::Unordered; - case bitc::ORDERING_MONOTONIC: return AtomicOrdering::Monotonic; - case bitc::ORDERING_ACQUIRE: return AtomicOrdering::Acquire; - case bitc::ORDERING_RELEASE: return AtomicOrdering::Release; - case bitc::ORDERING_ACQREL: return AtomicOrdering::AcquireRelease; + case bitc::ORDERING_NOTATOMIC: + return AtomicOrdering::NotAtomic; + case bitc::ORDERING_UNORDERED: + return AtomicOrdering::Unordered; + case bitc::ORDERING_MONOTONIC: + return AtomicOrdering::Monotonic; + case bitc::ORDERING_ACQUIRE: + return AtomicOrdering::Acquire; + case bitc::ORDERING_RELEASE: + return AtomicOrdering::Release; + case bitc::ORDERING_ACQREL: + return AtomicOrdering::AcquireRelease; default: // Map unknown orderings to sequentially-consistent. - case bitc::ORDERING_SEQCST: return AtomicOrdering::SequentiallyConsistent; + case bitc::ORDERING_SEQCST: + return AtomicOrdering::SequentiallyConsistent; } } @@ -1177,8 +1232,12 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) { static void upgradeDLLImportExportLinkage(GlobalValue *GV, unsigned Val) { switch (Val) { - case 5: GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); break; - case 6: GV->setDLLStorageClass(GlobalValue::DLLExportStorageClass); break; + case 5: + GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); + break; + case 6: + GV->setDLLStorageClass(GlobalValue::DLLExportStorageClass); + break; } } @@ -1217,61 +1276,116 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) { case Attribute::EndAttrKinds: llvm_unreachable("Synthetic enumerators which should never get here"); - case Attribute::None: return 0; - case Attribute::ZExt: return 1 << 0; - case Attribute::SExt: return 1 << 1; - case Attribute::NoReturn: return 1 << 2; - case Attribute::InReg: return 1 << 3; - case Attribute::StructRet: return 1 << 4; - case Attribute::NoUnwind: return 1 << 5; - case Attribute::NoAlias: return 1 << 6; - case Attribute::ByVal: return 1 << 7; - case Attribute::Nest: return 1 << 8; - case Attribute::ReadNone: return 1 << 9; - case Attribute::ReadOnly: return 1 << 10; - case Attribute::NoInline: return 1 << 11; - case Attribute::AlwaysInline: return 1 << 12; - case Attribute::OptimizeForSize: return 1 << 13; - case Attribute::StackProtect: return 1 << 14; - case Attribute::StackProtectReq: return 1 << 15; - case Attribute::Alignment: return 31 << 16; - case Attribute::NoCapture: return 1 << 21; - case Attribute::NoRedZone: return 1 << 22; - case Attribute::NoImplicitFloat: return 1 << 23; - case Attribute::Naked: return 1 << 24; - case Attribute::InlineHint: return 1 << 25; - case Attribute::StackAlignment: return 7 << 26; - case Attribute::ReturnsTwice: return 1 << 29; - case Attribute::UWTable: return 1 << 30; - case Attribute::NonLazyBind: return 1U << 31; - case Attribute::SanitizeAddress: return 1ULL << 32; - case Attribute::MinSize: return 1ULL << 33; - case Attribute::NoDuplicate: return 1ULL << 34; - case Attribute::StackProtectStrong: return 1ULL << 35; - case Attribute::SanitizeThread: return 1ULL << 36; - case Attribute::SanitizeMemory: return 1ULL << 37; - case Attribute::NoBuiltin: return 1ULL << 38; - case Attribute::Returned: return 1ULL << 39; - case Attribute::Cold: return 1ULL << 40; - case Attribute::Builtin: return 1ULL << 41; - case Attribute::OptimizeNone: return 1ULL << 42; - case Attribute::InAlloca: return 1ULL << 43; - case Attribute::NonNull: return 1ULL << 44; - case Attribute::JumpTable: return 1ULL << 45; - case Attribute::Convergent: return 1ULL << 46; - case Attribute::SafeStack: return 1ULL << 47; - case Attribute::NoRecurse: return 1ULL << 48; - case Attribute::InaccessibleMemOnly: return 1ULL << 49; - case Attribute::InaccessibleMemOrArgMemOnly: return 1ULL << 50; - case Attribute::SwiftSelf: return 1ULL << 51; - case Attribute::SwiftError: return 1ULL << 52; - case Attribute::WriteOnly: return 1ULL << 53; - case Attribute::Speculatable: return 1ULL << 54; - case Attribute::StrictFP: return 1ULL << 55; - case Attribute::SanitizeHWAddress: return 1ULL << 56; - case Attribute::NoCfCheck: return 1ULL << 57; - case Attribute::OptForFuzzing: return 1ULL << 58; - case Attribute::ShadowCallStack: return 1ULL << 59; + case Attribute::None: + return 0; + case Attribute::ZExt: + return 1 << 0; + case Attribute::SExt: + return 1 << 1; + case Attribute::NoReturn: + return 1 << 2; + case Attribute::InReg: + return 1 << 3; + case Attribute::StructRet: + return 1 << 4; + case Attribute::NoUnwind: + return 1 << 5; + case Attribute::NoAlias: + return 1 << 6; + case Attribute::ByVal: + return 1 << 7; + case Attribute::Nest: + return 1 << 8; + case Attribute::ReadNone: + return 1 << 9; + case Attribute::ReadOnly: + return 1 << 10; + case Attribute::NoInline: + return 1 << 11; + case Attribute::AlwaysInline: + return 1 << 12; + case Attribute::OptimizeForSize: + return 1 << 13; + case Attribute::StackProtect: + return 1 << 14; + case Attribute::StackProtectReq: + return 1 << 15; + case Attribute::Alignment: + return 31 << 16; + case Attribute::NoCapture: + return 1 << 21; + case Attribute::NoRedZone: + return 1 << 22; + case Attribute::NoImplicitFloat: + return 1 << 23; + case Attribute::Naked: + return 1 << 24; + case Attribute::InlineHint: + return 1 << 25; + case Attribute::StackAlignment: + return 7 << 26; + case Attribute::ReturnsTwice: + return 1 << 29; + case Attribute::UWTable: + return 1 << 30; + case Attribute::NonLazyBind: + return 1U << 31; + case Attribute::SanitizeAddress: + return 1ULL << 32; + case Attribute::MinSize: + return 1ULL << 33; + case Attribute::NoDuplicate: + return 1ULL << 34; + case Attribute::StackProtectStrong: + return 1ULL << 35; + case Attribute::SanitizeThread: + return 1ULL << 36; + case Attribute::SanitizeMemory: + return 1ULL << 37; + case Attribute::NoBuiltin: + return 1ULL << 38; + case Attribute::Returned: + return 1ULL << 39; + case Attribute::Cold: + return 1ULL << 40; + case Attribute::Builtin: + return 1ULL << 41; + case Attribute::OptimizeNone: + return 1ULL << 42; + case Attribute::InAlloca: + return 1ULL << 43; + case Attribute::NonNull: + return 1ULL << 44; + case Attribute::JumpTable: + return 1ULL << 45; + case Attribute::Convergent: + return 1ULL << 46; + case Attribute::SafeStack: + return 1ULL << 47; + case Attribute::NoRecurse: + return 1ULL << 48; + case Attribute::InaccessibleMemOnly: + return 1ULL << 49; + case Attribute::InaccessibleMemOrArgMemOnly: + return 1ULL << 50; + case Attribute::SwiftSelf: + return 1ULL << 51; + case Attribute::SwiftError: + return 1ULL << 52; + case Attribute::WriteOnly: + return 1ULL << 53; + case Attribute::Speculatable: + return 1ULL << 54; + case Attribute::StrictFP: + return 1ULL << 55; + case Attribute::SanitizeHWAddress: + return 1ULL << 56; + case Attribute::NoCfCheck: + return 1ULL << 57; + case Attribute::OptForFuzzing: + return 1ULL << 58; + case Attribute::ShadowCallStack: + return 1ULL << 59; case Attribute::SpeculativeLoadHardening: return 1ULL << 60; case Attribute::ImmArg: @@ -1281,10 +1395,13 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) { case Attribute::NoFree: return 1ULL << 63; - // VISC Attributes - case Attribute::In: return 3ULL << 0; - case Attribute::Out: return 3ULL << 1; - case Attribute::InOut: return 3ULL << 2; + // VISC Attributes + case Attribute::In: + return 3ULL << 0; + case Attribute::Out: + return 3ULL << 1; + case Attribute::InOut: + return 3ULL << 2; case Attribute::NoSync: llvm_unreachable("nosync attribute not supported in raw format"); @@ -1310,22 +1427,20 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) { } static void addRawAttributeValue(AttrBuilder &B, uint64_t Val) { - if (!Val) return; + if (!Val) + return; for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds; I = Attribute::AttrKind(I + 1)) { - if (I == Attribute::SanitizeMemTag || - I == Attribute::Dereferenceable || - I == Attribute::DereferenceableOrNull || - I == Attribute::ArgMemOnly || - I == Attribute::AllocSize || - I == Attribute::NoSync) + if (I == Attribute::SanitizeMemTag || I == Attribute::Dereferenceable || + I == Attribute::DereferenceableOrNull || I == Attribute::ArgMemOnly || + I == Attribute::AllocSize || I == Attribute::NoSync) continue; if (uint64_t A = (Val & getRawAttributeMask(I))) { if (I == Attribute::Alignment) B.addAlignmentAttr(1ULL << ((A >> 16) - 1)); else if (I == Attribute::StackAlignment) - B.addStackAlignmentAttr(1ULL << ((A >> 26)-1)); + B.addStackAlignmentAttr(1ULL << ((A >> 26) - 1)); else B.addAttribute(I); } @@ -1348,7 +1463,7 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B, if (Alignment) B.addAlignmentAttr(Alignment); addRawAttributeValue(B, ((EncodedAttrs & (0xfffffULL << 32)) >> 11) | - (EncodedAttrs & 0xffff)); + (EncodedAttrs & 0xffff)); } Error BitcodeReader::parseAttributeBlock() { @@ -1386,7 +1501,7 @@ Error BitcodeReader::parseAttributeBlock() { if (!MaybeRecord) return MaybeRecord.takeError(); switch (MaybeRecord.get()) { - default: // Default behavior: ignore. + default: // Default behavior: ignore. break; case bitc::PARAMATTR_CODE_ENTRY_OLD: // ENTRY: [paramidx0, attr0, ...] // FIXME: Remove in 4.0. @@ -1395,7 +1510,7 @@ Error BitcodeReader::parseAttributeBlock() { for (unsigned i = 0, e = Record.size(); i != e; i += 2) { AttrBuilder B; - decodeLLVMAttributesForBitcode(B, Record[i+1]); + decodeLLVMAttributesForBitcode(B, Record[i + 1]); Attrs.push_back(AttributeList::get(Context, Record[i], B)); } @@ -1599,7 +1714,7 @@ Error BitcodeReader::parseAttributeGroupBlock() { if (!MaybeRecord) return MaybeRecord.takeError(); switch (MaybeRecord.get()) { - default: // Default behavior: ignore. + default: // Default behavior: ignore. break; case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...] if (Record.size() < 3) @@ -1610,7 +1725,7 @@ Error BitcodeReader::parseAttributeGroupBlock() { AttrBuilder B; for (unsigned i = 2, e = Record.size(); i != e; ++i) { - if (Record[i] == 0) { // Enum attribute + if (Record[i] == 0) { // Enum attribute Attribute::AttrKind Kind; if (Error Err = parseAttrKind(Record[++i], &Kind)) return Err; @@ -1725,37 +1840,37 @@ Error BitcodeReader::parseTypeTableBody() { return error("Invalid record"); TypeList.resize(Record[0]); continue; - case bitc::TYPE_CODE_VOID: // VOID + case bitc::TYPE_CODE_VOID: // VOID ResultTy = Type::getVoidTy(Context); break; - case bitc::TYPE_CODE_HALF: // HALF + case bitc::TYPE_CODE_HALF: // HALF ResultTy = Type::getHalfTy(Context); break; - case bitc::TYPE_CODE_FLOAT: // FLOAT + case bitc::TYPE_CODE_FLOAT: // FLOAT ResultTy = Type::getFloatTy(Context); break; - case bitc::TYPE_CODE_DOUBLE: // DOUBLE + case bitc::TYPE_CODE_DOUBLE: // DOUBLE ResultTy = Type::getDoubleTy(Context); break; - case bitc::TYPE_CODE_X86_FP80: // X86_FP80 + case bitc::TYPE_CODE_X86_FP80: // X86_FP80 ResultTy = Type::getX86_FP80Ty(Context); break; - case bitc::TYPE_CODE_FP128: // FP128 + case bitc::TYPE_CODE_FP128: // FP128 ResultTy = Type::getFP128Ty(Context); break; case bitc::TYPE_CODE_PPC_FP128: // PPC_FP128 ResultTy = Type::getPPC_FP128Ty(Context); break; - case bitc::TYPE_CODE_LABEL: // LABEL + case bitc::TYPE_CODE_LABEL: // LABEL ResultTy = Type::getLabelTy(Context); break; - case bitc::TYPE_CODE_METADATA: // METADATA + case bitc::TYPE_CODE_METADATA: // METADATA ResultTy = Type::getMetadataTy(Context); break; - case bitc::TYPE_CODE_X86_MMX: // X86_MMX + case bitc::TYPE_CODE_X86_MMX: // X86_MMX ResultTy = Type::getX86_MMXTy(Context); break; - case bitc::TYPE_CODE_TOKEN: // TOKEN + case bitc::TYPE_CODE_TOKEN: // TOKEN ResultTy = Type::getTokenTy(Context); break; case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width] @@ -1777,8 +1892,7 @@ Error BitcodeReader::parseTypeTableBody() { if (Record.size() == 2) AddressSpace = Record[1]; ResultTy = getTypeByID(Record[0]); - if (!ResultTy || - !PointerType::isValidElementType(ResultTy)) + if (!ResultTy || !PointerType::isValidElementType(ResultTy)) return error("Invalid type"); ResultTy = PointerType::get(ResultTy, AddressSpace); break; @@ -1788,7 +1902,7 @@ Error BitcodeReader::parseTypeTableBody() { // FUNCTION: [vararg, attrid, retty, paramty x N] if (Record.size() < 3) return error("Invalid record"); - SmallVector<Type*, 8> ArgTys; + SmallVector<Type *, 8> ArgTys; for (unsigned i = 3, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) ArgTys.push_back(T); @@ -1797,7 +1911,7 @@ Error BitcodeReader::parseTypeTableBody() { } ResultTy = getTypeByID(Record[2]); - if (!ResultTy || ArgTys.size() < Record.size()-3) + if (!ResultTy || ArgTys.size() < Record.size() - 3) return error("Invalid type"); ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]); @@ -1807,40 +1921,39 @@ Error BitcodeReader::parseTypeTableBody() { // FUNCTION: [vararg, retty, paramty x N] if (Record.size() < 2) return error("Invalid record"); - SmallVector<Type*, 8> ArgTys; + SmallVector<Type *, 8> ArgTys; for (unsigned i = 2, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) { if (!FunctionType::isValidArgumentType(T)) return error("Invalid function argument type"); ArgTys.push_back(T); - } - else + } else break; } ResultTy = getTypeByID(Record[1]); - if (!ResultTy || ArgTys.size() < Record.size()-2) + if (!ResultTy || ArgTys.size() < Record.size() - 2) return error("Invalid type"); ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]); break; } - case bitc::TYPE_CODE_STRUCT_ANON: { // STRUCT: [ispacked, eltty x N] + case bitc::TYPE_CODE_STRUCT_ANON: { // STRUCT: [ispacked, eltty x N] if (Record.size() < 1) return error("Invalid record"); - SmallVector<Type*, 8> EltTys; + SmallVector<Type *, 8> EltTys; for (unsigned i = 1, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) EltTys.push_back(T); else break; } - if (EltTys.size() != Record.size()-1) + if (EltTys.size() != Record.size() - 1) return error("Invalid type"); ResultTy = StructType::get(Context, EltTys, Record[0]); break; } - case bitc::TYPE_CODE_STRUCT_NAME: // STRUCT_NAME: [strchr x N] + case bitc::TYPE_CODE_STRUCT_NAME: // STRUCT_NAME: [strchr x N] if (convertToString(Record, 0, TypeName)) return error("Invalid record"); continue; @@ -1857,24 +1970,24 @@ Error BitcodeReader::parseTypeTableBody() { if (Res) { Res->setName(TypeName); TypeList[NumRecords] = nullptr; - } else // Otherwise, create a new struct. + } else // Otherwise, create a new struct. Res = createIdentifiedStructType(Context, TypeName); TypeName.clear(); - SmallVector<Type*, 8> EltTys; + SmallVector<Type *, 8> EltTys; for (unsigned i = 1, e = Record.size(); i != e; ++i) { if (Type *T = getTypeByID(Record[i])) EltTys.push_back(T); else break; } - if (EltTys.size() != Record.size()-1) + if (EltTys.size() != Record.size() - 1) return error("Invalid record"); Res->setBody(EltTys, Record[0]); ResultTy = Res; break; } - case bitc::TYPE_CODE_OPAQUE: { // OPAQUE: [] + case bitc::TYPE_CODE_OPAQUE: { // OPAQUE: [] if (Record.size() != 1) return error("Invalid record"); @@ -1886,13 +1999,13 @@ Error BitcodeReader::parseTypeTableBody() { if (Res) { Res->setName(TypeName); TypeList[NumRecords] = nullptr; - } else // Otherwise, create a new struct with no body. + } else // Otherwise, create a new struct with no body. Res = createIdentifiedStructType(Context, TypeName); TypeName.clear(); ResultTy = Res; break; } - case bitc::TYPE_CODE_ARRAY: // ARRAY: [numelts, eltty] + case bitc::TYPE_CODE_ARRAY: // ARRAY: [numelts, eltty] if (Record.size() < 2) return error("Invalid record"); ResultTy = getTypeByID(Record[1]); @@ -1900,8 +2013,8 @@ Error BitcodeReader::parseTypeTableBody() { return error("Invalid type"); ResultTy = ArrayType::get(ResultTy, Record[0]); break; - case bitc::TYPE_CODE_VECTOR: // VECTOR: [numelts, eltty] or - // [numelts, eltty, scalable] + case bitc::TYPE_CODE_VECTOR: // VECTOR: [numelts, eltty] or + // [numelts, eltty, scalable] if (Record.size() < 2) return error("Invalid record"); if (Record[0] == 0) @@ -2183,9 +2296,9 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) { if (!MaybeRecord) return MaybeRecord.takeError(); switch (MaybeRecord.get()) { - default: // Default behavior: unknown type. + default: // Default behavior: unknown type. break; - case bitc::VST_CODE_ENTRY: { // VST_CODE_ENTRY: [valueid, namechar x N] + case bitc::VST_CODE_ENTRY: { // VST_CODE_ENTRY: [valueid, namechar x N] Expected<Value *> ValOrErr = recordValue(Record, 1, TT); if (Error Err = ValOrErr.takeError()) return Err; @@ -2320,8 +2433,7 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() { static APInt readWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) { SmallVector<uint64_t, 8> Words(Vals.size()); - transform(Vals, Words.begin(), - BitcodeReader::decodeSignRotatedValue); + transform(Vals, Words.begin(), BitcodeReader::decodeSignRotatedValue); return APInt(TypeBits, Words); } @@ -2368,11 +2480,11 @@ Error BitcodeReader::parseConstants() { if (!MaybeBitCode) return MaybeBitCode.takeError(); switch (unsigned BitCode = MaybeBitCode.get()) { - default: // Default behavior: unknown constant - case bitc::CST_CODE_UNDEF: // UNDEF + default: // Default behavior: unknown constant + case bitc::CST_CODE_UNDEF: // UNDEF V = UndefValue::get(CurTy); break; - case bitc::CST_CODE_SETTYPE: // SETTYPE: [typeid] + case bitc::CST_CODE_SETTYPE: // SETTYPE: [typeid] if (Record.empty()) return error("Invalid record"); if (Record[0] >= TypeList.size() || !TypeList[Record[0]]) @@ -2381,16 +2493,16 @@ Error BitcodeReader::parseConstants() { return error("Invalid constant type"); CurFullTy = TypeList[Record[0]]; CurTy = flattenPointerTypes(CurFullTy); - continue; // Skip the ValueList manipulation. - case bitc::CST_CODE_NULL: // NULL + continue; // Skip the ValueList manipulation. + case bitc::CST_CODE_NULL: // NULL V = Constant::getNullValue(CurTy); break; - case bitc::CST_CODE_INTEGER: // INTEGER: [intval] + case bitc::CST_CODE_INTEGER: // INTEGER: [intval] if (!CurTy->isIntegerTy() || Record.empty()) return error("Invalid record"); V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0])); break; - case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval] + case bitc::CST_CODE_WIDE_INTEGER: { // WIDE_INTEGER: [n x intval] if (!CurTy->isIntegerTy() || Record.empty()) return error("Invalid record"); @@ -2400,7 +2512,7 @@ Error BitcodeReader::parseConstants() { break; } - case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval] + case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval] if (Record.empty()) return error("Invalid record"); if (CurTy->isHalfTy()) @@ -2410,8 +2522,8 @@ Error BitcodeReader::parseConstants() { V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(), APInt(32, (uint32_t)Record[0]))); else if (CurTy->isDoubleTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(), - APInt(64, Record[0]))); + V = ConstantFP::get( + Context, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0]))); else if (CurTy->isX86_FP80Ty()) { // Bits are not stored the same way as a normal i80 APInt, compensate. uint64_t Rearrange[2]; @@ -2420,27 +2532,27 @@ Error BitcodeReader::parseConstants() { V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(), APInt(80, Rearrange))); } else if (CurTy->isFP128Ty()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(), - APInt(128, Record))); + V = ConstantFP::get(Context, + APFloat(APFloat::IEEEquad(), APInt(128, Record))); else if (CurTy->isPPC_FP128Ty()) - V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(), - APInt(128, Record))); + V = ConstantFP::get( + Context, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record))); else V = UndefValue::get(CurTy); break; } - case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number] + case bitc::CST_CODE_AGGREGATE: { // AGGREGATE: [n x value number] if (Record.empty()) return error("Invalid record"); unsigned Size = Record.size(); - SmallVector<Constant*, 16> Elts; + SmallVector<Constant *, 16> Elts; if (StructType *STy = dyn_cast<StructType>(CurTy)) { for (unsigned i = 0; i != Size; ++i) - Elts.push_back(ValueList.getConstantFwdRef(Record[i], - STy->getElementType(i))); + Elts.push_back( + ValueList.getConstantFwdRef(Record[i], STy->getElementType(i))); V = ConstantStruct::get(STy, Elts); } else if (ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) { Type *EltTy = ATy->getElementType(); @@ -2467,7 +2579,7 @@ Error BitcodeReader::parseConstants() { BitCode == bitc::CST_CODE_CSTRING); break; } - case bitc::CST_CODE_DATA: {// DATA: [n x value] + case bitc::CST_CODE_DATA: { // DATA: [n x value] if (Record.empty()) return error("Invalid record"); @@ -2519,12 +2631,12 @@ Error BitcodeReader::parseConstants() { } break; } - case bitc::CST_CODE_CE_UNOP: { // CE_UNOP: [opcode, opval] + case bitc::CST_CODE_CE_UNOP: { // CE_UNOP: [opcode, opval] if (Record.size() < 2) return error("Invalid record"); int Opc = getDecodedUnaryOpcode(Record[0], CurTy); if (Opc < 0) { - V = UndefValue::get(CurTy); // Unknown unop. + V = UndefValue::get(CurTy); // Unknown unop. } else { Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy); unsigned Flags = 0; @@ -2532,29 +2644,25 @@ Error BitcodeReader::parseConstants() { } break; } - case bitc::CST_CODE_CE_BINOP: { // CE_BINOP: [opcode, opval, opval] + case bitc::CST_CODE_CE_BINOP: { // CE_BINOP: [opcode, opval, opval] if (Record.size() < 3) return error("Invalid record"); int Opc = getDecodedBinaryOpcode(Record[0], CurTy); if (Opc < 0) { - V = UndefValue::get(CurTy); // Unknown binop. + V = UndefValue::get(CurTy); // Unknown binop. } else { Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy); Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy); unsigned Flags = 0; if (Record.size() >= 4) { - if (Opc == Instruction::Add || - Opc == Instruction::Sub || - Opc == Instruction::Mul || - Opc == Instruction::Shl) { + if (Opc == Instruction::Add || Opc == Instruction::Sub || + Opc == Instruction::Mul || Opc == Instruction::Shl) { if (Record[3] & (1 << bitc::OBO_NO_SIGNED_WRAP)) Flags |= OverflowingBinaryOperator::NoSignedWrap; if (Record[3] & (1 << bitc::OBO_NO_UNSIGNED_WRAP)) Flags |= OverflowingBinaryOperator::NoUnsignedWrap; - } else if (Opc == Instruction::SDiv || - Opc == Instruction::UDiv || - Opc == Instruction::LShr || - Opc == Instruction::AShr) { + } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv || + Opc == Instruction::LShr || Opc == Instruction::AShr) { if (Record[3] & (1 << bitc::PEO_EXACT)) Flags |= SDivOperator::IsExact; } @@ -2563,24 +2671,25 @@ Error BitcodeReader::parseConstants() { } break; } - case bitc::CST_CODE_CE_CAST: { // CE_CAST: [opcode, opty, opval] + case bitc::CST_CODE_CE_CAST: { // CE_CAST: [opcode, opty, opval] if (Record.size() < 3) return error("Invalid record"); int Opc = getDecodedCastOpcode(Record[0]); if (Opc < 0) { - V = UndefValue::get(CurTy); // Unknown cast. + V = UndefValue::get(CurTy); // Unknown cast. } else { Type *OpTy = getTypeByID(Record[1]); if (!OpTy) return error("Invalid record"); Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy); V = UpgradeBitCastExpr(Opc, Op, CurTy); - if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy); + if (!V) + V = ConstantExpr::getCast(Opc, Op, CurTy); } break; } - case bitc::CST_CODE_CE_INBOUNDS_GEP: // [ty, n x operands] - case bitc::CST_CODE_CE_GEP: // [ty, n x operands] + case bitc::CST_CODE_CE_INBOUNDS_GEP: // [ty, n x operands] + case bitc::CST_CODE_CE_GEP: // [ty, n x operands] case bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX: { // [ty, flags, n x // operands] unsigned OpNum = 0; @@ -2598,7 +2707,7 @@ Error BitcodeReader::parseConstants() { } else if (BitCode == bitc::CST_CODE_CE_INBOUNDS_GEP) InBounds = true; - SmallVector<Constant*, 16> Elts; + SmallVector<Constant *, 16> Elts; Type *Elt0FullTy = nullptr; while (OpNum != Record.size()) { if (!Elt0FullTy) @@ -2625,7 +2734,7 @@ Error BitcodeReader::parseConstants() { InBounds, InRangeIndex); break; } - case bitc::CST_CODE_CE_SELECT: { // CE_SELECT: [opval#, opval#, opval#] + case bitc::CST_CODE_CE_SELECT: { // CE_SELECT: [opval#, opval#, opval#] if (Record.size() < 3) return error("Invalid record"); @@ -2638,18 +2747,17 @@ Error BitcodeReader::parseConstants() { if (SelectorTy != V->getType()) SelectorTy = VectorType::get(SelectorTy, VTy->getNumElements()); - V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0], - SelectorTy), - ValueList.getConstantFwdRef(Record[1],CurTy), - ValueList.getConstantFwdRef(Record[2],CurTy)); + V = ConstantExpr::getSelect( + ValueList.getConstantFwdRef(Record[0], SelectorTy), + ValueList.getConstantFwdRef(Record[1], CurTy), + ValueList.getConstantFwdRef(Record[2], CurTy)); break; } - case bitc::CST_CODE_CE_EXTRACTELT - : { // CE_EXTRACTELT: [opty, opval, opty, opval] + case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opty, + // opval] if (Record.size() < 3) return error("Invalid record"); - VectorType *OpTy = - dyn_cast_or_null<VectorType>(getTypeByID(Record[0])); + VectorType *OpTy = dyn_cast_or_null<VectorType>(getTypeByID(Record[0])); if (!OpTy) return error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy); @@ -2666,14 +2774,14 @@ Error BitcodeReader::parseConstants() { V = ConstantExpr::getExtractElement(Op0, Op1); break; } - case bitc::CST_CODE_CE_INSERTELT - : { // CE_INSERTELT: [opval, opval, opty, opval] + case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opty, + // opval] VectorType *OpTy = dyn_cast<VectorType>(CurTy); if (Record.size() < 3 || !OpTy) return error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy); - Constant *Op1 = ValueList.getConstantFwdRef(Record[1], - OpTy->getElementType()); + Constant *Op1 = + ValueList.getConstantFwdRef(Record[1], OpTy->getElementType()); Constant *Op2 = nullptr; if (Record.size() == 4) { Type *IdxTy = getTypeByID(Record[2]); @@ -2693,27 +2801,26 @@ Error BitcodeReader::parseConstants() { return error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy); Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy); - Type *ShufTy = VectorType::get(Type::getInt32Ty(Context), - OpTy->getNumElements()); + Type *ShufTy = + VectorType::get(Type::getInt32Ty(Context), OpTy->getNumElements()); Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy); V = ConstantExpr::getShuffleVector(Op0, Op1, Op2); break; } case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval] VectorType *RTy = dyn_cast<VectorType>(CurTy); - VectorType *OpTy = - dyn_cast_or_null<VectorType>(getTypeByID(Record[0])); + VectorType *OpTy = dyn_cast_or_null<VectorType>(getTypeByID(Record[0])); if (Record.size() < 4 || !RTy || !OpTy) return error("Invalid record"); Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy); Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy); - Type *ShufTy = VectorType::get(Type::getInt32Ty(Context), - RTy->getNumElements()); + Type *ShufTy = + VectorType::get(Type::getInt32Ty(Context), RTy->getNumElements()); Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy); V = ConstantExpr::getShuffleVector(Op0, Op1, Op2); break; } - case bitc::CST_CODE_CE_CMP: { // CE_CMP: [opty, opval, opval, pred] + case bitc::CST_CODE_CE_CMP: { // CE_CMP: [opty, opval, opval, pred] if (Record.size() < 4) return error("Invalid record"); Type *OpTy = getTypeByID(Record[0]); @@ -2737,16 +2844,16 @@ Error BitcodeReader::parseConstants() { bool HasSideEffects = Record[0] & 1; bool IsAlignStack = Record[0] >> 1; unsigned AsmStrSize = Record[1]; - if (2+AsmStrSize >= Record.size()) + if (2 + AsmStrSize >= Record.size()) return error("Invalid record"); - unsigned ConstStrSize = Record[2+AsmStrSize]; - if (3+AsmStrSize+ConstStrSize > Record.size()) + unsigned ConstStrSize = Record[2 + AsmStrSize]; + if (3 + AsmStrSize + ConstStrSize > Record.size()) return error("Invalid record"); for (unsigned i = 0; i != AsmStrSize; ++i) - AsmStr += (char)Record[2+i]; + AsmStr += (char)Record[2 + i]; for (unsigned i = 0; i != ConstStrSize; ++i) - ConstrStr += (char)Record[3+AsmStrSize+i]; + ConstrStr += (char)Record[3 + AsmStrSize + i]; UpgradeInlineAsmString(&AsmStr); V = InlineAsm::get( cast<FunctionType>(getPointerElementFlatType(CurFullTy)), AsmStr, @@ -2763,16 +2870,16 @@ Error BitcodeReader::parseConstants() { bool IsAlignStack = (Record[0] >> 1) & 1; unsigned AsmDialect = Record[0] >> 2; unsigned AsmStrSize = Record[1]; - if (2+AsmStrSize >= Record.size()) + if (2 + AsmStrSize >= Record.size()) return error("Invalid record"); - unsigned ConstStrSize = Record[2+AsmStrSize]; - if (3+AsmStrSize+ConstStrSize > Record.size()) + unsigned ConstStrSize = Record[2 + AsmStrSize]; + if (3 + AsmStrSize + ConstStrSize > Record.size()) return error("Invalid record"); for (unsigned i = 0; i != AsmStrSize; ++i) - AsmStr += (char)Record[2+i]; + AsmStr += (char)Record[2 + i]; for (unsigned i = 0; i != ConstStrSize; ++i) - ConstrStr += (char)Record[3+AsmStrSize+i]; + ConstrStr += (char)Record[3 + AsmStrSize + i]; UpgradeInlineAsmString(&AsmStr); V = InlineAsm::get( cast<FunctionType>(getPointerElementFlatType(CurFullTy)), AsmStr, @@ -2780,14 +2887,14 @@ Error BitcodeReader::parseConstants() { InlineAsm::AsmDialect(AsmDialect)); break; } - case bitc::CST_CODE_BLOCKADDRESS:{ + case bitc::CST_CODE_BLOCKADDRESS: { if (Record.size() < 3) return error("Invalid record"); Type *FnTy = getTypeByID(Record[0]); if (!FnTy) return error("Invalid record"); - Function *Fn = - dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy)); + Function *Fn = dyn_cast_or_null<Function>( + ValueList.getConstantFwdRef(Record[1], FnTy)); if (!Fn) return error("Invalid record"); @@ -2861,7 +2968,7 @@ Error BitcodeReader::parseUseLists() { if (!MaybeRecord) return MaybeRecord.takeError(); switch (MaybeRecord.get()) { - default: // Default behavior: unknown type. + default: // Default behavior: unknown type. break; case bitc::USELIST_CODE_BB: IsBB = true; @@ -3011,7 +3118,8 @@ Error BitcodeReader::rememberAndSkipFunctionBodies() { return error("Could not find function in stream"); if (!SeenFirstFunctionBody) - return error("Trying to materialize functions before seeing function blocks"); + return error( + "Trying to materialize functions before seeing function blocks"); // An old bitcode file with the symbol table at the end would have // finished the parse greedily. @@ -3428,7 +3536,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, case BitstreamEntry::SubBlock: switch (Entry.ID) { - default: // Skip unknown content. + default: // Skip unknown content. if (Error Err = Stream.SkipBlock()) return Err; break; @@ -3565,7 +3673,8 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, if (!MaybeBitCode) return MaybeBitCode.takeError(); switch (unsigned BitCode = MaybeBitCode.get()) { - default: break; // Default behavior, ignore unknown content. + default: + break; // Default behavior, ignore unknown content. case bitc::MODULE_CODE_VERSION: { Expected<unsigned> VersionOrErr = parseVersionRecord(Record); if (!VersionOrErr) @@ -3573,28 +3682,28 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, UseRelativeIDs = *VersionOrErr >= 1; break; } - case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N] + case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N] std::string S; if (convertToString(Record, 0, S)) return error("Invalid record"); TheModule->setTargetTriple(S); break; } - case bitc::MODULE_CODE_DATALAYOUT: { // DATALAYOUT: [strchr x N] + case bitc::MODULE_CODE_DATALAYOUT: { // DATALAYOUT: [strchr x N] std::string S; if (convertToString(Record, 0, S)) return error("Invalid record"); TheModule->setDataLayout(S); break; } - case bitc::MODULE_CODE_ASM: { // ASM: [strchr x N] + case bitc::MODULE_CODE_ASM: { // ASM: [strchr x N] std::string S; if (convertToString(Record, 0, S)) return error("Invalid record"); TheModule->setModuleInlineAsm(S); break; } - case bitc::MODULE_CODE_DEPLIB: { // DEPLIB: [strchr x N] + case bitc::MODULE_CODE_DEPLIB: { // DEPLIB: [strchr x N] // FIXME: Remove in 4.0. std::string S; if (convertToString(Record, 0, S)) @@ -3602,14 +3711,14 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, // Ignore value. break; } - case bitc::MODULE_CODE_SECTIONNAME: { // SECTIONNAME: [strchr x N] + case bitc::MODULE_CODE_SECTIONNAME: { // SECTIONNAME: [strchr x N] std::string S; if (convertToString(Record, 0, S)) return error("Invalid record"); SectionTable.push_back(S); break; } - case bitc::MODULE_CODE_GCNAME: { // SECTIONNAME: [strchr x N] + case bitc::MODULE_CODE_GCNAME: { // SECTIONNAME: [strchr x N] std::string S; if (convertToString(Record, 0, S)) return error("Invalid record"); @@ -3683,9 +3792,9 @@ void BitcodeReader::propagateByValTypes(CallBase *CB, continue; CB->removeParamAttr(i, Attribute::ByVal); - CB->addParamAttr( - i, Attribute::getWithByValType( - Context, getPointerElementFlatType(ArgsFullTys[i]))); + CB->addParamAttr(i, + Attribute::getWithByValType( + Context, getPointerElementFlatType(ArgsFullTys[i]))); } } @@ -3743,7 +3852,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { case BitstreamEntry::SubBlock: switch (Entry.ID) { - default: // Skip unknown content. + default: // Skip unknown content. if (Error Err = Stream.SkipBlock()) return Err; break; @@ -3788,7 +3897,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { switch (unsigned BitCode = MaybeBitCode.get()) { default: // Default behavior: reject return error("Invalid value"); - case bitc::FUNC_CODE_DECLAREBLOCKS: { // DECLAREBLOCKS: [nblocks] + case bitc::FUNC_CODE_DECLAREBLOCKS: { // DECLAREBLOCKS: [nblocks] if (Record.size() < 1 || Record[0] == 0) return error("Invalid record"); // Create all the basic blocks for the function. @@ -3823,7 +3932,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { continue; } - case bitc::FUNC_CODE_DEBUG_LOC_AGAIN: // DEBUG_LOC_AGAIN + case bitc::FUNC_CODE_DEBUG_LOC_AGAIN: // DEBUG_LOC_AGAIN // This record indicates that the last instruction is at the same // location as the previous instruction with a location. I = getLastInstruction(); @@ -3834,7 +3943,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = nullptr; continue; - case bitc::FUNC_CODE_DEBUG_LOC: { // DEBUG_LOC: [line, col, scope, ia] + case bitc::FUNC_CODE_DEBUG_LOC: { // DEBUG_LOC: [line, col, scope, ia] I = getLastInstruction(); if (!I || Record.size() < 4) return error("Invalid record"); @@ -3861,11 +3970,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = nullptr; continue; } - case bitc::FUNC_CODE_INST_UNOP: { // UNOP: [opval, ty, opcode] + case bitc::FUNC_CODE_INST_UNOP: { // UNOP: [opval, ty, opcode] unsigned OpNum = 0; Value *LHS; if (getValueTypePair(Record, OpNum, NextValueNo, LHS) || - OpNum+1 > Record.size()) + OpNum + 1 > Record.size()) return error("Invalid record"); int Opc = getDecodedUnaryOpcode(Record[OpNum++], LHS->getType()); @@ -3882,12 +3991,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) { } break; } - case bitc::FUNC_CODE_INST_BINOP: { // BINOP: [opval, ty, opval, opcode] + case bitc::FUNC_CODE_INST_BINOP: { // BINOP: [opval, ty, opval, opcode] unsigned OpNum = 0; Value *LHS, *RHS; if (getValueTypePair(Record, OpNum, NextValueNo, LHS) || popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) || - OpNum+1 > Record.size()) + OpNum + 1 > Record.size()) return error("Invalid record"); int Opc = getDecodedBinaryOpcode(Record[OpNum++], LHS->getType()); @@ -3896,18 +4005,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); InstructionList.push_back(I); if (OpNum < Record.size()) { - if (Opc == Instruction::Add || - Opc == Instruction::Sub || - Opc == Instruction::Mul || - Opc == Instruction::Shl) { + if (Opc == Instruction::Add || Opc == Instruction::Sub || + Opc == Instruction::Mul || Opc == Instruction::Shl) { if (Record[OpNum] & (1 << bitc::OBO_NO_SIGNED_WRAP)) cast<BinaryOperator>(I)->setHasNoSignedWrap(true); if (Record[OpNum] & (1 << bitc::OBO_NO_UNSIGNED_WRAP)) cast<BinaryOperator>(I)->setHasNoUnsignedWrap(true); - } else if (Opc == Instruction::SDiv || - Opc == Instruction::UDiv || - Opc == Instruction::LShr || - Opc == Instruction::AShr) { + } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv || + Opc == Instruction::LShr || Opc == Instruction::AShr) { if (Record[OpNum] & (1 << bitc::PEO_EXACT)) cast<BinaryOperator>(I)->setIsExact(true); } else if (isa<FPMathOperator>(I)) { @@ -3915,15 +4020,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (FMF.any()) I->setFastMathFlags(FMF); } - } break; } - case bitc::FUNC_CODE_INST_CAST: { // CAST: [opval, opty, destty, castopc] + case bitc::FUNC_CODE_INST_CAST: { // CAST: [opval, opty, destty, castopc] unsigned OpNum = 0; Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op) || - OpNum+2 != Record.size()) + OpNum + 2 != Record.size()) return error("Invalid record"); FullTy = getFullyStructuredTypeByID(Record[OpNum]); @@ -3975,7 +4079,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { return error( "Explicit gep type does not match pointee type of pointer operand"); - SmallVector<Value*, 16> GEPIdx; + SmallVector<Value *, 16> GEPIdx; while (OpNum != Record.size()) { Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op)) @@ -3993,7 +4097,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { } case bitc::FUNC_CODE_INST_EXTRACTVAL: { - // EXTRACTVAL: [opty, opval, n x indices] + // EXTRACTVAL: [opty, opval, n x indices] unsigned OpNum = 0; Value *Agg; if (getValueTypePair(Record, OpNum, NextValueNo, Agg, &FullTy)) @@ -4031,7 +4135,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { } case bitc::FUNC_CODE_INST_INSERTVAL: { - // INSERTVAL: [opty, opval, opty, opval, n x indices] + // INSERTVAL: [opty, opval, opty, opval, n x indices] unsigned OpNum = 0; Value *Agg; if (getValueTypePair(Record, OpNum, NextValueNo, Agg, &FullTy)) @@ -4090,7 +4194,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { break; } - case bitc::FUNC_CODE_INST_VSELECT: {// VSELECT: [ty,opval,opval,predty,pred] + case bitc::FUNC_CODE_INST_VSELECT: { // VSELECT: + // [ty,opval,opval,predty,pred] // new form of select // handles select i1 or select [N x i1] unsigned OpNum = 0; @@ -4101,8 +4206,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { return error("Invalid record"); // select condition can be either i1 or [N x i1] - if (VectorType* vector_type = - dyn_cast<VectorType>(Cond->getType())) { + if (VectorType *vector_type = dyn_cast<VectorType>(Cond->getType())) { // expect <n x i1> if (vector_type->getElementType() != Type::getInt1Ty(Context)) return error("Invalid type for value"); @@ -4152,7 +4256,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { break; } - case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval] + case bitc::FUNC_CODE_INST_SHUFFLEVEC: { // SHUFFLEVEC: + // [opval,ty,opval,opval] unsigned OpNum = 0; Value *Vec1, *Vec2, *Mask; if (getValueTypePair(Record, OpNum, NextValueNo, Vec1, &FullTy) || @@ -4170,10 +4275,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) { break; } - case bitc::FUNC_CODE_INST_CMP: // CMP: [opty, opval, opval, pred] - // Old form of ICmp/FCmp returning bool - // Existed to differentiate between icmp/fcmp and vicmp/vfcmp which were - // both legal on vectors but had different behaviour. + case bitc::FUNC_CODE_INST_CMP: // CMP: [opty, opval, opval, pred] + // Old form of ICmp/FCmp returning bool + // Existed to differentiate between icmp/fcmp + // and vicmp/vfcmp which were both legal on + // vectors but had different behaviour. case bitc::FUNC_CODE_INST_CMP2: { // CMP2: [opty, opval, opval, pred] // FCmp/ICmp returning bool or vector of bool @@ -4190,10 +4296,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) { unsigned PredVal = Record[OpNum]; bool IsFP = LHS->getType()->isFPOrFPVectorTy(); FastMathFlags FMF; - if (IsFP && Record.size() > OpNum+1) + if (IsFP && Record.size() > OpNum + 1) FMF = getDecodedFastMathFlags(Record[++OpNum]); - if (OpNum+1 != Record.size()) + if (OpNum + 1 != Record.size()) return error("Invalid record"); if (LHS->getType()->isFPOrFPVectorTy()) @@ -4208,25 +4314,25 @@ Error BitcodeReader::parseFunctionBody(Function *F) { } case bitc::FUNC_CODE_INST_RET: // RET: [opty,opval<optional>] - { - unsigned Size = Record.size(); - if (Size == 0) { - I = ReturnInst::Create(Context); - InstructionList.push_back(I); - break; - } - - unsigned OpNum = 0; - Value *Op = nullptr; - if (getValueTypePair(Record, OpNum, NextValueNo, Op)) - return error("Invalid record"); - if (OpNum != Record.size()) - return error("Invalid record"); - - I = ReturnInst::Create(Context, Op); + { + unsigned Size = Record.size(); + if (Size == 0) { + I = ReturnInst::Create(Context); InstructionList.push_back(I); break; } + + unsigned OpNum = 0; + Value *Op = nullptr; + if (getValueTypePair(Record, OpNum, NextValueNo, Op)) + return error("Invalid record"); + if (OpNum != Record.size()) + return error("Invalid record"); + + I = ReturnInst::Create(Context, Op); + InstructionList.push_back(I); + break; + } case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#] if (Record.size() != 1 && Record.size() != 3) return error("Invalid record"); @@ -4237,11 +4343,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (Record.size() == 1) { I = BranchInst::Create(TrueDest); InstructionList.push_back(I); - } - else { + } else { BasicBlock *FalseDest = getBasicBlock(Record[1]); - Value *Cond = getValue(Record, 2, NextValueNo, - Type::getInt1Ty(Context)); + Value *Cond = + getValue(Record, 2, NextValueNo, Type::getInt1Ty(Context)); if (!FalseDest || !Cond) return error("Invalid record"); I = BranchInst::Create(TrueDest, FalseDest, Cond); @@ -4376,7 +4481,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { unsigned CurIdx = 5; for (unsigned i = 0; i != NumCases; ++i) { - SmallVector<ConstantInt*, 1> CaseVals; + SmallVector<ConstantInt *, 1> CaseVals; unsigned NumItems = Record[CurIdx++]; for (unsigned ci = 0; ci != NumItems; ++ci) { bool isSingleNumber = Record[CurIdx++]; @@ -4401,14 +4506,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) { // compared as signed or unsigned values. The partially // implemented changes that used this format in the past used // unsigned comparisons. - for ( ; Low.ule(High); ++Low) + for (; Low.ule(High); ++Low) CaseVals.push_back(ConstantInt::get(Context, Low)); } else CaseVals.push_back(ConstantInt::get(Context, Low)); } BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]); - for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(), - cve = CaseVals.end(); cvi != cve; ++cvi) + for (SmallVector<ConstantInt *, 1>::iterator cvi = CaseVals.begin(), + cve = CaseVals.end(); + cvi != cve; ++cvi) SI->addCase(*cvi, DestBB); } I = SI; @@ -4424,13 +4530,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) { BasicBlock *Default = getBasicBlock(Record[2]); if (!OpTy || !Cond || !Default) return error("Invalid record"); - unsigned NumCases = (Record.size()-3)/2; + unsigned NumCases = (Record.size() - 3) / 2; SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases); InstructionList.push_back(SI); for (unsigned i = 0, e = NumCases; i != e; ++i) { - ConstantInt *CaseVal = - dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy)); - BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]); + ConstantInt *CaseVal = dyn_cast_or_null<ConstantInt>( + getFnValueByID(Record[3 + i * 2], OpTy)); + BasicBlock *DestBB = getBasicBlock(Record[1 + 3 + i * 2]); if (!CaseVal || !DestBB) { delete SI; return error("Invalid record"); @@ -4447,11 +4553,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) { Value *Address = getValue(Record, 1, NextValueNo, OpTy); if (!OpTy || !Address) return error("Invalid record"); - unsigned NumDests = Record.size()-2; + unsigned NumDests = Record.size() - 2; IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests); InstructionList.push_back(IBI); for (unsigned i = 0, e = NumDests; i != e; ++i) { - if (BasicBlock *DestBB = getBasicBlock(Record[2+i])) { + if (BasicBlock *DestBB = getBasicBlock(Record[2 + i])) { IBI->addDestination(DestBB); } else { delete IBI; @@ -4501,11 +4607,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (Record.size() < FTy->getNumParams() + OpNum) return error("Insufficient operands to call"); - SmallVector<Value*, 16> Ops; + SmallVector<Value *, 16> Ops; SmallVector<Type *, 16> ArgsFullTys; for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) { - Ops.push_back(getValue(Record, OpNum, NextValueNo, - FTy->getParamType(i))); + Ops.push_back( + getValue(Record, OpNum, NextValueNo, FTy->getParamType(i))); ArgsFullTys.push_back(FullFTy->getParamType(i)); if (!Ops.back()) return error("Invalid record"); @@ -4588,14 +4694,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (Record.size() < FTy->getNumParams() + OpNum) return error("Insufficient operands to call"); - SmallVector<Value*, 16> Args; + SmallVector<Value *, 16> Args; // Read the fixed params. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) { if (FTy->getParamType(i)->isLabelTy()) Args.push_back(getBasicBlock(Record[OpNum])); else - Args.push_back(getValue(Record, OpNum, NextValueNo, - FTy->getParamType(i))); + Args.push_back( + getValue(Record, OpNum, NextValueNo, FTy->getParamType(i))); if (!Args.back()) return error("Invalid record"); } @@ -4628,26 +4734,26 @@ Error BitcodeReader::parseFunctionBody(Function *F) { InstructionList.push_back(I); break; case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...] - if (Record.size() < 1 || ((Record.size()-1)&1)) + if (Record.size() < 1 || ((Record.size() - 1) & 1)) return error("Invalid record"); FullTy = getFullyStructuredTypeByID(Record[0]); Type *Ty = flattenPointerTypes(FullTy); if (!Ty) return error("Invalid record"); - PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2); + PHINode *PN = PHINode::Create(Ty, (Record.size() - 1) / 2); InstructionList.push_back(PN); - for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) { + for (unsigned i = 0, e = Record.size() - 1; i != e; i += 2) { Value *V; // With the new function encoding, it is possible that operands have // negative IDs (for forward references). Use a signed VBR // representation to keep the encoding small. if (UseRelativeIDs) - V = getValueSigned(Record, 1+i, NextValueNo, Ty); + V = getValueSigned(Record, 1 + i, NextValueNo, Ty); else - V = getValue(Record, 1+i, NextValueNo, Ty); - BasicBlock *BB = getBasicBlock(Record[2+i]); + V = getValue(Record, 1 + i, NextValueNo, Ty); + BasicBlock *BB = getBasicBlock(Record[2 + i]); if (!V || !BB) return error("Invalid record"); PN->addIncoming(V, BB); @@ -4689,7 +4795,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { LP->setCleanup(IsCleanup); for (unsigned J = 0; J != NumClauses; ++J) { LandingPadInst::ClauseType CT = - LandingPadInst::ClauseType(Record[Idx++]); (void)CT; + LandingPadInst::ClauseType(Record[Idx++]); + (void)CT; Value *Val; if (getValueTypePair(Record, Idx, NextValueNo, Val)) { @@ -4697,12 +4804,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) { return error("Invalid record"); } - assert((CT != LandingPadInst::Catch || - !isa<ArrayType>(Val->getType())) && - "Catch clause has a invalid type!"); - assert((CT != LandingPadInst::Filter || - isa<ArrayType>(Val->getType())) && - "Filter clause has invalid type!"); + assert( + (CT != LandingPadInst::Catch || !isa<ArrayType>(Val->getType())) && + "Catch clause has a invalid type!"); + assert( + (CT != LandingPadInst::Filter || isa<ArrayType>(Val->getType())) && + "Filter clause has invalid type!"); LP->addClause(cast<Constant>(Val)); } @@ -4718,8 +4825,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { const uint64_t InAllocaMask = uint64_t(1) << 5; const uint64_t ExplicitTypeMask = uint64_t(1) << 6; const uint64_t SwiftErrorMask = uint64_t(1) << 7; - const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask | - SwiftErrorMask; + const uint64_t FlagMask = + InAllocaMask | ExplicitTypeMask | SwiftErrorMask; bool InAlloca = AlignRecord & InAllocaMask; bool SwiftError = AlignRecord & SwiftErrorMask; FullTy = getFullyStructuredTypeByID(Record[0]); @@ -4779,7 +4886,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { break; } case bitc::FUNC_CODE_INST_LOADATOMIC: { - // LOADATOMIC: [opty, op, align, vol, ordering, ssid] + // LOADATOMIC: [opty, op, align, vol, ordering, ssid] unsigned OpNum = 0; Value *Op; if (getValueTypePair(Record, OpNum, NextValueNo, Op, &FullTy) || @@ -4816,7 +4923,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { break; } case bitc::FUNC_CODE_INST_STORE: - case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol] + case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, + // vol] unsigned OpNum = 0; Value *Val, *Ptr; Type *FullTy; @@ -4833,7 +4941,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { unsigned Align; if (Error Err = parseAlignmentValue(Record[OpNum], Align)) return Err; - I = new StoreInst(Val, Ptr, Record[OpNum+1], Align); + I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align); InstructionList.push_back(I); break; } @@ -4866,7 +4974,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { unsigned Align; if (Error Err = parseAlignmentValue(Record[OpNum], Align)) return Err; - I = new StoreInst(Val, Ptr, Record[OpNum+1], Align, Ordering, SSID); + I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align, Ordering, SSID); InstructionList.push_back(I); break; } @@ -4923,7 +5031,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = ExtractValueInst::Create(I, 0); FullTy = cast<StructType>(FullTy)->getElementType(0); } else { - cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum+4]); + cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum + 4]); } InstructionList.push_back(I); @@ -4950,7 +5058,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]); I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID); FullTy = getPointerElementFlatType(FullTy); - cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]); + cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum + 1]); InstructionList.push_back(I); break; } @@ -5012,15 +5120,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (Record.size() < FTy->getNumParams() + OpNum) return error("Insufficient operands to call"); - SmallVector<Value*, 16> Args; - SmallVector<Type*, 16> ArgsFullTys; + SmallVector<Value *, 16> Args; + SmallVector<Type *, 16> ArgsFullTys; // Read the fixed params. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) { if (FTy->getParamType(i)->isLabelTy()) Args.push_back(getBasicBlock(Record[OpNum])); else - Args.push_back(getValue(Record, OpNum, NextValueNo, - FTy->getParamType(i))); + Args.push_back( + getValue(Record, OpNum, NextValueNo, FTy->getParamType(i))); ArgsFullTys.push_back(FullFTy->getParamType(i)); if (!Args.back()) return error("Invalid record"); @@ -5149,7 +5257,8 @@ OutOfRecordLoop: if (Argument *A = dyn_cast<Argument>(ValueList.back())) { if (!A->getParent()) { // We found at least one unresolved value. Nuke them all to avoid leaks. - for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){ + for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; + ++i) { if ((A = dyn_cast_or_null<Argument>(ValueList[i])) && !A->getParent()) { A->replaceAllUsesWith(UndefValue::get(A->getType())); delete A; @@ -5166,7 +5275,7 @@ OutOfRecordLoop: // Trim the value list down to the size it was before we parsed this function. ValueList.shrinkTo(ModuleValueListSize); MDLoader->shrinkTo(ModuleMDLoaderSize); - std::vector<BasicBlock*>().swap(FunctionBBs); + std::vector<BasicBlock *>().swap(FunctionBBs); return Error::success(); } @@ -5206,7 +5315,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) { if (!F || !F->isMaterializable()) return Error::success(); - DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F); + DenseMap<Function *, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F); assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!"); // If its position is recorded as 0, its body is somewhere in the stream // but we haven't seen it yet. @@ -5367,8 +5476,7 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID( // the index so that the value name can be recorded. ValueIdToValueInfoMap[ValueID] = std::make_pair( TheIndex.getOrInsertValueInfo( - ValueGUID, - UseStrtab ? ValueName : TheIndex.saveString(ValueName)), + ValueGUID, UseStrtab ? ValueName : TheIndex.saveString(ValueName)), OriginalNameID); } @@ -5537,71 +5645,71 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() { continue; case BitstreamEntry::Record: { - Record.clear(); - Expected<unsigned> MaybeBitCode = Stream.readRecord(Entry.ID, Record); - if (!MaybeBitCode) - return MaybeBitCode.takeError(); - switch (MaybeBitCode.get()) { - default: - break; // Default behavior, ignore unknown content. - case bitc::MODULE_CODE_VERSION: { - if (Error Err = parseVersionRecord(Record).takeError()) - return Err; - break; - } - /// MODULE_CODE_SOURCE_FILENAME: [namechar x N] - case bitc::MODULE_CODE_SOURCE_FILENAME: { - SmallString<128> ValueName; - if (convertToString(Record, 0, ValueName)) - return error("Invalid record"); - SourceFileName = ValueName.c_str(); - break; + Record.clear(); + Expected<unsigned> MaybeBitCode = Stream.readRecord(Entry.ID, Record); + if (!MaybeBitCode) + return MaybeBitCode.takeError(); + switch (MaybeBitCode.get()) { + default: + break; // Default behavior, ignore unknown content. + case bitc::MODULE_CODE_VERSION: { + if (Error Err = parseVersionRecord(Record).takeError()) + return Err; + break; + } + /// MODULE_CODE_SOURCE_FILENAME: [namechar x N] + case bitc::MODULE_CODE_SOURCE_FILENAME: { + SmallString<128> ValueName; + if (convertToString(Record, 0, ValueName)) + return error("Invalid record"); + SourceFileName = ValueName.c_str(); + break; + } + /// MODULE_CODE_HASH: [5*i32] + case bitc::MODULE_CODE_HASH: { + if (Record.size() != 5) + return error("Invalid hash length " + Twine(Record.size()).str()); + auto &Hash = getThisModule()->second.second; + int Pos = 0; + for (auto &Val : Record) { + assert(!(Val >> 32) && "Unexpected high bits set"); + Hash[Pos++] = Val; } - /// MODULE_CODE_HASH: [5*i32] - case bitc::MODULE_CODE_HASH: { - if (Record.size() != 5) - return error("Invalid hash length " + Twine(Record.size()).str()); - auto &Hash = getThisModule()->second.second; - int Pos = 0; - for (auto &Val : Record) { - assert(!(Val >> 32) && "Unexpected high bits set"); - Hash[Pos++] = Val; - } + break; + } + /// MODULE_CODE_VSTOFFSET: [offset] + case bitc::MODULE_CODE_VSTOFFSET: + if (Record.size() < 1) + return error("Invalid record"); + // Note that we subtract 1 here because the offset is relative to one + // word before the start of the identification or module block, which + // was historically always the start of the regular bitcode header. + VSTOffset = Record[0] - 1; + break; + // v1 GLOBALVAR: [pointer type, isconst, initid, linkage, ...] + // v1 FUNCTION: [type, callingconv, isproto, linkage, ...] + // v1 ALIAS: [alias type, addrspace, aliasee val#, linkage, ...] + // v2: [strtab offset, strtab size, v1] + case bitc::MODULE_CODE_GLOBALVAR: + case bitc::MODULE_CODE_FUNCTION: + case bitc::MODULE_CODE_ALIAS: { + StringRef Name; + ArrayRef<uint64_t> GVRecord; + std::tie(Name, GVRecord) = readNameFromStrtab(Record); + if (GVRecord.size() <= 3) + return error("Invalid record"); + uint64_t RawLinkage = GVRecord[3]; + GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage); + if (!UseStrtab) { + ValueIdToLinkageMap[ValueId++] = Linkage; break; } - /// MODULE_CODE_VSTOFFSET: [offset] - case bitc::MODULE_CODE_VSTOFFSET: - if (Record.size() < 1) - return error("Invalid record"); - // Note that we subtract 1 here because the offset is relative to one - // word before the start of the identification or module block, which - // was historically always the start of the regular bitcode header. - VSTOffset = Record[0] - 1; - break; - // v1 GLOBALVAR: [pointer type, isconst, initid, linkage, ...] - // v1 FUNCTION: [type, callingconv, isproto, linkage, ...] - // v1 ALIAS: [alias type, addrspace, aliasee val#, linkage, ...] - // v2: [strtab offset, strtab size, v1] - case bitc::MODULE_CODE_GLOBALVAR: - case bitc::MODULE_CODE_FUNCTION: - case bitc::MODULE_CODE_ALIAS: { - StringRef Name; - ArrayRef<uint64_t> GVRecord; - std::tie(Name, GVRecord) = readNameFromStrtab(Record); - if (GVRecord.size() <= 3) - return error("Invalid record"); - uint64_t RawLinkage = GVRecord[3]; - GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage); - if (!UseStrtab) { - ValueIdToLinkageMap[ValueId++] = Linkage; - break; - } - setValueGUID(ValueId++, Name, Linkage, SourceFileName); - break; - } - } + setValueGUID(ValueId++, Name, Linkage, SourceFileName); + break; + } } + } continue; } } @@ -5796,7 +5904,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { switch (unsigned BitCode = MaybeBitCode.get()) { default: // Default behavior: ignore. break; - case bitc::FS_FLAGS: { // [flags] + case bitc::FS_FLAGS: { // [flags] uint64_t Flags = Record[0]; // Scan flags. assert(Flags <= 0x1f && "Unexpected bits in flag"); @@ -5915,7 +6023,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { AS->setModulePath(getThisModule()->first()); auto AliaseeVI = getValueInfoFromValueId(AliaseeID).first; - auto AliaseeInModule = TheIndex.findSummaryInModule(AliaseeVI, ModulePath); + auto AliaseeInModule = + TheIndex.findSummaryInModule(AliaseeVI, ModulePath); if (!AliaseeInModule) return error("Alias expects aliasee summary to be parsed"); AS->setAliasee(AliaseeVI, AliaseeInModule); @@ -6057,7 +6166,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { AS->setModulePath(ModuleIdMap[ModuleId]); auto AliaseeVI = getValueInfoFromValueId(AliaseeValueId).first; - auto AliaseeInModule = TheIndex.findSummaryInModule(AliaseeVI, AS->modulePath()); + auto AliaseeInModule = + TheIndex.findSummaryInModule(AliaseeVI, AS->modulePath()); AS->setAliasee(AliaseeVI, AliaseeInModule); ValueInfo VI = getValueInfoFromValueId(ValueID).first; @@ -6110,13 +6220,13 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { case bitc::FS_TYPE_TEST_ASSUME_VCALLS: assert(PendingTypeTestAssumeVCalls.empty()); for (unsigned I = 0; I != Record.size(); I += 2) - PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I+1]}); + PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I + 1]}); break; case bitc::FS_TYPE_CHECKED_LOAD_VCALLS: assert(PendingTypeCheckedLoadVCalls.empty()); for (unsigned I = 0; I != Record.size(); I += 2) - PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I+1]}); + PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I + 1]}); break; case bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL: @@ -6231,9 +6341,7 @@ namespace { // will be removed once this transition is complete. Clients should prefer to // deal with the Error value directly, rather than converting to error_code. class BitcodeErrorCategoryType : public std::error_category { - const char *name() const noexcept override { - return "llvm.bitcode"; - } + const char *name() const noexcept override { return "llvm.bitcode"; } std::string message(int IE) const override { BitcodeError E = static_cast<BitcodeError>(IE); diff --git a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp index 5bd970432159c6bc7eee1cb2061e2891f07ddc7b..55e7415efbea2b37d85f20b1d123ce9a80efe67e 100644 --- a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -24,9 +24,9 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/Bitstream/BitCodes.h" #include "llvm/Bitstream/BitstreamWriter.h" -#include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -339,8 +339,8 @@ private: unsigned Abbrev); void writeDILocalVariable(const DILocalVariable *N, SmallVectorImpl<uint64_t> &Record, unsigned Abbrev); - void writeDILabel(const DILabel *N, - SmallVectorImpl<uint64_t> &Record, unsigned Abbrev); + void writeDILabel(const DILabel *N, SmallVectorImpl<uint64_t> &Record, + unsigned Abbrev); void writeDIExpression(const DIExpression *N, SmallVectorImpl<uint64_t> &Record, unsigned Abbrev); void writeDIGlobalVariableExpression(const DIGlobalVariableExpression *N, @@ -391,9 +391,7 @@ private: void writeBlockInfo(); void writeModuleHash(size_t BlockStartPos); - unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { - return unsigned(SSID); - } + unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { return unsigned(SSID); } }; /// Class to manage the bitcode writing for a combined index. @@ -426,9 +424,8 @@ public: // in writing out the call graph edges. Save the mapping from GUID // to the new global value id to use when writing those edges, which // are currently saved in the index in terms of GUID. - forEachSummary([&](GVInfo I, bool) { - GUIDToValueIdMap[I.first] = ++GlobalValueId; - }); + forEachSummary( + [&](GVInfo I, bool) { GUIDToValueIdMap[I.first] = ++GlobalValueId; }); } /// The below iterator returns the GUID and associated summary. @@ -437,8 +434,7 @@ public: /// Calls the callback for each value GUID and summary to be written to /// bitcode. This hides the details of whether they are being pulled from the /// entire index or just those in a provided ModuleToSummariesForIndex map. - template<typename Functor> - void forEachSummary(Functor Callback) { + template <typename Functor> void forEachSummary(Functor Callback) { if (ModuleToSummariesForIndex) { for (auto &M : *ModuleToSummariesForIndex) for (auto &Summary : M.second) { @@ -500,82 +496,133 @@ private: static unsigned getEncodedCastOpcode(unsigned Opcode) { switch (Opcode) { - default: llvm_unreachable("Unknown cast instruction!"); - case Instruction::Trunc : return bitc::CAST_TRUNC; - case Instruction::ZExt : return bitc::CAST_ZEXT; - case Instruction::SExt : return bitc::CAST_SEXT; - case Instruction::FPToUI : return bitc::CAST_FPTOUI; - case Instruction::FPToSI : return bitc::CAST_FPTOSI; - case Instruction::UIToFP : return bitc::CAST_UITOFP; - case Instruction::SIToFP : return bitc::CAST_SITOFP; - case Instruction::FPTrunc : return bitc::CAST_FPTRUNC; - case Instruction::FPExt : return bitc::CAST_FPEXT; - case Instruction::PtrToInt: return bitc::CAST_PTRTOINT; - case Instruction::IntToPtr: return bitc::CAST_INTTOPTR; - case Instruction::BitCast : return bitc::CAST_BITCAST; - case Instruction::AddrSpaceCast: return bitc::CAST_ADDRSPACECAST; + default: + llvm_unreachable("Unknown cast instruction!"); + case Instruction::Trunc: + return bitc::CAST_TRUNC; + case Instruction::ZExt: + return bitc::CAST_ZEXT; + case Instruction::SExt: + return bitc::CAST_SEXT; + case Instruction::FPToUI: + return bitc::CAST_FPTOUI; + case Instruction::FPToSI: + return bitc::CAST_FPTOSI; + case Instruction::UIToFP: + return bitc::CAST_UITOFP; + case Instruction::SIToFP: + return bitc::CAST_SITOFP; + case Instruction::FPTrunc: + return bitc::CAST_FPTRUNC; + case Instruction::FPExt: + return bitc::CAST_FPEXT; + case Instruction::PtrToInt: + return bitc::CAST_PTRTOINT; + case Instruction::IntToPtr: + return bitc::CAST_INTTOPTR; + case Instruction::BitCast: + return bitc::CAST_BITCAST; + case Instruction::AddrSpaceCast: + return bitc::CAST_ADDRSPACECAST; } } static unsigned getEncodedUnaryOpcode(unsigned Opcode) { switch (Opcode) { - default: llvm_unreachable("Unknown binary instruction!"); - case Instruction::FNeg: return bitc::UNOP_NEG; + default: + llvm_unreachable("Unknown binary instruction!"); + case Instruction::FNeg: + return bitc::UNOP_NEG; } } static unsigned getEncodedBinaryOpcode(unsigned Opcode) { switch (Opcode) { - default: llvm_unreachable("Unknown binary instruction!"); + default: + llvm_unreachable("Unknown binary instruction!"); case Instruction::Add: - case Instruction::FAdd: return bitc::BINOP_ADD; + case Instruction::FAdd: + return bitc::BINOP_ADD; case Instruction::Sub: - case Instruction::FSub: return bitc::BINOP_SUB; + case Instruction::FSub: + return bitc::BINOP_SUB; case Instruction::Mul: - case Instruction::FMul: return bitc::BINOP_MUL; - case Instruction::UDiv: return bitc::BINOP_UDIV; + case Instruction::FMul: + return bitc::BINOP_MUL; + case Instruction::UDiv: + return bitc::BINOP_UDIV; case Instruction::FDiv: - case Instruction::SDiv: return bitc::BINOP_SDIV; - case Instruction::URem: return bitc::BINOP_UREM; + case Instruction::SDiv: + return bitc::BINOP_SDIV; + case Instruction::URem: + return bitc::BINOP_UREM; case Instruction::FRem: - case Instruction::SRem: return bitc::BINOP_SREM; - case Instruction::Shl: return bitc::BINOP_SHL; - case Instruction::LShr: return bitc::BINOP_LSHR; - case Instruction::AShr: return bitc::BINOP_ASHR; - case Instruction::And: return bitc::BINOP_AND; - case Instruction::Or: return bitc::BINOP_OR; - case Instruction::Xor: return bitc::BINOP_XOR; + case Instruction::SRem: + return bitc::BINOP_SREM; + case Instruction::Shl: + return bitc::BINOP_SHL; + case Instruction::LShr: + return bitc::BINOP_LSHR; + case Instruction::AShr: + return bitc::BINOP_ASHR; + case Instruction::And: + return bitc::BINOP_AND; + case Instruction::Or: + return bitc::BINOP_OR; + case Instruction::Xor: + return bitc::BINOP_XOR; } } static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op) { switch (Op) { - default: llvm_unreachable("Unknown RMW operation!"); - case AtomicRMWInst::Xchg: return bitc::RMW_XCHG; - case AtomicRMWInst::Add: return bitc::RMW_ADD; - case AtomicRMWInst::Sub: return bitc::RMW_SUB; - case AtomicRMWInst::And: return bitc::RMW_AND; - case AtomicRMWInst::Nand: return bitc::RMW_NAND; - case AtomicRMWInst::Or: return bitc::RMW_OR; - case AtomicRMWInst::Xor: return bitc::RMW_XOR; - case AtomicRMWInst::Max: return bitc::RMW_MAX; - case AtomicRMWInst::Min: return bitc::RMW_MIN; - case AtomicRMWInst::UMax: return bitc::RMW_UMAX; - case AtomicRMWInst::UMin: return bitc::RMW_UMIN; - case AtomicRMWInst::FAdd: return bitc::RMW_FADD; - case AtomicRMWInst::FSub: return bitc::RMW_FSUB; + default: + llvm_unreachable("Unknown RMW operation!"); + case AtomicRMWInst::Xchg: + return bitc::RMW_XCHG; + case AtomicRMWInst::Add: + return bitc::RMW_ADD; + case AtomicRMWInst::Sub: + return bitc::RMW_SUB; + case AtomicRMWInst::And: + return bitc::RMW_AND; + case AtomicRMWInst::Nand: + return bitc::RMW_NAND; + case AtomicRMWInst::Or: + return bitc::RMW_OR; + case AtomicRMWInst::Xor: + return bitc::RMW_XOR; + case AtomicRMWInst::Max: + return bitc::RMW_MAX; + case AtomicRMWInst::Min: + return bitc::RMW_MIN; + case AtomicRMWInst::UMax: + return bitc::RMW_UMAX; + case AtomicRMWInst::UMin: + return bitc::RMW_UMIN; + case AtomicRMWInst::FAdd: + return bitc::RMW_FADD; + case AtomicRMWInst::FSub: + return bitc::RMW_FSUB; } } static unsigned getEncodedOrdering(AtomicOrdering Ordering) { switch (Ordering) { - case AtomicOrdering::NotAtomic: return bitc::ORDERING_NOTATOMIC; - case AtomicOrdering::Unordered: return bitc::ORDERING_UNORDERED; - case AtomicOrdering::Monotonic: return bitc::ORDERING_MONOTONIC; - case AtomicOrdering::Acquire: return bitc::ORDERING_ACQUIRE; - case AtomicOrdering::Release: return bitc::ORDERING_RELEASE; - case AtomicOrdering::AcquireRelease: return bitc::ORDERING_ACQREL; - case AtomicOrdering::SequentiallyConsistent: return bitc::ORDERING_SEQCST; + case AtomicOrdering::NotAtomic: + return bitc::ORDERING_NOTATOMIC; + case AtomicOrdering::Unordered: + return bitc::ORDERING_UNORDERED; + case AtomicOrdering::Monotonic: + return bitc::ORDERING_MONOTONIC; + case AtomicOrdering::Acquire: + return bitc::ORDERING_ACQUIRE; + case AtomicOrdering::Release: + return bitc::ORDERING_RELEASE; + case AtomicOrdering::AcquireRelease: + return bitc::ORDERING_ACQREL; + case AtomicOrdering::SequentiallyConsistent: + return bitc::ORDERING_SEQCST; } llvm_unreachable("Invalid ordering"); } @@ -746,7 +793,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { void ModuleBitcodeWriter::writeAttributeGroupTable() { const std::vector<ValueEnumerator::IndexAndAttrSet> &AttrGrps = VE.getAttributeGroups(); - if (AttrGrps.empty()) return; + if (AttrGrps.empty()) + return; Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3); @@ -795,7 +843,8 @@ void ModuleBitcodeWriter::writeAttributeGroupTable() { void ModuleBitcodeWriter::writeAttributeTable() { const std::vector<AttributeList> &Attrs = VE.getAttributeLists(); - if (Attrs.empty()) return; + if (Attrs.empty()) + return; Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3); @@ -828,13 +877,13 @@ void ModuleBitcodeWriter::writeTypeTable() { auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); - Abbv->Add(BitCodeAbbrevOp(0)); // Addrspace = 0 + Abbv->Add(BitCodeAbbrevOp(0)); // Addrspace = 0 unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for TYPE_CODE_FUNCTION. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isvararg + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isvararg Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv)); @@ -842,7 +891,7 @@ void ModuleBitcodeWriter::writeTypeTable() { // Abbrev for TYPE_CODE_STRUCT_ANON. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv)); @@ -857,7 +906,7 @@ void ModuleBitcodeWriter::writeTypeTable() { // Abbrev for TYPE_CODE_STRUCT_NAMED. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv)); @@ -865,7 +914,7 @@ void ModuleBitcodeWriter::writeTypeTable() { // Abbrev for TYPE_CODE_ARRAY. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // size + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // size Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv)); @@ -881,17 +930,39 @@ void ModuleBitcodeWriter::writeTypeTable() { unsigned Code = 0; switch (T->getTypeID()) { - case Type::VoidTyID: Code = bitc::TYPE_CODE_VOID; break; - case Type::HalfTyID: Code = bitc::TYPE_CODE_HALF; break; - case Type::FloatTyID: Code = bitc::TYPE_CODE_FLOAT; break; - case Type::DoubleTyID: Code = bitc::TYPE_CODE_DOUBLE; break; - case Type::X86_FP80TyID: Code = bitc::TYPE_CODE_X86_FP80; break; - case Type::FP128TyID: Code = bitc::TYPE_CODE_FP128; break; - case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break; - case Type::LabelTyID: Code = bitc::TYPE_CODE_LABEL; break; - case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; - case Type::X86_MMXTyID: Code = bitc::TYPE_CODE_X86_MMX; break; - case Type::TokenTyID: Code = bitc::TYPE_CODE_TOKEN; break; + case Type::VoidTyID: + Code = bitc::TYPE_CODE_VOID; + break; + case Type::HalfTyID: + Code = bitc::TYPE_CODE_HALF; + break; + case Type::FloatTyID: + Code = bitc::TYPE_CODE_FLOAT; + break; + case Type::DoubleTyID: + Code = bitc::TYPE_CODE_DOUBLE; + break; + case Type::X86_FP80TyID: + Code = bitc::TYPE_CODE_X86_FP80; + break; + case Type::FP128TyID: + Code = bitc::TYPE_CODE_FP128; + break; + case Type::PPC_FP128TyID: + Code = bitc::TYPE_CODE_PPC_FP128; + break; + case Type::LabelTyID: + Code = bitc::TYPE_CODE_LABEL; + break; + case Type::MetadataTyID: + Code = bitc::TYPE_CODE_METADATA; + break; + case Type::X86_MMXTyID: + Code = bitc::TYPE_CODE_X86_MMX; + break; + case Type::TokenTyID: + Code = bitc::TYPE_CODE_TOKEN; + break; case Type::IntegerTyID: // INTEGER: [width] Code = bitc::TYPE_CODE_INTEGER; @@ -904,7 +975,8 @@ void ModuleBitcodeWriter::writeTypeTable() { TypeVals.push_back(VE.getTypeID(PTy->getElementType())); unsigned AddressSpace = PTy->getAddressSpace(); TypeVals.push_back(AddressSpace); - if (AddressSpace == 0) AbbrevToUse = PtrAbbrev; + if (AddressSpace == 0) + AbbrevToUse = PtrAbbrev; break; } case Type::FunctionTyID: { @@ -924,7 +996,8 @@ void ModuleBitcodeWriter::writeTypeTable() { TypeVals.push_back(ST->isPacked()); // Output all of the element types. for (StructType::element_iterator I = ST->element_begin(), - E = ST->element_end(); I != E; ++I) + E = ST->element_end(); + I != E; ++I) TypeVals.push_back(VE.getTypeID(*I)); if (ST->isLiteral()) { @@ -1041,29 +1114,40 @@ static uint64_t getEncodedGVarFlags(GlobalVarSummary::GVarFlags Flags) { static unsigned getEncodedVisibility(const GlobalValue &GV) { switch (GV.getVisibility()) { - case GlobalValue::DefaultVisibility: return 0; - case GlobalValue::HiddenVisibility: return 1; - case GlobalValue::ProtectedVisibility: return 2; + case GlobalValue::DefaultVisibility: + return 0; + case GlobalValue::HiddenVisibility: + return 1; + case GlobalValue::ProtectedVisibility: + return 2; } llvm_unreachable("Invalid visibility"); } static unsigned getEncodedDLLStorageClass(const GlobalValue &GV) { switch (GV.getDLLStorageClass()) { - case GlobalValue::DefaultStorageClass: return 0; - case GlobalValue::DLLImportStorageClass: return 1; - case GlobalValue::DLLExportStorageClass: return 2; + case GlobalValue::DefaultStorageClass: + return 0; + case GlobalValue::DLLImportStorageClass: + return 1; + case GlobalValue::DLLExportStorageClass: + return 2; } llvm_unreachable("Invalid DLL storage class"); } static unsigned getEncodedThreadLocalMode(const GlobalValue &GV) { switch (GV.getThreadLocalMode()) { - case GlobalVariable::NotThreadLocal: return 0; - case GlobalVariable::GeneralDynamicTLSModel: return 1; - case GlobalVariable::LocalDynamicTLSModel: return 2; - case GlobalVariable::InitialExecTLSModel: return 3; - case GlobalVariable::LocalExecTLSModel: return 4; + case GlobalVariable::NotThreadLocal: + return 0; + case GlobalVariable::GeneralDynamicTLSModel: + return 1; + case GlobalVariable::LocalDynamicTLSModel: + return 2; + case GlobalVariable::InitialExecTLSModel: + return 3; + case GlobalVariable::LocalExecTLSModel: + return 4; } llvm_unreachable("Invalid TLS model"); } @@ -1086,9 +1170,12 @@ static unsigned getEncodedComdatSelectionKind(const Comdat &C) { static unsigned getEncodedUnnamedAddr(const GlobalValue &GV) { switch (GV.getUnnamedAddr()) { - case GlobalValue::UnnamedAddr::None: return 0; - case GlobalValue::UnnamedAddr::Local: return 2; - case GlobalValue::UnnamedAddr::Global: return 1; + case GlobalValue::UnnamedAddr::None: + return 0; + case GlobalValue::UnnamedAddr::Local: + return 2; + case GlobalValue::UnnamedAddr::Global: + return 1; } llvm_unreachable("Invalid unnamed_addr"); } @@ -1182,8 +1269,8 @@ void ModuleBitcodeWriter::writeModuleInfo() { // Give section names unique ID's. unsigned &Entry = SectionMap[GV.getSection()]; if (!Entry) { - writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME, GV.getSection(), - 0 /*TODO*/); + writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME, + GV.getSection(), 0 /*TODO*/); Entry = SectionMap.size(); } } @@ -1219,7 +1306,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, - Log2_32_Ceil(MaxGlobalType+1))); + Log2_32_Ceil(MaxGlobalType + 1))); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // AddrSpace << 2 //| explicitType << 1 //| constant @@ -1228,15 +1315,15 @@ void ModuleBitcodeWriter::writeModuleInfo() { if (MaxAlignment == 0) // Alignment. Abbv->Add(BitCodeAbbrevOp(0)); else { - unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1; + unsigned MaxEncAlignment = Log2_32(MaxAlignment) + 1; Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, - Log2_32_Ceil(MaxEncAlignment+1))); + Log2_32_Ceil(MaxEncAlignment + 1))); } - if (SectionMap.empty()) // Section. + if (SectionMap.empty()) // Section. Abbv->Add(BitCodeAbbrevOp(0)); else Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, - Log2_32_Ceil(SectionMap.size()+1))); + Log2_32_Ceil(SectionMap.size() + 1))); // Don't bother emitting vis + thread local. SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv)); } @@ -1278,19 +1365,17 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(GV.getName().size()); Vals.push_back(VE.getTypeID(GV.getValueType())); Vals.push_back(GV.getType()->getAddressSpace() << 2 | 2 | GV.isConstant()); - Vals.push_back(GV.isDeclaration() ? 0 : - (VE.getValueID(GV.getInitializer()) + 1)); + Vals.push_back( + GV.isDeclaration() ? 0 : (VE.getValueID(GV.getInitializer()) + 1)); Vals.push_back(getEncodedLinkage(GV)); - Vals.push_back(Log2_32(GV.getAlignment())+1); + Vals.push_back(Log2_32(GV.getAlignment()) + 1); Vals.push_back(GV.hasSection() ? SectionMap[GV.getSection()] : 0); if (GV.isThreadLocal() || GV.getVisibility() != GlobalValue::DefaultVisibility || GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None || GV.isExternallyInitialized() || GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass || - GV.hasComdat() || - GV.hasAttributes() || - GV.isDSOLocal() || + GV.hasComdat() || GV.hasAttributes() || GV.isDSOLocal() || GV.hasPartition()) { Vals.push_back(getEncodedVisibility(GV)); Vals.push_back(getEncodedThreadLocalMode(GV)); @@ -1326,13 +1411,13 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(F.isDeclaration()); Vals.push_back(getEncodedLinkage(F)); Vals.push_back(VE.getAttributeListID(F.getAttributes())); - Vals.push_back(Log2_32(F.getAlignment())+1); + Vals.push_back(Log2_32(F.getAlignment()) + 1); Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0); Vals.push_back(getEncodedVisibility(F)); Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0); Vals.push_back(getEncodedUnnamedAddr(F)); - Vals.push_back(F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1) - : 0); + Vals.push_back( + F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1) : 0); Vals.push_back(getEncodedDLLStorageClass(F)); Vals.push_back(F.hasComdat() ? VE.getComdatID(F.getComdat()) : 0); Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1) @@ -1864,9 +1949,9 @@ void ModuleBitcodeWriter::writeDILocalVariable( Record.clear(); } -void ModuleBitcodeWriter::writeDILabel( - const DILabel *N, SmallVectorImpl<uint64_t> &Record, - unsigned Abbrev) { +void ModuleBitcodeWriter::writeDILabel(const DILabel *N, + SmallVectorImpl<uint64_t> &Record, + unsigned Abbrev) { Record.push_back((uint64_t)N->isDistinct()); Record.push_back(VE.getMetadataOrNullID(N->getScope())); Record.push_back(VE.getMetadataOrNullID(N->getRawName())); @@ -2016,7 +2101,7 @@ void ModuleBitcodeWriter::writeMetadataRecords( if (MDs.empty()) return; - // Initialize MDNode abbreviations. + // Initialize MDNode abbreviations. #define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0; #include "llvm/IR/Metadata.def" @@ -2181,7 +2266,8 @@ void ModuleBitcodeWriter::writeFunctionMetadataAttachment(const Function &F) { I.getAllMetadataOtherThanDebugLoc(MDs); // If no metadata, ignore instruction. - if (MDs.empty()) continue; + if (MDs.empty()) + continue; Record.push_back(VE.getInstructionID(&I)); @@ -2204,7 +2290,8 @@ void ModuleBitcodeWriter::writeModuleMetadataKinds() { SmallVector<StringRef, 8> Names; M.getMDKindNames(Names); - if (Names.empty()) return; + if (Names.empty()) + return; Stream.EnterSubblock(bitc::METADATA_KIND_BLOCK_ID, 3); @@ -2274,7 +2361,8 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) { void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, bool isGlobal) { - if (FirstVal == LastVal) return; + if (FirstVal == LastVal) + return; Stream.EnterSubblock(bitc::CONSTANTS_BLOCK_ID, 4); @@ -2288,7 +2376,8 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1))); + Abbv->Add( + BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal + 1))); AggregateAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for CST_CODE_STRING. @@ -2329,7 +2418,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) { Record.push_back(unsigned(IA->hasSideEffects()) | unsigned(IA->isAlignStack()) << 1 | - unsigned(IA->getDialect()&1) << 2); + unsigned(IA->getDialect() & 1) << 2); // Add the asm string. const std::string &AsmStr = IA->getAsmString(); @@ -2357,7 +2446,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, emitSignedInt64(Record, V); Code = bitc::CST_CODE_INTEGER; AbbrevToUse = CONSTANTS_INTEGER_ABBREV; - } else { // Wide integers, > 64 bits in size. + } else { // Wide integers, > 64 bits in size. // We have an arbitrary precision integer value to write whose // bit width is > 64. However, in canonical unsigned integer // format it is likely that the high bits are going to be zero. @@ -2397,7 +2486,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, // If this is a null-terminated string, use the denser CSTRING encoding. if (Str->isCString()) { Code = bitc::CST_CODE_CSTRING; - --NumElts; // Don't encode the null, which isn't allowed by char6. + --NumElts; // Don't encode the null, which isn't allowed by char6. } else { Code = bitc::CST_CODE_STRING; AbbrevToUse = String8Abbrev; @@ -2417,7 +2506,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, else if (isCStr7) AbbrevToUse = CString7Abbrev; } else if (const ConstantDataSequential *CDS = - dyn_cast<ConstantDataSequential>(C)) { + dyn_cast<ConstantDataSequential>(C)) { Code = bitc::CST_CODE_DATA; Type *EltTy = CDS->getType()->getElementType(); if (isa<IntegerType>(EltTy)) { @@ -2713,45 +2802,39 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, break; } - case Instruction::Ret: - { - Code = bitc::FUNC_CODE_INST_RET; - unsigned NumOperands = I.getNumOperands(); - if (NumOperands == 0) - AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV; - else if (NumOperands == 1) { - if (!pushValueAndType(I.getOperand(0), InstID, Vals)) - AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV; - } else { - for (unsigned i = 0, e = NumOperands; i != e; ++i) - pushValueAndType(I.getOperand(i), InstID, Vals); - } + case Instruction::Ret: { + Code = bitc::FUNC_CODE_INST_RET; + unsigned NumOperands = I.getNumOperands(); + if (NumOperands == 0) + AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV; + else if (NumOperands == 1) { + if (!pushValueAndType(I.getOperand(0), InstID, Vals)) + AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV; + } else { + for (unsigned i = 0, e = NumOperands; i != e; ++i) + pushValueAndType(I.getOperand(i), InstID, Vals); } - break; - case Instruction::Br: - { - Code = bitc::FUNC_CODE_INST_BR; - const BranchInst &II = cast<BranchInst>(I); - Vals.push_back(VE.getValueID(II.getSuccessor(0))); - if (II.isConditional()) { - Vals.push_back(VE.getValueID(II.getSuccessor(1))); - pushValue(II.getCondition(), InstID, Vals); - } + } break; + case Instruction::Br: { + Code = bitc::FUNC_CODE_INST_BR; + const BranchInst &II = cast<BranchInst>(I); + Vals.push_back(VE.getValueID(II.getSuccessor(0))); + if (II.isConditional()) { + Vals.push_back(VE.getValueID(II.getSuccessor(1))); + pushValue(II.getCondition(), InstID, Vals); } - break; - case Instruction::Switch: - { - Code = bitc::FUNC_CODE_INST_SWITCH; - const SwitchInst &SI = cast<SwitchInst>(I); - Vals.push_back(VE.getTypeID(SI.getCondition()->getType())); - pushValue(SI.getCondition(), InstID, Vals); - Vals.push_back(VE.getValueID(SI.getDefaultDest())); - for (auto Case : SI.cases()) { - Vals.push_back(VE.getValueID(Case.getCaseValue())); - Vals.push_back(VE.getValueID(Case.getCaseSuccessor())); - } + } break; + case Instruction::Switch: { + Code = bitc::FUNC_CODE_INST_SWITCH; + const SwitchInst &SI = cast<SwitchInst>(I); + Vals.push_back(VE.getTypeID(SI.getCondition()->getType())); + pushValue(SI.getCondition(), InstID, Vals); + Vals.push_back(VE.getValueID(SI.getDefaultDest())); + for (auto Case : SI.cases()) { + Vals.push_back(VE.getValueID(Case.getCaseValue())); + Vals.push_back(VE.getValueID(Case.getCaseSuccessor())); } - break; + } break; case Instruction::IndirectBr: Code = bitc::FUNC_CODE_INST_INDIRECTBR; Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); @@ -2938,7 +3021,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, AbbrevToUse = FUNCTION_INST_LOAD_ABBREV; } Vals.push_back(VE.getTypeID(I.getType())); - Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1); + Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment()) + 1); Vals.push_back(cast<LoadInst>(I).isVolatile()); if (cast<LoadInst>(I).isAtomic()) { Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering())); @@ -2952,7 +3035,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_STORE; pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val - Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1); + Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment()) + 1); Vals.push_back(cast<StoreInst>(I).isVolatile()); if (cast<StoreInst>(I).isAtomic()) { Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering())); @@ -3025,17 +3108,17 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, // Emit type/value pairs for varargs params. if (FTy->isVarArg()) { - for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands(); - i != e; ++i) + for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands(); i != e; + ++i) pushValueAndType(CI.getArgOperand(i), InstID, Vals); // varargs } break; } case Instruction::VAArg: Code = bitc::FUNC_CODE_INST_VAARG; - Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); // valistty - pushValue(I.getOperand(0), InstID, Vals); // valist. - Vals.push_back(VE.getTypeID(I.getType())); // restype. + Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); // valistty + pushValue(I.getOperand(0), InstID, Vals); // valist. + Vals.push_back(VE.getTypeID(I.getType())); // restype. break; } @@ -3046,7 +3129,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, /// Write a GlobalValue VST to the module. The purpose of this data structure is /// to allow clients to efficiently find the function body. void ModuleBitcodeWriter::writeGlobalValueSymbolTable( - DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) { + DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) { // Get the offset of the VST we are writing, and backpatch it into // the VST forward declaration record. uint64_t VSTOffset = Stream.GetCurrentBitNo(); @@ -3201,8 +3284,8 @@ void ModuleBitcodeWriter::writeFunction( DILocation *LastDL = nullptr; // Finally, emit all the instructions, in order. for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; + ++I) { writeInstruction(*I, InstID, Vals); if (!I->getType()->isVoidTy()) @@ -3316,10 +3399,10 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // CE_CAST abbrev for CONSTANTS_BLOCK. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // typeid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // typeid VE.computeBitsRequiredForTypeIndicies())); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) != CONSTANTS_CE_CAST_Abbrev) @@ -3341,7 +3424,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty VE.computeBitsRequiredForTypeIndicies())); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_LOAD_ABBREV) @@ -3350,7 +3433,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // INST_UNOP abbrev for FUNCTION_BLOCK. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_UNOP_ABBREV) @@ -3359,7 +3442,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // INST_UNOP_FLAGS abbrev for FUNCTION_BLOCK. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != @@ -3369,8 +3452,8 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // INST_BINOP abbrev for FUNCTION_BLOCK. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_BINOP_ABBREV) @@ -3379,8 +3462,8 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != @@ -3390,10 +3473,10 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // INST_CAST abbrev for FUNCTION_BLOCK. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty VE.computeBitsRequiredForTypeIndicies())); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_CAST_ABBREV) llvm_unreachable("Unexpected abbrev ordering!"); @@ -3764,13 +3847,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { // Abbrev for FS_PERMODULE_PROFILE. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3782,13 +3865,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_RELBF)); else Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt // numrefs x valueid, n x (valueid [, rel_block_freq]) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3817,9 +3900,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { // Abbrev for FS_ALIAS. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_ALIAS)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_TYPE_ID_METADATA @@ -3913,15 +3996,15 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // Abbrev for FS_COMBINED. auto Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3930,15 +4013,15 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // Abbrev for FS_COMBINED_PROFILE. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3947,20 +4030,20 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // Abbrev for FS_COMBINED_GLOBALVAR_INIT_REFS. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); // valueids + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); // valueids Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_COMBINED_ALIAS. Abbv = std::make_shared<BitCodeAbbrev>(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALIAS)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // The aliases are emitted as a post-pass, and will point to the value @@ -4311,10 +4394,10 @@ static void emitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer, // number from /usr/include/mach/machine.h. It is ok to reproduce the // specific constants here because they are implicitly part of the Darwin ABI. enum { - DARWIN_CPU_ARCH_ABI64 = 0x01000000, - DARWIN_CPU_TYPE_X86 = 7, - DARWIN_CPU_TYPE_ARM = 12, - DARWIN_CPU_TYPE_POWERPC = 18 + DARWIN_CPU_ARCH_ABI64 = 0x01000000, + DARWIN_CPU_TYPE_X86 = 7, + DARWIN_CPU_TYPE_ARM = 12, + DARWIN_CPU_TYPE_POWERPC = 18 }; Triple::ArchType Arch = TT.getArch(); @@ -4463,7 +4546,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out, const ModuleSummaryIndex *Index, bool GenerateHash, ModuleHash *ModHash) { SmallVector<char, 0> Buffer; - Buffer.reserve(256*1024); + Buffer.reserve(256 * 1024); // If this is darwin or another generic macho target, reserve space for the // header. @@ -4481,7 +4564,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out, emitDarwinBCHeaderAndTrailer(Buffer, TT); // Write the generated bitstream to "Out". - Out.write((char*)&Buffer.front(), Buffer.size()); + Out.write((char *)&Buffer.front(), Buffer.size()); } void IndexBitcodeWriter::write() { diff --git a/hpvm/llvm_patches/lib/IR/Attributes.cpp b/hpvm/llvm_patches/lib/IR/Attributes.cpp index 264d0fe498b7f424686017d12759bf1d7d0ae2f6..3cc95b3102fdf6c7062fffe1f9486cfa094bba9b 100644 --- a/hpvm/llvm_patches/lib/IR/Attributes.cpp +++ b/hpvm/llvm_patches/lib/IR/Attributes.cpp @@ -82,7 +82,8 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind, LLVMContextImpl *pImpl = Context.pImpl; FoldingSetNodeID ID; ID.AddInteger(Kind); - if (Val) ID.AddInteger(Val); + if (Val) + ID.AddInteger(Val); void *InsertPoint; AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint); @@ -105,7 +106,8 @@ Attribute Attribute::get(LLVMContext &Context, StringRef Kind, StringRef Val) { LLVMContextImpl *pImpl = Context.pImpl; FoldingSetNodeID ID; ID.AddString(Kind); - if (!Val.empty()) ID.AddString(Val); + if (!Val.empty()) + ID.AddString(Val); void *InsertPoint; AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint); @@ -156,7 +158,7 @@ Attribute Attribute::getWithStackAlignment(LLVMContext &Context, } Attribute Attribute::getWithDereferenceableBytes(LLVMContext &Context, - uint64_t Bytes) { + uint64_t Bytes) { assert(Bytes && "Bytes must be non-zero."); return get(Context, Dereferenceable, Bytes); } @@ -200,47 +202,52 @@ bool Attribute::isTypeAttribute() const { } Attribute::AttrKind Attribute::getKindAsEnum() const { - if (!pImpl) return None; + if (!pImpl) + return None; assert((isEnumAttribute() || isIntAttribute() || isTypeAttribute()) && "Invalid attribute type to get the kind as an enum!"); return pImpl->getKindAsEnum(); } uint64_t Attribute::getValueAsInt() const { - if (!pImpl) return 0; + if (!pImpl) + return 0; assert(isIntAttribute() && "Expected the attribute to be an integer attribute!"); return pImpl->getValueAsInt(); } StringRef Attribute::getKindAsString() const { - if (!pImpl) return {}; + if (!pImpl) + return {}; assert(isStringAttribute() && "Invalid attribute type to get the kind as a string!"); return pImpl->getKindAsString(); } StringRef Attribute::getValueAsString() const { - if (!pImpl) return {}; + if (!pImpl) + return {}; assert(isStringAttribute() && "Invalid attribute type to get the value as a string!"); return pImpl->getValueAsString(); } Type *Attribute::getValueAsType() const { - if (!pImpl) return {}; + if (!pImpl) + return {}; assert(isTypeAttribute() && "Invalid attribute type to get the value as a type!"); return pImpl->getValueAsType(); } - bool Attribute::hasAttribute(AttrKind Kind) const { return (pImpl && pImpl->hasAttribute(Kind)) || (!pImpl && Kind == None); } bool Attribute::hasAttribute(StringRef Kind) const { - if (!isStringAttribute()) return false; + if (!isStringAttribute()) + return false; return pImpl && pImpl->hasAttribute(Kind); } @@ -277,7 +284,8 @@ std::pair<unsigned, Optional<unsigned>> Attribute::getAllocSizeArgs() const { } std::string Attribute::getAsString(bool InAttrGrp) const { - if (!pImpl) return {}; + if (!pImpl) + return {}; if (hasAttribute(Attribute::SanitizeAddress)) return "sanitize_address"; @@ -478,7 +486,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const { Result += (Twine('"') + getKindAsString() + Twine('"')).str(); std::string AttrVal = pImpl->getValueAsString(); - if (AttrVal.empty()) return Result; + if (AttrVal.empty()) + return Result; // Since some attribute strings contain special characters that cannot be // printable, those have to be escaped to make the attribute value printable @@ -496,9 +505,12 @@ std::string Attribute::getAsString(bool InAttrGrp) const { } bool Attribute::operator<(Attribute A) const { - if (!pImpl && !A.pImpl) return false; - if (!pImpl) return true; - if (!A.pImpl) return false; + if (!pImpl && !A.pImpl) + return false; + if (!pImpl) + return true; + if (!A.pImpl) + return false; return *pImpl < *A.pImpl; } @@ -518,12 +530,14 @@ void StringAttributeImpl::anchor() {} void TypeAttributeImpl::anchor() {} bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const { - if (isStringAttribute()) return false; + if (isStringAttribute()) + return false; return getKindAsEnum() == A; } bool AttributeImpl::hasAttribute(StringRef Kind) const { - if (!isStringAttribute()) return false; + if (!isStringAttribute()) + return false; return getKindAsString() == Kind; } @@ -556,38 +570,51 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const { // This sorts the attributes with Attribute::AttrKinds coming first (sorted // relative to their enum value) and then strings. if (isEnumAttribute()) { - if (AI.isEnumAttribute()) return getKindAsEnum() < AI.getKindAsEnum(); - if (AI.isIntAttribute()) return true; - if (AI.isStringAttribute()) return true; - if (AI.isTypeAttribute()) return true; + if (AI.isEnumAttribute()) + return getKindAsEnum() < AI.getKindAsEnum(); + if (AI.isIntAttribute()) + return true; + if (AI.isStringAttribute()) + return true; + if (AI.isTypeAttribute()) + return true; } if (isTypeAttribute()) { - if (AI.isEnumAttribute()) return false; + if (AI.isEnumAttribute()) + return false; if (AI.isTypeAttribute()) { assert(getKindAsEnum() != AI.getKindAsEnum() && "Comparison of types would be unstable"); return getKindAsEnum() < AI.getKindAsEnum(); } - if (AI.isIntAttribute()) return true; - if (AI.isStringAttribute()) return true; + if (AI.isIntAttribute()) + return true; + if (AI.isStringAttribute()) + return true; } if (isIntAttribute()) { - if (AI.isEnumAttribute()) return false; - if (AI.isTypeAttribute()) return false; + if (AI.isEnumAttribute()) + return false; + if (AI.isTypeAttribute()) + return false; if (AI.isIntAttribute()) { if (getKindAsEnum() == AI.getKindAsEnum()) return getValueAsInt() < AI.getValueAsInt(); return getKindAsEnum() < AI.getKindAsEnum(); } - if (AI.isStringAttribute()) return true; + if (AI.isStringAttribute()) + return true; } assert(isStringAttribute()); - if (AI.isEnumAttribute()) return false; - if (AI.isTypeAttribute()) return false; - if (AI.isIntAttribute()) return false; + if (AI.isEnumAttribute()) + return false; + if (AI.isTypeAttribute()) + return false; + if (AI.isIntAttribute()) + return false; if (getKindAsString() == AI.getKindAsString()) return getValueAsString() < AI.getValueAsString(); return getKindAsString() < AI.getKindAsString(); @@ -607,7 +634,8 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) { AttributeSet AttributeSet::addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const { - if (hasAttribute(Kind)) return *this; + if (hasAttribute(Kind)) + return *this; AttrBuilder B; B.addAttribute(Kind); return addAttributes(C, AttributeSet::get(C, B)); @@ -632,27 +660,29 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, for (const auto I : *this) B.addAttribute(I); - return get(C, B); + return get(C, B); } AttributeSet AttributeSet::removeAttribute(LLVMContext &C, - Attribute::AttrKind Kind) const { - if (!hasAttribute(Kind)) return *this; + Attribute::AttrKind Kind) const { + if (!hasAttribute(Kind)) + return *this; AttrBuilder B(*this); B.removeAttribute(Kind); return get(C, B); } AttributeSet AttributeSet::removeAttribute(LLVMContext &C, - StringRef Kind) const { - if (!hasAttribute(Kind)) return *this; + StringRef Kind) const { + if (!hasAttribute(Kind)) + return *this; AttrBuilder B(*this); B.removeAttribute(Kind); return get(C, B); } AttributeSet AttributeSet::removeAttributes(LLVMContext &C, - const AttrBuilder &Attrs) const { + const AttrBuilder &Attrs) const { AttrBuilder B(*this); B.remove(Attrs); return get(C, B); @@ -718,8 +748,8 @@ AttributeSet::iterator AttributeSet::end() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void AttributeSet::dump() const { dbgs() << "AS =\n"; - dbgs() << " { "; - dbgs() << getAsString(true) << " }\n"; + dbgs() << " { "; + dbgs() << getAsString(true) << " }\n"; } #endif @@ -732,8 +762,7 @@ AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs) // There's memory after the node where we can store the entries in. llvm::copy(Attrs, getTrailingObjects<Attribute>()); - static_assert(Attribute::EndAttrKinds <= - sizeof(AvailableAttrs) * CHAR_BIT, + static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT, "Too many attributes"); for (const auto I : *this) { @@ -761,7 +790,7 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, void *InsertPoint; AttributeSetNode *PA = - pImpl->AttrsSetNodes.FindNodeOrInsertPos(ID, InsertPoint); + pImpl->AttrsSetNodes.FindNodeOrInsertPos(ID, InsertPoint); // If we didn't find any existing attributes of the same shape then create a // new one and insert it. @@ -988,7 +1017,8 @@ AttributeList::get(LLVMContext &C, [](const std::pair<unsigned, Attribute> &LHS, const std::pair<unsigned, Attribute> &RHS) { return LHS.first < RHS.first; - }) && "Misordered Attributes list!"); + }) && + "Misordered Attributes list!"); assert(llvm::none_of(Attrs, [](const std::pair<unsigned, Attribute> &Pair) { return Pair.second.hasAttribute(Attribute::None); @@ -999,7 +1029,8 @@ AttributeList::get(LLVMContext &C, // list. SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairVec; for (ArrayRef<std::pair<unsigned, Attribute>>::iterator I = Attrs.begin(), - E = Attrs.end(); I != E; ) { + E = Attrs.end(); + I != E;) { unsigned Index = I->first; SmallVector<Attribute, 4> AttrVec; while (I != E && I->first == Index) { @@ -1140,7 +1171,8 @@ AttributeList AttributeList::get(LLVMContext &C, AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const { - if (hasAttribute(Index, Kind)) return *this; + if (hasAttribute(Index, Kind)) + return *this; AttrBuilder B; B.addAttribute(Kind); return addAttributes(C, Index, B); @@ -1212,7 +1244,8 @@ AttributeList AttributeList::addParamAttribute(LLVMContext &C, AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const { - if (!hasAttribute(Index, Kind)) return *this; + if (!hasAttribute(Index, Kind)) + return *this; Index = attrIdxToArrayIdx(Index); SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end()); @@ -1225,7 +1258,8 @@ AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, StringRef Kind) const { - if (!hasAttribute(Index, Kind)) return *this; + if (!hasAttribute(Index, Kind)) + return *this; Index = attrIdxToArrayIdx(Index); SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end()); @@ -1335,7 +1369,8 @@ bool AttributeList::hasParamAttribute(unsigned ArgNo, bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr, unsigned *Index) const { - if (!pImpl) return false; + if (!pImpl) + return false; for (unsigned I = index_begin(), E = index_end(); I != E; ++I) { if (hasAttribute(I, Attr)) { @@ -1366,10 +1401,9 @@ unsigned AttributeList::getParamAlignment(unsigned ArgNo) const { } Type *AttributeList::getParamByValType(unsigned Index) const { - return getAttributes(Index+FirstArgIndex).getByValType(); + return getAttributes(Index + FirstArgIndex).getByValType(); } - unsigned AttributeList::getStackAlignment(unsigned Index) const { return getAttributes(Index).getStackAlignment(); } @@ -1526,7 +1560,8 @@ std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const { } AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) { - if (Align == 0) return *this; + if (Align == 0) + return *this; assert(isPowerOf2_32(Align) && "Alignment must be a power of two."); assert(Align <= 0x40000000 && "Alignment too large."); @@ -1538,7 +1573,8 @@ AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) { AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) { // Default alignment, allow the target to define how to align it. - if (Align == 0) return *this; + if (Align == 0) + return *this; assert(isPowerOf2_32(Align) && "Alignment must be a power of two."); assert(Align <= 0x100 && "Alignment too large."); @@ -1549,7 +1585,8 @@ AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) { } AttrBuilder &AttrBuilder::addDereferenceableAttr(uint64_t Bytes) { - if (Bytes == 0) return *this; + if (Bytes == 0) + return *this; Attrs[Attribute::Dereferenceable] = true; DerefBytes = Bytes; @@ -1680,16 +1717,14 @@ bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const { return false; } -bool AttrBuilder::hasAlignmentAttr() const { - return Alignment != 0; -} +bool AttrBuilder::hasAlignmentAttr() const { return Alignment != 0; } bool AttrBuilder::operator==(const AttrBuilder &B) { if (Attrs != B.Attrs) return false; - for (td_const_iterator I = TargetDepAttrs.begin(), - E = TargetDepAttrs.end(); I != E; ++I) + for (td_const_iterator I = TargetDepAttrs.begin(), E = TargetDepAttrs.end(); + I != E; ++I) if (B.TargetDepAttrs.find(I->first) == B.TargetDepAttrs.end()) return false; @@ -1707,27 +1742,26 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) { if (!Ty->isIntegerTy()) // Attribute that only apply to integers. - Incompatible.addAttribute(Attribute::SExt) - .addAttribute(Attribute::ZExt); + Incompatible.addAttribute(Attribute::SExt).addAttribute(Attribute::ZExt); if (!Ty->isPointerTy()) // Attribute that only apply to pointers. Incompatible.addAttribute(Attribute::ByVal) - .addAttribute(Attribute::Nest) - .addAttribute(Attribute::NoAlias) - .addAttribute(Attribute::NoCapture) - .addAttribute(Attribute::NonNull) - .addDereferenceableAttr(1) // the int here is ignored - .addDereferenceableOrNullAttr(1) // the int here is ignored - .addAttribute(Attribute::ReadNone) - .addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::StructRet) - .addAttribute(Attribute::InAlloca); + .addAttribute(Attribute::Nest) + .addAttribute(Attribute::NoAlias) + .addAttribute(Attribute::NoCapture) + .addAttribute(Attribute::NonNull) + .addDereferenceableAttr(1) // the int here is ignored + .addDereferenceableOrNullAttr(1) // the int here is ignored + .addAttribute(Attribute::ReadNone) + .addAttribute(Attribute::ReadOnly) + .addAttribute(Attribute::StructRet) + .addAttribute(Attribute::InAlloca); return Incompatible; } -template<typename AttrClass> +template <typename AttrClass> static bool isEqual(const Function &Caller, const Function &Callee) { return Caller.getFnAttribute(AttrClass::getKind()) == Callee.getFnAttribute(AttrClass::getKind()); @@ -1738,7 +1772,7 @@ static bool isEqual(const Function &Caller, const Function &Callee) { /// /// This function sets the caller's attribute to false if the callee's attribute /// is false. -template<typename AttrClass> +template <typename AttrClass> static void setAND(Function &Caller, const Function &Callee) { if (AttrClass::isSet(Caller, AttrClass::getKind()) && !AttrClass::isSet(Callee, AttrClass::getKind())) @@ -1750,7 +1784,7 @@ static void setAND(Function &Caller, const Function &Callee) { /// /// This function sets the caller's attribute to true if the callee's attribute /// is true. -template<typename AttrClass> +template <typename AttrClass> static void setOR(Function &Caller, const Function &Callee) { if (!AttrClass::isSet(Caller, AttrClass::getKind()) && AttrClass::isSet(Callee, AttrClass::getKind())) @@ -1793,18 +1827,18 @@ static void adjustCallerStackProbes(Function &Caller, const Function &Callee) { /// If the inlined function defines the size of guard region /// on the stack, then ensure that the calling function defines a guard region /// that is no larger. -static void -adjustCallerStackProbeSize(Function &Caller, const Function &Callee) { +static void adjustCallerStackProbeSize(Function &Caller, + const Function &Callee) { if (Callee.hasFnAttribute("stack-probe-size")) { uint64_t CalleeStackProbeSize; Callee.getFnAttribute("stack-probe-size") - .getValueAsString() - .getAsInteger(0, CalleeStackProbeSize); + .getValueAsString() + .getAsInteger(0, CalleeStackProbeSize); if (Caller.hasFnAttribute("stack-probe-size")) { uint64_t CallerStackProbeSize; Caller.getFnAttribute("stack-probe-size") - .getValueAsString() - .getAsInteger(0, CallerStackProbeSize); + .getValueAsString() + .getAsInteger(0, CallerStackProbeSize); if (CallerStackProbeSize > CalleeStackProbeSize) { Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size")); } @@ -1823,18 +1857,18 @@ adjustCallerStackProbeSize(Function &Caller, const Function &Callee) { /// to merge the attribute this way. Heuristics that would use /// min-legal-vector-width to determine inline compatibility would need to be /// handled as part of inline cost analysis. -static void -adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) { +static void adjustMinLegalVectorWidth(Function &Caller, + const Function &Callee) { if (Caller.hasFnAttribute("min-legal-vector-width")) { if (Callee.hasFnAttribute("min-legal-vector-width")) { uint64_t CallerVectorWidth; Caller.getFnAttribute("min-legal-vector-width") - .getValueAsString() - .getAsInteger(0, CallerVectorWidth); + .getValueAsString() + .getAsInteger(0, CallerVectorWidth); uint64_t CalleeVectorWidth; Callee.getFnAttribute("min-legal-vector-width") - .getValueAsString() - .getAsInteger(0, CalleeVectorWidth); + .getValueAsString() + .getAsInteger(0, CalleeVectorWidth); if (CallerVectorWidth < CalleeVectorWidth) Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width")); } else { @@ -1847,8 +1881,8 @@ adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) { /// If the inlined function has "null-pointer-is-valid=true" attribute, /// set this attribute in the caller post inlining. -static void -adjustNullPointerValidAttr(Function &Caller, const Function &Callee) { +static void adjustNullPointerValidAttr(Function &Caller, + const Function &Callee) { if (Callee.nullPointerIsDefined() && !Caller.nullPointerIsDefined()) { Caller.addFnAttr(Callee.getFnAttribute("null-pointer-is-valid")); } diff --git a/hpvm/projects/llvm-cbe/build/CMakeCache.txt b/hpvm/projects/llvm-cbe/build/CMakeCache.txt deleted file mode 100644 index 5d9ac640421f729cf532c6e4406548fe77085f49..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeCache.txt +++ /dev/null @@ -1,314 +0,0 @@ -# This is the CMakeCache file. -# For build in directory: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build -# It was generated by CMake: /usr/bin/cmake -# You can edit this file to change values found and used by cmake. -# If you do not want to change any of the values, simply exit the editor. -# If you do want to change a value, simply edit, save, and exit the editor. -# The syntax for the file is as follows: -# KEY:TYPE=VALUE -# KEY is the name of a variable in the cache. -# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!. -# VALUE is the current value for the KEY. - -######################## -# EXTERNAL cache entries -######################## - -//Path to a program. -CMAKE_AR:FILEPATH=/usr/bin/ar - -//For backwards compatibility, what version of CMake commands and -// syntax should this version of CMake try to support. -CMAKE_BACKWARDS_COMPATIBILITY:STRING=2.4 - -//Choose the type of build, options are: None(CMAKE_CXX_FLAGS or -// CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel. -CMAKE_BUILD_TYPE:STRING= - -//Enable/Disable color output during build. -CMAKE_COLOR_MAKEFILE:BOOL=ON - -//CXX compiler -CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++ - -//Flags used by the compiler during all build types. -CMAKE_CXX_FLAGS:STRING= - -//Flags used by the compiler during debug builds. -CMAKE_CXX_FLAGS_DEBUG:STRING=-g - -//Flags used by the compiler during release builds for minimum -// size. -CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG - -//Flags used by the compiler during release builds. -CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG - -//Flags used by the compiler during release builds with debug info. -CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG - -//C compiler -CMAKE_C_COMPILER:FILEPATH=/usr/bin/cc - -//Flags used by the compiler during all build types. -CMAKE_C_FLAGS:STRING= - -//Flags used by the compiler during debug builds. -CMAKE_C_FLAGS_DEBUG:STRING=-g - -//Flags used by the compiler during release builds for minimum -// size. -CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG - -//Flags used by the compiler during release builds. -CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG - -//Flags used by the compiler during release builds with debug info. -CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG - -//Flags used by the linker. -CMAKE_EXE_LINKER_FLAGS:STRING= - -//Flags used by the linker during debug builds. -CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING= - -//Flags used by the linker during release minsize builds. -CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING= - -//Flags used by the linker during release builds. -CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING= - -//Flags used by the linker during Release with Debug Info builds. -CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING= - -//Enable/Disable output of compile commands during generation. -CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF - -//Install path prefix, prepended onto install directories. -CMAKE_INSTALL_PREFIX:PATH=/usr/local - -//Path to a program. -CMAKE_LINKER:FILEPATH=/usr/bin/ld - -//Path to a program. -CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/make - -//Flags used by the linker during the creation of modules. -CMAKE_MODULE_LINKER_FLAGS:STRING= - -//Flags used by the linker during debug builds. -CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING= - -//Flags used by the linker during release minsize builds. -CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING= - -//Flags used by the linker during release builds. -CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING= - -//Flags used by the linker during Release with Debug Info builds. -CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING= - -//Path to a program. -CMAKE_NM:FILEPATH=/usr/bin/nm - -//Path to a program. -CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy - -//Path to a program. -CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump - -//Value Computed by CMake -CMAKE_PROJECT_NAME:STATIC=Project - -//Path to a program. -CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib - -//Flags used by the linker during the creation of dll's. -CMAKE_SHARED_LINKER_FLAGS:STRING= - -//Flags used by the linker during debug builds. -CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING= - -//Flags used by the linker during release minsize builds. -CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING= - -//Flags used by the linker during release builds. -CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING= - -//Flags used by the linker during Release with Debug Info builds. -CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING= - -//If set, runtime paths are not added when installing shared libraries, -// but are added when building. -CMAKE_SKIP_INSTALL_RPATH:BOOL=NO - -//If set, runtime paths are not added when using shared libraries. -CMAKE_SKIP_RPATH:BOOL=NO - -//Flags used by the linker during the creation of static libraries. -CMAKE_STATIC_LINKER_FLAGS:STRING= - -//Flags used by the linker during debug builds. -CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING= - -//Flags used by the linker during release minsize builds. -CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING= - -//Flags used by the linker during release builds. -CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING= - -//Flags used by the linker during Release with Debug Info builds. -CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING= - -//Path to a program. -CMAKE_STRIP:FILEPATH=/usr/bin/strip - -//If this value is on, makefiles will be generated without the -// .SILENT directive, and all commands will be echoed to the console -// during the make. This is useful for debugging only. With Visual -// Studio IDE projects all commands are done without /nologo. -CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE - -//Single output directory for building all executables. -EXECUTABLE_OUTPUT_PATH:PATH= - -//Single output directory for building all libraries. -LIBRARY_OUTPUT_PATH:PATH= - -//Value Computed by CMake -Project_BINARY_DIR:STATIC=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build - -//Value Computed by CMake -Project_SOURCE_DIR:STATIC=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe - - -######################## -# INTERNAL cache entries -######################## - -//ADVANCED property for variable: CMAKE_AR -CMAKE_AR-ADVANCED:INTERNAL=1 -//This is the directory where this CMakeCache.txt was created -CMAKE_CACHEFILE_DIR:INTERNAL=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build -//Major version of cmake used to create the current loaded cache -CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3 -//Minor version of cmake used to create the current loaded cache -CMAKE_CACHE_MINOR_VERSION:INTERNAL=5 -//Patch version of cmake used to create the current loaded cache -CMAKE_CACHE_PATCH_VERSION:INTERNAL=1 -//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE -CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1 -//Path to CMake executable. -CMAKE_COMMAND:INTERNAL=/usr/bin/cmake -//Path to cpack program executable. -CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack -//Path to ctest program executable. -CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest -//ADVANCED property for variable: CMAKE_CXX_COMPILER -CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_CXX_FLAGS -CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG -CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL -CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE -CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO -CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_C_COMPILER -CMAKE_C_COMPILER-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_C_FLAGS -CMAKE_C_FLAGS-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG -CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL -CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE -CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO -CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 -//Executable file format -CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF -//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS -CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG -CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL -CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE -CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO -CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS -CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1 -//Name of external makefile project generator. -CMAKE_EXTRA_GENERATOR:INTERNAL= -//Name of generator. -CMAKE_GENERATOR:INTERNAL=Unix Makefiles -//Name of generator platform. -CMAKE_GENERATOR_PLATFORM:INTERNAL= -//Name of generator toolset. -CMAKE_GENERATOR_TOOLSET:INTERNAL= -//Source directory with the top level CMakeLists.txt file for this -// project -CMAKE_HOME_DIRECTORY:INTERNAL=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe -//Install .so files without execute permission. -CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1 -//ADVANCED property for variable: CMAKE_LINKER -CMAKE_LINKER-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_MAKE_PROGRAM -CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS -CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG -CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL -CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE -CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO -CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_NM -CMAKE_NM-ADVANCED:INTERNAL=1 -//number of local generators -CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=4 -//ADVANCED property for variable: CMAKE_OBJCOPY -CMAKE_OBJCOPY-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_OBJDUMP -CMAKE_OBJDUMP-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_RANLIB -CMAKE_RANLIB-ADVANCED:INTERNAL=1 -//Path to CMake installation. -CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.5 -//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS -CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG -CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL -CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE -CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO -CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH -CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_SKIP_RPATH -CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS -CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG -CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL -CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE -CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO -CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 -//ADVANCED property for variable: CMAKE_STRIP -CMAKE_STRIP-ADVANCED:INTERNAL=1 -//uname command -CMAKE_UNAME:INTERNAL=/bin/uname -//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE -CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1 - diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCCompiler.cmake b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCCompiler.cmake deleted file mode 100644 index f40522e627a66ddca0a1b7c75b83836d5e12e77a..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCCompiler.cmake +++ /dev/null @@ -1,67 +0,0 @@ -set(CMAKE_C_COMPILER "/usr/bin/cc") -set(CMAKE_C_COMPILER_ARG1 "") -set(CMAKE_C_COMPILER_ID "GNU") -set(CMAKE_C_COMPILER_VERSION "5.4.0") -set(CMAKE_C_COMPILER_WRAPPER "") -set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "11") -set(CMAKE_C_COMPILE_FEATURES "c_function_prototypes;c_restrict;c_variadic_macros;c_static_assert") -set(CMAKE_C90_COMPILE_FEATURES "c_function_prototypes") -set(CMAKE_C99_COMPILE_FEATURES "c_restrict;c_variadic_macros") -set(CMAKE_C11_COMPILE_FEATURES "c_static_assert") - -set(CMAKE_C_PLATFORM_ID "Linux") -set(CMAKE_C_SIMULATE_ID "") -set(CMAKE_C_SIMULATE_VERSION "") - -set(CMAKE_AR "/usr/bin/ar") -set(CMAKE_RANLIB "/usr/bin/ranlib") -set(CMAKE_LINKER "/usr/bin/ld") -set(CMAKE_COMPILER_IS_GNUCC 1) -set(CMAKE_C_COMPILER_LOADED 1) -set(CMAKE_C_COMPILER_WORKS TRUE) -set(CMAKE_C_ABI_COMPILED TRUE) -set(CMAKE_COMPILER_IS_MINGW ) -set(CMAKE_COMPILER_IS_CYGWIN ) -if(CMAKE_COMPILER_IS_CYGWIN) - set(CYGWIN 1) - set(UNIX 1) -endif() - -set(CMAKE_C_COMPILER_ENV_VAR "CC") - -if(CMAKE_COMPILER_IS_MINGW) - set(MINGW 1) -endif() -set(CMAKE_C_COMPILER_ID_RUN 1) -set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m) -set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC) -set(CMAKE_C_LINKER_PREFERENCE 10) - -# Save compiler ABI information. -set(CMAKE_C_SIZEOF_DATA_PTR "8") -set(CMAKE_C_COMPILER_ABI "ELF") -set(CMAKE_C_LIBRARY_ARCHITECTURE "x86_64-linux-gnu") - -if(CMAKE_C_SIZEOF_DATA_PTR) - set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}") -endif() - -if(CMAKE_C_COMPILER_ABI) - set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}") -endif() - -if(CMAKE_C_LIBRARY_ARCHITECTURE) - set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu") -endif() - -set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "") -if(CMAKE_C_CL_SHOWINCLUDES_PREFIX) - set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}") -endif() - - - - -set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "c") -set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib") -set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "") diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCXXCompiler.cmake b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCXXCompiler.cmake deleted file mode 100644 index 013ee9298fb861e7d0350d49a1fc08c0274b5e59..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCXXCompiler.cmake +++ /dev/null @@ -1,68 +0,0 @@ -set(CMAKE_CXX_COMPILER "/usr/bin/c++") -set(CMAKE_CXX_COMPILER_ARG1 "") -set(CMAKE_CXX_COMPILER_ID "GNU") -set(CMAKE_CXX_COMPILER_VERSION "5.4.0") -set(CMAKE_CXX_COMPILER_WRAPPER "") -set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "98") -set(CMAKE_CXX_COMPILE_FEATURES "cxx_template_template_parameters;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates") -set(CMAKE_CXX98_COMPILE_FEATURES "cxx_template_template_parameters") -set(CMAKE_CXX11_COMPILE_FEATURES "cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates") -set(CMAKE_CXX14_COMPILE_FEATURES "cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates") - -set(CMAKE_CXX_PLATFORM_ID "Linux") -set(CMAKE_CXX_SIMULATE_ID "") -set(CMAKE_CXX_SIMULATE_VERSION "") - -set(CMAKE_AR "/usr/bin/ar") -set(CMAKE_RANLIB "/usr/bin/ranlib") -set(CMAKE_LINKER "/usr/bin/ld") -set(CMAKE_COMPILER_IS_GNUCXX 1) -set(CMAKE_CXX_COMPILER_LOADED 1) -set(CMAKE_CXX_COMPILER_WORKS TRUE) -set(CMAKE_CXX_ABI_COMPILED TRUE) -set(CMAKE_COMPILER_IS_MINGW ) -set(CMAKE_COMPILER_IS_CYGWIN ) -if(CMAKE_COMPILER_IS_CYGWIN) - set(CYGWIN 1) - set(UNIX 1) -endif() - -set(CMAKE_CXX_COMPILER_ENV_VAR "CXX") - -if(CMAKE_COMPILER_IS_MINGW) - set(MINGW 1) -endif() -set(CMAKE_CXX_COMPILER_ID_RUN 1) -set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC) -set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;mm;CPP) -set(CMAKE_CXX_LINKER_PREFERENCE 30) -set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1) - -# Save compiler ABI information. -set(CMAKE_CXX_SIZEOF_DATA_PTR "8") -set(CMAKE_CXX_COMPILER_ABI "ELF") -set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu") - -if(CMAKE_CXX_SIZEOF_DATA_PTR) - set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}") -endif() - -if(CMAKE_CXX_COMPILER_ABI) - set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}") -endif() - -if(CMAKE_CXX_LIBRARY_ARCHITECTURE) - set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu") -endif() - -set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "") -if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX) - set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}") -endif() - - - - -set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;c") -set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib") -set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "") diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_C.bin b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_C.bin deleted file mode 100755 index 007976746bbc08577dee275193a151481f73ad7d..0000000000000000000000000000000000000000 Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_C.bin and /dev/null differ diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_CXX.bin b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_CXX.bin deleted file mode 100755 index 9717f93a704a711c1635031e2c1da2d3efcb684d..0000000000000000000000000000000000000000 Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_CXX.bin and /dev/null differ diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeSystem.cmake b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeSystem.cmake deleted file mode 100644 index 1927fbd348850efae35e1e56d7276bc0413aecb2..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeSystem.cmake +++ /dev/null @@ -1,15 +0,0 @@ -set(CMAKE_HOST_SYSTEM "Linux-4.15.0-66-generic") -set(CMAKE_HOST_SYSTEM_NAME "Linux") -set(CMAKE_HOST_SYSTEM_VERSION "4.15.0-66-generic") -set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64") - - - -set(CMAKE_SYSTEM "Linux-4.15.0-66-generic") -set(CMAKE_SYSTEM_NAME "Linux") -set(CMAKE_SYSTEM_VERSION "4.15.0-66-generic") -set(CMAKE_SYSTEM_PROCESSOR "x86_64") - -set(CMAKE_CROSSCOMPILING "FALSE") - -set(CMAKE_SYSTEM_LOADED 1) diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/CMakeCCompilerId.c b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/CMakeCCompilerId.c deleted file mode 100644 index 570a15e994e4f10ca4a05b4451ea350fb942337f..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/CMakeCCompilerId.c +++ /dev/null @@ -1,544 +0,0 @@ -#ifdef __cplusplus -# error "A C++ compiler has been selected for C." -#endif - -#if defined(__18CXX) -# define ID_VOID_MAIN -#endif - - -/* Version number components: V=Version, R=Revision, P=Patch - Version date components: YYYY=Year, MM=Month, DD=Day */ - -#if defined(__INTEL_COMPILER) || defined(__ICC) -# define COMPILER_ID "Intel" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif - /* __INTEL_COMPILER = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100) -# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10) -# if defined(__INTEL_COMPILER_UPDATE) -# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE) -# else -# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10) -# endif -# if defined(__INTEL_COMPILER_BUILD_DATE) - /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */ -# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE) -# endif -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif - -#elif defined(__PATHCC__) -# define COMPILER_ID "PathScale" -# define COMPILER_VERSION_MAJOR DEC(__PATHCC__) -# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__) -# if defined(__PATHCC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__) -# endif - -#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__) -# define COMPILER_ID "Embarcadero" -# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF) -# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF) -# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF) - -#elif defined(__BORLANDC__) -# define COMPILER_ID "Borland" - /* __BORLANDC__ = 0xVRR */ -# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8) -# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF) - -#elif defined(__WATCOMC__) && __WATCOMC__ < 1200 -# define COMPILER_ID "Watcom" - /* __WATCOMC__ = VVRR */ -# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100) -# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) -# if (__WATCOMC__ % 10) > 0 -# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) -# endif - -#elif defined(__WATCOMC__) -# define COMPILER_ID "OpenWatcom" - /* __WATCOMC__ = VVRP + 1100 */ -# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100) -# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) -# if (__WATCOMC__ % 10) > 0 -# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) -# endif - -#elif defined(__SUNPRO_C) -# define COMPILER_ID "SunPro" -# if __SUNPRO_C >= 0x5100 - /* __SUNPRO_C = 0xVRRP */ -# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12) -# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF) -# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF) -# else - /* __SUNPRO_CC = 0xVRP */ -# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8) -# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF) -# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF) -# endif - -#elif defined(__HP_cc) -# define COMPILER_ID "HP" - /* __HP_cc = VVRRPP */ -# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000) -# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100) -# define COMPILER_VERSION_PATCH DEC(__HP_cc % 100) - -#elif defined(__DECC) -# define COMPILER_ID "Compaq" - /* __DECC_VER = VVRRTPPPP */ -# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000) -# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000 % 100) -# define COMPILER_VERSION_PATCH DEC(__DECC_VER % 10000) - -#elif defined(__IBMC__) && defined(__COMPILER_VER__) -# define COMPILER_ID "zOS" - /* __IBMC__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10) - -#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800 -# define COMPILER_ID "XL" - /* __IBMC__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10) - -#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800 -# define COMPILER_ID "VisualAge" - /* __IBMC__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10) - -#elif defined(__PGI) -# define COMPILER_ID "PGI" -# define COMPILER_VERSION_MAJOR DEC(__PGIC__) -# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__) -# if defined(__PGIC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__) -# endif - -#elif defined(_CRAYC) -# define COMPILER_ID "Cray" -# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR) -# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR) - -#elif defined(__TI_COMPILER_VERSION__) -# define COMPILER_ID "TI" - /* __TI_COMPILER_VERSION__ = VVVRRRPPP */ -# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000) -# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000) -# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000) - -#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version) -# define COMPILER_ID "Fujitsu" - -#elif defined(__TINYC__) -# define COMPILER_ID "TinyCC" - -#elif defined(__SCO_VERSION__) -# define COMPILER_ID "SCO" - -#elif defined(__clang__) && defined(__apple_build_version__) -# define COMPILER_ID "AppleClang" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# define COMPILER_VERSION_MAJOR DEC(__clang_major__) -# define COMPILER_VERSION_MINOR DEC(__clang_minor__) -# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif -# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__) - -#elif defined(__clang__) -# define COMPILER_ID "Clang" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# define COMPILER_VERSION_MAJOR DEC(__clang_major__) -# define COMPILER_VERSION_MINOR DEC(__clang_minor__) -# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif - -#elif defined(__GNUC__) -# define COMPILER_ID "GNU" -# define COMPILER_VERSION_MAJOR DEC(__GNUC__) -# if defined(__GNUC_MINOR__) -# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__) -# endif -# if defined(__GNUC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) -# endif - -#elif defined(_MSC_VER) -# define COMPILER_ID "MSVC" - /* _MSC_VER = VVRR */ -# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100) -# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100) -# if defined(_MSC_FULL_VER) -# if _MSC_VER >= 1400 - /* _MSC_FULL_VER = VVRRPPPPP */ -# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000) -# else - /* _MSC_FULL_VER = VVRRPPPP */ -# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000) -# endif -# endif -# if defined(_MSC_BUILD) -# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD) -# endif - -#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__) -# define COMPILER_ID "ADSP" -#if defined(__VISUALDSPVERSION__) - /* __VISUALDSPVERSION__ = 0xVVRRPP00 */ -# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24) -# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF) -# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8 & 0xFF) -#endif - -#elif defined(__IAR_SYSTEMS_ICC__ ) || defined(__IAR_SYSTEMS_ICC) -# define COMPILER_ID "IAR" - -#elif defined(__ARMCC_VERSION) -# define COMPILER_ID "ARMCC" -#if __ARMCC_VERSION >= 1000000 - /* __ARMCC_VERSION = VRRPPPP */ - # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000) - # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100) - # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) -#else - /* __ARMCC_VERSION = VRPPPP */ - # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000) - # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10) - # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) -#endif - - -#elif defined(SDCC) -# define COMPILER_ID "SDCC" - /* SDCC = VRP */ -# define COMPILER_VERSION_MAJOR DEC(SDCC/100) -# define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10) -# define COMPILER_VERSION_PATCH DEC(SDCC % 10) - -#elif defined(_SGI_COMPILER_VERSION) || defined(_COMPILER_VERSION) -# define COMPILER_ID "MIPSpro" -# if defined(_SGI_COMPILER_VERSION) - /* _SGI_COMPILER_VERSION = VRP */ -# define COMPILER_VERSION_MAJOR DEC(_SGI_COMPILER_VERSION/100) -# define COMPILER_VERSION_MINOR DEC(_SGI_COMPILER_VERSION/10 % 10) -# define COMPILER_VERSION_PATCH DEC(_SGI_COMPILER_VERSION % 10) -# else - /* _COMPILER_VERSION = VRP */ -# define COMPILER_VERSION_MAJOR DEC(_COMPILER_VERSION/100) -# define COMPILER_VERSION_MINOR DEC(_COMPILER_VERSION/10 % 10) -# define COMPILER_VERSION_PATCH DEC(_COMPILER_VERSION % 10) -# endif - - -/* These compilers are either not known or too old to define an - identification macro. Try to identify the platform and guess that - it is the native compiler. */ -#elif defined(__sgi) -# define COMPILER_ID "MIPSpro" - -#elif defined(__hpux) || defined(__hpua) -# define COMPILER_ID "HP" - -#else /* unknown compiler */ -# define COMPILER_ID "" -#endif - -/* Construct the string literal in pieces to prevent the source from - getting matched. Store it in a pointer rather than an array - because some compilers will just produce instructions to fill the - array rather than assigning a pointer to a static array. */ -char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]"; -#ifdef SIMULATE_ID -char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]"; -#endif - -#ifdef __QNXNTO__ -char const* qnxnto = "INFO" ":" "qnxnto[]"; -#endif - -#if defined(__CRAYXE) || defined(__CRAYXC) -char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]"; -#endif - -#define STRINGIFY_HELPER(X) #X -#define STRINGIFY(X) STRINGIFY_HELPER(X) - -/* Identify known platforms by name. */ -#if defined(__linux) || defined(__linux__) || defined(linux) -# define PLATFORM_ID "Linux" - -#elif defined(__CYGWIN__) -# define PLATFORM_ID "Cygwin" - -#elif defined(__MINGW32__) -# define PLATFORM_ID "MinGW" - -#elif defined(__APPLE__) -# define PLATFORM_ID "Darwin" - -#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32) -# define PLATFORM_ID "Windows" - -#elif defined(__FreeBSD__) || defined(__FreeBSD) -# define PLATFORM_ID "FreeBSD" - -#elif defined(__NetBSD__) || defined(__NetBSD) -# define PLATFORM_ID "NetBSD" - -#elif defined(__OpenBSD__) || defined(__OPENBSD) -# define PLATFORM_ID "OpenBSD" - -#elif defined(__sun) || defined(sun) -# define PLATFORM_ID "SunOS" - -#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__) -# define PLATFORM_ID "AIX" - -#elif defined(__sgi) || defined(__sgi__) || defined(_SGI) -# define PLATFORM_ID "IRIX" - -#elif defined(__hpux) || defined(__hpux__) -# define PLATFORM_ID "HP-UX" - -#elif defined(__HAIKU__) -# define PLATFORM_ID "Haiku" - -#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS) -# define PLATFORM_ID "BeOS" - -#elif defined(__QNX__) || defined(__QNXNTO__) -# define PLATFORM_ID "QNX" - -#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__) -# define PLATFORM_ID "Tru64" - -#elif defined(__riscos) || defined(__riscos__) -# define PLATFORM_ID "RISCos" - -#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__) -# define PLATFORM_ID "SINIX" - -#elif defined(__UNIX_SV__) -# define PLATFORM_ID "UNIX_SV" - -#elif defined(__bsdos__) -# define PLATFORM_ID "BSDOS" - -#elif defined(_MPRAS) || defined(MPRAS) -# define PLATFORM_ID "MP-RAS" - -#elif defined(__osf) || defined(__osf__) -# define PLATFORM_ID "OSF1" - -#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv) -# define PLATFORM_ID "SCO_SV" - -#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX) -# define PLATFORM_ID "ULTRIX" - -#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX) -# define PLATFORM_ID "Xenix" - -#elif defined(__WATCOMC__) -# if defined(__LINUX__) -# define PLATFORM_ID "Linux" - -# elif defined(__DOS__) -# define PLATFORM_ID "DOS" - -# elif defined(__OS2__) -# define PLATFORM_ID "OS2" - -# elif defined(__WINDOWS__) -# define PLATFORM_ID "Windows3x" - -# else /* unknown platform */ -# define PLATFORM_ID "" -# endif - -#else /* unknown platform */ -# define PLATFORM_ID "" - -#endif - -/* For windows compilers MSVC and Intel we can determine - the architecture of the compiler being used. This is because - the compilers do not have flags that can change the architecture, - but rather depend on which compiler is being used -*/ -#if defined(_WIN32) && defined(_MSC_VER) -# if defined(_M_IA64) -# define ARCHITECTURE_ID "IA64" - -# elif defined(_M_X64) || defined(_M_AMD64) -# define ARCHITECTURE_ID "x64" - -# elif defined(_M_IX86) -# define ARCHITECTURE_ID "X86" - -# elif defined(_M_ARM) -# if _M_ARM == 4 -# define ARCHITECTURE_ID "ARMV4I" -# elif _M_ARM == 5 -# define ARCHITECTURE_ID "ARMV5I" -# else -# define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM) -# endif - -# elif defined(_M_MIPS) -# define ARCHITECTURE_ID "MIPS" - -# elif defined(_M_SH) -# define ARCHITECTURE_ID "SHx" - -# else /* unknown architecture */ -# define ARCHITECTURE_ID "" -# endif - -#elif defined(__WATCOMC__) -# if defined(_M_I86) -# define ARCHITECTURE_ID "I86" - -# elif defined(_M_IX86) -# define ARCHITECTURE_ID "X86" - -# else /* unknown architecture */ -# define ARCHITECTURE_ID "" -# endif - -#else -# define ARCHITECTURE_ID "" -#endif - -/* Convert integer to decimal digit literals. */ -#define DEC(n) \ - ('0' + (((n) / 10000000)%10)), \ - ('0' + (((n) / 1000000)%10)), \ - ('0' + (((n) / 100000)%10)), \ - ('0' + (((n) / 10000)%10)), \ - ('0' + (((n) / 1000)%10)), \ - ('0' + (((n) / 100)%10)), \ - ('0' + (((n) / 10)%10)), \ - ('0' + ((n) % 10)) - -/* Convert integer to hex digit literals. */ -#define HEX(n) \ - ('0' + ((n)>>28 & 0xF)), \ - ('0' + ((n)>>24 & 0xF)), \ - ('0' + ((n)>>20 & 0xF)), \ - ('0' + ((n)>>16 & 0xF)), \ - ('0' + ((n)>>12 & 0xF)), \ - ('0' + ((n)>>8 & 0xF)), \ - ('0' + ((n)>>4 & 0xF)), \ - ('0' + ((n) & 0xF)) - -/* Construct a string literal encoding the version number components. */ -#ifdef COMPILER_VERSION_MAJOR -char const info_version[] = { - 'I', 'N', 'F', 'O', ':', - 'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[', - COMPILER_VERSION_MAJOR, -# ifdef COMPILER_VERSION_MINOR - '.', COMPILER_VERSION_MINOR, -# ifdef COMPILER_VERSION_PATCH - '.', COMPILER_VERSION_PATCH, -# ifdef COMPILER_VERSION_TWEAK - '.', COMPILER_VERSION_TWEAK, -# endif -# endif -# endif - ']','\0'}; -#endif - -/* Construct a string literal encoding the version number components. */ -#ifdef SIMULATE_VERSION_MAJOR -char const info_simulate_version[] = { - 'I', 'N', 'F', 'O', ':', - 's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[', - SIMULATE_VERSION_MAJOR, -# ifdef SIMULATE_VERSION_MINOR - '.', SIMULATE_VERSION_MINOR, -# ifdef SIMULATE_VERSION_PATCH - '.', SIMULATE_VERSION_PATCH, -# ifdef SIMULATE_VERSION_TWEAK - '.', SIMULATE_VERSION_TWEAK, -# endif -# endif -# endif - ']','\0'}; -#endif - -/* Construct the string literal in pieces to prevent the source from - getting matched. Store it in a pointer rather than an array - because some compilers will just produce instructions to fill the - array rather than assigning a pointer to a static array. */ -char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]"; -char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]"; - - - - -const char* info_language_dialect_default = "INFO" ":" "dialect_default[" -#if !defined(__STDC_VERSION__) - "90" -#elif __STDC_VERSION__ >= 201000L - "11" -#elif __STDC_VERSION__ >= 199901L - "99" -#else -#endif -"]"; - -/*--------------------------------------------------------------------------*/ - -#ifdef ID_VOID_MAIN -void main() {} -#else -int main(int argc, char* argv[]) -{ - int require = 0; - require += info_compiler[argc]; - require += info_platform[argc]; - require += info_arch[argc]; -#ifdef COMPILER_VERSION_MAJOR - require += info_version[argc]; -#endif -#ifdef SIMULATE_ID - require += info_simulate[argc]; -#endif -#ifdef SIMULATE_VERSION_MAJOR - require += info_simulate_version[argc]; -#endif -#if defined(__CRAYXE) || defined(__CRAYXC) - require += info_cray[argc]; -#endif - require += info_language_dialect_default[argc]; - (void)argv; - return require; -} -#endif diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out deleted file mode 100755 index 3e779cddc6621457f3aa2ca25bab3125c9c419b2..0000000000000000000000000000000000000000 Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out and /dev/null differ diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/CMakeCXXCompilerId.cpp b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/CMakeCXXCompilerId.cpp deleted file mode 100644 index e6d853637c6f7637dd8672b59612a9263a4d0244..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/CMakeCXXCompilerId.cpp +++ /dev/null @@ -1,533 +0,0 @@ -/* This source file must have a .cpp extension so that all C++ compilers - recognize the extension without flags. Borland does not know .cxx for - example. */ -#ifndef __cplusplus -# error "A C compiler has been selected for C++." -#endif - - -/* Version number components: V=Version, R=Revision, P=Patch - Version date components: YYYY=Year, MM=Month, DD=Day */ - -#if defined(__COMO__) -# define COMPILER_ID "Comeau" - /* __COMO_VERSION__ = VRR */ -# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100) -# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100) - -#elif defined(__INTEL_COMPILER) || defined(__ICC) -# define COMPILER_ID "Intel" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif - /* __INTEL_COMPILER = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100) -# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10) -# if defined(__INTEL_COMPILER_UPDATE) -# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE) -# else -# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10) -# endif -# if defined(__INTEL_COMPILER_BUILD_DATE) - /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */ -# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE) -# endif -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif - -#elif defined(__PATHCC__) -# define COMPILER_ID "PathScale" -# define COMPILER_VERSION_MAJOR DEC(__PATHCC__) -# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__) -# if defined(__PATHCC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__) -# endif - -#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__) -# define COMPILER_ID "Embarcadero" -# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF) -# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF) -# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF) - -#elif defined(__BORLANDC__) -# define COMPILER_ID "Borland" - /* __BORLANDC__ = 0xVRR */ -# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8) -# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF) - -#elif defined(__WATCOMC__) && __WATCOMC__ < 1200 -# define COMPILER_ID "Watcom" - /* __WATCOMC__ = VVRR */ -# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100) -# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) -# if (__WATCOMC__ % 10) > 0 -# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) -# endif - -#elif defined(__WATCOMC__) -# define COMPILER_ID "OpenWatcom" - /* __WATCOMC__ = VVRP + 1100 */ -# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100) -# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) -# if (__WATCOMC__ % 10) > 0 -# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) -# endif - -#elif defined(__SUNPRO_CC) -# define COMPILER_ID "SunPro" -# if __SUNPRO_CC >= 0x5100 - /* __SUNPRO_CC = 0xVRRP */ -# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12) -# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF) -# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) -# else - /* __SUNPRO_CC = 0xVRP */ -# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8) -# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF) -# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) -# endif - -#elif defined(__HP_aCC) -# define COMPILER_ID "HP" - /* __HP_aCC = VVRRPP */ -# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000) -# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100) -# define COMPILER_VERSION_PATCH DEC(__HP_aCC % 100) - -#elif defined(__DECCXX) -# define COMPILER_ID "Compaq" - /* __DECCXX_VER = VVRRTPPPP */ -# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000) -# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000 % 100) -# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER % 10000) - -#elif defined(__IBMCPP__) && defined(__COMPILER_VER__) -# define COMPILER_ID "zOS" - /* __IBMCPP__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) - -#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800 -# define COMPILER_ID "XL" - /* __IBMCPP__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) - -#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800 -# define COMPILER_ID "VisualAge" - /* __IBMCPP__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) - -#elif defined(__PGI) -# define COMPILER_ID "PGI" -# define COMPILER_VERSION_MAJOR DEC(__PGIC__) -# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__) -# if defined(__PGIC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__) -# endif - -#elif defined(_CRAYC) -# define COMPILER_ID "Cray" -# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR) -# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR) - -#elif defined(__TI_COMPILER_VERSION__) -# define COMPILER_ID "TI" - /* __TI_COMPILER_VERSION__ = VVVRRRPPP */ -# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000) -# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000) -# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000) - -#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version) -# define COMPILER_ID "Fujitsu" - -#elif defined(__SCO_VERSION__) -# define COMPILER_ID "SCO" - -#elif defined(__clang__) && defined(__apple_build_version__) -# define COMPILER_ID "AppleClang" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# define COMPILER_VERSION_MAJOR DEC(__clang_major__) -# define COMPILER_VERSION_MINOR DEC(__clang_minor__) -# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif -# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__) - -#elif defined(__clang__) -# define COMPILER_ID "Clang" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# define COMPILER_VERSION_MAJOR DEC(__clang_major__) -# define COMPILER_VERSION_MINOR DEC(__clang_minor__) -# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif - -#elif defined(__GNUC__) -# define COMPILER_ID "GNU" -# define COMPILER_VERSION_MAJOR DEC(__GNUC__) -# if defined(__GNUC_MINOR__) -# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__) -# endif -# if defined(__GNUC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) -# endif - -#elif defined(_MSC_VER) -# define COMPILER_ID "MSVC" - /* _MSC_VER = VVRR */ -# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100) -# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100) -# if defined(_MSC_FULL_VER) -# if _MSC_VER >= 1400 - /* _MSC_FULL_VER = VVRRPPPPP */ -# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000) -# else - /* _MSC_FULL_VER = VVRRPPPP */ -# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000) -# endif -# endif -# if defined(_MSC_BUILD) -# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD) -# endif - -#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__) -# define COMPILER_ID "ADSP" -#if defined(__VISUALDSPVERSION__) - /* __VISUALDSPVERSION__ = 0xVVRRPP00 */ -# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24) -# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF) -# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8 & 0xFF) -#endif - -#elif defined(__IAR_SYSTEMS_ICC__ ) || defined(__IAR_SYSTEMS_ICC) -# define COMPILER_ID "IAR" - -#elif defined(__ARMCC_VERSION) -# define COMPILER_ID "ARMCC" -#if __ARMCC_VERSION >= 1000000 - /* __ARMCC_VERSION = VRRPPPP */ - # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000) - # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100) - # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) -#else - /* __ARMCC_VERSION = VRPPPP */ - # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000) - # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10) - # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) -#endif - - -#elif defined(_SGI_COMPILER_VERSION) || defined(_COMPILER_VERSION) -# define COMPILER_ID "MIPSpro" -# if defined(_SGI_COMPILER_VERSION) - /* _SGI_COMPILER_VERSION = VRP */ -# define COMPILER_VERSION_MAJOR DEC(_SGI_COMPILER_VERSION/100) -# define COMPILER_VERSION_MINOR DEC(_SGI_COMPILER_VERSION/10 % 10) -# define COMPILER_VERSION_PATCH DEC(_SGI_COMPILER_VERSION % 10) -# else - /* _COMPILER_VERSION = VRP */ -# define COMPILER_VERSION_MAJOR DEC(_COMPILER_VERSION/100) -# define COMPILER_VERSION_MINOR DEC(_COMPILER_VERSION/10 % 10) -# define COMPILER_VERSION_PATCH DEC(_COMPILER_VERSION % 10) -# endif - - -/* These compilers are either not known or too old to define an - identification macro. Try to identify the platform and guess that - it is the native compiler. */ -#elif defined(__sgi) -# define COMPILER_ID "MIPSpro" - -#elif defined(__hpux) || defined(__hpua) -# define COMPILER_ID "HP" - -#else /* unknown compiler */ -# define COMPILER_ID "" -#endif - -/* Construct the string literal in pieces to prevent the source from - getting matched. Store it in a pointer rather than an array - because some compilers will just produce instructions to fill the - array rather than assigning a pointer to a static array. */ -char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]"; -#ifdef SIMULATE_ID -char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]"; -#endif - -#ifdef __QNXNTO__ -char const* qnxnto = "INFO" ":" "qnxnto[]"; -#endif - -#if defined(__CRAYXE) || defined(__CRAYXC) -char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]"; -#endif - -#define STRINGIFY_HELPER(X) #X -#define STRINGIFY(X) STRINGIFY_HELPER(X) - -/* Identify known platforms by name. */ -#if defined(__linux) || defined(__linux__) || defined(linux) -# define PLATFORM_ID "Linux" - -#elif defined(__CYGWIN__) -# define PLATFORM_ID "Cygwin" - -#elif defined(__MINGW32__) -# define PLATFORM_ID "MinGW" - -#elif defined(__APPLE__) -# define PLATFORM_ID "Darwin" - -#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32) -# define PLATFORM_ID "Windows" - -#elif defined(__FreeBSD__) || defined(__FreeBSD) -# define PLATFORM_ID "FreeBSD" - -#elif defined(__NetBSD__) || defined(__NetBSD) -# define PLATFORM_ID "NetBSD" - -#elif defined(__OpenBSD__) || defined(__OPENBSD) -# define PLATFORM_ID "OpenBSD" - -#elif defined(__sun) || defined(sun) -# define PLATFORM_ID "SunOS" - -#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__) -# define PLATFORM_ID "AIX" - -#elif defined(__sgi) || defined(__sgi__) || defined(_SGI) -# define PLATFORM_ID "IRIX" - -#elif defined(__hpux) || defined(__hpux__) -# define PLATFORM_ID "HP-UX" - -#elif defined(__HAIKU__) -# define PLATFORM_ID "Haiku" - -#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS) -# define PLATFORM_ID "BeOS" - -#elif defined(__QNX__) || defined(__QNXNTO__) -# define PLATFORM_ID "QNX" - -#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__) -# define PLATFORM_ID "Tru64" - -#elif defined(__riscos) || defined(__riscos__) -# define PLATFORM_ID "RISCos" - -#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__) -# define PLATFORM_ID "SINIX" - -#elif defined(__UNIX_SV__) -# define PLATFORM_ID "UNIX_SV" - -#elif defined(__bsdos__) -# define PLATFORM_ID "BSDOS" - -#elif defined(_MPRAS) || defined(MPRAS) -# define PLATFORM_ID "MP-RAS" - -#elif defined(__osf) || defined(__osf__) -# define PLATFORM_ID "OSF1" - -#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv) -# define PLATFORM_ID "SCO_SV" - -#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX) -# define PLATFORM_ID "ULTRIX" - -#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX) -# define PLATFORM_ID "Xenix" - -#elif defined(__WATCOMC__) -# if defined(__LINUX__) -# define PLATFORM_ID "Linux" - -# elif defined(__DOS__) -# define PLATFORM_ID "DOS" - -# elif defined(__OS2__) -# define PLATFORM_ID "OS2" - -# elif defined(__WINDOWS__) -# define PLATFORM_ID "Windows3x" - -# else /* unknown platform */ -# define PLATFORM_ID "" -# endif - -#else /* unknown platform */ -# define PLATFORM_ID "" - -#endif - -/* For windows compilers MSVC and Intel we can determine - the architecture of the compiler being used. This is because - the compilers do not have flags that can change the architecture, - but rather depend on which compiler is being used -*/ -#if defined(_WIN32) && defined(_MSC_VER) -# if defined(_M_IA64) -# define ARCHITECTURE_ID "IA64" - -# elif defined(_M_X64) || defined(_M_AMD64) -# define ARCHITECTURE_ID "x64" - -# elif defined(_M_IX86) -# define ARCHITECTURE_ID "X86" - -# elif defined(_M_ARM) -# if _M_ARM == 4 -# define ARCHITECTURE_ID "ARMV4I" -# elif _M_ARM == 5 -# define ARCHITECTURE_ID "ARMV5I" -# else -# define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM) -# endif - -# elif defined(_M_MIPS) -# define ARCHITECTURE_ID "MIPS" - -# elif defined(_M_SH) -# define ARCHITECTURE_ID "SHx" - -# else /* unknown architecture */ -# define ARCHITECTURE_ID "" -# endif - -#elif defined(__WATCOMC__) -# if defined(_M_I86) -# define ARCHITECTURE_ID "I86" - -# elif defined(_M_IX86) -# define ARCHITECTURE_ID "X86" - -# else /* unknown architecture */ -# define ARCHITECTURE_ID "" -# endif - -#else -# define ARCHITECTURE_ID "" -#endif - -/* Convert integer to decimal digit literals. */ -#define DEC(n) \ - ('0' + (((n) / 10000000)%10)), \ - ('0' + (((n) / 1000000)%10)), \ - ('0' + (((n) / 100000)%10)), \ - ('0' + (((n) / 10000)%10)), \ - ('0' + (((n) / 1000)%10)), \ - ('0' + (((n) / 100)%10)), \ - ('0' + (((n) / 10)%10)), \ - ('0' + ((n) % 10)) - -/* Convert integer to hex digit literals. */ -#define HEX(n) \ - ('0' + ((n)>>28 & 0xF)), \ - ('0' + ((n)>>24 & 0xF)), \ - ('0' + ((n)>>20 & 0xF)), \ - ('0' + ((n)>>16 & 0xF)), \ - ('0' + ((n)>>12 & 0xF)), \ - ('0' + ((n)>>8 & 0xF)), \ - ('0' + ((n)>>4 & 0xF)), \ - ('0' + ((n) & 0xF)) - -/* Construct a string literal encoding the version number components. */ -#ifdef COMPILER_VERSION_MAJOR -char const info_version[] = { - 'I', 'N', 'F', 'O', ':', - 'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[', - COMPILER_VERSION_MAJOR, -# ifdef COMPILER_VERSION_MINOR - '.', COMPILER_VERSION_MINOR, -# ifdef COMPILER_VERSION_PATCH - '.', COMPILER_VERSION_PATCH, -# ifdef COMPILER_VERSION_TWEAK - '.', COMPILER_VERSION_TWEAK, -# endif -# endif -# endif - ']','\0'}; -#endif - -/* Construct a string literal encoding the version number components. */ -#ifdef SIMULATE_VERSION_MAJOR -char const info_simulate_version[] = { - 'I', 'N', 'F', 'O', ':', - 's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[', - SIMULATE_VERSION_MAJOR, -# ifdef SIMULATE_VERSION_MINOR - '.', SIMULATE_VERSION_MINOR, -# ifdef SIMULATE_VERSION_PATCH - '.', SIMULATE_VERSION_PATCH, -# ifdef SIMULATE_VERSION_TWEAK - '.', SIMULATE_VERSION_TWEAK, -# endif -# endif -# endif - ']','\0'}; -#endif - -/* Construct the string literal in pieces to prevent the source from - getting matched. Store it in a pointer rather than an array - because some compilers will just produce instructions to fill the - array rather than assigning a pointer to a static array. */ -char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]"; -char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]"; - - - - -const char* info_language_dialect_default = "INFO" ":" "dialect_default[" -#if __cplusplus >= 201402L - "14" -#elif __cplusplus >= 201103L - "11" -#else - "98" -#endif -"]"; - -/*--------------------------------------------------------------------------*/ - -int main(int argc, char* argv[]) -{ - int require = 0; - require += info_compiler[argc]; - require += info_platform[argc]; -#ifdef COMPILER_VERSION_MAJOR - require += info_version[argc]; -#endif -#ifdef SIMULATE_ID - require += info_simulate[argc]; -#endif -#ifdef SIMULATE_VERSION_MAJOR - require += info_simulate_version[argc]; -#endif -#if defined(__CRAYXE) || defined(__CRAYXC) - require += info_cray[argc]; -#endif - require += info_language_dialect_default[argc]; - (void)argv; - return require; -} diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out deleted file mode 100755 index 8f3f2abf609f4888e5f8ff9281af2657a5e2094d..0000000000000000000000000000000000000000 Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out and /dev/null differ diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/CMakeOutput.log b/hpvm/projects/llvm-cbe/build/CMakeFiles/CMakeOutput.log deleted file mode 100644 index 34bcb9577d9dad1ca8cb6c1f22529e1cb191cbe6..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/CMakeOutput.log +++ /dev/null @@ -1,554 +0,0 @@ -The system is: Linux - 4.15.0-66-generic - x86_64 -Compiling the C compiler identification source file "CMakeCCompilerId.c" succeeded. -Compiler: /usr/bin/cc -Build flags: -Id flags: - -The output was: -0 - - -Compilation of the C compiler identification source "CMakeCCompilerId.c" produced "a.out" - -The C compiler identification is GNU, found in "/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out" - -Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded. -Compiler: /usr/bin/c++ -Build flags: -Id flags: - -The output was: -0 - - -Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out" - -The CXX compiler identification is GNU, found in "/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out" - -Determining if the C compiler works passed with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_a1887/fast" -/usr/bin/make -f CMakeFiles/cmTC_a1887.dir/build.make CMakeFiles/cmTC_a1887.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building C object CMakeFiles/cmTC_a1887.dir/testCCompiler.c.o -/usr/bin/cc -o CMakeFiles/cmTC_a1887.dir/testCCompiler.c.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp/testCCompiler.c -Linking C executable cmTC_a1887 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_a1887.dir/link.txt --verbose=1 -/usr/bin/cc CMakeFiles/cmTC_a1887.dir/testCCompiler.c.o -o cmTC_a1887 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - -Detecting C compiler ABI info compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_8b6ac/fast" -/usr/bin/make -f CMakeFiles/cmTC_8b6ac.dir/build.make CMakeFiles/cmTC_8b6ac.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building C object CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -/usr/bin/cc -o CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -c /usr/share/cmake-3.5/Modules/CMakeCCompilerABI.c -Linking C executable cmTC_8b6ac -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_8b6ac.dir/link.txt --verbose=1 -/usr/bin/cc -v CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -o cmTC_8b6ac -rdynamic -Using built-in specs. -COLLECT_GCC=/usr/bin/cc -COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -Target: x86_64-linux-gnu -Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu -Thread model: posix -gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) -COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/ -LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/ -COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_8b6ac' '-rdynamic' '-mtune=generic' '-march=x86-64' - /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/cckmhPLv.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_8b6ac /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - -Parsed C implicit link information from above output: - link line regex: [^( *|.*[/\])(ld|([^/\]+-)?ld|collect2)[^/\]*( |$)] - ignore line: [Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp] - ignore line: [] - ignore line: [Run Build Command:"/usr/bin/make" "cmTC_8b6ac/fast"] - ignore line: [/usr/bin/make -f CMakeFiles/cmTC_8b6ac.dir/build.make CMakeFiles/cmTC_8b6ac.dir/build] - ignore line: [make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'] - ignore line: [Building C object CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o] - ignore line: [/usr/bin/cc -o CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -c /usr/share/cmake-3.5/Modules/CMakeCCompilerABI.c] - ignore line: [Linking C executable cmTC_8b6ac] - ignore line: [/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_8b6ac.dir/link.txt --verbose=1] - ignore line: [/usr/bin/cc -v CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -o cmTC_8b6ac -rdynamic ] - ignore line: [Using built-in specs.] - ignore line: [COLLECT_GCC=/usr/bin/cc] - ignore line: [COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper] - ignore line: [Target: x86_64-linux-gnu] - ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu] - ignore line: [Thread model: posix] - ignore line: [gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) ] - ignore line: [COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/] - ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/] - ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_8b6ac' '-rdynamic' '-mtune=generic' '-march=x86-64'] - link line: [ /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/cckmhPLv.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_8b6ac /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o] - arg [/usr/lib/gcc/x86_64-linux-gnu/5/collect2] ==> ignore - arg [-plugin] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so] ==> ignore - arg [-plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper] ==> ignore - arg [-plugin-opt=-fresolution=/tmp/cckmhPLv.res] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore - arg [-plugin-opt=-pass-through=-lc] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore - arg [--sysroot=/] ==> ignore - arg [--build-id] ==> ignore - arg [--eh-frame-hdr] ==> ignore - arg [-m] ==> ignore - arg [elf_x86_64] ==> ignore - arg [--hash-style=gnu] ==> ignore - arg [--as-needed] ==> ignore - arg [-export-dynamic] ==> ignore - arg [-dynamic-linker] ==> ignore - arg [/lib64/ld-linux-x86-64.so.2] ==> ignore - arg [-zrelro] ==> ignore - arg [-o] ==> ignore - arg [cmTC_8b6ac] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o] ==> ignore - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5] - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] - arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu] - arg [-L/lib/../lib] ==> dir [/lib/../lib] - arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu] - arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib] - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..] - arg [CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o] ==> ignore - arg [-lgcc] ==> lib [gcc] - arg [--as-needed] ==> ignore - arg [-lgcc_s] ==> lib [gcc_s] - arg [--no-as-needed] ==> ignore - arg [-lc] ==> lib [c] - arg [-lgcc] ==> lib [gcc] - arg [--as-needed] ==> ignore - arg [-lgcc_s] ==> lib [gcc_s] - arg [--no-as-needed] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtend.o] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o] ==> ignore - remove lib [gcc] - remove lib [gcc_s] - remove lib [gcc] - remove lib [gcc_s] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5] ==> [/usr/lib/gcc/x86_64-linux-gnu/5] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> [/usr/lib] - collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu] - collapse library dir [/lib/../lib] ==> [/lib] - collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu] - collapse library dir [/usr/lib/../lib] ==> [/usr/lib] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> [/usr/lib] - implicit libs: [c] - implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib] - implicit fwks: [] - - - - -Detecting C [-std=c11] compiler features compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_58d37/fast" -/usr/bin/make -f CMakeFiles/cmTC_58d37.dir/build.make CMakeFiles/cmTC_58d37.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building C object CMakeFiles/cmTC_58d37.dir/feature_tests.c.o -/usr/bin/cc -std=c11 -o CMakeFiles/cmTC_58d37.dir/feature_tests.c.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c -Linking C executable cmTC_58d37 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_58d37.dir/link.txt --verbose=1 -/usr/bin/cc CMakeFiles/cmTC_58d37.dir/feature_tests.c.o -o cmTC_58d37 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - - Feature record: C_FEATURE:1c_function_prototypes - Feature record: C_FEATURE:1c_restrict - Feature record: C_FEATURE:1c_static_assert - Feature record: C_FEATURE:1c_variadic_macros - - -Detecting C [-std=c99] compiler features compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_47bc6/fast" -/usr/bin/make -f CMakeFiles/cmTC_47bc6.dir/build.make CMakeFiles/cmTC_47bc6.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building C object CMakeFiles/cmTC_47bc6.dir/feature_tests.c.o -/usr/bin/cc -std=c99 -o CMakeFiles/cmTC_47bc6.dir/feature_tests.c.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c -Linking C executable cmTC_47bc6 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_47bc6.dir/link.txt --verbose=1 -/usr/bin/cc CMakeFiles/cmTC_47bc6.dir/feature_tests.c.o -o cmTC_47bc6 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - - Feature record: C_FEATURE:1c_function_prototypes - Feature record: C_FEATURE:1c_restrict - Feature record: C_FEATURE:0c_static_assert - Feature record: C_FEATURE:1c_variadic_macros - - -Detecting C [-std=c90] compiler features compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_bafe8/fast" -/usr/bin/make -f CMakeFiles/cmTC_bafe8.dir/build.make CMakeFiles/cmTC_bafe8.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building C object CMakeFiles/cmTC_bafe8.dir/feature_tests.c.o -/usr/bin/cc -std=c90 -o CMakeFiles/cmTC_bafe8.dir/feature_tests.c.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c -Linking C executable cmTC_bafe8 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_bafe8.dir/link.txt --verbose=1 -/usr/bin/cc CMakeFiles/cmTC_bafe8.dir/feature_tests.c.o -o cmTC_bafe8 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - - Feature record: C_FEATURE:1c_function_prototypes - Feature record: C_FEATURE:0c_restrict - Feature record: C_FEATURE:0c_static_assert - Feature record: C_FEATURE:0c_variadic_macros -Determining if the CXX compiler works passed with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_a4db2/fast" -/usr/bin/make -f CMakeFiles/cmTC_a4db2.dir/build.make CMakeFiles/cmTC_a4db2.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building CXX object CMakeFiles/cmTC_a4db2.dir/testCXXCompiler.cxx.o -/usr/bin/c++ -o CMakeFiles/cmTC_a4db2.dir/testCXXCompiler.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp/testCXXCompiler.cxx -Linking CXX executable cmTC_a4db2 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_a4db2.dir/link.txt --verbose=1 -/usr/bin/c++ CMakeFiles/cmTC_a4db2.dir/testCXXCompiler.cxx.o -o cmTC_a4db2 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - -Detecting CXX compiler ABI info compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_9fb11/fast" -/usr/bin/make -f CMakeFiles/cmTC_9fb11.dir/build.make CMakeFiles/cmTC_9fb11.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building CXX object CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -/usr/bin/c++ -o CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -c /usr/share/cmake-3.5/Modules/CMakeCXXCompilerABI.cpp -Linking CXX executable cmTC_9fb11 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_9fb11.dir/link.txt --verbose=1 -/usr/bin/c++ -v CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -o cmTC_9fb11 -rdynamic -Using built-in specs. -COLLECT_GCC=/usr/bin/c++ -COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -Target: x86_64-linux-gnu -Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu -Thread model: posix -gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) -COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/ -LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/ -COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_9fb11' '-rdynamic' '-shared-libgcc' '-mtune=generic' '-march=x86-64' - /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/ccHuuX3E.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_9fb11 /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - -Parsed CXX implicit link information from above output: - link line regex: [^( *|.*[/\])(ld|([^/\]+-)?ld|collect2)[^/\]*( |$)] - ignore line: [Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp] - ignore line: [] - ignore line: [Run Build Command:"/usr/bin/make" "cmTC_9fb11/fast"] - ignore line: [/usr/bin/make -f CMakeFiles/cmTC_9fb11.dir/build.make CMakeFiles/cmTC_9fb11.dir/build] - ignore line: [make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'] - ignore line: [Building CXX object CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o] - ignore line: [/usr/bin/c++ -o CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -c /usr/share/cmake-3.5/Modules/CMakeCXXCompilerABI.cpp] - ignore line: [Linking CXX executable cmTC_9fb11] - ignore line: [/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_9fb11.dir/link.txt --verbose=1] - ignore line: [/usr/bin/c++ -v CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -o cmTC_9fb11 -rdynamic ] - ignore line: [Using built-in specs.] - ignore line: [COLLECT_GCC=/usr/bin/c++] - ignore line: [COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper] - ignore line: [Target: x86_64-linux-gnu] - ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu] - ignore line: [Thread model: posix] - ignore line: [gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) ] - ignore line: [COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/] - ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/] - ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_9fb11' '-rdynamic' '-shared-libgcc' '-mtune=generic' '-march=x86-64'] - link line: [ /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/ccHuuX3E.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_9fb11 /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o] - arg [/usr/lib/gcc/x86_64-linux-gnu/5/collect2] ==> ignore - arg [-plugin] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so] ==> ignore - arg [-plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper] ==> ignore - arg [-plugin-opt=-fresolution=/tmp/ccHuuX3E.res] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc] ==> ignore - arg [-plugin-opt=-pass-through=-lc] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore - arg [-plugin-opt=-pass-through=-lgcc] ==> ignore - arg [--sysroot=/] ==> ignore - arg [--build-id] ==> ignore - arg [--eh-frame-hdr] ==> ignore - arg [-m] ==> ignore - arg [elf_x86_64] ==> ignore - arg [--hash-style=gnu] ==> ignore - arg [--as-needed] ==> ignore - arg [-export-dynamic] ==> ignore - arg [-dynamic-linker] ==> ignore - arg [/lib64/ld-linux-x86-64.so.2] ==> ignore - arg [-zrelro] ==> ignore - arg [-o] ==> ignore - arg [cmTC_9fb11] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o] ==> ignore - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5] - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] - arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu] - arg [-L/lib/../lib] ==> dir [/lib/../lib] - arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu] - arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib] - arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..] - arg [CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore - arg [-lstdc++] ==> lib [stdc++] - arg [-lm] ==> lib [m] - arg [-lgcc_s] ==> lib [gcc_s] - arg [-lgcc] ==> lib [gcc] - arg [-lc] ==> lib [c] - arg [-lgcc_s] ==> lib [gcc_s] - arg [-lgcc] ==> lib [gcc] - arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtend.o] ==> ignore - arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o] ==> ignore - remove lib [gcc_s] - remove lib [gcc] - remove lib [gcc_s] - remove lib [gcc] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5] ==> [/usr/lib/gcc/x86_64-linux-gnu/5] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> [/usr/lib] - collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu] - collapse library dir [/lib/../lib] ==> [/lib] - collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu] - collapse library dir [/usr/lib/../lib] ==> [/usr/lib] - collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> [/usr/lib] - implicit libs: [stdc++;m;c] - implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib] - implicit fwks: [] - - - - -Detecting CXX [-std=c++14] compiler features compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_72948/fast" -/usr/bin/make -f CMakeFiles/cmTC_72948.dir/build.make CMakeFiles/cmTC_72948.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building CXX object CMakeFiles/cmTC_72948.dir/feature_tests.cxx.o -/usr/bin/c++ -std=c++14 -o CMakeFiles/cmTC_72948.dir/feature_tests.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx -Linking CXX executable cmTC_72948 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_72948.dir/link.txt --verbose=1 -/usr/bin/c++ CMakeFiles/cmTC_72948.dir/feature_tests.cxx.o -o cmTC_72948 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - - Feature record: CXX_FEATURE:1cxx_aggregate_default_initializers - Feature record: CXX_FEATURE:1cxx_alias_templates - Feature record: CXX_FEATURE:1cxx_alignas - Feature record: CXX_FEATURE:1cxx_alignof - Feature record: CXX_FEATURE:1cxx_attributes - Feature record: CXX_FEATURE:1cxx_attribute_deprecated - Feature record: CXX_FEATURE:1cxx_auto_type - Feature record: CXX_FEATURE:1cxx_binary_literals - Feature record: CXX_FEATURE:1cxx_constexpr - Feature record: CXX_FEATURE:1cxx_contextual_conversions - Feature record: CXX_FEATURE:1cxx_decltype - Feature record: CXX_FEATURE:1cxx_decltype_auto - Feature record: CXX_FEATURE:1cxx_decltype_incomplete_return_types - Feature record: CXX_FEATURE:1cxx_default_function_template_args - Feature record: CXX_FEATURE:1cxx_defaulted_functions - Feature record: CXX_FEATURE:1cxx_defaulted_move_initializers - Feature record: CXX_FEATURE:1cxx_delegating_constructors - Feature record: CXX_FEATURE:1cxx_deleted_functions - Feature record: CXX_FEATURE:1cxx_digit_separators - Feature record: CXX_FEATURE:1cxx_enum_forward_declarations - Feature record: CXX_FEATURE:1cxx_explicit_conversions - Feature record: CXX_FEATURE:1cxx_extended_friend_declarations - Feature record: CXX_FEATURE:1cxx_extern_templates - Feature record: CXX_FEATURE:1cxx_final - Feature record: CXX_FEATURE:1cxx_func_identifier - Feature record: CXX_FEATURE:1cxx_generalized_initializers - Feature record: CXX_FEATURE:1cxx_generic_lambdas - Feature record: CXX_FEATURE:1cxx_inheriting_constructors - Feature record: CXX_FEATURE:1cxx_inline_namespaces - Feature record: CXX_FEATURE:1cxx_lambdas - Feature record: CXX_FEATURE:1cxx_lambda_init_captures - Feature record: CXX_FEATURE:1cxx_local_type_template_args - Feature record: CXX_FEATURE:1cxx_long_long_type - Feature record: CXX_FEATURE:1cxx_noexcept - Feature record: CXX_FEATURE:1cxx_nonstatic_member_init - Feature record: CXX_FEATURE:1cxx_nullptr - Feature record: CXX_FEATURE:1cxx_override - Feature record: CXX_FEATURE:1cxx_range_for - Feature record: CXX_FEATURE:1cxx_raw_string_literals - Feature record: CXX_FEATURE:1cxx_reference_qualified_functions - Feature record: CXX_FEATURE:1cxx_relaxed_constexpr - Feature record: CXX_FEATURE:1cxx_return_type_deduction - Feature record: CXX_FEATURE:1cxx_right_angle_brackets - Feature record: CXX_FEATURE:1cxx_rvalue_references - Feature record: CXX_FEATURE:1cxx_sizeof_member - Feature record: CXX_FEATURE:1cxx_static_assert - Feature record: CXX_FEATURE:1cxx_strong_enums - Feature record: CXX_FEATURE:1cxx_template_template_parameters - Feature record: CXX_FEATURE:1cxx_thread_local - Feature record: CXX_FEATURE:1cxx_trailing_return_types - Feature record: CXX_FEATURE:1cxx_unicode_literals - Feature record: CXX_FEATURE:1cxx_uniform_initialization - Feature record: CXX_FEATURE:1cxx_unrestricted_unions - Feature record: CXX_FEATURE:1cxx_user_literals - Feature record: CXX_FEATURE:1cxx_variable_templates - Feature record: CXX_FEATURE:1cxx_variadic_macros - Feature record: CXX_FEATURE:1cxx_variadic_templates - - -Detecting CXX [-std=c++11] compiler features compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_4b9af/fast" -/usr/bin/make -f CMakeFiles/cmTC_4b9af.dir/build.make CMakeFiles/cmTC_4b9af.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building CXX object CMakeFiles/cmTC_4b9af.dir/feature_tests.cxx.o -/usr/bin/c++ -std=c++11 -o CMakeFiles/cmTC_4b9af.dir/feature_tests.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx -Linking CXX executable cmTC_4b9af -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_4b9af.dir/link.txt --verbose=1 -/usr/bin/c++ CMakeFiles/cmTC_4b9af.dir/feature_tests.cxx.o -o cmTC_4b9af -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - - Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers - Feature record: CXX_FEATURE:1cxx_alias_templates - Feature record: CXX_FEATURE:1cxx_alignas - Feature record: CXX_FEATURE:1cxx_alignof - Feature record: CXX_FEATURE:1cxx_attributes - Feature record: CXX_FEATURE:0cxx_attribute_deprecated - Feature record: CXX_FEATURE:1cxx_auto_type - Feature record: CXX_FEATURE:0cxx_binary_literals - Feature record: CXX_FEATURE:1cxx_constexpr - Feature record: CXX_FEATURE:0cxx_contextual_conversions - Feature record: CXX_FEATURE:1cxx_decltype - Feature record: CXX_FEATURE:0cxx_decltype_auto - Feature record: CXX_FEATURE:1cxx_decltype_incomplete_return_types - Feature record: CXX_FEATURE:1cxx_default_function_template_args - Feature record: CXX_FEATURE:1cxx_defaulted_functions - Feature record: CXX_FEATURE:1cxx_defaulted_move_initializers - Feature record: CXX_FEATURE:1cxx_delegating_constructors - Feature record: CXX_FEATURE:1cxx_deleted_functions - Feature record: CXX_FEATURE:0cxx_digit_separators - Feature record: CXX_FEATURE:1cxx_enum_forward_declarations - Feature record: CXX_FEATURE:1cxx_explicit_conversions - Feature record: CXX_FEATURE:1cxx_extended_friend_declarations - Feature record: CXX_FEATURE:1cxx_extern_templates - Feature record: CXX_FEATURE:1cxx_final - Feature record: CXX_FEATURE:1cxx_func_identifier - Feature record: CXX_FEATURE:1cxx_generalized_initializers - Feature record: CXX_FEATURE:0cxx_generic_lambdas - Feature record: CXX_FEATURE:1cxx_inheriting_constructors - Feature record: CXX_FEATURE:1cxx_inline_namespaces - Feature record: CXX_FEATURE:1cxx_lambdas - Feature record: CXX_FEATURE:0cxx_lambda_init_captures - Feature record: CXX_FEATURE:1cxx_local_type_template_args - Feature record: CXX_FEATURE:1cxx_long_long_type - Feature record: CXX_FEATURE:1cxx_noexcept - Feature record: CXX_FEATURE:1cxx_nonstatic_member_init - Feature record: CXX_FEATURE:1cxx_nullptr - Feature record: CXX_FEATURE:1cxx_override - Feature record: CXX_FEATURE:1cxx_range_for - Feature record: CXX_FEATURE:1cxx_raw_string_literals - Feature record: CXX_FEATURE:1cxx_reference_qualified_functions - Feature record: CXX_FEATURE:0cxx_relaxed_constexpr - Feature record: CXX_FEATURE:0cxx_return_type_deduction - Feature record: CXX_FEATURE:1cxx_right_angle_brackets - Feature record: CXX_FEATURE:1cxx_rvalue_references - Feature record: CXX_FEATURE:1cxx_sizeof_member - Feature record: CXX_FEATURE:1cxx_static_assert - Feature record: CXX_FEATURE:1cxx_strong_enums - Feature record: CXX_FEATURE:1cxx_template_template_parameters - Feature record: CXX_FEATURE:1cxx_thread_local - Feature record: CXX_FEATURE:1cxx_trailing_return_types - Feature record: CXX_FEATURE:1cxx_unicode_literals - Feature record: CXX_FEATURE:1cxx_uniform_initialization - Feature record: CXX_FEATURE:1cxx_unrestricted_unions - Feature record: CXX_FEATURE:1cxx_user_literals - Feature record: CXX_FEATURE:0cxx_variable_templates - Feature record: CXX_FEATURE:1cxx_variadic_macros - Feature record: CXX_FEATURE:1cxx_variadic_templates - - -Detecting CXX [-std=c++98] compiler features compiled with the following output: -Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp - -Run Build Command:"/usr/bin/make" "cmTC_1ceb0/fast" -/usr/bin/make -f CMakeFiles/cmTC_1ceb0.dir/build.make CMakeFiles/cmTC_1ceb0.dir/build -make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' -Building CXX object CMakeFiles/cmTC_1ceb0.dir/feature_tests.cxx.o -/usr/bin/c++ -std=c++98 -o CMakeFiles/cmTC_1ceb0.dir/feature_tests.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx -Linking CXX executable cmTC_1ceb0 -/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_1ceb0.dir/link.txt --verbose=1 -/usr/bin/c++ CMakeFiles/cmTC_1ceb0.dir/feature_tests.cxx.o -o cmTC_1ceb0 -rdynamic -make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp' - - - Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers - Feature record: CXX_FEATURE:0cxx_alias_templates - Feature record: CXX_FEATURE:0cxx_alignas - Feature record: CXX_FEATURE:0cxx_alignof - Feature record: CXX_FEATURE:0cxx_attributes - Feature record: CXX_FEATURE:0cxx_attribute_deprecated - Feature record: CXX_FEATURE:0cxx_auto_type - Feature record: CXX_FEATURE:0cxx_binary_literals - Feature record: CXX_FEATURE:0cxx_constexpr - Feature record: CXX_FEATURE:0cxx_contextual_conversions - Feature record: CXX_FEATURE:0cxx_decltype - Feature record: CXX_FEATURE:0cxx_decltype_auto - Feature record: CXX_FEATURE:0cxx_decltype_incomplete_return_types - Feature record: CXX_FEATURE:0cxx_default_function_template_args - Feature record: CXX_FEATURE:0cxx_defaulted_functions - Feature record: CXX_FEATURE:0cxx_defaulted_move_initializers - Feature record: CXX_FEATURE:0cxx_delegating_constructors - Feature record: CXX_FEATURE:0cxx_deleted_functions - Feature record: CXX_FEATURE:0cxx_digit_separators - Feature record: CXX_FEATURE:0cxx_enum_forward_declarations - Feature record: CXX_FEATURE:0cxx_explicit_conversions - Feature record: CXX_FEATURE:0cxx_extended_friend_declarations - Feature record: CXX_FEATURE:0cxx_extern_templates - Feature record: CXX_FEATURE:0cxx_final - Feature record: CXX_FEATURE:0cxx_func_identifier - Feature record: CXX_FEATURE:0cxx_generalized_initializers - Feature record: CXX_FEATURE:0cxx_generic_lambdas - Feature record: CXX_FEATURE:0cxx_inheriting_constructors - Feature record: CXX_FEATURE:0cxx_inline_namespaces - Feature record: CXX_FEATURE:0cxx_lambdas - Feature record: CXX_FEATURE:0cxx_lambda_init_captures - Feature record: CXX_FEATURE:0cxx_local_type_template_args - Feature record: CXX_FEATURE:0cxx_long_long_type - Feature record: CXX_FEATURE:0cxx_noexcept - Feature record: CXX_FEATURE:0cxx_nonstatic_member_init - Feature record: CXX_FEATURE:0cxx_nullptr - Feature record: CXX_FEATURE:0cxx_override - Feature record: CXX_FEATURE:0cxx_range_for - Feature record: CXX_FEATURE:0cxx_raw_string_literals - Feature record: CXX_FEATURE:0cxx_reference_qualified_functions - Feature record: CXX_FEATURE:0cxx_relaxed_constexpr - Feature record: CXX_FEATURE:0cxx_return_type_deduction - Feature record: CXX_FEATURE:0cxx_right_angle_brackets - Feature record: CXX_FEATURE:0cxx_rvalue_references - Feature record: CXX_FEATURE:0cxx_sizeof_member - Feature record: CXX_FEATURE:0cxx_static_assert - Feature record: CXX_FEATURE:0cxx_strong_enums - Feature record: CXX_FEATURE:1cxx_template_template_parameters - Feature record: CXX_FEATURE:0cxx_thread_local - Feature record: CXX_FEATURE:0cxx_trailing_return_types - Feature record: CXX_FEATURE:0cxx_unicode_literals - Feature record: CXX_FEATURE:0cxx_uniform_initialization - Feature record: CXX_FEATURE:0cxx_unrestricted_unions - Feature record: CXX_FEATURE:0cxx_user_literals - Feature record: CXX_FEATURE:0cxx_variable_templates - Feature record: CXX_FEATURE:0cxx_variadic_macros - Feature record: CXX_FEATURE:0cxx_variadic_templates diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/cmake.check_cache b/hpvm/projects/llvm-cbe/build/CMakeFiles/cmake.check_cache deleted file mode 100644 index 3dccd731726d7faa8b29d8d7dba3b981a53ca497..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/cmake.check_cache +++ /dev/null @@ -1 +0,0 @@ -# This file is generated by cmake for dependency checking of the CMakeCache.txt file diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.bin b/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.bin deleted file mode 100755 index 1b62f454d8a2b71fdf6dd528f88f1c018560a607..0000000000000000000000000000000000000000 Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.bin and /dev/null differ diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c b/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c deleted file mode 100644 index 6590dded2342f3eebd9b81505327e84a488580e6..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c +++ /dev/null @@ -1,34 +0,0 @@ - - const char features[] = {"\n" -"C_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 -"1" -#else -"0" -#endif -"c_function_prototypes\n" -"C_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -"1" -#else -"0" -#endif -"c_restrict\n" -"C_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201000L -"1" -#else -"0" -#endif -"c_static_assert\n" -"C_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -"1" -#else -"0" -#endif -"c_variadic_macros\n" - -}; - -int main(int argc, char** argv) { (void)argv; return features[argc]; } diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx b/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx deleted file mode 100644 index b93418c6ed69feaf1b5c2feb9592bbdb5a5f042c..0000000000000000000000000000000000000000 --- a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx +++ /dev/null @@ -1,405 +0,0 @@ - - const char features[] = {"\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L -"1" -#else -"0" -#endif -"cxx_aggregate_default_initializers\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_alias_templates\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_alignas\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_alignof\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_attributes\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_attribute_deprecated\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_auto_type\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_binary_literals\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_constexpr\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_contextual_conversions\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_decltype\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_decltype_auto\n" -"CXX_FEATURE:" -#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 40801) && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_decltype_incomplete_return_types\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_default_function_template_args\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_defaulted_functions\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_defaulted_move_initializers\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_delegating_constructors\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_deleted_functions\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_digit_separators\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_enum_forward_declarations\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_explicit_conversions\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_extended_friend_declarations\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_extern_templates\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_final\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_func_identifier\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_generalized_initializers\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_generic_lambdas\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_inheriting_constructors\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_inline_namespaces\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_lambdas\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_lambda_init_captures\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_local_type_template_args\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_long_long_type\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_noexcept\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_nonstatic_member_init\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_nullptr\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_override\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_range_for\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_raw_string_literals\n" -"CXX_FEATURE:" -#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 40801) && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_reference_qualified_functions\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L -"1" -#else -"0" -#endif -"cxx_relaxed_constexpr\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L -"1" -#else -"0" -#endif -"cxx_return_type_deduction\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_right_angle_brackets\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_rvalue_references\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_sizeof_member\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_static_assert\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_strong_enums\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && __cplusplus -"1" -#else -"0" -#endif -"cxx_template_template_parameters\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_thread_local\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_trailing_return_types\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_unicode_literals\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_uniform_initialization\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_unrestricted_unions\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L -"1" -#else -"0" -#endif -"cxx_user_literals\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L -"1" -#else -"0" -#endif -"cxx_variable_templates\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_variadic_macros\n" -"CXX_FEATURE:" -#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__)) -"1" -#else -"0" -#endif -"cxx_variadic_templates\n" - -}; - -int main(int argc, char** argv) { (void)argv; return features[argc]; } diff --git a/hpvm/projects/llvm-cbe/include/sample.h b/hpvm/projects/llvm-cbe/include/sample.h index b3ce9ce2928297fb61db666d865de81542743da3..1d2545fb05c7bc392cd19575aed4f48062366631 100644 --- a/hpvm/projects/llvm-cbe/include/sample.h +++ b/hpvm/projects/llvm-cbe/include/sample.h @@ -4,5 +4,4 @@ * This is a sample header file that is global to the entire project. * It is located here so that everyone will find it. */ -extern int compute_sample (int a); - +extern int compute_sample(int a); diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp index cb4d43311ab6754adf270769e56b3dd210a90163..a5fddb967dbd96befef3af7d01fe6c42fd16462c 100644 --- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp +++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp @@ -14,14 +14,14 @@ #include "CBackend.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Config/config.h" #include "llvm/IR/InstIterator.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/Host.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/Config/config.h" #include "llvm/Transforms/Utils.h" #include <algorithm> @@ -29,14 +29,13 @@ #include <iostream> - //#include "PHINodePass.h" -//Jackson Korba 9/29/14 +// Jackson Korba 9/29/14 #ifndef DEBUG_TYPE #define DEBUG_TYPE "" #endif -//End Modification +// End Modification // Some ms header decided to define setjmp as _setjmp, undo this for this file // since we don't need it @@ -52,7 +51,8 @@ extern "C" void LLVMInitializeCBackendTarget() { char CWriter::ID = 0; -// extra (invalid) Ops tags for tracking unary ops as a special case of the available binary ops +// extra (invalid) Ops tags for tracking unary ops as a special case of the +// available binary ops enum UnaryOps { BinaryNeg = Instruction::OtherOpsEnd + 1, BinaryNot, @@ -61,19 +61,16 @@ enum UnaryOps { static bool isEmptyType(Type *Ty) { if (StructType *STy = dyn_cast<StructType>(Ty)) return STy->getNumElements() == 0 || - std::all_of(STy->element_begin(), STy->element_end(), [](Type *T){ return isEmptyType(T); }); + std::all_of(STy->element_begin(), STy->element_end(), + [](Type *T) { return isEmptyType(T); }); if (VectorType *VTy = dyn_cast<VectorType>(Ty)) - return VTy->getNumElements() == 0 || - isEmptyType(VTy->getElementType()); + return VTy->getNumElements() == 0 || isEmptyType(VTy->getElementType()); if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) - return ATy->getNumElements() == 0 || - isEmptyType(ATy->getElementType()); + return ATy->getNumElements() == 0 || isEmptyType(ATy->getElementType()); return Ty->isVoidTy(); } -bool CWriter::isEmptyType(Type *Ty) const { - return ::isEmptyType(Ty); -} +bool CWriter::isEmptyType(Type *Ty) const { return ::isEmptyType(Ty); } /// isAddressExposed - Return true if the specified value's name needs to /// have its address taken in order to get a C value of the correct type. @@ -98,19 +95,19 @@ bool CWriter::isInlinableInst(Instruction &I) const { if (isa<GetElementPtrInst>(I)) { for (User *U : I.users()) { if (!(isa<LoadInst>(U) || isa<StoreInst>(U))) { - //DEBUG(errs() << "GEP user not a Load/Store!\n"); + // DEBUG(errs() << "GEP user not a Load/Store!\n"); return false; } } - //DEBUG(errs() << "All users of GEP are loads/stores, mark it inlinable!\n"); + // DEBUG(errs() << "All users of GEP are loads/stores, mark it + // inlinable!\n"); return true; } // Must be an expression, must be used exactly once. If it is dead, we // emit it inline where it would go. - if (isEmptyType(I.getType()) || !I.hasOneUse() || - I.isTerminator() || isa<CallInst>(I) || isa<PHINode>(I) || - isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) || - isa<InsertValueInst>(I)) + if (isEmptyType(I.getType()) || !I.hasOneUse() || I.isTerminator() || + isa<CallInst>(I) || isa<PHINode>(I) || isa<LoadInst>(I) || + isa<VAArgInst>(I) || isa<InsertElementInst>(I) || isa<InsertValueInst>(I)) // Don't inline a load across a store or other bad things! return false; @@ -130,19 +127,20 @@ bool CWriter::isInlinableInst(Instruction &I) const { // generate significantly better code than to emit alloca calls directly. // AllocaInst *CWriter::isDirectAlloca(Value *V) const { - //DEBUG(errs() << "Checking if " << *V << " is a direct alloca!\n"); + // DEBUG(errs() << "Checking if " << *V << " is a direct alloca!\n"); AllocaInst *AI = dyn_cast<AllocaInst>(V); - if (!AI) return 0; + if (!AI) + return 0; // Modification to inline fixed size array alloca! if (AI->isArrayAllocation()) - return AI; // FIXME: we can also inline fixed size array allocas! + return AI; // FIXME: we can also inline fixed size array allocas! if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock()) return 0; return AI; } // isInlineAsm - Check if the instruction is a call to an inline asm chunk. -bool CWriter::isInlineAsm(Instruction& I) const { +bool CWriter::isInlineAsm(Instruction &I) const { if (CallInst *CI = dyn_cast<CallInst>(&I)) return isa<InlineAsm>(CI->getCalledValue()); return false; @@ -160,19 +158,20 @@ bool CWriter::runOnFunction(Function &F) { PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); // Adding Scalar Evolution Pass for loop induction variable SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - //Adding Dominator Tree Pass + // Adding Dominator Tree Pass DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); // Adding Assumption Cache AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); // Adding IVUsers Pass for loop recongnition // IU = &getAnalysis<IVUsersWrapperPass>().getIU(); - BasicBlock* entry = &(F.getEntryBlock()); - for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); BI!=BE; ++BI) { + BasicBlock *entry = &(F.getEntryBlock()); + for (df_iterator<BasicBlock *> BI = df_begin(entry), BE = df_end(entry); + BI != BE; ++BI) { BasicBlock *BB = *BI; if (Loop *L = LI->getLoopFor(&*BB)) { - if(simplifyLoop(L, DT, LI, SE, AC, nullptr, /*true*/false)) { - //DEBUG(errs() << "Simplified loop!\n" << *L << "\n"); + if (simplifyLoop(L, DT, LI, SE, AC, nullptr, /*true*/ false)) { + // DEBUG(errs() << "Simplified loop!\n" << *L << "\n"); } } } @@ -197,15 +196,15 @@ static std::string CBEMangle(const std::string &S) { Result += S[i]; } else { Result += '_'; - Result += 'A'+(S[i]&15); - Result += 'A'+((S[i]>>4)&15); + Result += 'A' + (S[i] & 15); + Result += 'A' + ((S[i] >> 4) & 15); Result += '_'; } return Result; } -raw_ostream & -CWriter::printTypeString(raw_ostream &Out, Type *Ty, bool isSigned) { +raw_ostream &CWriter::printTypeString(raw_ostream &Out, Type *Ty, + bool isSigned) { if (StructType *ST = dyn_cast<StructType>(Ty)) { assert(!isEmptyType(ST)); TypedefDeclTypes.insert(Ty); @@ -225,46 +224,51 @@ CWriter::printTypeString(raw_ostream &Out, Type *Ty, bool isSigned) { } switch (Ty->getTypeID()) { - case Type::VoidTyID: return Out << "void"; - case Type::IntegerTyID: { - unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); - if (NumBits == 1) - return Out << "bool"; - else { - assert(NumBits <= 128 && "Bit widths > 128 not implemented yet"); - return Out << (isSigned?"i":"u") << NumBits; - } - } - case Type::FloatTyID: return Out << "f32"; - case Type::DoubleTyID: return Out << "f64"; - case Type::X86_FP80TyID: return Out << "f80"; - case Type::PPC_FP128TyID: - case Type::FP128TyID: return Out << "f128"; - - case Type::X86_MMXTyID: - return Out << (isSigned ? "i32y2" : "u32y2"); - - case Type::VectorTyID: { - TypedefDeclTypes.insert(Ty); - VectorType *VTy = cast<VectorType>(Ty); - assert(VTy->getNumElements() != 0); - printTypeString(Out, VTy->getElementType(), isSigned); - return Out << "x" << VTy->getNumElements(); - } - - case Type::ArrayTyID: { - TypedefDeclTypes.insert(Ty); - ArrayType *ATy = cast<ArrayType>(Ty); - assert(ATy->getNumElements() != 0); - printTypeString(Out, ATy->getElementType(), isSigned); - return Out << "a" << ATy->getNumElements(); - } + case Type::VoidTyID: + return Out << "void"; + case Type::IntegerTyID: { + unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); + if (NumBits == 1) + return Out << "bool"; + else { + assert(NumBits <= 128 && "Bit widths > 128 not implemented yet"); + return Out << (isSigned ? "i" : "u") << NumBits; + } + } + case Type::FloatTyID: + return Out << "f32"; + case Type::DoubleTyID: + return Out << "f64"; + case Type::X86_FP80TyID: + return Out << "f80"; + case Type::PPC_FP128TyID: + case Type::FP128TyID: + return Out << "f128"; - default: + case Type::X86_MMXTyID: + return Out << (isSigned ? "i32y2" : "u32y2"); + + case Type::VectorTyID: { + TypedefDeclTypes.insert(Ty); + VectorType *VTy = cast<VectorType>(Ty); + assert(VTy->getNumElements() != 0); + printTypeString(Out, VTy->getElementType(), isSigned); + return Out << "x" << VTy->getNumElements(); + } + + case Type::ArrayTyID: { + TypedefDeclTypes.insert(Ty); + ArrayType *ATy = cast<ArrayType>(Ty); + assert(ATy->getNumElements() != 0); + printTypeString(Out, ATy->getElementType(), isSigned); + return Out << "a" << ATy->getNumElements(); + } + + default: #ifndef NDEBUG - errs() << "Unknown primitive type: " << *Ty << "\n"; + errs() << "Unknown primitive type: " << *Ty << "\n"; #endif - llvm_unreachable(0); + llvm_unreachable(0); } } @@ -279,8 +283,9 @@ std::string CWriter::getStructName(StructType *ST) { return "struct l_unnamed_" + utostr(id); } -std::string CWriter::getFunctionName(FunctionType *FT, - std::pair<AttributeList, CallingConv::ID> PAL) { +std::string +CWriter::getFunctionName(FunctionType *FT, + std::pair<AttributeList, CallingConv::ID> PAL) { unsigned &id = UnnamedFunctionIDs[std::make_pair(FT, PAL)]; if (id == 0) id = ++NextFunctionNumber; @@ -294,7 +299,8 @@ std::string CWriter::getArrayName(ArrayType *AT) { // value semantics (avoiding the array "decay"). assert(!isEmptyType(AT)); printTypeName(ArrayInnards, AT->getElementType(), false); - return "struct l_array_" + utostr(AT->getNumElements()) + '_' + CBEMangle(ArrayInnards.str()); + return "struct l_array_" + utostr(AT->getNumElements()) + '_' + + CBEMangle(ArrayInnards.str()); } std::string CWriter::getVectorName(VectorType *VT, bool Aligned) { @@ -305,95 +311,125 @@ std::string CWriter::getVectorName(VectorType *VT, bool Aligned) { // if (Aligned) // Out << "__MSALIGN__(" << TD->getABITypeAlignment(VT) << ") "; printTypeName(VectorInnards, VT->getElementType(), false); - return "struct l_vector_" + utostr(VT->getNumElements()) + '_' + CBEMangle(VectorInnards.str()); + return "struct l_vector_" + utostr(VT->getNumElements()) + '_' + + CBEMangle(VectorInnards.str()); } - static const std::string getCmpPredicateName(CmpInst::Predicate P) { switch (P) { - case FCmpInst::FCMP_FALSE: return "0"; - case FCmpInst::FCMP_OEQ: return "oeq"; - case FCmpInst::FCMP_OGT: return "ogt"; - case FCmpInst::FCMP_OGE: return "oge"; - case FCmpInst::FCMP_OLT: return "olt"; - case FCmpInst::FCMP_OLE: return "ole"; - case FCmpInst::FCMP_ONE: return "one"; - case FCmpInst::FCMP_ORD: return "ord"; - case FCmpInst::FCMP_UNO: return "uno"; - case FCmpInst::FCMP_UEQ: return "ueq"; - case FCmpInst::FCMP_UGT: return "ugt"; - case FCmpInst::FCMP_UGE: return "uge"; - case FCmpInst::FCMP_ULT: return "ult"; - case FCmpInst::FCMP_ULE: return "ule"; - case FCmpInst::FCMP_UNE: return "une"; - case FCmpInst::FCMP_TRUE: return "1"; - case ICmpInst::ICMP_EQ: return "eq"; - case ICmpInst::ICMP_NE: return "ne"; - case ICmpInst::ICMP_ULE: return "ule"; - case ICmpInst::ICMP_SLE: return "sle"; - case ICmpInst::ICMP_UGE: return "uge"; - case ICmpInst::ICMP_SGE: return "sge"; - case ICmpInst::ICMP_ULT: return "ult"; - case ICmpInst::ICMP_SLT: return "slt"; - case ICmpInst::ICMP_UGT: return "ugt"; - case ICmpInst::ICMP_SGT: return "sgt"; - default: + case FCmpInst::FCMP_FALSE: + return "0"; + case FCmpInst::FCMP_OEQ: + return "oeq"; + case FCmpInst::FCMP_OGT: + return "ogt"; + case FCmpInst::FCMP_OGE: + return "oge"; + case FCmpInst::FCMP_OLT: + return "olt"; + case FCmpInst::FCMP_OLE: + return "ole"; + case FCmpInst::FCMP_ONE: + return "one"; + case FCmpInst::FCMP_ORD: + return "ord"; + case FCmpInst::FCMP_UNO: + return "uno"; + case FCmpInst::FCMP_UEQ: + return "ueq"; + case FCmpInst::FCMP_UGT: + return "ugt"; + case FCmpInst::FCMP_UGE: + return "uge"; + case FCmpInst::FCMP_ULT: + return "ult"; + case FCmpInst::FCMP_ULE: + return "ule"; + case FCmpInst::FCMP_UNE: + return "une"; + case FCmpInst::FCMP_TRUE: + return "1"; + case ICmpInst::ICMP_EQ: + return "eq"; + case ICmpInst::ICMP_NE: + return "ne"; + case ICmpInst::ICMP_ULE: + return "ule"; + case ICmpInst::ICMP_SLE: + return "sle"; + case ICmpInst::ICMP_UGE: + return "uge"; + case ICmpInst::ICMP_SGE: + return "sge"; + case ICmpInst::ICMP_ULT: + return "ult"; + case ICmpInst::ICMP_SLT: + return "slt"; + case ICmpInst::ICMP_UGT: + return "ugt"; + case ICmpInst::ICMP_SGT: + return "sgt"; + default: #ifndef NDEBUG - errs() << "Invalid icmp predicate!" << P; + errs() << "Invalid icmp predicate!" << P; #endif - llvm_unreachable(0); + llvm_unreachable(0); } } - -raw_ostream & -CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned) { +raw_ostream &CWriter::printSimpleType(raw_ostream &Out, Type *Ty, + bool isSigned) { assert((Ty->isSingleValueType() || Ty->isVoidTy()) && - "Invalid type for printSimpleType"); + "Invalid type for printSimpleType"); switch (Ty->getTypeID()) { - case Type::VoidTyID: return Out << "void"; - case Type::IntegerTyID: { - unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); - if (NumBits == 1) - return Out << "bool"; - else if (NumBits <= 8) - return Out << (isSigned?"char":"uchar"); - else if (NumBits <= 16) - return Out << (isSigned?"short":"ushort"); - else if (NumBits <= 32) - return Out << (isSigned?"int":"uint"); // !!FIX ME - else if (NumBits <= 64) - return Out << (isSigned?"long":"ulong"); - else { - assert(NumBits <= 128 && "Bit widths > 128 not implemented yet"); - return Out << (isSigned?"int128_t":"uint128_t"); - } - } - case Type::FloatTyID: return Out << "float"; - case Type::DoubleTyID: return Out << "double"; - // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is - // present matches host 'long double'. - case Type::X86_FP80TyID: - case Type::PPC_FP128TyID: - case Type::FP128TyID: return Out << "long double"; - - case Type::X86_MMXTyID: - return Out << (isSigned?"int":"uint") << " __attribute__((vector_size(8)))"; - - default: + case Type::VoidTyID: + return Out << "void"; + case Type::IntegerTyID: { + unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); + if (NumBits == 1) + return Out << "bool"; + else if (NumBits <= 8) + return Out << (isSigned ? "char" : "uchar"); + else if (NumBits <= 16) + return Out << (isSigned ? "short" : "ushort"); + else if (NumBits <= 32) + return Out << (isSigned ? "int" : "uint"); // !!FIX ME + else if (NumBits <= 64) + return Out << (isSigned ? "long" : "ulong"); + else { + assert(NumBits <= 128 && "Bit widths > 128 not implemented yet"); + return Out << (isSigned ? "int128_t" : "uint128_t"); + } + } + case Type::FloatTyID: + return Out << "float"; + case Type::DoubleTyID: + return Out << "double"; + // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is + // present matches host 'long double'. + case Type::X86_FP80TyID: + case Type::PPC_FP128TyID: + case Type::FP128TyID: + return Out << "long double"; + + case Type::X86_MMXTyID: + return Out << (isSigned ? "int" : "uint") + << " __attribute__((vector_size(8)))"; + + default: #ifndef NDEBUG - errs() << "Unknown primitive type: " << *Ty << "\n"; + errs() << "Unknown primitive type: " << *Ty << "\n"; #endif - llvm_unreachable(0); + llvm_unreachable(0); } } // Pass the Type* and the variable name and this prints out the variable // declaration. // -raw_ostream &CWriter::printTypeName(raw_ostream &Out, Type *Ty, - bool isSigned, - std::pair<AttributeList, CallingConv::ID> PAL) { +raw_ostream & +CWriter::printTypeName(raw_ostream &Out, Type *Ty, bool isSigned, + std::pair<AttributeList, CallingConv::ID> PAL) { if (Ty->isSingleValueType() || Ty->isVoidTy()) { if (!Ty->isPointerTy() && !Ty->isVectorTy()) @@ -404,39 +440,40 @@ raw_ostream &CWriter::printTypeName(raw_ostream &Out, Type *Ty, return Out << "void"; switch (Ty->getTypeID()) { - case Type::FunctionTyID: { - FunctionType *FTy = cast<FunctionType>(Ty); - return Out << getFunctionName(FTy, PAL); - } - case Type::StructTyID: { - TypedefDeclTypes.insert(Ty); - return Out << getStructName(cast<StructType>(Ty)); - } - - case Type::PointerTyID: { - Type *ElTy = Ty->getPointerElementType(); - return printTypeName(Out, ElTy, false) << '*'; - } - - case Type::ArrayTyID: { - TypedefDeclTypes.insert(Ty); - return Out << getArrayName(cast<ArrayType>(Ty)); - } - - case Type::VectorTyID: { - TypedefDeclTypes.insert(Ty); - return Out << getVectorName(cast<VectorType>(Ty), true); - } + case Type::FunctionTyID: { + FunctionType *FTy = cast<FunctionType>(Ty); + return Out << getFunctionName(FTy, PAL); + } + case Type::StructTyID: { + TypedefDeclTypes.insert(Ty); + return Out << getStructName(cast<StructType>(Ty)); + } - default: + case Type::PointerTyID: { + Type *ElTy = Ty->getPointerElementType(); + return printTypeName(Out, ElTy, false) << '*'; + } + + case Type::ArrayTyID: { + TypedefDeclTypes.insert(Ty); + return Out << getArrayName(cast<ArrayType>(Ty)); + } + + case Type::VectorTyID: { + TypedefDeclTypes.insert(Ty); + return Out << getVectorName(cast<VectorType>(Ty), true); + } + + default: #ifndef NDEBUG - errs() << "Unexpected type: " << *Ty << "\n"; + errs() << "Unexpected type: " << *Ty << "\n"; #endif - llvm_unreachable(0); + llvm_unreachable(0); } } -raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool isSigned) { +raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, + bool isSigned) { if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { // MSVC doesn't handle __declspec(align) on parameters, // but we specify it for Vector (hoping the compiler will vectorize it) @@ -447,13 +484,15 @@ raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool is return printTypeName(Out, Ty, isSigned); } -raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, StructType *STy) { +raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, + StructType *STy) { if (STy->isPacked()) Out << "#ifdef _MSC_VER\n#pragma pack(push, 1)\n#endif\n"; Out << getStructName(STy) << " {\n"; unsigned Idx = 0; for (StructType::element_iterator I = STy->element_begin(), - E = STy->element_end(); I != E; ++I, Idx++) { + E = STy->element_end(); + I != E; ++I, Idx++) { Out << " "; bool empty = isEmptyType(*I); if (empty) @@ -473,21 +512,23 @@ raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, StructType *STy) return Out; } -raw_ostream &CWriter::printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty, - std::pair<AttributeList, CallingConv::ID> PAL){ +raw_ostream &CWriter::printFunctionDeclaration( + raw_ostream &Out, FunctionType *Ty, + std::pair<AttributeList, CallingConv::ID> PAL) { Out << "typedef "; printFunctionProto(Out, Ty, PAL, getFunctionName(Ty, PAL), NULL, false); return Out << ";\n"; } -raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, - std::pair<AttributeList, CallingConv::ID> Attrs, - const std::string &Name, - Function::arg_iterator ArgList, - bool isKernel) { +raw_ostream & +CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, + std::pair<AttributeList, CallingConv::ID> Attrs, + const std::string &Name, + Function::arg_iterator ArgList, bool isKernel) { - // NOTE: AttributeSet is replaced by 'AttributeList' at function level in LLVM-9 + // NOTE: AttributeSet is replaced by 'AttributeList' at function level in + // LLVM-9 AttributeList &PAL = Attrs.first; if (PAL.hasAttribute(AttributeList::FunctionIndex, Attribute::NoReturn)) @@ -498,7 +539,7 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, // Should this function actually return a struct by-value? bool isStructReturn = PAL.hasAttribute(1, Attribute::StructRet) || - PAL.hasAttribute(2, Attribute::StructRet); + PAL.hasAttribute(2, Attribute::StructRet); // Get the return type for the function. Type *RetTy; if (!isStructReturn) @@ -508,24 +549,25 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, RetTy = cast<PointerType>(FTy->getParamType(0))->getElementType(); } printTypeName(Out, RetTy, - /*isSigned=*/PAL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)); + /*isSigned=*/ + PAL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)); Out << "/* Processing Function: " << Name << ": " << Attrs.second << "*/\n"; switch (Attrs.second) { - case CallingConv::C: - break; - case CallingConv::X86_StdCall: - Out << " __stdcall"; - break; - case CallingConv::X86_FastCall: - Out << " __fastcall"; - break; - case CallingConv::X86_ThisCall: - Out << " __thiscall"; - break; - default: - // assert(0 && "Encountered Unhandled Calling Convention"); - break; + case CallingConv::C: + break; + case CallingConv::X86_StdCall: + Out << " __stdcall"; + break; + case CallingConv::X86_FastCall: + Out << " __fastcall"; + break; + case CallingConv::X86_ThisCall: + Out << " __thiscall"; + break; + default: + // assert(0 && "Encountered Unhandled Calling Convention"); + break; } Out << ' ' << Name << '('; @@ -533,7 +575,8 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, bool PrintedArg = false; FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); - //Function::arg_iterator ArgName = ArgList ? ArgList->begin() : Function::arg_iterator(); + // Function::arg_iterator ArgName = ArgList ? ArgList->begin() : + // Function::arg_iterator(); // NOTE: ArgumentLists not supported in LLVM-9 Function::arg_iterator ArgName = ArgList ? ArgList : Function::arg_iterator(); @@ -544,8 +587,10 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, assert(I != E && "Invalid struct return function!"); ++I; ++Idx; - // CHECK: very confused as to how next loop starts from first Function Param? - if (ArgList) ++ArgName; + // CHECK: very confused as to how next loop starts from first Function + // Param? + if (ArgList) + ++ArgName; } for (; I != E; ++I) { @@ -559,27 +604,27 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, if (PointerType *PTy = dyn_cast<PointerType>(ArgTy)) { unsigned AddrSpace = PTy->getAddressSpace(); - //DEBUG(errs() << "AddrSpace for " << Idx << " = " << AddrSpace << "\n"); - switch(AddrSpace) { - case GLOBAL_ADDRSPACE: - Out << "__global "; - break; - case SHARED_ADDRSPACE: - Out << "__local "; - break; - case CONSTANT_ADDRSPACE: - Out << "__constant "; - break; - case PRIVATE_ADDRSPACE: - Out << "__private "; - break; - default: - break; + // DEBUG(errs() << "AddrSpace for " << Idx << " = " << AddrSpace << "\n"); + switch (AddrSpace) { + case GLOBAL_ADDRSPACE: + Out << "__global "; + break; + case SHARED_ADDRSPACE: + Out << "__local "; + break; + case CONSTANT_ADDRSPACE: + Out << "__constant "; + break; + case PRIVATE_ADDRSPACE: + Out << "__private "; + break; + default: + break; } } printTypeNameUnaligned(Out, ArgTy, - /*isSigned=*/PAL.hasAttribute(Idx, Attribute::SExt)); + /*isSigned=*/PAL.hasAttribute(Idx, Attribute::SExt)); PrintedArg = true; bool noalias = false; if (PAL.hasAttribute(Idx, Attribute::NoAlias)) { @@ -588,15 +633,16 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy, ++Idx; if (ArgList) { - Out << ' ' << (noalias ? " restrict " : "") << GetValueName(&*ArgName); + Out << ' ' << (noalias ? " restrict " : "") << GetValueName(&*ArgName); ++ArgName; } } if (FTy->isVarArg()) { if (!PrintedArg) { - Out << "int"; //dummy argument for empty vaarg functs - if (ArgList) Out << " vararg_dummy_arg"; + Out << "int"; // dummy argument for empty vaarg functs + if (ArgList) + Out << " vararg_dummy_arg"; } Out << ", ..."; } else if (!PrintedArg) { @@ -616,16 +662,20 @@ raw_ostream &CWriter::printArrayDeclaration(raw_ostream &Out, ArrayType *ATy) { return Out; } -raw_ostream &CWriter::printVectorDeclaration(raw_ostream &Out, VectorType *VTy) { +raw_ostream &CWriter::printVectorDeclaration(raw_ostream &Out, + VectorType *VTy) { assert(!isEmptyType(VTy)); // Vectors are printed like arrays Out << getVectorName(VTy, false) << " {\n "; printTypeName(Out, VTy->getElementType()); - Out << " vector[" << utostr(VTy->getNumElements()) << "];\n} __attribute__((aligned(" << TD->getABITypeAlignment(VTy) << ")));\n"; + Out << " vector[" << utostr(VTy->getNumElements()) + << "];\n} __attribute__((aligned(" << TD->getABITypeAlignment(VTy) + << ")));\n"; return Out; } -void CWriter::printConstantArray(ConstantArray *CPA, enum OperandContext Context) { +void CWriter::printConstantArray(ConstantArray *CPA, + enum OperandContext Context) { printConstant(cast<Constant>(CPA->getOperand(0)), Context); for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) { Out << ", "; @@ -633,7 +683,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, enum OperandContext Context } } -void CWriter::printConstantVector(ConstantVector *CP, enum OperandContext Context) { +void CWriter::printConstantVector(ConstantVector *CP, + enum OperandContext Context) { printConstant(cast<Constant>(CP->getOperand(0)), Context); for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) { Out << ", "; @@ -641,7 +692,8 @@ void CWriter::printConstantVector(ConstantVector *CP, enum OperandContext Contex } } -void CWriter::printConstantDataSequential(ConstantDataSequential *CDS, enum OperandContext Context) { +void CWriter::printConstantDataSequential(ConstantDataSequential *CDS, + enum OperandContext Context) { printConstant(CDS->getElementAsConstant(0), Context); for (unsigned i = 1, e = CDS->getNumElements(); i != e; ++i) { Out << ", "; @@ -653,8 +705,10 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) { // As a special case, print the array as a string if it is an array of // ubytes or an array of sbytes with positive values. ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C); - if (!CDS || !CDS->isCString()) return false; - if (Context != ContextStatic) return false; // TODO + if (!CDS || !CDS->isCString()) + return false; + if (Context != ContextStatic) + return false; // TODO Out << "{ \""; // Keep track of whether the last number was a hexadecimal escape. @@ -681,19 +735,34 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) { } else { LastWasHex = false; switch (C) { - case '\n': Out << "\\n"; break; - case '\t': Out << "\\t"; break; - case '\r': Out << "\\r"; break; - case '\v': Out << "\\v"; break; - case '\a': Out << "\\a"; break; - case '\"': Out << "\\\""; break; - case '\'': Out << "\\\'"; break; - default: - Out << "\\x"; - Out << (char)(( C/16 < 10) ? ( C/16 +'0') : ( C/16 -10+'A')); - Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A')); - LastWasHex = true; - break; + case '\n': + Out << "\\n"; + break; + case '\t': + Out << "\\t"; + break; + case '\r': + Out << "\\r"; + break; + case '\v': + Out << "\\v"; + break; + case '\a': + Out << "\\a"; + break; + case '\"': + Out << "\\\""; + break; + case '\'': + Out << "\\\'"; + break; + default: + Out << "\\x"; + Out << (char)((C / 16 < 10) ? (C / 16 + '0') : (C / 16 - 10 + 'A')); + Out << (char)(((C & 15) < 10) ? ((C & 15) + '0') + : ((C & 15) - 10 + 'A')); + LastWasHex = true; + break; } } } @@ -701,7 +770,6 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) { return true; } - // isFPCSafeToPrint - Returns true if we may assume that CFP may be written out // textually as a double (rather than as a reference to a stack-allocated // variable). We decide this by converting CFP to a string and back into a @@ -712,7 +780,7 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) { // // TODO copied from CppBackend, new code should use raw_ostream -static inline std::string ftostr(const APFloat& V) { +static inline std::string ftostr(const APFloat &V) { std::string Buf; if (&V.getSemantics() == &APFloat::IEEEdouble()) { raw_string_ostream(Buf) << V.convertToDouble(); @@ -730,14 +798,13 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) { if (CFP->getType() != Type::getFloatTy(CFP->getContext()) && CFP->getType() != Type::getDoubleTy(CFP->getContext())) return false; - APFloat APF = APFloat(CFP->getValueAPF()); // copy + APFloat APF = APFloat(CFP->getValueAPF()); // copy if (CFP->getType() == Type::getFloatTy(CFP->getContext())) APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &ignored); #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A char Buffer[100]; sprintf(Buffer, "%a", APF.convertToDouble()); - if (!strncmp(Buffer, "0x", 2) || - !strncmp(Buffer, "-0x", 3) || + if (!strncmp(Buffer, "0x", 2) || !strncmp(Buffer, "-0x", 3) || !strncmp(Buffer, "+0x", 3)) return APF.bitwiseIsEqual(APFloat(atof(Buffer))); return false; @@ -764,211 +831,249 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) { void CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) { // Print the destination type cast switch (opc) { - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::IntToPtr: - case Instruction::Trunc: - case Instruction::BitCast: - case Instruction::FPExt: - case Instruction::FPTrunc: // For these the DstTy sign doesn't matter - Out << '('; - printTypeName(Out, DstTy); - Out << ')'; - break; - case Instruction::ZExt: - case Instruction::PtrToInt: - case Instruction::FPToUI: // For these, make sure we get an unsigned dest - Out << '('; - printSimpleType(Out, DstTy, false); - Out << ')'; - break; - case Instruction::SExt: - case Instruction::FPToSI: // For these, make sure we get a signed dest - Out << '('; - printSimpleType(Out, DstTy, true); - Out << ')'; - break; - default: - llvm_unreachable("Invalid cast opcode"); + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::IntToPtr: + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::FPExt: + case Instruction::FPTrunc: // For these the DstTy sign doesn't matter + Out << '('; + printTypeName(Out, DstTy); + Out << ')'; + break; + case Instruction::ZExt: + case Instruction::PtrToInt: + case Instruction::FPToUI: // For these, make sure we get an unsigned dest + Out << '('; + printSimpleType(Out, DstTy, false); + Out << ')'; + break; + case Instruction::SExt: + case Instruction::FPToSI: // For these, make sure we get a signed dest + Out << '('; + printSimpleType(Out, DstTy, true); + Out << ')'; + break; + default: + llvm_unreachable("Invalid cast opcode"); } // Print the source type cast switch (opc) { - case Instruction::UIToFP: - case Instruction::ZExt: - Out << '('; - printSimpleType(Out, SrcTy, false); - Out << ')'; - break; - case Instruction::SIToFP: - case Instruction::SExt: - Out << '('; - printSimpleType(Out, SrcTy, true); - Out << ')'; - break; - case Instruction::IntToPtr: - case Instruction::PtrToInt: - // Avoid "cast to pointer from integer of different size" warnings - Out << "(uintptr_t)"; - break; - case Instruction::Trunc: - case Instruction::BitCast: - case Instruction::FPExt: - case Instruction::FPTrunc: - case Instruction::FPToSI: - case Instruction::FPToUI: - break; // These don't need a source cast. - default: - llvm_unreachable("Invalid cast opcode"); + case Instruction::UIToFP: + case Instruction::ZExt: + Out << '('; + printSimpleType(Out, SrcTy, false); + Out << ')'; + break; + case Instruction::SIToFP: + case Instruction::SExt: + Out << '('; + printSimpleType(Out, SrcTy, true); + Out << ')'; + break; + case Instruction::IntToPtr: + case Instruction::PtrToInt: + // Avoid "cast to pointer from integer of different size" warnings + Out << "(uintptr_t)"; + break; + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::FPExt: + case Instruction::FPTrunc: + case Instruction::FPToSI: + case Instruction::FPToUI: + break; // These don't need a source cast. + default: + llvm_unreachable("Invalid cast opcode"); } } // printConstant - The LLVM Constant to C Constant converter. void CWriter::printConstant(Constant *CPV, enum OperandContext Context) { if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) { - assert(CE->getType()->isIntegerTy() || CE->getType()->isFloatingPointTy() || CE->getType()->isPointerTy()); // TODO: VectorType are valid here, but not supported + assert(CE->getType()->isIntegerTy() || CE->getType()->isFloatingPointTy() || + CE->getType()->isPointerTy()); // TODO: VectorType are valid here, + // but not supported GetElementPtrInst *GEPI; switch (CE->getOpcode()) { - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::BitCast: - Out << "("; - printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType()); - if (CE->getOpcode() == Instruction::SExt && - CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) { - // Make sure we really sext from bool here by subtracting from 0 - Out << "0-"; - } - printConstant(CE->getOperand(0), ContextCasted); - if (CE->getType() == Type::getInt1Ty(CPV->getContext()) && - (CE->getOpcode() == Instruction::Trunc || - CE->getOpcode() == Instruction::FPToUI || - CE->getOpcode() == Instruction::FPToSI || - CE->getOpcode() == Instruction::PtrToInt)) { - // Make sure we really truncate to bool here by anding with 1 - Out << "&1u"; - } - Out << ')'; - return; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + Out << "("; + printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType()); + if (CE->getOpcode() == Instruction::SExt && + CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) { + // Make sure we really sext from bool here by subtracting from 0 + Out << "0-"; + } + printConstant(CE->getOperand(0), ContextCasted); + if (CE->getType() == Type::getInt1Ty(CPV->getContext()) && + (CE->getOpcode() == Instruction::Trunc || + CE->getOpcode() == Instruction::FPToUI || + CE->getOpcode() == Instruction::FPToSI || + CE->getOpcode() == Instruction::PtrToInt)) { + // Make sure we really truncate to bool here by anding with 1 + Out << "&1u"; + } + Out << ')'; + return; - case Instruction::GetElementPtr: - Out << "("; - //DEBUG(errs() << "\n----------\nCE: " << *CE << "\n"); - GEPI = dyn_cast<GetElementPtrInst>(CE->getAsInstruction()); - //DEBUG(errs() << "GEPI: " << *GEPI << "\n"); - printGEPExpression(CE->getOperand(0), gep_type_begin(CPV), gep_type_end(CPV), CE->getOperand(0)->getType()->isArrayTy(), GEPI); - delete(GEPI); - //DEBUG(errs() << "Deleted GEPI!\n"); - Out << ")"; - return; - case Instruction::Select: - Out << '('; - printConstant(CE->getOperand(0), ContextCasted); - Out << '?'; - printConstant(CE->getOperand(1), ContextNormal); - Out << ':'; - printConstant(CE->getOperand(2), ContextNormal); - Out << ')'; - return; + case Instruction::GetElementPtr: + Out << "("; + // DEBUG(errs() << "\n----------\nCE: " << *CE << "\n"); + GEPI = dyn_cast<GetElementPtrInst>(CE->getAsInstruction()); + // DEBUG(errs() << "GEPI: " << *GEPI << "\n"); + printGEPExpression(CE->getOperand(0), gep_type_begin(CPV), + gep_type_end(CPV), + CE->getOperand(0)->getType()->isArrayTy(), GEPI); + delete (GEPI); + // DEBUG(errs() << "Deleted GEPI!\n"); + Out << ")"; + return; + case Instruction::Select: + Out << '('; + printConstant(CE->getOperand(0), ContextCasted); + Out << '?'; + printConstant(CE->getOperand(1), ContextNormal); + Out << ':'; + printConstant(CE->getOperand(2), ContextNormal); + Out << ')'; + return; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: { + Out << '('; + bool NeedsClosingParens = printConstExprCast(CE); + printConstantWithCast(CE->getOperand(0), CE->getOpcode()); + switch (CE->getOpcode()) { case Instruction::Add: case Instruction::FAdd: + Out << " + "; + break; case Instruction::Sub: case Instruction::FSub: + Out << " - "; + break; case Instruction::Mul: case Instruction::FMul: - case Instruction::SDiv: - case Instruction::UDiv: - case Instruction::FDiv: + Out << " * "; + break; case Instruction::URem: case Instruction::SRem: case Instruction::FRem: + Out << " % "; + break; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + Out << " / "; + break; case Instruction::And: + Out << " & "; + break; case Instruction::Or: + Out << " | "; + break; case Instruction::Xor: - case Instruction::ICmp: + Out << " ^ "; + break; case Instruction::Shl: + Out << " << "; + break; case Instruction::LShr: case Instruction::AShr: - { - Out << '('; - bool NeedsClosingParens = printConstExprCast(CE); - printConstantWithCast(CE->getOperand(0), CE->getOpcode()); - switch (CE->getOpcode()) { - case Instruction::Add: - case Instruction::FAdd: Out << " + "; break; - case Instruction::Sub: - case Instruction::FSub: Out << " - "; break; - case Instruction::Mul: - case Instruction::FMul: Out << " * "; break; - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: Out << " % "; break; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: Out << " / "; break; - case Instruction::And: Out << " & "; break; - case Instruction::Or: Out << " | "; break; - case Instruction::Xor: Out << " ^ "; break; - case Instruction::Shl: Out << " << "; break; - case Instruction::LShr: - case Instruction::AShr: Out << " >> "; break; - case Instruction::ICmp: - switch (CE->getPredicate()) { - case ICmpInst::ICMP_EQ: Out << " == "; break; - case ICmpInst::ICMP_NE: Out << " != "; break; - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_ULT: Out << " < "; break; - case ICmpInst::ICMP_SLE: - case ICmpInst::ICMP_ULE: Out << " <= "; break; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_UGT: Out << " > "; break; - case ICmpInst::ICMP_SGE: - case ICmpInst::ICMP_UGE: Out << " >= "; break; - default: llvm_unreachable("Illegal ICmp predicate"); - } - break; - default: llvm_unreachable("Illegal opcode here!"); - } - printConstantWithCast(CE->getOperand(1), CE->getOpcode()); - if (NeedsClosingParens) - Out << "))"; - Out << ')'; - return; + Out << " >> "; + break; + case Instruction::ICmp: + switch (CE->getPredicate()) { + case ICmpInst::ICMP_EQ: + Out << " == "; + break; + case ICmpInst::ICMP_NE: + Out << " != "; + break; + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_ULT: + Out << " < "; + break; + case ICmpInst::ICMP_SLE: + case ICmpInst::ICMP_ULE: + Out << " <= "; + break; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_UGT: + Out << " > "; + break; + case ICmpInst::ICMP_SGE: + case ICmpInst::ICMP_UGE: + Out << " >= "; + break; + default: + llvm_unreachable("Illegal ICmp predicate"); } - case Instruction::FCmp: { - Out << '('; - bool NeedsClosingParens = printConstExprCast(CE); - if (CE->getPredicate() == FCmpInst::FCMP_FALSE) - Out << "0"; - else if (CE->getPredicate() == FCmpInst::FCMP_TRUE) - Out << "1"; - else { - Out << "llvm_fcmp_" << getCmpPredicateName((CmpInst::Predicate)CE->getPredicate()) << "("; - printConstant(CE->getOperand(0), ContextCasted); - Out << ", "; - printConstant(CE->getOperand(1), ContextCasted); - Out << ")"; - } - if (NeedsClosingParens) - Out << "))"; - Out << ')'; - return; - } + break; default: + llvm_unreachable("Illegal opcode here!"); + } + printConstantWithCast(CE->getOperand(1), CE->getOpcode()); + if (NeedsClosingParens) + Out << "))"; + Out << ')'; + return; + } + case Instruction::FCmp: { + Out << '('; + bool NeedsClosingParens = printConstExprCast(CE); + if (CE->getPredicate() == FCmpInst::FCMP_FALSE) + Out << "0"; + else if (CE->getPredicate() == FCmpInst::FCMP_TRUE) + Out << "1"; + else { + Out << "llvm_fcmp_" + << getCmpPredicateName((CmpInst::Predicate)CE->getPredicate()) + << "("; + printConstant(CE->getOperand(0), ContextCasted); + Out << ", "; + printConstant(CE->getOperand(1), ContextCasted); + Out << ")"; + } + if (NeedsClosingParens) + Out << "))"; + Out << ')'; + return; + } + default: #ifndef NDEBUG - errs() << "CWriter Error: Unhandled constant expression: " - << *CE << "\n"; + errs() << "CWriter Error: Unhandled constant expression: " << *CE << "\n"; #endif - llvm_unreachable(0); + llvm_unreachable(0); } } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) { if (CPV->getType()->isVectorTy()) { @@ -985,7 +1090,8 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) { Constant *Zero = Constant::getNullValue(VT->getElementType()); unsigned NumElts = VT->getNumElements(); for (unsigned i = 0; i != NumElts; ++i) { - if (i) Out << ", "; + if (i) + Out << ", "; printConstant(Zero, ContextCasted); } Out << ")"; @@ -999,9 +1105,10 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) { } if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { - Type* Ty = CI->getType(); + Type *Ty = CI->getType(); unsigned ActiveBits = CI->getValue().getMinSignedBits(); - // DEBUG(errs() << "Here: " << *CI << ", " << *Ty << ", " << ActiveBits << "\n"); + // DEBUG(errs() << "Here: " << *CI << ", " << *Ty << ", " << ActiveBits + // << "\n"); Out << CI->getSExtValue(); // if (Ty == Type::getInt1Ty(CPV->getContext())) { // Out << (CI->getZExtValue() ? '1' : '0'); @@ -1014,7 +1121,8 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) { // Out << CI->getSExtValue(); // most likely a shorter representation //// if (ActiveBits >= 32) //// Out << ")"; - // } else if (Ty->getPrimitiveSizeInBits() < 32 && Context == ContextNormal) { + // } else if (Ty->getPrimitiveSizeInBits() < 32 && Context == + // ContextNormal) { // Out << "(("; // printSimpleType(Out, Ty, false) << ')'; // if (CI->isMinValue(true)) @@ -1031,248 +1139,266 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) { //// const APInt &V = CI->getValue(); //// const APInt &Vlo = V.getLoBits(64); //// const APInt &Vhi = V.getHiBits(64); - //// Out << (Context == ContextStatic ? "UINT128_C" : "llvm_ctor_u128"); - //// Out << "(UINT64_C(" << Vhi.getZExtValue() << "), UINT64_C(" << Vlo.getZExtValue() << "))"; + //// Out << (Context == ContextStatic ? "UINT128_C" : + ///"llvm_ctor_u128"); / Out << "(UINT64_C(" << Vhi.getZExtValue() << + ///"), UINT64_C(" << Vlo.getZExtValue() << "))"; // } return; } switch (CPV->getType()->getTypeID()) { - case Type::FloatTyID: - case Type::DoubleTyID: - case Type::X86_FP80TyID: - case Type::PPC_FP128TyID: - case Type::FP128TyID: { - ConstantFP *FPC = cast<ConstantFP>(CPV); - std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC); - if (I != FPConstantMap.end()) { - // Because of FP precision problems we must load from a stack allocated - // value that holds the value in hex. - Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ? - "float" : - FPC->getType() == Type::getDoubleTy(CPV->getContext()) ? - "double" : - "long double") - << "*)&FPConstant" << I->second << ')'; - } else { - double V; - if (FPC->getType() == Type::getFloatTy(CPV->getContext())) - V = FPC->getValueAPF().convertToFloat(); - else if (FPC->getType() == Type::getDoubleTy(CPV->getContext())) - V = FPC->getValueAPF().convertToDouble(); - else { - // Long double. Convert the number to double, discarding precision. - // This is not awesome, but it at least makes the CBE output somewhat - // useful. - APFloat Tmp = FPC->getValueAPF(); - bool LosesInfo; - Tmp.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo); - V = Tmp.convertToDouble(); - } - - if (std::isnan(V)) { - // The value is NaN - - // FIXME the actual NaN bits should be emitted. - // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN, - // it's 0x7ff4. - const unsigned long QuietNaN = 0x7ff8UL; - //const unsigned long SignalNaN = 0x7ff4UL; - - // We need to grab the first part of the FP # - char Buffer[100]; - - uint64_t ll = DoubleToBits(V); - sprintf(Buffer, "0x%llx", static_cast<long long>(ll)); - - std::string Num(&Buffer[0], &Buffer[6]); - unsigned long Val = strtoul(Num.c_str(), 0, 16); - - if (FPC->getType() == Type::getFloatTy(FPC->getContext())) - Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\"" - << Buffer << "\") /*nan*/ "; - else - Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\"" - << Buffer << "\") /*nan*/ "; - } else if (std::isinf(V)) { - // The value is Inf - if (V < 0) Out << '-'; - Out << "LLVM_INF" << - (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "") - << " /*inf*/ "; - } else { - std::string Num; + case Type::FloatTyID: + case Type::DoubleTyID: + case Type::X86_FP80TyID: + case Type::PPC_FP128TyID: + case Type::FP128TyID: { + ConstantFP *FPC = cast<ConstantFP>(CPV); + std::map<const ConstantFP *, unsigned>::iterator I = + FPConstantMap.find(FPC); + if (I != FPConstantMap.end()) { + // Because of FP precision problems we must load from a stack allocated + // value that holds the value in hex. + Out << "(*(" + << (FPC->getType() == Type::getFloatTy(CPV->getContext()) + ? "float" + : FPC->getType() == Type::getDoubleTy(CPV->getContext()) + ? "double" + : "long double") + << "*)&FPConstant" << I->second << ')'; + } else { + double V; + if (FPC->getType() == Type::getFloatTy(CPV->getContext())) + V = FPC->getValueAPF().convertToFloat(); + else if (FPC->getType() == Type::getDoubleTy(CPV->getContext())) + V = FPC->getValueAPF().convertToDouble(); + else { + // Long double. Convert the number to double, discarding precision. + // This is not awesome, but it at least makes the CBE output somewhat + // useful. + APFloat Tmp = FPC->getValueAPF(); + bool LosesInfo; + Tmp.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo); + V = Tmp.convertToDouble(); + } + + if (std::isnan(V)) { + // The value is NaN + + // FIXME the actual NaN bits should be emitted. + // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN, + // it's 0x7ff4. + const unsigned long QuietNaN = 0x7ff8UL; + // const unsigned long SignalNaN = 0x7ff4UL; + + // We need to grab the first part of the FP # + char Buffer[100]; + + uint64_t ll = DoubleToBits(V); + sprintf(Buffer, "0x%llx", static_cast<long long>(ll)); + + std::string Num(&Buffer[0], &Buffer[6]); + unsigned long Val = strtoul(Num.c_str(), 0, 16); + + if (FPC->getType() == Type::getFloatTy(FPC->getContext())) + Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\"" << Buffer + << "\") /*nan*/ "; + else + Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\"" << Buffer + << "\") /*nan*/ "; + } else if (std::isinf(V)) { + // The value is Inf + if (V < 0) + Out << '-'; + Out << "LLVM_INF" + << (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" + : "") + << " /*inf*/ "; + } else { + std::string Num; #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A - // Print out the constant as a floating point number. - char Buffer[100]; - sprintf(Buffer, "%a", V); - Num = Buffer; + // Print out the constant as a floating point number. + char Buffer[100]; + sprintf(Buffer, "%a", V); + Num = Buffer; #else - Num = ftostr(FPC->getValueAPF()); -#endif - Out << Num; - } - } - break; - } - - case Type::ArrayTyID: { - if (printConstantString(CPV, Context)) break; - ArrayType *AT = cast<ArrayType>(CPV->getType()); - assert(AT->getNumElements() != 0 && !isEmptyType(AT)); - if (Context != ContextStatic) { - CtorDeclTypes.insert(AT); - Out << "llvm_ctor_"; - printTypeString(Out, AT, false); - Out << "("; - Context = ContextCasted; - } else { - Out << "{ { "; // Arrays are wrapped in struct types. - } - if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) { - printConstantArray(CA, Context); - } else if (ConstantDataSequential *CDS = - dyn_cast<ConstantDataSequential>(CPV)) { - printConstantDataSequential(CDS, Context); - } else { - assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)); - Constant *CZ = Constant::getNullValue(AT->getElementType()); - printConstant(CZ, Context); - for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) { - Out << ", "; - printConstant(CZ, Context); - } - } - Out << (Context == ContextStatic ? " } }" : ")"); // Arrays are wrapped in struct types. - break; - } - - case Type::VectorTyID: { - VectorType *VT = cast<VectorType>(CPV->getType()); - assert(VT->getNumElements() != 0 && !isEmptyType(VT)); - if (Context != ContextStatic) { - CtorDeclTypes.insert(VT); - Out << "llvm_ctor_"; - printTypeString(Out, VT, false); - Out << "("; - Context = ContextCasted; - } else { - Out << "{ "; - } - if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) { - printConstantVector(CV, Context); - } else if (ConstantDataSequential *CDS = - dyn_cast<ConstantDataSequential>(CPV)) { - printConstantDataSequential(CDS, Context); - } else { - assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)); - Constant *CZ = Constant::getNullValue(VT->getElementType()); - printConstant(CZ, Context); - for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) { - Out << ", "; - printConstant(CZ, Context); - } - } - Out << (Context == ContextStatic ? " }" : ")"); - break; - } - - case Type::StructTyID: { - StructType *ST = cast<StructType>(CPV->getType()); - assert(!isEmptyType(ST)); - if (Context != ContextStatic) { - CtorDeclTypes.insert(ST); - Out << "llvm_ctor_"; - printTypeString(Out, ST, false); - Out << "("; - Context = ContextCasted; - } else { - Out << "{ "; - } - - if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) { - bool printed = false; - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Type *ElTy = ST->getElementType(i); - if (isEmptyType(ElTy)) continue; - if (printed) Out << ", "; - printConstant(Constant::getNullValue(ElTy), Context); - printed = true; - } - assert(printed); - } else { - bool printed = false; - for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) { - Constant *C = cast<Constant>(CPV->getOperand(i)); - if (isEmptyType(C->getType())) continue; - if (printed) Out << ", "; - printConstant(C, Context); - printed = true; - } - assert(printed); - } - Out << (Context == ContextStatic ? " }" : ")"); - break; - } - - case Type::PointerTyID: - if (isa<ConstantPointerNull>(CPV)) { - Out << "(("; - printTypeName(Out, CPV->getType()); // sign doesn't matter - Out << ")/*NULL*/0)"; - break; - } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) { - writeOperand(GV); - break; - } - // FALL THROUGH - default: -#ifndef NDEBUG - errs() << "Unknown constant type: " << *CPV << "\n"; + Num = ftostr(FPC->getValueAPF()); #endif - llvm_unreachable(0); + Out << Num; + } + } + break; } -} -// Some constant expressions need to be casted back to the original types -// because their operands were casted to the expected type. This function takes -// care of detecting that case and printing the cast for the ConstantExpr. -bool CWriter::printConstExprCast(ConstantExpr* CE) { - bool NeedsExplicitCast = false; - Type *Ty = CE->getOperand(0)->getType(); - bool TypeIsSigned = false; - switch (CE->getOpcode()) { - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - // We need to cast integer arithmetic so that it is always performed - // as unsigned, to avoid undefined behavior on overflow. - case Instruction::LShr: - case Instruction::URem: - case Instruction::UDiv: NeedsExplicitCast = true; break; - case Instruction::AShr: - case Instruction::SRem: - case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break; - case Instruction::SExt: - Ty = CE->getType(); - NeedsExplicitCast = true; - TypeIsSigned = true; - break; - case Instruction::ZExt: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::BitCast: - Ty = CE->getType(); - NeedsExplicitCast = true; - break; - default: break; + case Type::ArrayTyID: { + if (printConstantString(CPV, Context)) + break; + ArrayType *AT = cast<ArrayType>(CPV->getType()); + assert(AT->getNumElements() != 0 && !isEmptyType(AT)); + if (Context != ContextStatic) { + CtorDeclTypes.insert(AT); + Out << "llvm_ctor_"; + printTypeString(Out, AT, false); + Out << "("; + Context = ContextCasted; + } else { + Out << "{ { "; // Arrays are wrapped in struct types. + } + if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) { + printConstantArray(CA, Context); + } else if (ConstantDataSequential *CDS = + dyn_cast<ConstantDataSequential>(CPV)) { + printConstantDataSequential(CDS, Context); + } else { + assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)); + Constant *CZ = Constant::getNullValue(AT->getElementType()); + printConstant(CZ, Context); + for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) { + Out << ", "; + printConstant(CZ, Context); + } + } + Out << (Context == ContextStatic + ? " } }" + : ")"); // Arrays are wrapped in struct types. + break; + } + + case Type::VectorTyID: { + VectorType *VT = cast<VectorType>(CPV->getType()); + assert(VT->getNumElements() != 0 && !isEmptyType(VT)); + if (Context != ContextStatic) { + CtorDeclTypes.insert(VT); + Out << "llvm_ctor_"; + printTypeString(Out, VT, false); + Out << "("; + Context = ContextCasted; + } else { + Out << "{ "; + } + if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) { + printConstantVector(CV, Context); + } else if (ConstantDataSequential *CDS = + dyn_cast<ConstantDataSequential>(CPV)) { + printConstantDataSequential(CDS, Context); + } else { + assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)); + Constant *CZ = Constant::getNullValue(VT->getElementType()); + printConstant(CZ, Context); + for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) { + Out << ", "; + printConstant(CZ, Context); + } + } + Out << (Context == ContextStatic ? " }" : ")"); + break; + } + + case Type::StructTyID: { + StructType *ST = cast<StructType>(CPV->getType()); + assert(!isEmptyType(ST)); + if (Context != ContextStatic) { + CtorDeclTypes.insert(ST); + Out << "llvm_ctor_"; + printTypeString(Out, ST, false); + Out << "("; + Context = ContextCasted; + } else { + Out << "{ "; + } + + if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) { + bool printed = false; + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Type *ElTy = ST->getElementType(i); + if (isEmptyType(ElTy)) + continue; + if (printed) + Out << ", "; + printConstant(Constant::getNullValue(ElTy), Context); + printed = true; + } + assert(printed); + } else { + bool printed = false; + for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) { + Constant *C = cast<Constant>(CPV->getOperand(i)); + if (isEmptyType(C->getType())) + continue; + if (printed) + Out << ", "; + printConstant(C, Context); + printed = true; + } + assert(printed); + } + Out << (Context == ContextStatic ? " }" : ")"); + break; + } + + case Type::PointerTyID: + if (isa<ConstantPointerNull>(CPV)) { + Out << "(("; + printTypeName(Out, CPV->getType()); // sign doesn't matter + Out << ")/*NULL*/0)"; + break; + } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) { + writeOperand(GV); + break; + } + // FALL THROUGH + default: +#ifndef NDEBUG + errs() << "Unknown constant type: " << *CPV << "\n"; +#endif + llvm_unreachable(0); + } +} + +// Some constant expressions need to be casted back to the original types +// because their operands were casted to the expected type. This function takes +// care of detecting that case and printing the cast for the ConstantExpr. +bool CWriter::printConstExprCast(ConstantExpr *CE) { + bool NeedsExplicitCast = false; + Type *Ty = CE->getOperand(0)->getType(); + bool TypeIsSigned = false; + switch (CE->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // We need to cast integer arithmetic so that it is always performed + // as unsigned, to avoid undefined behavior on overflow. + case Instruction::LShr: + case Instruction::URem: + case Instruction::UDiv: + NeedsExplicitCast = true; + break; + case Instruction::AShr: + case Instruction::SRem: + case Instruction::SDiv: + NeedsExplicitCast = true; + TypeIsSigned = true; + break; + case Instruction::SExt: + Ty = CE->getType(); + NeedsExplicitCast = true; + TypeIsSigned = true; + break; + case Instruction::ZExt: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + Ty = CE->getType(); + NeedsExplicitCast = true; + break; + default: + break; } if (NeedsExplicitCast) { Out << "(("; @@ -1285,11 +1411,13 @@ bool CWriter::printConstExprCast(ConstantExpr* CE) { // Print a constant assuming that it is the operand for a given Opcode. The // opcodes that care about sign need to cast their operands to the expected // type before the operation proceeds. This function does the casting. -void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) { +void CWriter::printConstantWithCast(Constant *CPV, unsigned Opcode) { // Extract the operand's type, we'll need it. - Type* OpTy = CPV->getType(); - assert(OpTy->isIntegerTy() || OpTy->isFloatingPointTy()); // TODO: VectorType are valid here, but not supported + Type *OpTy = CPV->getType(); + assert(OpTy->isIntegerTy() || + OpTy->isFloatingPointTy()); // TODO: VectorType are valid here, but not + // supported // Indicate whether to do the cast or not. bool shouldCast; @@ -1309,7 +1437,7 @@ void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) { } std::string CWriter::GetValueName(Value *Operand) { - //DEBUG(errs() << "In getvaluename: " << *Operand << "\n"); + // DEBUG(errs() << "In getvaluename: " << *Operand << "\n"); // Resolve potential alias. if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Operand)) { @@ -1332,8 +1460,7 @@ std::string CWriter::GetValueName(Value *Operand) { std::string VarName; VarName.reserve(Name.capacity()); - for (std::string::iterator I = Name.begin(), E = Name.end(); - I != E; ++I) { + for (std::string::iterator I = Name.begin(), E = Name.end(); I != E; ++I) { unsigned char ch = *I; if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || @@ -1357,7 +1484,7 @@ void CWriter::writeInstComputationInline(Instruction &I) { unsigned mask = 0; Type *Ty = I.getType(); if (Ty->isIntegerTy()) { - IntegerType *ITy = static_cast<IntegerType*>(Ty); + IntegerType *ITy = static_cast<IntegerType *>(Ty); if (!ITy->isPowerOf2ByteWidth()) mask = ITy->getBitMask(); } @@ -1375,20 +1502,20 @@ void CWriter::writeInstComputationInline(Instruction &I) { Out << ")&" << mask << ")"; } - -void CWriter::writeOperandInternal(Value *Operand, enum OperandContext Context) { - //DEBUG(errs() << "In write operand internal: " << *Operand << "\n"); +void CWriter::writeOperandInternal(Value *Operand, + enum OperandContext Context) { + // DEBUG(errs() << "In write operand internal: " << *Operand << "\n"); if (Instruction *I = dyn_cast<Instruction>(Operand)) // Should we inline this instruction to build a tree? if (isInlinableInst(*I) && !isDirectAlloca(I)) { - //DEBUG(errs() << "isInlinableInst & NOT isDirectAlloca\n" << "\n"); + // DEBUG(errs() << "isInlinableInst & NOT isDirectAlloca\n" << "\n"); Out << '('; writeInstComputationInline(*I); Out << ')'; return; } - Constant* CPV = dyn_cast<Constant>(Operand); + Constant *CPV = dyn_cast<Constant>(Operand); if (CPV && !isa<GlobalValue>(CPV)) printConstant(CPV, Context); @@ -1396,12 +1523,14 @@ void CWriter::writeOperandInternal(Value *Operand, enum OperandContext Context) Out << GetValueName(Operand); } -void CWriter::writeOperand(Value *Operand, enum OperandContext Context, bool arrayAccess) { - //DEBUG(errs() << "In write operand: " << *Operand << "; ArrayAccess = " << arrayAccess << "\n"); +void CWriter::writeOperand(Value *Operand, enum OperandContext Context, + bool arrayAccess) { + // DEBUG(errs() << "In write operand: " << *Operand << "; ArrayAccess = " << + // arrayAccess << "\n"); bool isAddressImplicit = isAddressExposed(Operand); if (isAddressImplicit && !arrayAccess) { DEBUG(errs() << "isAddressImplicit & NOT arrayAccess!\n"); - Out << "(&"; // Global variables are referenced as their addresses by llvm + Out << "(&"; // Global variables are referenced as their addresses by llvm } writeOperandInternal(Operand, Context); @@ -1430,26 +1559,27 @@ void CWriter::writeOperandDeref(Value *Operand) { bool CWriter::writeInstructionCast(Instruction &I) { Type *Ty = I.getOperand(0)->getType(); switch (I.getOpcode()) { - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - // We need to cast integer arithmetic so that it is always performed - // as unsigned, to avoid undefined behavior on overflow. - case Instruction::LShr: - case Instruction::URem: - case Instruction::UDiv: - Out << "(("; - printSimpleType(Out, Ty, false); - Out << ")("; - return true; - case Instruction::AShr: - case Instruction::SRem: - case Instruction::SDiv: - Out << "(("; - printSimpleType(Out, Ty, true); - Out << ")("; - return true; - default: break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // We need to cast integer arithmetic so that it is always performed + // as unsigned, to avoid undefined behavior on overflow. + case Instruction::LShr: + case Instruction::URem: + case Instruction::UDiv: + Out << "(("; + printSimpleType(Out, Ty, false); + Out << ")("; + return true; + case Instruction::AShr: + case Instruction::SRem: + case Instruction::SDiv: + Out << "(("; + printSimpleType(Out, Ty, true); + Out << ")("; + return true; + default: + break; } return false; } @@ -1457,7 +1587,8 @@ bool CWriter::writeInstructionCast(Instruction &I) { // Write the operand with a cast to another type based on the Opcode being used. // This will be used in cases where an instruction has specific type // requirements (usually signedness) for its operands. -void CWriter::opcodeNeedsCast(unsigned Opcode, +void CWriter::opcodeNeedsCast( + unsigned Opcode, // Indicate whether to do the cast or not. bool &shouldCast, // Indicate whether the cast should be to a signed type or not. @@ -1467,33 +1598,33 @@ void CWriter::opcodeNeedsCast(unsigned Opcode, // the new type to which the operand should be casted by setting the value // of OpTy. If we change OpTy, also set shouldCast to true. switch (Opcode) { - default: - // for most instructions, it doesn't matter - shouldCast = false; - castIsSigned = false; - break; - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - // We need to cast integer arithmetic so that it is always performed - // as unsigned, to avoid undefined behavior on overflow. - case Instruction::LShr: - case Instruction::UDiv: - case Instruction::URem: // Cast to unsigned first - shouldCast = true; - castIsSigned = false; - break; - case Instruction::GetElementPtr: - case Instruction::AShr: - case Instruction::SDiv: - case Instruction::SRem: // Cast to signed first - shouldCast = true; - castIsSigned = true; - break; + default: + // for most instructions, it doesn't matter + shouldCast = false; + castIsSigned = false; + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // We need to cast integer arithmetic so that it is always performed + // as unsigned, to avoid undefined behavior on overflow. + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::URem: // Cast to unsigned first + shouldCast = true; + castIsSigned = false; + break; + case Instruction::GetElementPtr: + case Instruction::AShr: + case Instruction::SDiv: + case Instruction::SRem: // Cast to signed first + shouldCast = true; + castIsSigned = true; + break; } } -void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) { +void CWriter::writeOperandWithCast(Value *Operand, unsigned Opcode) { // DEBUG(errs() << "Here: " << *Operand << "\n"); // Write out the casted operand if we should, otherwise just write the // operand. @@ -1511,12 +1642,12 @@ void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) { // writeOperand(Operand, ContextCasted); // Out << ")"; // } else - writeOperand(Operand, ContextNormal/*ContextCasted*/); + writeOperand(Operand, ContextNormal /*ContextCasted*/); } // Write the operand with a cast to another type based on the icmp predicate // being used. -void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) { +void CWriter::writeOperandWithCast(Value *Operand, ICmpInst &Cmp) { // This has to do a cast to ensure the operand has the right signedness. // Also, if the operand is a pointer, we make sure to cast to an integer when // doing the comparison both for signedness and so that the C compiler doesn't @@ -1535,7 +1666,7 @@ void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) { bool castIsSigned = Cmp.isSigned(); // If the operand was a pointer, convert to a large integer type. - Type* OpTy = Operand->getType(); + Type *OpTy = Operand->getType(); if (OpTy->isPointerTy()) OpTy = TD->getIntPtrType(Operand->getContext()); @@ -1549,61 +1680,64 @@ void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) { // generateCompilerSpecificCode - This is where we add conditional compilation // directives to cater to specific compilers as need be. // -static void generateCompilerSpecificCode(raw_ostream& Out, - const DataLayout *TD) { +static void generateCompilerSpecificCode(raw_ostream &Out, + const DataLayout *TD) { // Alloca is hard to get, and we don't want to include stdlib.h here. Out << "/* get a declaration for alloca */\n" - << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n" - << "#define alloca(x) __builtin_alloca((x))\n" - << "#define _alloca(x) __builtin_alloca((x))\n" - << "#elif defined(__APPLE__)\n" - << "extern void *__builtin_alloca(unsigned long);\n" - << "#define alloca(x) __builtin_alloca(x)\n" - << "#define longjmp _longjmp\n" - << "#define setjmp _setjmp\n" - << "#elif defined(__sun__)\n" - << "#if defined(__sparcv9)\n" - << "extern void *__builtin_alloca(unsigned long);\n" - << "#else\n" - << "extern void *__builtin_alloca(unsigned int);\n" - << "#endif\n" - << "#define alloca(x) __builtin_alloca(x)\n" - << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n" - << "#define alloca(x) __builtin_alloca(x)\n" - << "#elif defined(_MSC_VER)\n" - << "#define alloca(x) _alloca(x)\n" - << "#else\n" - << "#include <alloca.h>\n" - << "#endif\n\n"; + << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n" + << "#define alloca(x) __builtin_alloca((x))\n" + << "#define _alloca(x) __builtin_alloca((x))\n" + << "#elif defined(__APPLE__)\n" + << "extern void *__builtin_alloca(unsigned long);\n" + << "#define alloca(x) __builtin_alloca(x)\n" + << "#define longjmp _longjmp\n" + << "#define setjmp _setjmp\n" + << "#elif defined(__sun__)\n" + << "#if defined(__sparcv9)\n" + << "extern void *__builtin_alloca(unsigned long);\n" + << "#else\n" + << "extern void *__builtin_alloca(unsigned int);\n" + << "#endif\n" + << "#define alloca(x) __builtin_alloca(x)\n" + << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || " + "defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n" + << "#define alloca(x) __builtin_alloca(x)\n" + << "#elif defined(_MSC_VER)\n" + << "#define alloca(x) _alloca(x)\n" + << "#else\n" + << "#include <alloca.h>\n" + << "#endif\n\n"; // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))". Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n" - << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n" - << "#elif defined(__GNUC__)\n" - << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n" - << "#else\n" - << "#define __EXTERNAL_WEAK__\n" - << "#endif\n\n"; + << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n" + << "#elif defined(__GNUC__)\n" + << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n" + << "#else\n" + << "#define __EXTERNAL_WEAK__\n" + << "#endif\n\n"; // For now, turn off the weak linkage attribute on Mac OS X. (See above.) Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n" - << "#define __ATTRIBUTE_WEAK__\n" - << "#elif defined(__GNUC__)\n" - << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n" - << "#else\n" - << "#define __ATTRIBUTE_WEAK__\n" - << "#endif\n\n"; + << "#define __ATTRIBUTE_WEAK__\n" + << "#elif defined(__GNUC__)\n" + << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n" + << "#else\n" + << "#define __ATTRIBUTE_WEAK__\n" + << "#endif\n\n"; // Add hidden visibility support. FIXME: APPLE_CC? Out << "#if defined(__GNUC__)\n" - << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n" - << "#endif\n\n"; + << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n" + << "#endif\n\n"; // Define unaligned-load helper macro Out << "#ifdef _MSC_VER\n"; - Out << "#define __UNALIGNED_LOAD__(type, align, op) *((type __unaligned*)op)\n"; + Out << "#define __UNALIGNED_LOAD__(type, align, op) *((type " + "__unaligned*)op)\n"; Out << "#else\n"; - Out << "#define __UNALIGNED_LOAD__(type, align, op) ((struct { type data __attribute__((packed, aligned(align))); }*)op)->data\n"; + Out << "#define __UNALIGNED_LOAD__(type, align, op) ((struct { type data " + "__attribute__((packed, aligned(align))); }*)op)->data\n"; Out << "#endif\n\n"; // Define unaligned-load helper macro @@ -1654,110 +1788,144 @@ static void generateCompilerSpecificCode(raw_ostream& Out, // // Similar to __builtin_inf, except the return type is float. Out << "#ifdef __GNUC__\n" - << "#define LLVM_NAN(NanStr) __builtin_nan(NanStr) /* Double */\n" - << "#define LLVM_NANF(NanStr) __builtin_nanf(NanStr) /* Float */\n" - //<< "#define LLVM_NANS(NanStr) __builtin_nans(NanStr) /* Double */\n" - //<< "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n" - << "#define LLVM_INF __builtin_inf() /* Double */\n" - << "#define LLVM_INFF __builtin_inff() /* Float */\n" - << "#define LLVM_PREFETCH(addr,rw,locality) " - "__builtin_prefetch(addr,rw,locality)\n" - << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n" - << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n" - << "#else\n" - << "#define LLVM_NAN(NanStr) ((double)NAN) /* Double */\n" - << "#define LLVM_NANF(NanStr) ((float)NAN)) /* Float */\n" - //<< "#define LLVM_NANS(NanStr) ((double)NAN) /* Double */\n" - //<< "#define LLVM_NANSF(NanStr) ((single)NAN) /* Float */\n" - << "#define LLVM_INF ((double)INFINITY) /* Double */\n" - << "#define LLVM_INFF ((float)INFINITY) /* Float */\n" - << "#define LLVM_PREFETCH(addr,rw,locality) /* PREFETCH */\n" - << "#define __ATTRIBUTE_CTOR__ \"__attribute__((constructor)) not supported on this compiler\"\n" - << "#define __ATTRIBUTE_DTOR__ \"__attribute__((destructor)) not supported on this compiler\"\n" - << "#endif\n\n"; - - Out << "#if !defined(__GNUC__) || __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n" - << "#define __builtin_stack_save() 0 /* not implemented */\n" - << "#define __builtin_stack_restore(X) /* noop */\n" - << "#endif\n\n"; + << "#define LLVM_NAN(NanStr) __builtin_nan(NanStr) /* Double */\n" + << "#define LLVM_NANF(NanStr) __builtin_nanf(NanStr) /* Float */\n" + //<< "#define LLVM_NANS(NanStr) __builtin_nans(NanStr) /* Double */\n" + //<< "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n" + << "#define LLVM_INF __builtin_inf() /* Double */\n" + << "#define LLVM_INFF __builtin_inff() /* Float */\n" + << "#define LLVM_PREFETCH(addr,rw,locality) " + "__builtin_prefetch(addr,rw,locality)\n" + << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n" + << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n" + << "#else\n" + << "#define LLVM_NAN(NanStr) ((double)NAN) /* Double */\n" + << "#define LLVM_NANF(NanStr) ((float)NAN)) /* Float */\n" + //<< "#define LLVM_NANS(NanStr) ((double)NAN) /* Double */\n" + //<< "#define LLVM_NANSF(NanStr) ((single)NAN) /* Float */\n" + << "#define LLVM_INF ((double)INFINITY) /* Double */\n" + << "#define LLVM_INFF ((float)INFINITY) /* Float */\n" + << "#define LLVM_PREFETCH(addr,rw,locality) /* PREFETCH */\n" + << "#define __ATTRIBUTE_CTOR__ \"__attribute__((constructor)) not " + "supported on this compiler\"\n" + << "#define __ATTRIBUTE_DTOR__ \"__attribute__((destructor)) not " + "supported on this compiler\"\n" + << "#endif\n\n"; + + Out << "#if !defined(__GNUC__) || __GNUC__ < 4 /* Old GCC's, or compilers " + "not GCC */ \n" + << "#define __builtin_stack_save() 0 /* not implemented */\n" + << "#define __builtin_stack_restore(X) /* noop */\n" + << "#endif\n\n"; // Output typedefs for 128-bit integers - Out << "#if defined(__GNUC__) && defined(__LP64__) /* 128-bit integer types */\n" - << "typedef int __attribute__((mode(TI))) int128_t;\n" - << "typedef unsigned __attribute__((mode(TI))) uint128_t;\n" - << "#define UINT128_C(hi, lo) (((uint128_t)(hi) << 64) | (uint128_t)(lo))\n" - << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {" - << " return UINT128_C(hi, lo); }\n" - << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t r) {" - << " return l == r; }\n" - << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t r) {" - << " return l != r; }\n" - << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t r) {" - << " return l <= r; }\n" - << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) {" - << " return l <= r; }\n" - << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t r) {" - << " return l >= r; }\n" - << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) {" - << " return l >= r; }\n" - << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t r) {" - << " return l < r; }\n" - << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) {" - << " return l < r; }\n" - << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t r) {" - << " return l > r; }\n" - << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) {" - << " return l > r; }\n" - - << "#else /* manual 128-bit types */\n" - // TODO: field order should be reversed for big-endian - << "typedef struct { ulong lo; ulong hi; } uint128_t;\n" - << "typedef uint128_t int128_t;\n" - << "#define UINT128_C(hi, lo) {(lo), (hi)}\n" // only use in Static context - << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {" - << " uint128_t r; r.lo = lo; r.hi = hi; return r; }\n" - << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t r) {" - << " return l.hi == r.hi && l.lo == r.lo; }\n" - << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t r) {" - << " return l.hi != r.hi || l.lo != r.lo; }\n" - << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t r) {" - << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo <= l.lo : 0); }\n" - << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) {" - << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo <= (long)l.lo : 0); }\n" - << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t r) {" - << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo >= l.hi : 0); }\n" - << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) {" - << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo >= (long)l.lo : 0); }\n" - << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t r) {" - << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo < l.hi : 0); }\n" - << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) {" - << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo < (long)l.lo : 0); }\n" - << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t r) {" - << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo > l.hi : 0); }\n" - << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) {" - << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo > (long)l.lo : 0); }\n" - << "#define __emulate_i128\n" - << "#endif\n\n"; + Out << "#if defined(__GNUC__) && defined(__LP64__) /* 128-bit integer types " + "*/\n" + << "typedef int __attribute__((mode(TI))) int128_t;\n" + << "typedef unsigned __attribute__((mode(TI))) uint128_t;\n" + << "#define UINT128_C(hi, lo) (((uint128_t)(hi) << 64) | " + "(uint128_t)(lo))\n" + << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {" + << " return UINT128_C(hi, lo); }\n" + << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t " + "r) {" + << " return l == r; }\n" + << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t " + "r) {" + << " return l != r; }\n" + << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t " + "r) {" + << " return l <= r; }\n" + << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) " + "{" + << " return l <= r; }\n" + << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t " + "r) {" + << " return l >= r; }\n" + << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) " + "{" + << " return l >= r; }\n" + << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t " + "r) {" + << " return l < r; }\n" + << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) " + "{" + << " return l < r; }\n" + << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t " + "r) {" + << " return l > r; }\n" + << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) " + "{" + << " return l > r; }\n" + + << "#else /* manual 128-bit types */\n" + // TODO: field order should be reversed for big-endian + << "typedef struct { ulong lo; ulong hi; } uint128_t;\n" + << "typedef uint128_t int128_t;\n" + << "#define UINT128_C(hi, lo) {(lo), (hi)}\n" // only use in Static + // context + << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {" + << " uint128_t r; r.lo = lo; r.hi = hi; return r; }\n" + << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t " + "r) {" + << " return l.hi == r.hi && l.lo == r.lo; }\n" + << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t " + "r) {" + << " return l.hi != r.hi || l.lo != r.lo; }\n" + << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t " + "r) {" + << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo <= l.lo : 0); }\n" + << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) " + "{" + << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo <= " + "(long)l.lo : 0); }\n" + << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t " + "r) {" + << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo >= l.hi : 0); }\n" + << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) " + "{" + << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo >= " + "(long)l.lo : 0); }\n" + << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t " + "r) {" + << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo < l.hi : 0); }\n" + << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) " + "{" + << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo < " + "(long)l.lo : 0); }\n" + << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t " + "r) {" + << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo > l.hi : 0); }\n" + << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) " + "{" + << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo > " + "(long)l.lo : 0); }\n" + << "#define __emulate_i128\n" + << "#endif\n\n"; // We output GCC specific attributes to preserve 'linkonce'ness on globals. // If we aren't being compiled with GCC, just drop these attributes. Out << "#ifdef _MSC_VER /* Can only support \"linkonce\" vars with GCC */\n" - << "#define __attribute__(X)\n" - << "#endif\n\n"; + << "#define __attribute__(X)\n" + << "#endif\n\n"; } /// FindStaticTors - Given a static ctor/dtor list, unpack its contents into /// the StaticTors set. -static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){ +static void FindStaticTors(GlobalVariable *GV, + std::set<Function *> &StaticTors) { ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer()); - if (!InitList) return; + if (!InitList) + return; for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) - if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){ - if (CS->getNumOperands() != 2) return; // Not array of 2-element structs. + if (ConstantStruct *CS = + dyn_cast<ConstantStruct>(InitList->getOperand(i))) { + if (CS->getNumOperands() != 2) + return; // Not array of 2-element structs. if (CS->getOperand(1)->isNullValue()) - return; // Found a null terminator, exit printing. + return; // Found a null terminator, exit printing. Constant *FP = CS->getOperand(1); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP)) if (CE->isCast()) @@ -1769,7 +1937,8 @@ static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){ enum SpecialGlobalClass { NotSpecial = 0, - GlobalCtors, GlobalDtors, + GlobalCtors, + GlobalDtors, NotPrinted }; @@ -1786,9 +1955,8 @@ static SpecialGlobalClass getGlobalVariableClass(GlobalVariable *GV) { // Otherwise, if it is other metadata, don't print it. This catches things // like debug information. - if (StringRef(GV->getSection()) == "llvm.metadata") - { - //DEBUG(errs() << "Printing Metada!\n" << *GV << "\n"); + if (StringRef(GV->getSection()) == "llvm.metadata") { + // DEBUG(errs() << "Printing Metada!\n" << *GV << "\n"); return NotPrinted; } return NotSpecial; @@ -1797,7 +1965,7 @@ static SpecialGlobalClass getGlobalVariableClass(GlobalVariable *GV) { // PrintEscapedString - Print each character of the specified string, escaping // it if it is not printable or if it is an escape char. static void PrintEscapedString(const char *Str, unsigned Length, - raw_ostream &Out) { + raw_ostream &Out) { for (unsigned i = 0; i != Length; ++i) { unsigned char C = Str[i]; if (isprint(C) && C != '\\' && C != '"') @@ -1824,9 +1992,10 @@ bool CWriter::doInitialization(Module &M) { TD = new DataLayout(&M); IL = new IntrinsicLowering(*TD); - // CHECK: Looking at lib/CodeGen/IntrinsicsLowering.cpp this func not supported - // This func creates defs which are created once each call is referenced anyway - //IL->AddPrototypes(M); + // CHECK: Looking at lib/CodeGen/IntrinsicsLowering.cpp this func not + // supported This func creates defs which are created once each call is + // referenced anyway + // IL->AddPrototypes(M); #if 0 std::string Triple = TheModule->getTargetTriple(); @@ -1838,7 +2007,7 @@ bool CWriter::doInitialization(Module &M) { TAsm = Match->createMCAsmInfo(Triple); #endif TAsm = new CBEMCAsmInfo(); - MRI = new MCRegisterInfo(); + MRI = new MCRegisterInfo(); TCtx = new MCContext(TAsm, MRI, NULL); return false; } @@ -1884,17 +2053,18 @@ bool CWriter::doFinalization(Module &M) { void CWriter::generateHeader(Module &M) { // Keep track of which functions are static ctors/dtors so they can have // an attribute added to their prototypes. - std::set<Function*> StaticCtors, StaticDtors; - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { + std::set<Function *> StaticCtors, StaticDtors; + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; + ++I) { switch (getGlobalVariableClass(&*I)) { - default: break; - case GlobalCtors: - FindStaticTors(&*I, StaticCtors); - break; - case GlobalDtors: - FindStaticTors(&*I, StaticDtors); - break; + default: + break; + case GlobalCtors: + FindStaticTors(&*I, StaticCtors); + break; + case GlobalDtors: + FindStaticTors(&*I, StaticDtors); + break; } } @@ -1904,8 +2074,9 @@ void CWriter::generateHeader(Module &M) { // Out << "#include <setjmp.h>\n"; // Unwind support // Out << "#include <limits.h>\n"; // With overflow intrinsics support. // Out << "#include <stdint.h>\n"; // Sized integer support - // Out << "#include <math.h>\n"; // definitions for some math functions and numeric constants - // Out << "#include <APInt-C.h>\n"; // Implementations of many llvm intrinsics + // Out << "#include <math.h>\n"; // definitions for some math + // functions and numeric constants Out << "#include <APInt-C.h>\n"; // + // Implementations of many llvm intrinsics // // Provide a definition for `bool' if not compiling with a C++ compiler. // Out << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"; // Out << "\n"; @@ -1913,24 +2084,24 @@ void CWriter::generateHeader(Module &M) { // generateCompilerSpecificCode(Out, TD); Out << "\n\n/* Support for floating point constants */\n" - << "typedef ulong ConstantDoubleTy;\n" - << "typedef uint ConstantFloatTy;\n" - << "typedef struct { ulong f1; ushort f2; " - "ushort pad[3]; } ConstantFP80Ty;\n" - // This is used for both kinds of 128-bit long double; meaning differs. - << "typedef struct { ulong f1; ulong f2; }" - " ConstantFP128Ty;\n" - << "\n\n/* OpenCL Pragmas */\n" - << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" - << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n" - << "\n\n/* Global Declarations */\n"; + << "typedef ulong ConstantDoubleTy;\n" + << "typedef uint ConstantFloatTy;\n" + << "typedef struct { ulong f1; ushort f2; " + "ushort pad[3]; } ConstantFP80Ty;\n" + // This is used for both kinds of 128-bit long double; meaning differs. + << "typedef struct { ulong f1; ulong f2; }" + " ConstantFP128Ty;\n" + << "\n\n/* OpenCL Pragmas */\n" + << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" + << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n" + << "\n\n/* Global Declarations */\n"; // First output all the declarations for the program, because C requires // Functions & globals to be declared before they are used. if (!M.getModuleInlineAsm().empty()) { Out << "\n/* Module asm statements */\n" - << "__asm__ ("; + << "__asm__ ("; // Split the string into lines, to make it easier to read the .ll file. std::string Asm = M.getModuleInlineAsm(); @@ -1940,22 +2111,22 @@ void CWriter::generateHeader(Module &M) { // We found a newline, print the portion of the asm string from the // last newline up to this newline. Out << "\""; - PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine), - Out); + PrintEscapedString( + std::string(Asm.begin() + CurPos, Asm.begin() + NewLine), Out); Out << "\\n\"\n"; - CurPos = NewLine+1; + CurPos = NewLine + 1; NewLine = Asm.find_first_of('\n', CurPos); } Out << "\""; - PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out); + PrintEscapedString(std::string(Asm.begin() + CurPos, Asm.end()), Out); Out << "\");\n" - << "/* End Module asm statements */\n"; + << "/* End Module asm statements */\n"; } // collect any remaining types raw_null_ostream NullOut; - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; + ++I) { // Ignore special globals, such as debug info. if (getGlobalVariableClass(&*I)) continue; @@ -1967,8 +2138,9 @@ void CWriter::generateHeader(Module &M) { if (!M.global_empty()) { Out << "\n/* External Global Variable Declarations */\n"; for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { - if (!I->isDeclaration() || isEmptyType(I->getType()->getPointerElementType())) + I != E; ++I) { + if (!I->isDeclaration() || + isEmptyType(I->getType()->getPointerElementType())) continue; if (I->hasDLLImportStorageClass()) @@ -1988,8 +2160,8 @@ void CWriter::generateHeader(Module &M) { Type *ElTy = I->getType()->getElementType(); unsigned Alignment = I->getAlignment(); - bool IsOveraligned = Alignment && - Alignment > TD->getABITypeAlignment(ElTy); + bool IsOveraligned = + Alignment && Alignment > TD->getABITypeAlignment(ElTy); // if (IsOveraligned) // Out << "__MSALIGN__(" << Alignment << ") "; printTypeName(Out, ElTy, false) << ' ' << GetValueName(&*I); @@ -2006,64 +2178,53 @@ void CWriter::generateHeader(Module &M) { Out << "\n/* Function Declarations */\n"; // Store the intrinsics which will be declared/defined below. - SmallVector<Function*, 16> intrinsicsToDefine; + SmallVector<Function *, 16> intrinsicsToDefine; for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { // Don't print declarations for intrinsic functions. // Store the used intrinsics, which need to be explicitly defined. if (I->isIntrinsic()) { switch (I->getIntrinsicID()) { - default: - continue; - case Intrinsic::uadd_with_overflow: - case Intrinsic::sadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::bswap: - case Intrinsic::ceil: - case Intrinsic::ctlz: - case Intrinsic::ctpop: - case Intrinsic::cttz: - case Intrinsic::fabs: - case Intrinsic::floor: - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::pow: - case Intrinsic::powi: - case Intrinsic::rint: - case Intrinsic::sqrt: - case Intrinsic::trunc: - intrinsicsToDefine.push_back(&*I); - continue; + default: + continue; + case Intrinsic::uadd_with_overflow: + case Intrinsic::sadd_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::bswap: + case Intrinsic::ceil: + case Intrinsic::ctlz: + case Intrinsic::ctpop: + case Intrinsic::cttz: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::fma: + case Intrinsic::fmuladd: + case Intrinsic::pow: + case Intrinsic::powi: + case Intrinsic::rint: + case Intrinsic::sqrt: + case Intrinsic::trunc: + intrinsicsToDefine.push_back(&*I); + continue; } } // Skip a few functions that have already been defined in headers - if (I->getName() == "setjmp" || - I->getName() == "longjmp" || - I->getName() == "_setjmp" || - I->getName() == "siglongjmp" || - I->getName() == "sigsetjmp" || - I->getName() == "pow" || - I->getName() == "powf" || - I->getName() == "sqrt" || - I->getName() == "sqrtf" || - I->getName() == "trunc" || - I->getName() == "truncf" || - I->getName() == "rint" || - I->getName() == "rintf" || - I->getName() == "floor" || - I->getName() == "floorf" || - I->getName() == "ceil" || - I->getName() == "ceilf" || - I->getName() == "alloca" || - I->getName() == "_alloca" || - I->getName() == "_chkstk" || - I->getName() == "__chkstk" || - I->getName() == "___chkstk_ms") - continue; + if (I->getName() == "setjmp" || I->getName() == "longjmp" || + I->getName() == "_setjmp" || I->getName() == "siglongjmp" || + I->getName() == "sigsetjmp" || I->getName() == "pow" || + I->getName() == "powf" || I->getName() == "sqrt" || + I->getName() == "sqrtf" || I->getName() == "trunc" || + I->getName() == "truncf" || I->getName() == "rint" || + I->getName() == "rintf" || I->getName() == "floor" || + I->getName() == "floorf" || I->getName() == "ceil" || + I->getName() == "ceilf" || I->getName() == "alloca" || + I->getName() == "_alloca" || I->getName() == "_chkstk" || + I->getName() == "__chkstk" || I->getName() == "___chkstk_ms") + continue; if (I->hasDLLImportStorageClass()) Out << "__declspec(dllimport) "; @@ -2096,7 +2257,7 @@ void CWriter::generateHeader(Module &M) { if (!M.global_empty()) { Out << "\n\n/* Global Variable Definitions and Initialization */\n"; for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { + I != E; ++I) { declareOneGlobalVariable(&*I); } } @@ -2104,9 +2265,10 @@ void CWriter::generateHeader(Module &M) { // Alias declarations... if (!M.alias_empty()) { Out << "\n/* External Alias Declarations */\n"; - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E; ++I) { - assert(!I->isDeclaration() && !isEmptyType(I->getType()->getPointerElementType())); + for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; + ++I) { + assert(!I->isDeclaration() && + !isEmptyType(I->getType()->getPointerElementType())); if (I->hasLocalLinkage()) continue; // Internal Global @@ -2121,8 +2283,8 @@ void CWriter::generateHeader(Module &M) { Type *ElTy = I->getType()->getElementType(); unsigned Alignment = I->getAlignment(); - bool IsOveraligned = Alignment && - Alignment > TD->getABITypeAlignment(ElTy); + bool IsOveraligned = + Alignment && Alignment > TD->getABITypeAlignment(ElTy); // if (IsOveraligned) // Out << "__MSALIGN__(" << Alignment << ") "; // GetValueName would resolve the alias, which is not what we want, @@ -2177,9 +2339,11 @@ void CWriter::generateHeader(Module &M) { Out << "return 1; }\n"; // Loop over all select operations - for (std::set<Type*>::iterator it = SelectDeclTypes.begin(), end = SelectDeclTypes.end(); - it != end; ++it) { - // static __forceinline Rty llvm_select_u8x4(<bool x 4> condition, <u8 x 4> iftrue, <u8 x 4> ifnot) { + for (std::set<Type *>::iterator it = SelectDeclTypes.begin(), + end = SelectDeclTypes.end(); + it != end; ++it) { + // static __forceinline Rty llvm_select_u8x4(<bool x 4> condition, <u8 x 4> + // iftrue, <u8 x 4> ifnot) { // Rty r = { // condition[0] ? iftrue[0] : ifnot[0], // condition[1] ? iftrue[1] : ifnot[1], @@ -2194,7 +2358,11 @@ void CWriter::generateHeader(Module &M) { printTypeString(Out, *it, false); Out << "("; if (isa<VectorType>(*it)) - printTypeNameUnaligned(Out, VectorType::get(Type::getInt1Ty((*it)->getContext()), (*it)->getVectorNumElements()), false); + printTypeNameUnaligned( + Out, + VectorType::get(Type::getInt1Ty((*it)->getContext()), + (*it)->getVectorNumElements()), + false); else Out << "bool"; Out << " condition, "; @@ -2207,19 +2375,22 @@ void CWriter::generateHeader(Module &M) { if (isa<VectorType>(*it)) { unsigned n, l = (*it)->getVectorNumElements(); for (n = 0; n < l; n++) { - Out << " r.vector[" << n << "] = condition.vector[" << n << "] ? iftrue.vector[" << n << "] : ifnot.vector[" << n << "];\n"; + Out << " r.vector[" << n << "] = condition.vector[" << n + << "] ? iftrue.vector[" << n << "] : ifnot.vector[" << n << "];\n"; } - } - else { + } else { Out << " r = condition ? iftrue : ifnot;\n"; } Out << " return r;\n}\n"; } // Loop over all compare operations - for (std::set< std::pair<CmpInst::Predicate, VectorType*> >::iterator it = CmpDeclTypes.begin(), end = CmpDeclTypes.end(); - it != end; ++it) { - // static __forceinline <bool x 4> llvm_icmp_ge_u8x4(<u8 x 4> l, <u8 x 4> r) { + for (std::set<std::pair<CmpInst::Predicate, VectorType *>>::iterator + it = CmpDeclTypes.begin(), + end = CmpDeclTypes.end(); + it != end; ++it) { + // static __forceinline <bool x 4> llvm_icmp_ge_u8x4(<u8 x 4> l, <u8 x 4> r) + // { // Rty c = { // l[0] >= r[0], // l[1] >= r[1], @@ -2229,7 +2400,8 @@ void CWriter::generateHeader(Module &M) { // return c; // } unsigned n, l = (*it).second->getVectorNumElements(); - VectorType *RTy = VectorType::get(Type::getInt1Ty((*it).second->getContext()), l); + VectorType *RTy = + VectorType::get(Type::getInt1Ty((*it).second->getContext()), l); bool isSigned = CmpInst::isSigned((*it).first); Out << "static __forceinline "; printTypeName(Out, RTy, isSigned); @@ -2249,25 +2421,38 @@ void CWriter::generateHeader(Module &M) { for (n = 0; n < l; n++) { Out << " c.vector[" << n << "] = "; if (CmpInst::isFPPredicate((*it).first)) { - Out << "llvm_fcmp_ " << getCmpPredicateName((*it).first) << "(l.vector[" << n << "], r.vector[" << n << "]);\n"; + Out << "llvm_fcmp_ " << getCmpPredicateName((*it).first) << "(l.vector[" + << n << "], r.vector[" << n << "]);\n"; } else { Out << "l.vector[" << n << "]"; switch ((*it).first) { - case CmpInst::ICMP_EQ: Out << " == "; break; - case CmpInst::ICMP_NE: Out << " != "; break; - case CmpInst::ICMP_ULE: - case CmpInst::ICMP_SLE: Out << " <= "; break; - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_SGE: Out << " >= "; break; - case CmpInst::ICMP_ULT: - case CmpInst::ICMP_SLT: Out << " < "; break; - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_SGT: Out << " > "; break; - default: + case CmpInst::ICMP_EQ: + Out << " == "; + break; + case CmpInst::ICMP_NE: + Out << " != "; + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + Out << " <= "; + break; + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + Out << " >= "; + break; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_SLT: + Out << " < "; + break; + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SGT: + Out << " > "; + break; + default: #ifndef NDEBUG - errs() << "Invalid icmp predicate!" << (*it).first; + errs() << "Invalid icmp predicate!" << (*it).first; #endif - llvm_unreachable(0); + llvm_unreachable(0); } Out << "r.vector[" << n << "];\n"; } @@ -2276,9 +2461,13 @@ void CWriter::generateHeader(Module &M) { } // Loop over all (vector) cast operations - for (std::set<std::pair<CastInst::CastOps, std::pair<Type*, Type*>>>::iterator it = CastOpDeclTypes.begin(), end = CastOpDeclTypes.end(); - it != end; ++it) { - // static __forceinline <u32 x 4> llvm_ZExt_u8x4_u32x4(<u8 x 4> in) { // Src->isVector == Dst->isVector + for (std::set< + std::pair<CastInst::CastOps, std::pair<Type *, Type *>>>::iterator + it = CastOpDeclTypes.begin(), + end = CastOpDeclTypes.end(); + it != end; ++it) { + // static __forceinline <u32 x 4> llvm_ZExt_u8x4_u32x4(<u8 x 4> in) { // + // Src->isVector == Dst->isVector // Rty out = { // in[0], // in[1], @@ -2287,7 +2476,8 @@ void CWriter::generateHeader(Module &M) { // }; // return out; // } - // static __forceinline u32 llvm_BitCast_u8x4_u32(<u8 x 4> in) { // Src->bitsSize == Dst->bitsSize + // static __forceinline u32 llvm_BitCast_u8x4_u32(<u8 x 4> in) { // + // Src->bitsSize == Dst->bitsSize // union { // <u8 x 4> in; // u32 out; @@ -2300,18 +2490,18 @@ void CWriter::generateHeader(Module &M) { Type *DstTy = (*it).second.second; bool SrcSigned, DstSigned; switch (opcode) { - default: - SrcSigned = false; - DstSigned = false; - case Instruction::SIToFP: - SrcSigned = true; - DstSigned = false; - case Instruction::FPToSI: - SrcSigned = false; - DstSigned = true; - case Instruction::SExt: - SrcSigned = true; - DstSigned = true; + default: + SrcSigned = false; + DstSigned = false; + case Instruction::SIToFP: + SrcSigned = true; + DstSigned = false; + case Instruction::FPToSI: + SrcSigned = false; + DstSigned = true; + case Instruction::SExt: + SrcSigned = true; + DstSigned = true; } Out << "static __forceinline "; @@ -2350,20 +2540,34 @@ void CWriter::generateHeader(Module &M) { Out << " out;\n"; Out << " LLVM"; switch (opcode) { - case Instruction::UIToFP: Out << "UItoFP"; break; - case Instruction::SIToFP: Out << "SItoFP"; break; - case Instruction::Trunc: Out << "Trunc"; break; - //case Instruction::FPExt: - //case Instruction::FPTrunc: - case Instruction::ZExt: Out << "ZExt"; break; - case Instruction::FPToUI: Out << "FPtoUI"; break; - case Instruction::SExt: Out << "SExt"; break; - case Instruction::FPToSI: Out << "FPtoSI"; break; - default: - llvm_unreachable("Invalid cast opcode for i128"); + case Instruction::UIToFP: + Out << "UItoFP"; + break; + case Instruction::SIToFP: + Out << "SItoFP"; + break; + case Instruction::Trunc: + Out << "Trunc"; + break; + // case Instruction::FPExt: + // case Instruction::FPTrunc: + case Instruction::ZExt: + Out << "ZExt"; + break; + case Instruction::FPToUI: + Out << "FPtoUI"; + break; + case Instruction::SExt: + Out << "SExt"; + break; + case Instruction::FPToSI: + Out << "FPtoSI"; + break; + default: + llvm_unreachable("Invalid cast opcode for i128"); } Out << "(" << SrcTy->getPrimitiveSizeInBits() << ", &in, " - << DstTy->getPrimitiveSizeInBits() << ", &out);\n"; + << DstTy->getPrimitiveSizeInBits() << ", &out);\n"; Out << " return out;\n"; Out << "#endif\n"; Out << "}\n"; @@ -2371,9 +2575,12 @@ void CWriter::generateHeader(Module &M) { } // Loop over all simple vector operations - for (std::set<std::pair<unsigned, Type*>>::iterator it = InlineOpDeclTypes.begin(), end = InlineOpDeclTypes.end(); - it != end; ++it) { - // static __forceinline <u32 x 4> llvm_BinOp_u32x4(<u32 x 4> a, <u32 x 4> b) { + for (std::set<std::pair<unsigned, Type *>>::iterator + it = InlineOpDeclTypes.begin(), + end = InlineOpDeclTypes.end(); + it != end; ++it) { + // static __forceinline <u32 x 4> llvm_BinOp_u32x4(<u32 x 4> a, <u32 x 4> b) + // { // Rty r = { // a[0] OP b[0], // a[1] OP b[1], @@ -2417,7 +2624,7 @@ void CWriter::generateHeader(Module &M) { // C can't handle non-power-of-two integer types unsigned mask = 0; if (ElemTy->isIntegerTy()) { - IntegerType *ITy = static_cast<IntegerType*>(ElemTy); + IntegerType *ITy = static_cast<IntegerType *>(ElemTy); if (!ITy->isPowerOf2ByteWidth()) mask = ITy->getBitMask(); } @@ -2439,34 +2646,54 @@ void CWriter::generateHeader(Module &M) { Out << "fmodf(a.vector[" << n << "], b.vector[" << n << "])"; else if (ElemTy->isDoubleTy()) Out << "fmod(a.vector[" << n << "], b.vector[" << n << "])"; - else // all 3 flavors of long double + else // all 3 flavors of long double Out << "fmodl(a.vector[" << n << "], b.vector[" << n << "])"; } else { Out << "a.vector[" << n << "]"; switch (opcode) { - case Instruction::Add: - case Instruction::FAdd: Out << " + "; break; - case Instruction::Sub: - case Instruction::FSub: Out << " - "; break; - case Instruction::Mul: - case Instruction::FMul: Out << " * "; break; - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: Out << " % "; break; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: Out << " / "; break; - case Instruction::And: Out << " & "; break; - case Instruction::Or: Out << " | "; break; - case Instruction::Xor: Out << " ^ "; break; - case Instruction::Shl : Out << " << "; break; - case Instruction::LShr: - case Instruction::AShr: Out << " >> "; break; - default: + case Instruction::Add: + case Instruction::FAdd: + Out << " + "; + break; + case Instruction::Sub: + case Instruction::FSub: + Out << " - "; + break; + case Instruction::Mul: + case Instruction::FMul: + Out << " * "; + break; + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + Out << " % "; + break; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + Out << " / "; + break; + case Instruction::And: + Out << " & "; + break; + case Instruction::Or: + Out << " | "; + break; + case Instruction::Xor: + Out << " ^ "; + break; + case Instruction::Shl: + Out << " << "; + break; + case Instruction::LShr: + case Instruction::AShr: + Out << " >> "; + break; + default: #ifndef NDEBUG - errs() << "Invalid operator type!" << opcode; + errs() << "Invalid operator type!" << opcode; #endif - llvm_unreachable(0); + llvm_unreachable(0); } Out << "b.vector[" << n << "]"; } @@ -2487,24 +2714,44 @@ void CWriter::generateHeader(Module &M) { } else { Out << "a"; switch (opcode) { - case Instruction::Add: Out << " + "; break; - case Instruction::Sub: Out << " - "; break; - case Instruction::Mul: Out << " * "; break; - case Instruction::URem: - case Instruction::SRem: Out << " % "; break; - case Instruction::UDiv: - case Instruction::SDiv: Out << " / "; break; - case Instruction::And: Out << " & "; break; - case Instruction::Or: Out << " | "; break; - case Instruction::Xor: Out << " ^ "; break; - case Instruction::Shl: Out << " << "; break; - case Instruction::LShr: - case Instruction::AShr: Out << " >> "; break; - default: + case Instruction::Add: + Out << " + "; + break; + case Instruction::Sub: + Out << " - "; + break; + case Instruction::Mul: + Out << " * "; + break; + case Instruction::URem: + case Instruction::SRem: + Out << " % "; + break; + case Instruction::UDiv: + case Instruction::SDiv: + Out << " / "; + break; + case Instruction::And: + Out << " & "; + break; + case Instruction::Or: + Out << " | "; + break; + case Instruction::Xor: + Out << " ^ "; + break; + case Instruction::Shl: + Out << " << "; + break; + case Instruction::LShr: + case Instruction::AShr: + Out << " >> "; + break; + default: #ifndef NDEBUG - errs() << "Invalid operator type!" << opcode; + errs() << "Invalid operator type!" << opcode; #endif - llvm_unreachable(0); + llvm_unreachable(0); } Out << "b;\n"; } @@ -2526,7 +2773,8 @@ void CWriter::generateHeader(Module &M) { } else if (opcode == Instruction::Xor) { Out << " r.hi = a.hi ^ b.hi;\n"; Out << " r.lo = a.lo ^ b.lo;\n"; - } else if (opcode == Instruction::Shl) { // reminder: undef behavior if b >= 128 + } else if (opcode == + Instruction::Shl) { // reminder: undef behavior if b >= 128 Out << " if (b.lo >= 64) {\n"; Out << " r.hi = (a.lo << (b.lo - 64));\n"; Out << " r.lo = 0;\n"; @@ -2541,26 +2789,44 @@ void CWriter::generateHeader(Module &M) { // everything that hasn't been manually implemented above Out << " LLVM"; switch (opcode) { - //case BinaryNeg: Out << "Neg"; break; - //case BinaryNot: Out << "FlipAllBits"; break; - case Instruction::Add: Out << "Add"; break; - case Instruction::Sub: Out << "Sub"; break; - case Instruction::Mul: Out << "Mul"; break; - case Instruction::URem: Out << "URem"; break; - case Instruction::SRem: Out << "SRem"; break; - case Instruction::UDiv: Out << "UDiv"; break; - case Instruction::SDiv: Out << "SDiv"; break; - //case Instruction::And: Out << "And"; break; - //case Instruction::Or: Out << "Or"; break; - //case Instruction::Xor: Out << "Xor"; break; - //case Instruction::Shl: Out << "Shl"; break; - case Instruction::LShr: Out << "LShr"; break; - case Instruction::AShr: Out << "AShr"; break; - default: + // case BinaryNeg: Out << "Neg"; break; + // case BinaryNot: Out << "FlipAllBits"; break; + case Instruction::Add: + Out << "Add"; + break; + case Instruction::Sub: + Out << "Sub"; + break; + case Instruction::Mul: + Out << "Mul"; + break; + case Instruction::URem: + Out << "URem"; + break; + case Instruction::SRem: + Out << "SRem"; + break; + case Instruction::UDiv: + Out << "UDiv"; + break; + case Instruction::SDiv: + Out << "SDiv"; + break; + // case Instruction::And: Out << "And"; break; + // case Instruction::Or: Out << "Or"; break; + // case Instruction::Xor: Out << "Xor"; break; + // case Instruction::Shl: Out << "Shl"; break; + case Instruction::LShr: + Out << "LShr"; + break; + case Instruction::AShr: + Out << "AShr"; + break; + default: #ifndef NDEBUG - errs() << "Invalid operator type!" << opcode; + errs() << "Invalid operator type!" << opcode; #endif - llvm_unreachable(0); + llvm_unreachable(0); } Out << "(16, &a, &b, &r);\n"; } @@ -2580,34 +2846,54 @@ void CWriter::generateHeader(Module &M) { Out << "fmodf(a, b)"; else if (ElemTy->isDoubleTy()) Out << "fmod(a, b)"; - else // all 3 flavors of long double + else // all 3 flavors of long double Out << "fmodl(a, b)"; } else { Out << "a"; switch (opcode) { - case Instruction::Add: - case Instruction::FAdd: Out << " + "; break; - case Instruction::Sub: - case Instruction::FSub: Out << " - "; break; - case Instruction::Mul: - case Instruction::FMul: Out << " * "; break; - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: Out << " % "; break; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: Out << " / "; break; - case Instruction::And: Out << " & "; break; - case Instruction::Or: Out << " | "; break; - case Instruction::Xor: Out << " ^ "; break; - case Instruction::Shl : Out << " << "; break; - case Instruction::LShr: - case Instruction::AShr: Out << " >> "; break; - default: + case Instruction::Add: + case Instruction::FAdd: + Out << " + "; + break; + case Instruction::Sub: + case Instruction::FSub: + Out << " - "; + break; + case Instruction::Mul: + case Instruction::FMul: + Out << " * "; + break; + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + Out << " % "; + break; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + Out << " / "; + break; + case Instruction::And: + Out << " & "; + break; + case Instruction::Or: + Out << " | "; + break; + case Instruction::Xor: + Out << " ^ "; + break; + case Instruction::Shl: + Out << " << "; + break; + case Instruction::LShr: + case Instruction::AShr: + Out << " >> "; + break; + default: #ifndef NDEBUG - errs() << "Invalid operator type!" << opcode; + errs() << "Invalid operator type!" << opcode; #endif - llvm_unreachable(0); + llvm_unreachable(0); } Out << "b"; if (mask) @@ -2619,9 +2905,11 @@ void CWriter::generateHeader(Module &M) { } // Loop over all inline constructors - for (std::set<Type*>::iterator it = CtorDeclTypes.begin(), end = CtorDeclTypes.end(); - it != end; ++it) { - // static __forceinline <u32 x 4> llvm_ctor_u32x4(u32 x1, u32 x2, u32 x3, u32 x4) { + for (std::set<Type *>::iterator it = CtorDeclTypes.begin(), + end = CtorDeclTypes.end(); + it != end; ++it) { + // static __forceinline <u32 x 4> llvm_ctor_u32x4(u32 x1, u32 x2, u32 x3, + // u32 x4) { // Rty r = { // x1, x2, x3, x4 // }; @@ -2635,10 +2923,12 @@ void CWriter::generateHeader(Module &M) { StructType *STy = dyn_cast<StructType>(*it); ArrayType *ATy = dyn_cast<ArrayType>(*it); VectorType *VTy = dyn_cast<VectorType>(*it); - unsigned e = (STy ? STy->getNumElements() : (ATy ? ATy->getNumElements() : VTy->getNumElements())); + unsigned e = (STy ? STy->getNumElements() + : (ATy ? ATy->getNumElements() : VTy->getNumElements())); bool printed = false; for (unsigned i = 0; i != e; ++i) { - Type *ElTy = STy ? STy->getElementType(i) : (*it)->getSequentialElementType(); + Type *ElTy = + STy ? STy->getElementType(i) : (*it)->getSequentialElementType(); if (isEmptyType(ElTy)) Out << " /* "; else if (printed) @@ -2654,7 +2944,8 @@ void CWriter::generateHeader(Module &M) { printTypeName(Out, *it); Out << " r;"; for (unsigned i = 0; i != e; ++i) { - Type *ElTy = STy ? STy->getElementType(i) : (*it)->getSequentialElementType(); + Type *ElTy = + STy ? STy->getElementType(i) : (*it)->getSequentialElementType(); if (isEmptyType(ElTy)) continue; if (STy) @@ -2670,9 +2961,9 @@ void CWriter::generateHeader(Module &M) { } // Emit definitions of the intrinsics. - for (SmallVector<Function*, 16>::iterator - I = intrinsicsToDefine.begin(), - E = intrinsicsToDefine.end(); I != E; ++I) { + for (SmallVector<Function *, 16>::iterator I = intrinsicsToDefine.begin(), + E = intrinsicsToDefine.end(); + I != E; ++I) { printIntrinsicDefinition(**I, Out); } @@ -2680,7 +2971,7 @@ void CWriter::generateHeader(Module &M) { Out << "\n\n/* Function Bodies */\n"; } -void CWriter::declareOneGlobalVariable(GlobalVariable* I) { +void CWriter::declareOneGlobalVariable(GlobalVariable *I) { if (I->isDeclaration() || isEmptyType(I->getType()->getPointerElementType())) return; @@ -2702,8 +2993,7 @@ void CWriter::declareOneGlobalVariable(GlobalVariable* I) { Type *ElTy = I->getType()->getElementType(); unsigned Alignment = I->getAlignment(); - bool IsOveraligned = Alignment && - Alignment > TD->getABITypeAlignment(ElTy); + bool IsOveraligned = Alignment && Alignment > TD->getABITypeAlignment(ElTy); // if (IsOveraligned) // Out << "__MSALIGN__(" << Alignment << ") "; printTypeName(Out, ElTy, false) << ' ' << GetValueName(I); @@ -2727,13 +3017,13 @@ void CWriter::declareOneGlobalVariable(GlobalVariable* I) { // and common, so we disable this optimization. // FIXME common linkage should avoid this problem. if (!I->getInitializer()->isNullValue()) { - Out << " = " ; + Out << " = "; writeOperand(I->getInitializer(), ContextStatic); } else if (I->hasWeakLinkage()) { // We have to specify an initializer, but it doesn't have to be // complete. If the value is an aggregate, print out { 0 }, and let // the compiler figure out the rest of the zeros. - Out << " = " ; + Out << " = "; if (I->getInitializer()->getType()->isStructTy() || I->getInitializer()->getType()->isVectorTy()) { Out << "{ 0 }"; @@ -2757,7 +3047,8 @@ void CWriter::printFloatingPointConstants(Function &F) { // precision. // for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) - for (Instruction::op_iterator I_Op = I->op_begin(), E_Op = I->op_end(); I_Op != E_Op; ++I_Op) + for (Instruction::op_iterator I_Op = I->op_begin(), E_Op = I->op_end(); + I_Op != E_Op; ++I_Op) if (const Constant *C = dyn_cast<Constant>(I_Op)) printFloatingPointConstants(C); Out << '\n'; @@ -2780,44 +3071,39 @@ void CWriter::printFloatingPointConstants(const Constant *C) { FPConstantMap.count(FPC)) return; - FPConstantMap[FPC] = FPCounter; // Number the FP constants + FPConstantMap[FPC] = FPCounter; // Number the FP constants if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) { double Val = FPC->getValueAPF().convertToDouble(); uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); - Out << "static const ConstantDoubleTy FPConstant" << FPCounter++ - << " = 0x" << utohexstr(i) - << "ULL; /* " << Val << " */\n"; + Out << "static const ConstantDoubleTy FPConstant" << FPCounter++ << " = 0x" + << utohexstr(i) << "ULL; /* " << Val << " */\n"; } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) { float Val = FPC->getValueAPF().convertToFloat(); - uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt(). - getZExtValue(); - Out << "static const ConstantFloatTy FPConstant" << FPCounter++ - << " = 0x" << utohexstr(i) - << "U; /* " << Val << " */\n"; + uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().getZExtValue(); + Out << "static const ConstantFloatTy FPConstant" << FPCounter++ << " = 0x" + << utohexstr(i) << "U; /* " << Val << " */\n"; } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) { // api needed to prevent premature destruction const APInt api = FPC->getValueAPF().bitcastToAPInt(); const uint64_t *p = api.getRawData(); - Out << "static const ConstantFP80Ty FPConstant" << FPCounter++ - << " = { 0x" << utohexstr(p[0]) - << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}" - << "}; /* Long double constant */\n"; + Out << "static const ConstantFP80Ty FPConstant" << FPCounter++ << " = { 0x" + << utohexstr(p[0]) << "ULL, 0x" << utohexstr((uint16_t)p[1]) + << ",{0,0,0}" + << "}; /* Long double constant */\n"; } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) || - FPC->getType() == Type::getFP128Ty(FPC->getContext())) { + FPC->getType() == Type::getFP128Ty(FPC->getContext())) { const APInt api = FPC->getValueAPF().bitcastToAPInt(); const uint64_t *p = api.getRawData(); - Out << "static const ConstantFP128Ty FPConstant" << FPCounter++ - << " = { 0x" - << utohexstr(p[0]) << ", 0x" << utohexstr(p[1]) - << "}; /* Long double constant */\n"; + Out << "static const ConstantFP128Ty FPConstant" << FPCounter++ << " = { 0x" + << utohexstr(p[0]) << ", 0x" << utohexstr(p[1]) + << "}; /* Long double constant */\n"; } else { llvm_unreachable("Unknown float type!"); } } - /// printSymbolTable - Run through symbol table looking for type names. If a /// type name is found, emit its declaration... /// @@ -2831,7 +3117,7 @@ void CWriter::printModuleTypes(raw_ostream &Out) { Out << "} llvmBitCastUnion;\n"; // Keep track of which types have been printed so far. - std::set<Type*> TypesPrinted; + std::set<Type *> TypesPrinted; // Loop over all structures then push them into the stack so they are // printed in the correct order. @@ -2840,8 +3126,9 @@ void CWriter::printModuleTypes(raw_ostream &Out) { // forward-declare all structs here first { - std::set<Type*> TypesPrinted; - for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) { + std::set<Type *> TypesPrinted; + for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); + it != end; ++it) { forwardDeclareStructs(Out, *it, TypesPrinted); } } @@ -2849,31 +3136,35 @@ void CWriter::printModuleTypes(raw_ostream &Out) { // forward-declare all function pointer typedefs (Issue #2) { - std::set<Type*> TypesPrinted; - for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) { + std::set<Type *> TypesPrinted; + for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); + it != end; ++it) { forwardDeclareFunctionTypedefs(Out, *it, TypesPrinted); } } - Out << "\n/* Types Definitions */\n"; - for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) { + for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); + it != end; ++it) { printContainedTypes(Out, *it, TypesPrinted); } Out << "\n/* Function definitions */\n"; // Question: Is UnnamedFunctionIDs ever non-empty? - for (DenseMap<std::pair<FunctionType*, - std::pair<AttributeList, CallingConv::ID> >, unsigned>::iterator - I = UnnamedFunctionIDs.begin(), E = UnnamedFunctionIDs.end(); - I != E; ++I) { + for (DenseMap< + std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>>, + unsigned>::iterator I = UnnamedFunctionIDs.begin(), + E = UnnamedFunctionIDs.end(); + I != E; ++I) { Out << '\n'; - std::pair<FunctionType*, std::pair<AttributeList, CallingConv::ID> > F = I->first; + std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>> F = + I->first; if (F.second.first == AttributeList() && F.second.second == CallingConv::C) - if (!TypesPrinted.insert(F.first).second) continue; // already printed this above + if (!TypesPrinted.insert(F.first).second) + continue; // already printed this above // FIXME: Removing apparently unused function call - need to check printFunctionDeclaration(Out, F.first, F.second); @@ -2881,9 +3172,9 @@ void CWriter::printModuleTypes(raw_ostream &Out) { // We may have collected some intrinsic prototypes to emit. // Emit them now, before the function that uses them is emitted - for (std::vector<Function*>::iterator - I = prototypesToGen.begin(), E = prototypesToGen.end(); - I != E; ++I) { + for (std::vector<Function *>::iterator I = prototypesToGen.begin(), + E = prototypesToGen.end(); + I != E; ++I) { Out << '\n'; Function *F = *I; printFunctionProto(Out, F); @@ -2891,9 +3182,12 @@ void CWriter::printModuleTypes(raw_ostream &Out) { } } -void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted) { - if (!TypesPrinted.insert(Ty).second) return; - if (isEmptyType(Ty)) return; +void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, + std::set<Type *> &TypesPrinted) { + if (!TypesPrinted.insert(Ty).second) + return; + if (isEmptyType(Ty)) + return; for (auto I = Ty->subtype_begin(); I != Ty->subtype_end(); ++I) { forwardDeclareStructs(Out, *I, TypesPrinted); @@ -2904,9 +3198,12 @@ void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*> } } -void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted) { - if (!TypesPrinted.insert(Ty).second) return; - if (isEmptyType(Ty)) return; +void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, + std::set<Type *> &TypesPrinted) { + if (!TypesPrinted.insert(Ty).second) + return; + if (isEmptyType(Ty)) + return; for (auto I = Ty->subtype_begin(); I != Ty->subtype_end(); ++I) { forwardDeclareFunctionTypedefs(Out, *I, TypesPrinted); @@ -2921,15 +3218,17 @@ void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, std::se // this one depends on. // void CWriter::printContainedTypes(raw_ostream &Out, Type *Ty, - std::set<Type*> &TypesPrinted) { + std::set<Type *> &TypesPrinted) { // Check to see if we have already printed this struct. - if (!TypesPrinted.insert(Ty).second) return; + if (!TypesPrinted.insert(Ty).second) + return; // Skip empty structs - if (isEmptyType(Ty)) return; + if (isEmptyType(Ty)) + return; // Print all contained types first. - for (Type::subtype_iterator I = Ty->subtype_begin(), - E = Ty->subtype_end(); I != E; ++I) + for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end(); + I != E; ++I) printContainedTypes(Out, *I, TypesPrinted); if (StructType *ST = dyn_cast<StructType>(Ty)) { @@ -2950,22 +3249,23 @@ static inline bool isFPIntBitCast(Instruction &I) { Type *SrcTy = I.getOperand(0)->getType(); Type *DstTy = I.getType(); return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) || - (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy()); + (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy()); } void CWriter::printFunction(Function &F) { bool isKernel = false; - if (NamedMDNode * KernelMD = F.getParent()->getNamedMetadata("opencl.kernels")) { + if (NamedMDNode *KernelMD = + F.getParent()->getNamedMetadata("opencl.kernels")) { for (auto iter : KernelMD->operands()) { // DEBUG( errs() << "Kernel Metadata: " << *iter << "\n"); const MDOperand *KernelMDOp = iter->operands().begin(); Metadata *KMD = KernelMDOp->get(); - if(ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)){ + if (ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)) { Value *KMDVal = KMDVAM->getValue(); Function *KMDFunc = dyn_cast<Function>(KMDVal); - if(KMDFunc == &F) { - //DEBUG(errs() << "-->Kernel Func: " << KMDFunc->getName() << "\n"); + if (KMDFunc == &F) { + // DEBUG(errs() << "-->Kernel Func: " << KMDFunc->getName() << "\n"); isKernel = true; } } @@ -2976,12 +3276,15 @@ void CWriter::printFunction(Function &F) { bool isStructReturn = F.hasStructRetAttr(); assert(!F.isDeclaration()); - if (F.hasDLLImportStorageClass()) Out << "__declspec(dllimport) "; - if (F.hasDLLExportStorageClass()) Out << "__declspec(dllexport) "; - if (F.hasLocalLinkage()) Out << "static "; - printFunctionProto(Out, F.getFunctionType(), - std::make_pair(F.getAttributes(), F.getCallingConv()), - GetValueName(&F), + if (F.hasDLLImportStorageClass()) + Out << "__declspec(dllimport) "; + if (F.hasDLLExportStorageClass()) + Out << "__declspec(dllexport) "; + if (F.hasLocalLinkage()) + Out << "static "; + printFunctionProto( + Out, F.getFunctionType(), + std::make_pair(F.getAttributes(), F.getCallingConv()), GetValueName(&F), F.arg_begin(), // NOTE: replacing ArgumentList (LLVM-4) with arg iterator //&F.getArgumentList(), isKernel); @@ -2991,9 +3294,10 @@ void CWriter::printFunction(Function &F) { // If this is a struct return function, handle the result with magic. if (isStructReturn) { Type *StructTy = - cast<PointerType>(F.arg_begin()->getType())->getElementType(); + cast<PointerType>(F.arg_begin()->getType())->getElementType(); Out << " "; - printTypeName(Out, StructTy, false) << " StructReturn; /* Struct return temporary */\n"; + printTypeName(Out, StructTy, false) + << " StructReturn; /* Struct return temporary */\n"; Out << " "; printTypeName(Out, F.arg_begin()->getType(), false); @@ -3005,10 +3309,10 @@ void CWriter::printFunction(Function &F) { // print local variable information for the function for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { if (AllocaInst *AI = isDirectAlloca(&*I)) { - //DEBUG(errs() << "Processing alloca inst: " << *AI << "\n"); + // DEBUG(errs() << "Processing alloca inst: " << *AI << "\n"); unsigned Alignment = AI->getAlignment(); - bool IsOveraligned = Alignment && - Alignment > TD->getABITypeAlignment(AI->getAllocatedType()); + bool IsOveraligned = Alignment && Alignment > TD->getABITypeAlignment( + AI->getAllocatedType()); Out << " "; // if (IsOveraligned) // Out << "__MSALIGN__(" << Alignment << ") "; @@ -3017,21 +3321,22 @@ void CWriter::printFunction(Function &F) { if (IsOveraligned) Out << " __attribute__((aligned(" << Alignment << ")))"; if (AI->isArrayAllocation()) { - //DEBUG(errs() << "Alloca is an array allocation!\n"); - unsigned arraySize = dyn_cast<ConstantInt>(AI->getArraySize())->getZExtValue(); + // DEBUG(errs() << "Alloca is an array allocation!\n"); + unsigned arraySize = + dyn_cast<ConstantInt>(AI->getArraySize())->getZExtValue(); Out << "[" << arraySize << "]"; } Out << "; /* Address-exposed local */\n"; PrintedVar = true; - } else if (!isEmptyType(I->getType()) && - !isInlinableInst(*I)) { + } else if (!isEmptyType(I->getType()) && !isInlinableInst(*I)) { Out << " "; printTypeName(Out, I->getType(), false) << ' ' << GetValueName(&*I); Out << ";\n"; - if (isa<PHINode>(*I)) { // Print out PHI node temporaries as well... + if (isa<PHINode>(*I)) { // Print out PHI node temporaries as well... Out << " "; - printTypeName(Out, I->getType(), false) << ' ' << (GetValueName(&*I)+"__PHI_TEMPORARY"); + printTypeName(Out, I->getType(), false) + << ' ' << (GetValueName(&*I) + "__PHI_TEMPORARY"); Out << ";\n"; } PrintedVar = true; @@ -3041,7 +3346,7 @@ void CWriter::printFunction(Function &F) { // variable to hold the result of the BitCast. if (isFPIntBitCast(*I)) { Out << " llvmBitCastUnion " << GetValueName(&*I) - << "__BITCAST_TEMPORARY;\n"; + << "__BITCAST_TEMPORARY;\n"; PrintedVar = true; } } @@ -3052,11 +3357,13 @@ void CWriter::printFunction(Function &F) { // print the basic blocks // for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - std::set<BasicBlock*> VisitSet; - BasicBlock* entry = &(F.getEntryBlock()); - // starting printing from entry, then CFG traversal will print the reachable blocks. + std::set<BasicBlock *> VisitSet; + BasicBlock *entry = &(F.getEntryBlock()); + // starting printing from entry, then CFG traversal will print the reachable + // blocks. printBBorLoop(entry); - // for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); BI!=BE; ++BI) { + // for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); + // BI!=BE; ++BI) { // BasicBlock *BB = *BI; // printBBorLoop(BB); // if(VisitedBlocks.find(BB) == VisitedBlocks.end()) { @@ -3073,29 +3380,29 @@ void CWriter::printFunction(Function &F) { Out << "}\n\n"; } - -bool CWriter::extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *IndVarChain, Instruction *Branch, unsigned indent) { - //Traverse def-use chain of induction variable to make sure that - //it ends at the branch. Keep stack of all instructions leading there. - for(User *U : Inst->users()) { +bool CWriter::extractIndVarChain(Instruction *Inst, + std::stack<Instruction *> *IndVarChain, + Instruction *Branch, unsigned indent) { + // Traverse def-use chain of induction variable to make sure that + // it ends at the branch. Keep stack of all instructions leading there. + for (User *U : Inst->users()) { // DEBUG(errs() << std::string(indent, '-')); // DEBUG(errs() << "->Found user: " << *U << "\n"); - if(Instruction *UInst = dyn_cast<Instruction>(U)) { - if(UInst == Branch) { + if (Instruction *UInst = dyn_cast<Instruction>(U)) { + if (UInst == Branch) { // DEBUG(errs() << "Found correct path, returning!\n"); return true; - } - else if (isa<PHINode>(UInst)) { - // DEBUG(errs() << "Reached a PHI Node => Wrong path! Returning!\n"); + } else if (isa<PHINode>(UInst)) { + // DEBUG(errs() << "Reached a PHI Node => Wrong path! + // Returning!\n"); continue; - } - else { + } else { IndVarChain->push(UInst); - if(extractIndVarChain(UInst, IndVarChain, Branch, indent+2)) { + if (extractIndVarChain(UInst, IndVarChain, Branch, indent + 2)) { return true; - } - else { - // DEBUG(errs() << "Wrong path, popping: " << *(IndVarChain->top()) << "\n"); + } else { + // DEBUG(errs() << "Wrong path, popping: " << + // *(IndVarChain->top()) << "\n"); IndVarChain->pop(); } } @@ -3105,53 +3412,61 @@ bool CWriter::extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *In return false; } -bool CWriter::findLoopBranch(BranchInst **LBranch, BasicBlock* CurBlock, BasicBlock* LHeader, std::set<BasicBlock*>*visitSet) { +bool CWriter::findLoopBranch(BranchInst **LBranch, BasicBlock *CurBlock, + BasicBlock *LHeader, + std::set<BasicBlock *> *visitSet) { bool result = false; - // DEBUG(errs() << "Finding loop branch in " << CurBlock->getName() << "!\n"); - if(BranchInst *LBranchTemp = dyn_cast<BranchInst>(CurBlock->getTerminator())) { + // DEBUG(errs() << "Finding loop branch in " << CurBlock->getName() << + // "!\n"); + if (BranchInst *LBranchTemp = + dyn_cast<BranchInst>(CurBlock->getTerminator())) { // DEBUG(errs() << "Branch: " << *LBranchTemp << "\n"); - if(LBranchTemp->isConditional()) { - if(LBranchTemp->getSuccessor(0) == LHeader || LBranchTemp->getSuccessor(1) == LHeader) { + if (LBranchTemp->isConditional()) { + if (LBranchTemp->getSuccessor(0) == LHeader || + LBranchTemp->getSuccessor(1) == LHeader) { *LBranch = LBranchTemp; // DEBUG(errs() << "Found Loop branch: " << **LBranch << "\n"); result = true; } else { - BasicBlock* NextBlock1 = LBranchTemp->getSuccessor(0); - BasicBlock* NextBlock2 = LBranchTemp->getSuccessor(1); - if(visitSet->find(NextBlock1) == visitSet->end()) { - // DEBUG(errs() << "Visiting unvisited node: " << NextBlock1->getName() << "\n"); + BasicBlock *NextBlock1 = LBranchTemp->getSuccessor(0); + BasicBlock *NextBlock2 = LBranchTemp->getSuccessor(1); + if (visitSet->find(NextBlock1) == visitSet->end()) { + // DEBUG(errs() << "Visiting unvisited node: " << + // NextBlock1->getName() << "\n"); visitSet->insert(NextBlock1); result |= findLoopBranch(LBranch, NextBlock1, LHeader, visitSet); } - if(visitSet->find(NextBlock2) == visitSet->end()) { - // DEBUG(errs() << "Visiting unvisited node: " << NextBlock2->getName() << "\n"); + if (visitSet->find(NextBlock2) == visitSet->end()) { + // DEBUG(errs() << "Visiting unvisited node: " << + // NextBlock2->getName() << "\n"); visitSet->insert(NextBlock2); result |= findLoopBranch(LBranch, NextBlock2, LHeader, visitSet); } } } else { - if(LBranchTemp->getSuccessor(0) == LHeader) { + if (LBranchTemp->getSuccessor(0) == LHeader) { *LBranch = LBranchTemp; // DEBUG(errs() << "Found Loop branch: " << **LBranch << "\n"); result = true; } else { BasicBlock *NextBlock = LBranchTemp->getSuccessor(0); - if(visitSet->find(NextBlock) == visitSet->end()) { - // DEBUG(errs() << "Visiting unvisited node: " << NextBlock->getName() << "\n"); + if (visitSet->find(NextBlock) == visitSet->end()) { + // DEBUG(errs() << "Visiting unvisited node: " << + // NextBlock->getName() << "\n"); visitSet->insert(NextBlock); result |= findLoopBranch(LBranch, NextBlock, LHeader, visitSet); } } } } - return result; + return result; } bool CWriter::traverseUseDefChain(Instruction *I, PHINode *PI) { // DEBUG(errs() << "traversing: " << *I << "\n"); bool result = false; - if(PHINode *PHI = dyn_cast<PHINode>(I)) { + if (PHINode *PHI = dyn_cast<PHINode>(I)) { if (PI == PHI) { // DEBUG(errs() << "returning true\n"); result = true; @@ -3162,9 +3477,9 @@ bool CWriter::traverseUseDefChain(Instruction *I, PHINode *PI) { } } else { for (Use &U : I->operands()) { - if(Instruction *UInst = dyn_cast<Instruction>(U)) { + if (Instruction *UInst = dyn_cast<Instruction>(U)) { result |= traverseUseDefChain(UInst, PI); - } + } } } return result; @@ -3186,19 +3501,20 @@ void CWriter::printLoop(Loop *L) { auto *ExitingBranch = ExitingBlock->getTerminator(); // DEBUG(errs() << "Exiting Branch: " << *ExitingBranch << "\n"); InductionDescriptor ID; - if (L->getLoopPreheader()==nullptr) { + if (L->getLoopPreheader() == nullptr) { // DEBUG(errs() << "Loop has no preheader!\n"); } // DEBUG(errs() << "Looking for induction variables\n"); // if (PHINode *IndVar = L->getCanonicalInductionVariable()) { // InductionVariable = IndVar; - // DEBUG(errs() << "Found canonical induction variable:\n" << *IndVar << "\n"); + // DEBUG(errs() << "Found canonical induction variable:\n" << *IndVar << + // "\n"); // } bool found = false; for (auto I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { PHINode *PHI = cast<PHINode>(I); // DEBUG(errs() << "Phi Node: " << *PHI << "\n"); - if(InductionDescriptor::isInductionPHI(PHI,L,PSE,ID)) { + if (InductionDescriptor::isInductionPHI(PHI, L, PSE, ID)) { // DEBUG(errs() << "Found induction: " << *PHI << "\n"); InductionVariable = PHI; found = true; @@ -3206,18 +3522,18 @@ void CWriter::printLoop(Loop *L) { } } - if(!found) { + if (!found) { llvm_unreachable("Couldn't find induction Variable in loop!\n"); } LInductionVars.insert(InductionVariable); - LoopIndVarsMap.insert(std::pair<Loop*, PHINode*>(L,InductionVariable)); + LoopIndVarsMap.insert(std::pair<Loop *, PHINode *>(L, InductionVariable)); Value *IV = dyn_cast<Value>(InductionVariable); std::string IVName = GetValueName(IV); Optional<Loop::LoopBounds> OLB = L->getBounds(*SE); - if(OLB.hasValue()) { + if (OLB.hasValue()) { Loop::LoopBounds LB = OLB.getValue(); Value *StartValue = &(LB.getInitialIVValue()); Instruction *StepInstruction = &(LB.getStepInst()); @@ -3225,98 +3541,130 @@ void CWriter::printLoop(Loop *L) { Value *FinalValue = &(LB.getFinalIVValue()); ICmpInst::Predicate LoopPredicate = LB.getCanonicalPredicate(); std::string BranchPredicate; - switch(LoopPredicate) { - case ICmpInst::ICMP_EQ: BranchPredicate = " == "; break; - case ICmpInst::ICMP_NE: BranchPredicate = " != "; break; - case ICmpInst::ICMP_ULE: - case ICmpInst::ICMP_SLE: BranchPredicate = " < "; break; - case ICmpInst::ICMP_UGE: - case ICmpInst::ICMP_SGE: BranchPredicate = " > "; break; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_SLT: BranchPredicate = " <= "; break; - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_SGT: BranchPredicate = " >= "; break; - default: llvm_unreachable("Illegal ICmp predicate"); + switch (LoopPredicate) { + case ICmpInst::ICMP_EQ: + BranchPredicate = " == "; + break; + case ICmpInst::ICMP_NE: + BranchPredicate = " != "; + break; + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SLE: + BranchPredicate = " < "; + break; + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_SGE: + BranchPredicate = " > "; + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: + BranchPredicate = " <= "; + break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: + BranchPredicate = " >= "; + break; + default: + llvm_unreachable("Illegal ICmp predicate"); } - errs() << "IV: " << *IV<< "\n"; - errs() << "StartValue: " << *StartValue<< "\n"; - errs() << "StepInstruction: " << *StepInstruction<< "\n"; - errs() << "StepValue: " << *StepValue<< "\n"; - errs() << "FinalValue: " << *FinalValue<< "\n"; - errs() << "Branch Predicate: " << BranchPredicate<< "\n"; - errs() << "Direction: " << ((LB.getDirection() == Loop::LoopBounds::Direction::Increasing) - ? "increasing" : "decreasing") << "\n"; - - std::string startStr; + errs() << "IV: " << *IV << "\n"; + errs() << "StartValue: " << *StartValue << "\n"; + errs() << "StepInstruction: " << *StepInstruction << "\n"; + errs() << "StepValue: " << *StepValue << "\n"; + errs() << "FinalValue: " << *FinalValue << "\n"; + errs() << "Branch Predicate: " << BranchPredicate << "\n"; + errs() << "Direction: " + << ((LB.getDirection() == Loop::LoopBounds::Direction::Increasing) + ? "increasing" + : "decreasing") + << "\n"; + + std::string startStr; if (ConstantInt *startConst = dyn_cast<ConstantInt>(StartValue)) { startStr = std::to_string(startConst->getSExtValue()); } else { startStr = GetValueName(StartValue); } - std::string finalStr; + std::string finalStr; if (ConstantInt *finalConst = dyn_cast<ConstantInt>(FinalValue)) { finalStr = std::to_string(finalConst->getSExtValue()); } else { finalStr = GetValueName(FinalValue); } - std::string stepStr; + std::string stepStr; if (ConstantInt *stepConst = dyn_cast<ConstantInt>(StepValue)) { stepStr = std::to_string(stepConst->getSExtValue()); } else { stepStr = GetValueName(StepValue); } - errs() << "\n for ( " << IVName << " = " << startStr << "; " - << IVName << BranchPredicate << finalStr << "; " - << IVName << " = " << IVName << " + " << stepStr << ") {\n"; + errs() << "\n for ( " << IVName << " = " << startStr << "; " << IVName + << BranchPredicate << finalStr << "; " << IVName << " = " << IVName + << " + " << stepStr << ") {\n"; - Out << "\n for ( " << IVName << " = " << startStr << "; " - << IVName << BranchPredicate << finalStr << "; " - << IVName << " = " << IVName << " + " << stepStr << ") {\n"; + Out << "\n for ( " << IVName << " = " << startStr << "; " << IVName + << BranchPredicate << finalStr << "; " << IVName << " = " << IVName + << " + " << stepStr << ") {\n"; } else { llvm_unreachable("No Loop Bounds!"); Value *StartValue = ID.getStartValue(); const SCEV *Step = ID.getStep(); - // unsigned IterationCount = SE->getSmallConstantMaxTripCount(L); - // DEBUG(errs() << "StartValue: " << *StartValue << "\nStep: " << *Step << "\nIterationCount: " << IterationCount << "\n"); + // unsigned IterationCount = SE->getSmallConstantMaxTripCount(L); + // DEBUG(errs() << "StartValue: " << *StartValue << "\nStep: " << *Step << + // "\nIterationCount: " << IterationCount << "\n"); std::string IVOp; if (const SCEVConstant *stepConst = dyn_cast<SCEVConstant>(Step)) { - if(stepConst->getAPInt().isNonNegative()) { - IVOp = " + "; + if (stepConst->getAPInt().isNonNegative()) { + IVOp = " + "; } } - std::string BranchPredicate; - ICmpInst *BranchCondition = dyn_cast<ICmpInst>(dyn_cast<BranchInst>(ExitingBranch)->getCondition()); - switch(BranchCondition->getPredicate()) { - case ICmpInst::ICMP_EQ: BranchPredicate = " != "; break; - case ICmpInst::ICMP_NE: BranchPredicate = " == "; break; - case ICmpInst::ICMP_ULE: - case ICmpInst::ICMP_SLE: BranchPredicate = " > "; break; - case ICmpInst::ICMP_UGE: - case ICmpInst::ICMP_SGE: BranchPredicate = " < "; break; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_SLT: BranchPredicate = " >= "; break; - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_SGT: BranchPredicate = " <= "; break; - default: llvm_unreachable("Illegal ICmp predicate"); - } - - // DEBUG(errs() << "Branch Condition: " << *BranchCondition << "\n"); + ICmpInst *BranchCondition = + dyn_cast<ICmpInst>(dyn_cast<BranchInst>(ExitingBranch)->getCondition()); + switch (BranchCondition->getPredicate()) { + case ICmpInst::ICMP_EQ: + BranchPredicate = " != "; + break; + case ICmpInst::ICMP_NE: + BranchPredicate = " == "; + break; + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SLE: + BranchPredicate = " > "; + break; + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_SGE: + BranchPredicate = " < "; + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: + BranchPredicate = " >= "; + break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: + BranchPredicate = " <= "; + break; + default: + llvm_unreachable("Illegal ICmp predicate"); + } + + // DEBUG(errs() << "Branch Condition: " << *BranchCondition << "\n"); std::string compLHS, compRHS; Value *CondOp1 = BranchCondition->getOperand(0); // DEBUG(errs() << "CondOp1: " << *CondOp1 << "\n"); if (Constant *constOp1 = dyn_cast<Constant>(CondOp1)) { - // DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n"); - compLHS = (constOp1->getUniqueInteger()).toString(10,1); + // DEBUG(errs() << "Condition Operand is a constant, inserting it as + // is.\n"); + compLHS = (constOp1->getUniqueInteger()).toString(10, 1); } else { // DEBUG(errs() << "Condition Operand is not a constant, "); - if(traverseUseDefChain(dyn_cast<Instruction>(CondOp1), InductionVariable)) { + if (traverseUseDefChain(dyn_cast<Instruction>(CondOp1), + InductionVariable)) { // DEBUG(errs() << "it is the IV.\n"); compLHS = GetValueName(IV); } else { @@ -3327,11 +3675,13 @@ void CWriter::printLoop(Loop *L) { Value *CondOp2 = BranchCondition->getOperand(1); // DEBUG(errs() << "CondOp2: " << *CondOp2 << "\n"); if (Constant *constOp2 = dyn_cast<Constant>(CondOp2)) { - // DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n"); - compRHS = (constOp2->getUniqueInteger()).toString(10,1); + // DEBUG(errs() << "Condition Operand is a constant, inserting it as + // is.\n"); + compRHS = (constOp2->getUniqueInteger()).toString(10, 1); } else { // DEBUG(errs() << "Condition Operand is not a constant.\n"); - if(traverseUseDefChain(dyn_cast<Instruction>(CondOp2), InductionVariable)) { + if (traverseUseDefChain(dyn_cast<Instruction>(CondOp2), + InductionVariable)) { // DEBUG(errs() << "It is the IV.\n"); compRHS = GetValueName(IV); } else { @@ -3340,24 +3690,22 @@ void CWriter::printLoop(Loop *L) { } } - std::string startStr; + std::string startStr; if (Constant *startConst = dyn_cast<Constant>(StartValue)) { - startStr = (startConst->getUniqueInteger()).toString(10,1); + startStr = (startConst->getUniqueInteger()).toString(10, 1); } else { startStr = GetValueName(StartValue); } - - // DEBUG(errs() << " for ( " << IVName << " = " << startStr << "; " - // << compLHS << BranchPredicate << compRHS << "; " + // DEBUG(errs() << " for ( " << IVName << " = " << startStr << "; " + // << compLHS << BranchPredicate << compRHS << "; " // << IVName << " = " << IVName << IVOp << *Step << ") {\n"); - Out << "\n for ( " << IVName << " = " << startStr << "; " - << compLHS << BranchPredicate << compRHS << "; " - << IVName << " = " << IVName << IVOp << *Step << ") {\n"; + Out << "\n for ( " << IVName << " = " << startStr << "; " << compLHS + << BranchPredicate << compRHS << "; " << IVName << " = " << IVName + << IVOp << *Step << ") {\n"; } - BasicBlock *BB = L->getHeader(); // printBBorLoop(BB); printBasicBlock(BB); @@ -3381,7 +3729,7 @@ void CWriter::printLoop(Loop *L) { } void CWriter::printBasicBlock(BasicBlock *BB) { - //DEBUG(errs() << "\n\nProcessing Basic Block: " << BB->getName() << "\n"); + // DEBUG(errs() << "\n\nProcessing Basic Block: " << BB->getName() << "\n"); Out << "\n\n/* Processing Basic Block: " << BB->getName() << " */\n"; // Don't print the label for the basic block if there are no uses, or if @@ -3400,19 +3748,19 @@ void CWriter::printBasicBlock(BasicBlock *BB) { Out << "/* " << GetValueName(BB) << ": */\n"; // Output all of the instructions in the basic block... - for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; - ++II) { + for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; ++II) { Instruction *I = &*II; - //DEBUG(errs() << "*********Processing: " << *I << "\n"); + // DEBUG(errs() << "*********Processing: " << *I << "\n"); bool skip = false; - for(Use &U : I->operands()) { + for (Use &U : I->operands()) { Value *v = U.get(); - if(PHINode *PN = dyn_cast<PHINode>(v)) { + if (PHINode *PN = dyn_cast<PHINode>(v)) { if (LInductionVars.find(PN) != LInductionVars.end()) { bool UserPHI = false; bool UserCMP = false; bool UserOTHER = false; - //// DEBUG(errs() << "Instruction uses induction variable\n"); + //// DEBUG(errs() << "Instruction uses induction + /// variable\n"); for (User *IUser : I->users()) { if (Instruction *UserInst = dyn_cast<Instruction>(IUser)) { // DEBUG(errs() << "User: " << *UserInst << "\n"); @@ -3435,28 +3783,28 @@ void CWriter::printBasicBlock(BasicBlock *BB) { if (skip) break; } - if(skip){ - // DEBUG(errs() << "Skipping instruction that increments Induction Variable!\n"); + if (skip) { + // DEBUG(errs() << "Skipping instruction that increments Induction + // Variable!\n"); Out << "/* Skipped induction variable use: " << *I << " */\n"; continue; } - if(PHINode *PN = dyn_cast<PHINode>(I)) { - if (LInductionVars.find(PN) != LInductionVars.end()) { + if (PHINode *PN = dyn_cast<PHINode>(I)) { + if (LInductionVars.find(PN) != LInductionVars.end()) { // DEBUG(errs() << "Skipping PHINode for Induction Variable!\n"); Out << "/* PHINode of induction variable was here */\n"; continue; } } if (!isInlinableInst(*II) && !isDirectAlloca(&*II)) { - if (!isEmptyType(II->getType()) && - !isInlineAsm(*II)) + if (!isEmptyType(II->getType()) && !isInlineAsm(*II)) outputLValue(&*II); else Out << " "; writeInstComputationInline(*II); Out << ";\n"; } else { - //DEBUG(errs() << "Skipping inlinable or direct alloca!\n"); + // DEBUG(errs() << "Skipping inlinable or direct alloca!\n"); } } @@ -3464,7 +3812,6 @@ void CWriter::printBasicBlock(BasicBlock *BB) { visit(*BB->getTerminator()); } - // Specific Instruction type classes... note that all of the casts are // necessary because we use the instruction classes as opaque types... // @@ -3494,11 +3841,11 @@ void CWriter::visitReturnInst(ReturnInst &I) { } void CWriter::visitSwitchInst(SwitchInst &SI) { - Value* Cond = SI.getCondition(); + Value *Cond = SI.getCondition(); unsigned NumBits = cast<IntegerType>(Cond->getType())->getBitWidth(); if (SI.getNumCases() == 0) { // unconditional branch - printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2); + printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2); printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2); Out << "\n"; @@ -3506,18 +3853,17 @@ void CWriter::visitSwitchInst(SwitchInst &SI) { Out << " switch ("; writeOperand(Cond); Out << ") {\n default:\n"; - printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2); + printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2); printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2); - // CHECK: Needs much testing for (auto Case : SI.cases()) { - ConstantInt* CaseVal = Case.getCaseValue(); - BasicBlock* Succ = Case.getCaseSuccessor(); + ConstantInt *CaseVal = Case.getCaseValue(); + BasicBlock *Succ = Case.getCaseSuccessor(); Out << " case "; writeOperand(CaseVal); Out << ":\n"; - printPHICopiesForSuccessor (SI.getParent(), Succ, 2); + printPHICopiesForSuccessor(SI.getParent(), Succ, 2); if (isGotoCodeNecessary(SI.getParent(), Succ)) printBranchToBlock(SI.getParent(), Succ, 2); else @@ -3530,18 +3876,18 @@ void CWriter::visitSwitchInst(SwitchInst &SI) { // CHECK: Needs much testing for (auto Case : SI.cases()) { Out << "if ("; - ConstantInt* CaseVal = Case.getCaseValue(); - BasicBlock* Succ = Case.getCaseSuccessor(); + ConstantInt *CaseVal = Case.getCaseValue(); + BasicBlock *Succ = Case.getCaseSuccessor(); ICmpInst *icmp = new ICmpInst(CmpInst::ICMP_EQ, Cond, CaseVal); visitICmpInst(*icmp); delete icmp; Out << ") {\n"; - printPHICopiesForSuccessor (SI.getParent(), Succ, 2); + printPHICopiesForSuccessor(SI.getParent(), Succ, 2); printBranchToBlock(SI.getParent(), Succ, 2); Out << " } else "; } Out << "{\n"; - printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2); + printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2); printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2); Out << " }\n"; } @@ -3563,22 +3909,23 @@ bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) { return true; if (std::next(Function::iterator(From)) != Function::iterator(To)) - return true; // Not the direct successor, we need a goto. + return true; // Not the direct successor, we need a goto. - //isa<SwitchInst>(From->getTerminator()) + // isa<SwitchInst>(From->getTerminator()) if (LI->getLoopFor(From) != LI->getLoopFor(To)) return true; return false; } -void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock, - BasicBlock *Successor, - unsigned Indent) { - Out << "/* Printing PHIs for " << CurBlock->getName() << "->" << Successor->getName() << " */\n"; +void CWriter::printPHICopiesForSuccessor(BasicBlock *CurBlock, + BasicBlock *Successor, + unsigned Indent) { + Out << "/* Printing PHIs for " << CurBlock->getName() << "->" + << Successor->getName() << " */\n"; for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) { PHINode *PN = cast<PHINode>(I); - if(LInductionVars.find(PN) == LInductionVars.end()) { + if (LInductionVars.find(PN) == LInductionVars.end()) { Out << "/* Printing phi node: " << *PN << " */\n"; // Now we have to do the printing. Value *IV = PN->getIncomingValueForBlock(CurBlock); @@ -3595,7 +3942,7 @@ void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock, } void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ, - unsigned Indent) { + unsigned Indent) { if (isGotoCodeNecessary(CurBB, Succ)) { Out << std::string(Indent, ' ') << " goto "; writeOperand(Succ); @@ -3603,76 +3950,89 @@ void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ, } } -void CWriter::printBBorLoop (BasicBlock *BB) { - //DEBUG(errs() << "\nPrinting: " << BB->getName() << "\n"); +void CWriter::printBBorLoop(BasicBlock *BB) { + // DEBUG(errs() << "\nPrinting: " << BB->getName() << "\n"); Out << "\n/* Printing: " << BB->getName() << " */\n"; - if(VisitedBlocks.find(BB)!=VisitedBlocks.end() && ReplicateBlocks.find(BB)==ReplicateBlocks.end()) { - //DEBUG(errs() << "This BB has already been printed and is not marked for replication! exiting!\n"); - Out << "/* This BB has already been printed and is not marked for replication! exiting! */\n"; - } else if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == BB) { - //DEBUG(errs() << "Reached block that is top of stack, return instead!\n"); + if (VisitedBlocks.find(BB) != VisitedBlocks.end() && + ReplicateBlocks.find(BB) == ReplicateBlocks.end()) { + // DEBUG(errs() << "This BB has already been printed and is not marked for + // replication! exiting!\n"); + Out << "/* This BB has already been printed and is not marked for " + "replication! exiting! */\n"; + } else if (!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == BB) { + // DEBUG(errs() << "Reached block that is top of stack, return instead!\n"); Out << "/* " << BB->getName() << " is top of stack, return instead! */\n"; // ImmPostDommBlocks.pop(); } else { VisitedBlocks.insert(BB); - if(Loop *LL = LI->getLoopFor(BB)) { + if (Loop *LL = LI->getLoopFor(BB)) { if (LL->getHeader() == BB) printLoop(LL); - else + else printBasicBlock(BB); } else { printBasicBlock(BB); } } - } -bool CWriter::compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm) { +bool CWriter::compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, + BasicBlock *ImmPostDomm) { CompVisitedBlocks.insert(CurrBlock); - //DEBUG(errs() << "--Comparing " << CurrBlock->getName() << " with " << CompBlock->getName() << "\n"); + // DEBUG(errs() << "--Comparing " << CurrBlock->getName() << " with " << + // CompBlock->getName() << "\n"); if (CurrBlock == ImmPostDomm) { - //DEBUG(errs() << "----Reached Post Dominator, returning false!\n"); + // DEBUG(errs() << "----Reached Post Dominator, returning false!\n"); return false; } else if (CurrBlock == CompBlock) { - //DEBUG(errs() << "----Found a match! " << CurrBlock->getName() << " == " << CompBlock->getName() << "\n"); + // DEBUG(errs() << "----Found a match! " << CurrBlock->getName() << " == " + // << CompBlock->getName() << "\n"); return true; } else { bool res = false; - for (auto succ: successors(CurrBlock)) { + for (auto succ : successors(CurrBlock)) { if (CompVisitedBlocks.find(succ) == CompVisitedBlocks.end()) { - //DEBUG(errs() << "----Visiting successor " << succ->getName() << " of " << CurrBlock->getName() << "\n"); + // DEBUG(errs() << "----Visiting successor " << succ->getName() << " of + // " << CurrBlock->getName() << "\n"); res = res || compareBlocks(succ, CompBlock, ImmPostDomm); } else { - //DEBUG(errs() << "----Skipping successor " << succ->getName() << " of " << CurrBlock->getName() << "\n"); + // DEBUG(errs() << "----Skipping successor " << succ->getName() << " of + // " << CurrBlock->getName() << "\n"); } } return res; } } -bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm) { +bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, + BasicBlock *ImmPostDomm) { if (CompBlock == ImmPostDomm) { - //DEBUG(errs() << "Reached PostDomm; returning!\n"); + // DEBUG(errs() << "Reached PostDomm; returning!\n"); return false; } FindVisitedBlocks.insert(CompBlock); - //DEBUG(errs() << "Finding match between " << CompBlock->getName() << " & " << CurrBlock->getName() << "\n"); + // DEBUG(errs() << "Finding match between " << CompBlock->getName() << " & " + // << CurrBlock->getName() << "\n"); bool compareResult = compareBlocks(CurrBlock, CompBlock, ImmPostDomm); CompVisitedBlocks.clear(); - if (compareResult){ - //DEBUG(errs() << "Match found, marking " << CompBlock->getName() << " for replication!\n"); + if (compareResult) { + // DEBUG(errs() << "Match found, marking " << CompBlock->getName() << " for + // replication!\n"); // Flag for replication ReplicateBlocks.insert(CompBlock); return true; } else { bool res = false; - for (auto succ: successors(CompBlock)) { - if(FindVisitedBlocks.find(succ) == FindVisitedBlocks.end()) { - //DEBUG(errs() << "Visiting successor " << succ->getName() << " of " << CompBlock->getName() << "\n"); + for (auto succ : successors(CompBlock)) { + if (FindVisitedBlocks.find(succ) == FindVisitedBlocks.end()) { + // DEBUG(errs() << "Visiting successor " << succ->getName() << " of " << + // CompBlock->getName() << "\n"); res = res || findMatch(CurrBlock, succ, ImmPostDomm); - if (res == true) break; + if (res == true) + break; } else { - //DEBUG(errs() << "Skipping successor " << succ->getName() << " of " << CompBlock->getName() << "\n"); + // DEBUG(errs() << "Skipping successor " << succ->getName() << " of " << + // CompBlock->getName() << "\n"); } } return res; @@ -3682,13 +4042,13 @@ bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock // that immediately succeeds the current one. // void CWriter::visitBranchInst(BranchInst &I) { - errs() << "Visiting Branch Instruction: " << I <<"\n"; + errs() << "Visiting Branch Instruction: " << I << "\n"; Out << "\n/* Branch: " << I << " */\n"; if (I.isConditional()) { BasicBlock *BB0 = I.getSuccessor(0); BasicBlock *BB1 = I.getSuccessor(1); - BasicBlock *ImmPostDomm = PDT->findNearestCommonDominator(BB0,BB1); + BasicBlock *ImmPostDomm = PDT->findNearestCommonDominator(BB0, BB1); // Iterate over all BBs in then & else to find a matching BB // If found, mark it for replication @@ -3696,166 +4056,189 @@ void CWriter::visitBranchInst(BranchInst &I) { findMatch(BB0, BB1, ImmPostDomm); FindVisitedBlocks.clear(); } - if(Loop *L = LI->getLoopFor(I.getParent())) { - if(L == LI->getLoopFor(BB0) && !(L == LI->getLoopFor(BB1))) { + if (Loop *L = LI->getLoopFor(I.getParent())) { + if (L == LI->getLoopFor(BB0) && !(L == LI->getLoopFor(BB1))) { errs() << "This is a loop branch!\n"; Out << "/* This is a loop branch! */\n"; - //BB0 is in the loop. Print it if it hsn't been printed - if(VisitedBlocks.find(BB0) != VisitedBlocks.end()) { + // BB0 is in the loop. Print it if it hsn't been printed + if (VisitedBlocks.find(BB0) != VisitedBlocks.end()) { errs() << "Branching back to header: " << BB0->getName() << "\n"; errs() << "This is the end of the loop, closing!\n"; Out << "/* Branching back to header: " << BB0->getName() << " */\n"; Out << "/* Closing loop! */\n"; - //BB0 is the loop header. CLose the loop then print BB1. - printPHICopiesForSuccessor (I.getParent(), BB0, 2); + // BB0 is the loop header. CLose the loop then print BB1. + printPHICopiesForSuccessor(I.getParent(), BB0, 2); Out << " }\n"; - printPHICopiesForSuccessor (I.getParent(), BB1, 2); + printPHICopiesForSuccessor(I.getParent(), BB1, 2); printBBorLoop(BB1); } else { - errs() << "Not branching to header! Branching to: " << BB0->getName() << "\n"; - //BB0 is not the loop header. That means we are entering loop body + errs() << "Not branching to header! Branching to: " << BB0->getName() + << "\n"; + // BB0 is not the loop header. That means we are entering loop body llvm_unreachable("loop branch unhandled!\n"); } - } else if(L == LI->getLoopFor(BB1) && !(L == LI->getLoopFor(BB0))) { + } else if (L == LI->getLoopFor(BB1) && !(L == LI->getLoopFor(BB0))) { errs() << "This is a loop branch!\n"; Out << "/* This is a loop branch! */\n"; - if(VisitedBlocks.find(BB1) != VisitedBlocks.end()) { + if (VisitedBlocks.find(BB1) != VisitedBlocks.end()) { errs() << "Branching back to header: " << BB1->getName() << "\n"; errs() << "This is the end of the loop, closing!\n"; Out << "/* Branching back to header: " << BB1->getName() << " */\n"; Out << "/* Closing loop! */\n"; - //BB0 is the loop header. CLose the loop then print BB1. - printPHICopiesForSuccessor (I.getParent(), BB1, 2); + // BB0 is the loop header. CLose the loop then print BB1. + printPHICopiesForSuccessor(I.getParent(), BB1, 2); Out << " }\n"; - printPHICopiesForSuccessor (I.getParent(), BB0, 2); + printPHICopiesForSuccessor(I.getParent(), BB0, 2); printBBorLoop(BB0); } else { - errs() << "Not branching to header! Branching to: " << BB1->getName() << "\n"; - //BB1 is not the loop header. That means we are entering loop body + errs() << "Not branching to header! Branching to: " << BB1->getName() + << "\n"; + // BB1 is not the loop header. That means we are entering loop body llvm_unreachable("loop branch unhandled!\n"); } } else { errs() << "This is a conditional statement within a loop!\n"; Out << "/* This is a conditional statement within a loop! */\n"; - errs() << ImmPostDomm->getName() << " is the immediate post dominator of " << BB0->getName() << " and " << BB1->getName() << "\n"; - if(VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) { - errs() << "Not pushing " << ImmPostDomm->getName() << " because it has already been visited!\n"; + errs() << ImmPostDomm->getName() + << " is the immediate post dominator of " << BB0->getName() + << " and " << BB1->getName() << "\n"; + if (VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) { + errs() << "Not pushing " << ImmPostDomm->getName() + << " because it has already been visited!\n"; } else { errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n"; ImmPostDommBlocks.push(ImmPostDomm); } bool noElse = false; - if(BB1 == ImmPostDomm) { + if (BB1 == ImmPostDomm) { noElse = true; } Out << " if ("; writeOperand(I.getCondition(), ContextCasted); Out << ") { /* " << I << "*/\n"; - printPHICopiesForSuccessor (I.getParent(), BB0, 2); + printPHICopiesForSuccessor(I.getParent(), BB0, 2); printBBorLoop(BB0); - errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n"; - Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n"; + errs() << "Back to handling " << I.getParent()->getName() << ": " << I + << "\n"; + Out << "/* Back to handling " << I.getParent()->getName() << ": " << I + << " */\n"; if (!noElse) { errs() << "Printing else!\n"; Out << " } else { /*" << I << "*/\n"; - printPHICopiesForSuccessor (I.getParent(), BB1, 2); + printPHICopiesForSuccessor(I.getParent(), BB1, 2); ElseBlocks.push(BB1); ElseBranches.push(&I); printBBorLoop(BB1); - errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n"; + errs() << "Back to handling " << I.getParent()->getName() << ": " << I + << "\n"; errs() << "Check to see if else block is closed!\n"; - Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n" ; - Out << "/* Check to see if else block is closed! */\n" ; - if(!ElseBlocks.empty() && ElseBlocks.top() == BB1) { + Out << "/* Back to handling " << I.getParent()->getName() << ": " << I + << " */\n"; + Out << "/* Check to see if else block is closed! */\n"; + if (!ElseBlocks.empty() && ElseBlocks.top() == BB1) { errs() << "Else block not closed, need to close braces!\n"; - Out << "/* Else block not closed, need to close braces! */\n" ; + Out << "/* Else block not closed, need to close braces! */\n"; Out << "} /* closing " << *(ElseBranches.top()) << " */\n"; ElseBranches.pop(); ElseBlocks.pop(); } - if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == ImmPostDomm) { + if (!ImmPostDommBlocks.empty() && + ImmPostDommBlocks.top() == ImmPostDomm) { errs() << "Will now pop post dom them handle it!\n"; ImmPostDommBlocks.pop(); printBBorLoop(ImmPostDomm); } else { - errs() << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n"; + errs() + << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n"; } } else { - errs() << "No else block. Adding one for phis, then moving to " << BB1->getName() << "!\n"; - Out << "/* (3913) No else block. Adding one for phis, then moving to " << BB1->getName() << "! */\n"; + errs() << "No else block. Adding one for phis, then moving to " + << BB1->getName() << "!\n"; + Out << "/* (3913) No else block. Adding one for phis, then moving to " + << BB1->getName() << "! */\n"; Out << " } /* closing " << I << "*/\n"; errs() << "Will now pop post dom them handle it!\n"; ImmPostDommBlocks.pop(); Out << "else {\n"; - printPHICopiesForSuccessor (I.getParent(), BB1, 2); + printPHICopiesForSuccessor(I.getParent(), BB1, 2); Out << "}\n"; printBBorLoop(BB1); } } } else { errs() << "This is a conditional statement!\n"; - errs() << ImmPostDomm->getName() << " is the immediate post dominator of " << BB0->getName() << " and " << BB1->getName() << "\n"; - if(VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) { - errs() << "Not pushing " << ImmPostDomm->getName() << " because it has already been visited!\n"; + errs() << ImmPostDomm->getName() << " is the immediate post dominator of " + << BB0->getName() << " and " << BB1->getName() << "\n"; + if (VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) { + errs() << "Not pushing " << ImmPostDomm->getName() + << " because it has already been visited!\n"; } else { errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n"; ImmPostDommBlocks.push(ImmPostDomm); } bool noElse = false; - if(BB1 == ImmPostDomm) { + if (BB1 == ImmPostDomm) { noElse = true; } Out << " if ("; writeOperand(I.getCondition(), ContextCasted); Out << ") { /* " << I << "*/\n"; - printPHICopiesForSuccessor (I.getParent(), BB0, 2); + printPHICopiesForSuccessor(I.getParent(), BB0, 2); printBBorLoop(BB0); - errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n"; - Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n" ; + errs() << "Back to handling " << I.getParent()->getName() << ": " << I + << "\n"; + Out << "/* Back to handling " << I.getParent()->getName() << ": " << I + << " */\n"; if (!noElse) { errs() << "Printing else!\n"; - Out << "/* Printing else! */\n" ; + Out << "/* Printing else! */\n"; Out << " } else { /*" << I << "*/\n"; - printPHICopiesForSuccessor (I.getParent(), BB1, 2); + printPHICopiesForSuccessor(I.getParent(), BB1, 2); ElseBlocks.push(BB1); ElseBranches.push(&I); printBBorLoop(BB1); - errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n"; + errs() << "Back to handling " << I.getParent()->getName() << ": " << I + << "\n"; errs() << "Check to see if else block is closed!\n"; - Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n"; + Out << "/* Back to handling " << I.getParent()->getName() << ": " << I + << " */\n"; Out << "/* Check to see if else block is closed! */\n"; - if(!ElseBlocks.empty() && ElseBlocks.top() == BB1) { + if (!ElseBlocks.empty() && ElseBlocks.top() == BB1) { errs() << "Else block not closed, need to close braces!\n"; Out << "/* Else block not closed, need to close braces! */\n"; Out << "} /* closing " << *(ElseBranches.top()) << " */\n"; ElseBranches.pop(); ElseBlocks.pop(); } - if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == ImmPostDomm) { + if (!ImmPostDommBlocks.empty() && + ImmPostDommBlocks.top() == ImmPostDomm) { errs() << "Will now pop post dom them handle it!\n"; ImmPostDommBlocks.pop(); printBBorLoop(ImmPostDomm); } else { - errs() << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n"; + errs() + << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n"; } } else { - errs() << "No else block. Adding one for phis, then moving to " << BB1->getName() << "!\n"; - Out << "/* (3985) No else block. Adding one for phis, then moving to " << BB1->getName() << "! */\n"; + errs() << "No else block. Adding one for phis, then moving to " + << BB1->getName() << "!\n"; + Out << "/* (3985) No else block. Adding one for phis, then moving to " + << BB1->getName() << "! */\n"; Out << " } /* closing " << I << "*/\n"; errs() << "Will now pop post dom them handle it!\n"; ImmPostDommBlocks.pop(); Out << "else {\n"; - printPHICopiesForSuccessor (I.getParent(), BB1, 2); + printPHICopiesForSuccessor(I.getParent(), BB1, 2); Out << "}\n"; printBBorLoop(BB1); } } } else { errs() << "This is an unconditional branch!\n"; - BasicBlock *BB = I.getSuccessor(0); - printPHICopiesForSuccessor (I.getParent(), BB, 2); + BasicBlock *BB = I.getSuccessor(0); + printPHICopiesForSuccessor(I.getParent(), BB, 2); if (!ElseBlocks.empty() && I.getParent() == ElseBlocks.top()) { errs() << "Branch marks end of else block, need to close braces!\n"; Out << "/* Branch marks end of else block, need to close braces! */\n"; @@ -3875,13 +4258,11 @@ void CWriter::visitPHINode(PHINode &I) { if (LInductionVars.find(&I) == LInductionVars.end()) { writeOperand(&I); Out << "__PHI_TEMPORARY"; - } - else { - // DEBUG(errs() << "Skipping PHI node for induction variable!\n"); + } else { + // DEBUG(errs() << "Skipping PHI node for induction variable!\n"); } } - // NOTE: Moving LLVM-4 Binary Op functions here bool isNeg(const Value *V) { if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V)) @@ -3902,13 +4283,12 @@ bool isFNeg(const Value *V, bool IgnoreZeroSign) { return false; } - Value *getNegArgument(Value *BinOp) { return cast<BinaryOperator>(BinOp)->getOperand(1); } const Value *getNegArgument(const Value *BinOp) { - return getNegArgument(const_cast<Value*>(BinOp)); + return getNegArgument(const_cast<Value *>(BinOp)); } Value *getFNegArgument(Value *BinOp) { @@ -3916,7 +4296,7 @@ Value *getFNegArgument(Value *BinOp) { } const Value *getFNegArgument(const Value *BinOp) { - return getFNegArgument(const_cast<Value*>(BinOp)); + return getFNegArgument(const_cast<Value *>(BinOp)); } static inline bool isConstantAllOnes(const Value *V) { @@ -3928,32 +4308,27 @@ static inline bool isConstantAllOnes(const Value *V) { bool isNot(const Value *V) { if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V)) return (Bop->getOpcode() == Instruction::Xor && - (isConstantAllOnes(Bop->getOperand(1)) || - isConstantAllOnes(Bop->getOperand(0)))); + (isConstantAllOnes(Bop->getOperand(1)) || + isConstantAllOnes(Bop->getOperand(0)))); return false; } - Value *getNotArgument(Value *BinOp) { assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!"); BinaryOperator *BO = cast<BinaryOperator>(BinOp); Value *Op0 = BO->getOperand(0); Value *Op1 = BO->getOperand(1); - if (isConstantAllOnes(Op0)) return Op1; + if (isConstantAllOnes(Op0)) + return Op1; assert(isConstantAllOnes(Op1)); return Op0; } const Value *getNotArgument(const Value *BinOp) { - return getNotArgument(const_cast<Value*>(BinOp)); + return getNotArgument(const_cast<Value *>(BinOp)); } - - - - - void CWriter::visitBinaryOperator(BinaryOperator &I) { // binary instructions, shift instructions, setCond instructions. assert(!I.getType()->isPointerTy()); @@ -3979,7 +4354,8 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) { // DEBUG( // if(needsCast) errs() << "****Needs Cast: \n" << I << "\n"; // else if(shouldCast) errs() << "****Should Cast: \n" << I << "\n"; - // else if(I.getType()->isVectorTy()) errs() << "****Is Vector Type: \n" << I << "\n"; + // else if(I.getType()->isVectorTy()) errs() << "****Is Vector Type: \n" + // << I << "\n"; // ); // // Type *VTy = I.getOperand(0)->getType(); @@ -4019,13 +4395,13 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) { // If this is a negation operation, print it out as such. For FP, we don't // want to print "-0.0 - X". - //if (BinaryOperator::isNeg(&I)) { + // if (BinaryOperator::isNeg(&I)) { if (isNeg(&I)) { Out << "-("; writeOperand(getNegArgument(&I)); Out << ")"; } - //else if (BinaryOperator::isFNeg(&I)) { + // else if (BinaryOperator::isFNeg(&I)) { else if (isFNeg(&I, true)) { Out << "-("; writeOperand(getFNegArgument(&I)); @@ -4040,7 +4416,7 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) { Out << "fmodf("; else if (I.getType() == Type::getDoubleTy(I.getContext())) Out << "fmod("; - else // all 3 flavors of long double + else // all 3 flavors of long double Out << "fmodl("; writeOperand(I.getOperand(0), ContextCasted); Out << ", "; @@ -4058,29 +4434,49 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) { writeOperandWithCast(I.getOperand(0), I.getOpcode()); switch (I.getOpcode()) { - case Instruction::Add: - case Instruction::FAdd: Out << " + "; break; - case Instruction::Sub: - case Instruction::FSub: Out << " - "; break; - case Instruction::Mul: - case Instruction::FMul: Out << " * "; break; - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: Out << " % "; break; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: Out << " / "; break; - case Instruction::And: Out << " & "; break; - case Instruction::Or: Out << " | "; break; - case Instruction::Xor: Out << " ^ "; break; - case Instruction::Shl : Out << " << "; break; - case Instruction::LShr: - case Instruction::AShr: Out << " >> "; break; - default: + case Instruction::Add: + case Instruction::FAdd: + Out << " + "; + break; + case Instruction::Sub: + case Instruction::FSub: + Out << " - "; + break; + case Instruction::Mul: + case Instruction::FMul: + Out << " * "; + break; + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + Out << " % "; + break; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + Out << " / "; + break; + case Instruction::And: + Out << " & "; + break; + case Instruction::Or: + Out << " | "; + break; + case Instruction::Xor: + Out << " ^ "; + break; + case Instruction::Shl: + Out << " << "; + break; + case Instruction::LShr: + case Instruction::AShr: + Out << " >> "; + break; + default: #ifndef NDEBUG - errs() << "Invalid operator type!" << I; + errs() << "Invalid operator type!" << I; #endif - llvm_unreachable(0); + llvm_unreachable(0); } writeOperandWithCast(I.getOperand(1), I.getOpcode()); @@ -4090,8 +4486,8 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) { } void CWriter::visitICmpInst(ICmpInst &I) { - if (I.getType()->isVectorTy() - || I.getOperand(0)->getType()->getPrimitiveSizeInBits() > 64) { + if (I.getType()->isVectorTy() || + I.getOperand(0)->getType()->getPrimitiveSizeInBits() > 64) { Out << "llvm_icmp_" << getCmpPredicateName(I.getPredicate()) << "_"; printTypeString(Out, I.getOperand(0)->getType(), I.isSigned()); Out << "("; @@ -4100,8 +4496,10 @@ void CWriter::visitICmpInst(ICmpInst &I) { writeOperand(I.getOperand(1), ContextCasted); Out << ")"; if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) { - CmpDeclTypes.insert(std::pair<CmpInst::Predicate, VectorType*>(I.getPredicate(), VTy)); - TypedefDeclTypes.insert(I.getType()); // insert type not necessarily visible above + CmpDeclTypes.insert( + std::pair<CmpInst::Predicate, VectorType *>(I.getPredicate(), VTy)); + TypedefDeclTypes.insert( + I.getType()); // insert type not necessarily visible above } return; } @@ -4116,21 +4514,33 @@ void CWriter::visitICmpInst(ICmpInst &I) { writeOperandWithCast(I.getOperand(0), I); switch (I.getPredicate()) { - case ICmpInst::ICMP_EQ: Out << " == "; break; - case ICmpInst::ICMP_NE: Out << " != "; break; - case ICmpInst::ICMP_ULE: - case ICmpInst::ICMP_SLE: Out << " <= "; break; - case ICmpInst::ICMP_UGE: - case ICmpInst::ICMP_SGE: Out << " >= "; break; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_SLT: Out << " < "; break; - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_SGT: Out << " > "; break; - default: + case ICmpInst::ICMP_EQ: + Out << " == "; + break; + case ICmpInst::ICMP_NE: + Out << " != "; + break; + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SLE: + Out << " <= "; + break; + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_SGE: + Out << " >= "; + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: + Out << " < "; + break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: + Out << " > "; + break; + default: #ifndef NDEBUG - errs() << "Invalid icmp predicate!" << I; + errs() << "Invalid icmp predicate!" << I; #endif - llvm_unreachable(0); + llvm_unreachable(0); } writeOperandWithCast(I.getOperand(1), I); @@ -4148,8 +4558,10 @@ void CWriter::visitFCmpInst(FCmpInst &I) { writeOperand(I.getOperand(1), ContextCasted); Out << ")"; if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) { - CmpDeclTypes.insert(std::pair<CmpInst::Predicate, VectorType*>(I.getPredicate(), VTy)); - TypedefDeclTypes.insert(I.getType()); // insert type not necessarily visible above + CmpDeclTypes.insert( + std::pair<CmpInst::Predicate, VectorType *>(I.getPredicate(), VTy)); + TypedefDeclTypes.insert( + I.getType()); // insert type not necessarily visible above } return; } @@ -4163,18 +4575,21 @@ void CWriter::visitFCmpInst(FCmpInst &I) { Out << ")"; } -static const char * getFloatBitCastField(Type *Ty) { +static const char *getFloatBitCastField(Type *Ty) { switch (Ty->getTypeID()) { - default: llvm_unreachable("Invalid Type"); - case Type::FloatTyID: return "Float"; - case Type::DoubleTyID: return "Double"; - case Type::IntegerTyID: { - unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); - if (NumBits <= 32) - return "Int32"; - else - return "Int64"; - } + default: + llvm_unreachable("Invalid Type"); + case Type::FloatTyID: + return "Float"; + case Type::DoubleTyID: + return "Double"; + case Type::IntegerTyID: { + unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); + if (NumBits <= 32) + return "Int32"; + else + return "Int64"; + } } } @@ -4183,9 +4598,9 @@ void CWriter::visitCastInst(CastInst &I) { Type *DstTy = I.getType(); Type *SrcTy = I.getOperand(0)->getType(); - if (DstTy->isVectorTy() || SrcTy->isVectorTy() - || DstTy->getPrimitiveSizeInBits() > 64 - || SrcTy->getPrimitiveSizeInBits() > 64) { + if (DstTy->isVectorTy() || SrcTy->isVectorTy() || + DstTy->getPrimitiveSizeInBits() > 64 || + SrcTy->getPrimitiveSizeInBits() > 64) { Out << "llvm_" << I.getOpcodeName() << "_"; printTypeString(Out, SrcTy, false); Out << "_"; @@ -4193,7 +4608,9 @@ void CWriter::visitCastInst(CastInst &I) { Out << "("; writeOperand(I.getOperand(0), ContextCasted); Out << ")"; - CastOpDeclTypes.insert(std::pair<Instruction::CastOps, std::pair<Type*, Type*> >(I.getOpcode(), std::pair<Type*, Type*>(SrcTy, DstTy))); + CastOpDeclTypes.insert( + std::pair<Instruction::CastOps, std::pair<Type *, Type *>>( + I.getOpcode(), std::pair<Type *, Type *>(SrcTy, DstTy))); return; } @@ -4201,10 +4618,10 @@ void CWriter::visitCastInst(CastInst &I) { Out << '('; // These int<->float and long<->double casts need to be handled specially Out << GetValueName(&I) << "__BITCAST_TEMPORARY." - << getFloatBitCastField(I.getOperand(0)->getType()) << " = "; + << getFloatBitCastField(I.getOperand(0)->getType()) << " = "; writeOperand(I.getOperand(0), ContextCasted); Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY." - << getFloatBitCastField(I.getType()); + << getFloatBitCastField(I.getType()); Out << ')'; return; } @@ -4241,15 +4658,16 @@ void CWriter::visitSelectInst(SelectInst &I) { writeOperand(I.getFalseValue(), ContextCasted); Out << ")"; SelectDeclTypes.insert(I.getType()); - assert(I.getCondition()->getType()->isVectorTy() == I.getType()->isVectorTy()); // TODO: might be scalarty == vectorty + assert(I.getCondition()->getType()->isVectorTy() == + I.getType()->isVectorTy()); // TODO: might be scalarty == vectorty } // Returns the macro name or value of the max or min of an integer type // (as defined in limits.h). static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax, - raw_ostream &Out) { - const char* type; - const char* sprefix = ""; + raw_ostream &Out) { + const char *type; + const char *sprefix = ""; unsigned NumBits = Ty.getBitWidth(); if (NumBits <= 8) { @@ -4274,37 +4692,38 @@ static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax, #ifndef NDEBUG static bool isSupportedIntegerSize(IntegerType &T) { return T.getBitWidth() == 8 || T.getBitWidth() == 16 || - T.getBitWidth() == 32 || T.getBitWidth() == 64 || - T.getBitWidth() == 128; + T.getBitWidth() == 32 || T.getBitWidth() == 64 || + T.getBitWidth() == 128; } #endif -void CWriter::printIntrinsicDefinition(FunctionType *funT, - unsigned Opcode, std::string OpName, raw_ostream &Out) { +void CWriter::printIntrinsicDefinition(FunctionType *funT, unsigned Opcode, + std::string OpName, raw_ostream &Out) { Type *retT = funT->getReturnType(); Type *elemT = funT->getParamType(0); IntegerType *elemIntT = dyn_cast<IntegerType>(elemT); char i, numParams = funT->getNumParams(); bool isSigned; switch (Opcode) { - default: - isSigned = false; - break; - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - isSigned = true; - break; + default: + isSigned = false; + break; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + isSigned = true; + break; } assert(numParams > 0 && numParams < 26); if (isa<VectorType>(retT)) { // this looks general, but is only actually used for ctpop, ctlz, cttz - Type* *devecFunParams = (Type**)alloca(sizeof(Type*) * numParams); + Type **devecFunParams = (Type **)alloca(sizeof(Type *) * numParams); for (i = 0; i < numParams; i++) { devecFunParams[(int)i] = funT->params()[(int)i]->getScalarType(); } - FunctionType *devecFunT = FunctionType::get(funT->getReturnType()->getScalarType(), + FunctionType *devecFunT = FunctionType::get( + funT->getReturnType()->getScalarType(), makeArrayRef(devecFunParams, numParams), funT->isVarArg()); printIntrinsicDefinition(devecFunT, Opcode, OpName + "_devec", Out); } @@ -4321,19 +4740,20 @@ void CWriter::printIntrinsicDefinition(FunctionType *funT, Out << "("; for (i = 0; i < numParams; i++) { switch (Opcode) { - // optional intrinsic validity assertion checks - default: - // default case: assume all parameters must have the same type - assert(elemT == funT->getParamType(i)); - break; - case Intrinsic::ctlz: - case Intrinsic::cttz: - case Intrinsic::powi: - break; + // optional intrinsic validity assertion checks + default: + // default case: assume all parameters must have the same type + assert(elemT == funT->getParamType(i)); + break; + case Intrinsic::ctlz: + case Intrinsic::cttz: + case Intrinsic::powi: + break; } printTypeNameUnaligned(Out, funT->getParamType(i), isSigned); Out << " " << (char)('a' + i); - if (i != numParams - 1) Out << ", "; + if (i != numParams - 1) + Out << ", "; } Out << ") {\n "; printTypeName(Out, retT); @@ -4346,106 +4766,106 @@ void CWriter::printIntrinsicDefinition(FunctionType *funT, Out << (char)('a' + j); if (isa<VectorType>(funT->params()[j])) Out << ".vector[" << (int)i << "]"; - if (j != numParams - 1) Out << ", "; + if (j != numParams - 1) + Out << ", "; } Out << ");\n"; } - } - else if (elemIntT) { + } else if (elemIntT) { // handle integer ops assert(isSupportedIntegerSize(*elemIntT) && - "CBackend does not support arbitrary size integers."); + "CBackend does not support arbitrary size integers."); switch (Opcode) { - default: + default: #ifndef NDEBUG - errs() << "Unsupported Intrinsic!" << Opcode; + errs() << "Unsupported Intrinsic!" << Opcode; #endif - llvm_unreachable(0); + llvm_unreachable(0); - case Intrinsic::uadd_with_overflow: - // r.field0 = a + b; - // r.field1 = (r.field0 < a); - assert(cast<StructType>(retT)->getElementType(0) == elemT); - Out << " r.field0 = a + b;\n"; - Out << " r.field1 = (a >= -b);\n"; - break; + case Intrinsic::uadd_with_overflow: + // r.field0 = a + b; + // r.field1 = (r.field0 < a); + assert(cast<StructType>(retT)->getElementType(0) == elemT); + Out << " r.field0 = a + b;\n"; + Out << " r.field1 = (a >= -b);\n"; + break; - case Intrinsic::sadd_with_overflow: - // r.field0 = a + b; - // r.field1 = (b > 0 && a > XX_MAX - b) || - // (b < 0 && a < XX_MIN - b); - assert(cast<StructType>(retT)->getElementType(0) == elemT); - Out << " r.field0 = a + b;\n"; - Out << " r.field1 = (b >= 0 ? a > "; - printLimitValue(*elemIntT, true, true, Out); - Out << " - b : a < "; - printLimitValue(*elemIntT, true, false, Out); - Out << " - b);\n"; - break; + case Intrinsic::sadd_with_overflow: + // r.field0 = a + b; + // r.field1 = (b > 0 && a > XX_MAX - b) || + // (b < 0 && a < XX_MIN - b); + assert(cast<StructType>(retT)->getElementType(0) == elemT); + Out << " r.field0 = a + b;\n"; + Out << " r.field1 = (b >= 0 ? a > "; + printLimitValue(*elemIntT, true, true, Out); + Out << " - b : a < "; + printLimitValue(*elemIntT, true, false, Out); + Out << " - b);\n"; + break; - case Intrinsic::usub_with_overflow: - assert(cast<StructType>(retT)->getElementType(0) == elemT); - Out << " r.field0 = a - b;\n"; - Out << " r.field1 = (a < b);\n"; - break; + case Intrinsic::usub_with_overflow: + assert(cast<StructType>(retT)->getElementType(0) == elemT); + Out << " r.field0 = a - b;\n"; + Out << " r.field1 = (a < b);\n"; + break; - case Intrinsic::ssub_with_overflow: - assert(cast<StructType>(retT)->getElementType(0) == elemT); - Out << " r.field0 = a - b;\n"; - Out << " r.field1 = (b <= 0 ? a > "; - printLimitValue(*elemIntT, true, true, Out); - Out << " + b : a < "; - printLimitValue(*elemIntT, true, false, Out); - Out << " + b);\n"; - break; + case Intrinsic::ssub_with_overflow: + assert(cast<StructType>(retT)->getElementType(0) == elemT); + Out << " r.field0 = a - b;\n"; + Out << " r.field1 = (b <= 0 ? a > "; + printLimitValue(*elemIntT, true, true, Out); + Out << " + b : a < "; + printLimitValue(*elemIntT, true, false, Out); + Out << " + b);\n"; + break; - case Intrinsic::umul_with_overflow: - assert(cast<StructType>(retT)->getElementType(0) == elemT); - Out << " r.field1 = LLVMMul_uov(8 * sizeof(a), &a, &b, &r.field0);\n"; - break; + case Intrinsic::umul_with_overflow: + assert(cast<StructType>(retT)->getElementType(0) == elemT); + Out << " r.field1 = LLVMMul_uov(8 * sizeof(a), &a, &b, &r.field0);\n"; + break; - case Intrinsic::smul_with_overflow: - assert(cast<StructType>(retT)->getElementType(0) == elemT); - Out << " r.field1 = LLVMMul_sov(8 * sizeof(a), &a, &b, &r.field0);\n"; - break; + case Intrinsic::smul_with_overflow: + assert(cast<StructType>(retT)->getElementType(0) == elemT); + Out << " r.field1 = LLVMMul_sov(8 * sizeof(a), &a, &b, &r.field0);\n"; + break; - case Intrinsic::bswap: - assert(retT == elemT); - Out << " LLVMFlipAllBits(8 * sizeof(a), &a, &r);\n"; - break; + case Intrinsic::bswap: + assert(retT == elemT); + Out << " LLVMFlipAllBits(8 * sizeof(a), &a, &r);\n"; + break; - case Intrinsic::ctpop: - assert(retT == elemT); - Out << " r = "; - if (retT->getPrimitiveSizeInBits() > 64) - Out << "llvm_ctor_u128(0, "; - Out << "LLVMCountPopulation(8 * sizeof(a), &a)"; - if (retT->getPrimitiveSizeInBits() > 64) - Out << ")"; - Out << ";\n"; - break; + case Intrinsic::ctpop: + assert(retT == elemT); + Out << " r = "; + if (retT->getPrimitiveSizeInBits() > 64) + Out << "llvm_ctor_u128(0, "; + Out << "LLVMCountPopulation(8 * sizeof(a), &a)"; + if (retT->getPrimitiveSizeInBits() > 64) + Out << ")"; + Out << ";\n"; + break; - case Intrinsic::ctlz: - assert(retT == elemT); - Out << " (void)b;\n r = "; - if (retT->getPrimitiveSizeInBits() > 64) - Out << "llvm_ctor_u128(0, "; - Out << "LLVMCountLeadingZeros(8 * sizeof(a), &a)"; - if (retT->getPrimitiveSizeInBits() > 64) - Out << ")"; - Out << ";\n"; - break; + case Intrinsic::ctlz: + assert(retT == elemT); + Out << " (void)b;\n r = "; + if (retT->getPrimitiveSizeInBits() > 64) + Out << "llvm_ctor_u128(0, "; + Out << "LLVMCountLeadingZeros(8 * sizeof(a), &a)"; + if (retT->getPrimitiveSizeInBits() > 64) + Out << ")"; + Out << ";\n"; + break; - case Intrinsic::cttz: - assert(retT == elemT); - Out << " (void)b;\n r = "; - if (retT->getPrimitiveSizeInBits() > 64) - Out << "llvm_ctor_u128(0, "; - Out << "LLVMCountTrailingZeros(8 * sizeof(a), &a)"; - if (retT->getPrimitiveSizeInBits() > 64) - Out << ")"; - Out << ";\n"; - break; + case Intrinsic::cttz: + assert(retT == elemT); + Out << " (void)b;\n r = "; + if (retT->getPrimitiveSizeInBits() > 64) + Out << "llvm_ctor_u128(0, "; + Out << "LLVMCountTrailingZeros(8 * sizeof(a), &a)"; + if (retT->getPrimitiveSizeInBits() > 64) + Out << ")"; + Out << ";\n"; + break; } } else { @@ -4468,49 +4888,48 @@ void CWriter::printIntrinsicDefinition(FunctionType *funT, } switch (Opcode) { - default: + default: #ifndef NDEBUG - errs() << "Unsupported Intrinsic!" << Opcode; + errs() << "Unsupported Intrinsic!" << Opcode; #endif - llvm_unreachable(0); - - case Intrinsic::ceil: - Out << " r = ceil" << suffix << "(a);\n"; - break; + llvm_unreachable(0); - case Intrinsic::fabs: - Out << " r = fabs" << suffix << "(a);\n"; - break; + case Intrinsic::ceil: + Out << " r = ceil" << suffix << "(a);\n"; + break; - case Intrinsic::floor: - Out << " r = floor" << suffix << "(a);\n"; - break; + case Intrinsic::fabs: + Out << " r = fabs" << suffix << "(a);\n"; + break; - case Intrinsic::fma: - Out << " r = fma" << suffix << "(a, b, c);\n"; - break; + case Intrinsic::floor: + Out << " r = floor" << suffix << "(a);\n"; + break; - case Intrinsic::fmuladd: - Out << " r = a * b + c;\n"; - break; + case Intrinsic::fma: + Out << " r = fma" << suffix << "(a, b, c);\n"; + break; - case Intrinsic::pow: - case Intrinsic::powi: - Out << " r = pow" << suffix << "(a, b);\n"; - break; + case Intrinsic::fmuladd: + Out << " r = a * b + c;\n"; + break; - case Intrinsic::rint: - Out << " r = rint" << suffix << "(a);\n"; - break; + case Intrinsic::pow: + case Intrinsic::powi: + Out << " r = pow" << suffix << "(a, b);\n"; + break; - case Intrinsic::sqrt: - Out << " r = sqrt" << suffix << "(a);\n"; - break; + case Intrinsic::rint: + Out << " r = rint" << suffix << "(a);\n"; + break; - case Intrinsic::trunc: - Out << " r = trunc" << suffix << "(a);\n"; - break; + case Intrinsic::sqrt: + Out << " r = sqrt" << suffix << "(a);\n"; + break; + case Intrinsic::trunc: + Out << " r = trunc" << suffix << "(a);\n"; + break; } } @@ -4528,73 +4947,74 @@ void CWriter::lowerIntrinsics(Function &F) { // Examine all the instructions in this function to find the intrinsics that // need to be lowered. for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) if (CallInst *CI = dyn_cast<CallInst>(I++)) if (Function *F = CI->getCalledFunction()) switch (F->getIntrinsicID()) { - case Intrinsic::not_intrinsic: - case Intrinsic::vastart: - case Intrinsic::vacopy: - case Intrinsic::vaend: - case Intrinsic::returnaddress: - case Intrinsic::frameaddress: - case Intrinsic::setjmp: - case Intrinsic::longjmp: - case Intrinsic::sigsetjmp: - case Intrinsic::siglongjmp: - case Intrinsic::prefetch: - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse_cmp_ps: - case Intrinsic::x86_sse2_cmp_sd: - case Intrinsic::x86_sse2_cmp_pd: - case Intrinsic::ppc_altivec_lvsl: - case Intrinsic::uadd_with_overflow: - case Intrinsic::sadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::bswap: - case Intrinsic::ceil: - case Intrinsic::ctlz: - case Intrinsic::ctpop: - case Intrinsic::cttz: - case Intrinsic::fabs: - case Intrinsic::floor: - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::pow: - case Intrinsic::powi: - case Intrinsic::rint: - case Intrinsic::sqrt: - case Intrinsic::trunc: - case Intrinsic::trap: - case Intrinsic::stackprotector: - case Intrinsic::dbg_value: - case Intrinsic::dbg_declare: - // We directly implement these intrinsics - break; - default: - // All other intrinsic calls we must lower. - BasicBlock::iterator Before = E; - if (CI != &BB->front()) - Before = std::prev(BasicBlock::iterator(CI)); - - IL->LowerIntrinsicCall(CI); - if (Before != E) { // Move iterator to instruction after call - I = Before; ++I; - } else { - I = BB->begin(); - } - // If the intrinsic got lowered to another call, and that call has - // a definition then we need to make sure its prototype is emitted - // before any calls to it. - if (CallInst *Call = dyn_cast<CallInst>(I)) - if (Function *NewF = Call->getCalledFunction()) - if (!NewF->isDeclaration()) - prototypesToGen.push_back(NewF); - - break; + case Intrinsic::not_intrinsic: + case Intrinsic::vastart: + case Intrinsic::vacopy: + case Intrinsic::vaend: + case Intrinsic::returnaddress: + case Intrinsic::frameaddress: + case Intrinsic::setjmp: + case Intrinsic::longjmp: + case Intrinsic::sigsetjmp: + case Intrinsic::siglongjmp: + case Intrinsic::prefetch: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse_cmp_ps: + case Intrinsic::x86_sse2_cmp_sd: + case Intrinsic::x86_sse2_cmp_pd: + case Intrinsic::ppc_altivec_lvsl: + case Intrinsic::uadd_with_overflow: + case Intrinsic::sadd_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::bswap: + case Intrinsic::ceil: + case Intrinsic::ctlz: + case Intrinsic::ctpop: + case Intrinsic::cttz: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::fma: + case Intrinsic::fmuladd: + case Intrinsic::pow: + case Intrinsic::powi: + case Intrinsic::rint: + case Intrinsic::sqrt: + case Intrinsic::trunc: + case Intrinsic::trap: + case Intrinsic::stackprotector: + case Intrinsic::dbg_value: + case Intrinsic::dbg_declare: + // We directly implement these intrinsics + break; + default: + // All other intrinsic calls we must lower. + BasicBlock::iterator Before = E; + if (CI != &BB->front()) + Before = std::prev(BasicBlock::iterator(CI)); + + IL->LowerIntrinsicCall(CI); + if (Before != E) { // Move iterator to instruction after call + I = Before; + ++I; + } else { + I = BB->begin(); + } + // If the intrinsic got lowered to another call, and that call has + // a definition then we need to make sure its prototype is emitted + // before any calls to it. + if (CallInst *Call = dyn_cast<CallInst>(I)) + if (Function *NewF = Call->getCalledFunction()) + if (!NewF->isDeclaration()) + prototypesToGen.push_back(NewF); + + break; } } @@ -4610,8 +5030,8 @@ void CWriter::visitCallInst(CallInst &I) { Value *Callee = I.getCalledValue(); - PointerType *PTy = cast<PointerType>(Callee->getType()); - FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); + PointerType *PTy = cast<PointerType>(Callee->getType()); + FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); // If this is a call to a struct-return function, assign to the first // parameter instead of passing it to the call. @@ -4625,11 +5045,14 @@ void CWriter::visitCallInst(CallInst &I) { Out << " = "; } - if (I.isTailCall()) Out << " /*tail*/ "; + if (I.isTailCall()) + Out << " /*tail*/ "; // If this is an indirect call to a struct return function, we need to cast // the pointer. Ditto for indirect calls with byval arguments. - bool NeedsCast = (hasByVal || isStructRet || I.getCallingConv() != CallingConv::C) && !isa<Function>(Callee); + bool NeedsCast = + (hasByVal || isStructRet || I.getCallingConv() != CallingConv::C) && + !isa<Function>(Callee); // GCC is a real PITA. It does not permit codegening casts of functions to // function pointers if they are in a call (it generates a trap instruction @@ -4653,11 +5076,13 @@ void CWriter::visitCallInst(CallInst &I) { if (NeedsCast) { // Ok, just cast the pointer type. Out << "(("; - printTypeName(Out, I.getCalledValue()->getType()->getPointerElementType(), false, std::make_pair(PAL, I.getCallingConv())); + printTypeName(Out, I.getCalledValue()->getType()->getPointerElementType(), + false, std::make_pair(PAL, I.getCallingConv())); Out << "*)(void*)"; } writeOperand(Callee, ContextCasted); - if (NeedsCast) Out << ')'; + if (NeedsCast) + Out << ')'; Out << '('; @@ -4671,7 +5096,7 @@ void CWriter::visitCallInst(CallInst &I) { CallSite CS(&I); CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); unsigned ArgNo = 0; - if (isStructRet) { // Skip struct return argument. + if (isStructRet) { // Skip struct return argument. ++AI; ++ArgNo; } @@ -4689,16 +5114,18 @@ void CWriter::visitCallInst(CallInst &I) { } for (; AI != AE; ++AI, ++ArgNo) { - if (PrintedArg) Out << ", "; + if (PrintedArg) + Out << ", "; if (ArgNo < NumDeclaredParams && (*AI)->getType() != FTy->getParamType(ArgNo)) { Out << '('; - printTypeNameUnaligned(Out, FTy->getParamType(ArgNo), - /*isSigned=*/PAL.hasAttribute(ArgNo+1, Attribute::SExt)); + printTypeNameUnaligned( + Out, FTy->getParamType(ArgNo), + /*isSigned=*/PAL.hasAttribute(ArgNo + 1, Attribute::SExt)); Out << ')'; } // Check if the argument is expected to be passed by value. - if (I.getAttributes().hasAttribute(ArgNo+1, Attribute::ByVal)) + if (I.getAttributes().hasAttribute(ArgNo + 1, Attribute::ByVal)) writeOperandDeref(*AI); else writeOperand(*AI, ContextCasted); @@ -4712,175 +5139,191 @@ void CWriter::visitCallInst(CallInst &I) { bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID) { switch (ID) { - default: { + default: { #ifndef NDEBUG - errs() << "Unknown LLVM intrinsic! " << I; + errs() << "Unknown LLVM intrinsic! " << I; #endif - llvm_unreachable(0); - return false; - } - - case Intrinsic::dbg_value: - case Intrinsic::dbg_declare: - return true; // ignore these intrinsics - case Intrinsic::vastart: - Out << "0; "; - - Out << "va_start(*(va_list*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ", "; - // Output the last argument to the enclosing function. - if (I.getParent()->getParent()->arg_empty()) - Out << "vararg_dummy_arg"; - else - writeOperand(&*(I.getParent()->getParent()->arg_end() - 1)); - Out << ')'; - return true; - case Intrinsic::vaend: - if (!isa<ConstantPointerNull>(I.getArgOperand(0))) { - Out << "0; va_end(*(va_list*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ')'; - } else { - Out << "va_end(*(va_list*)0)"; - } - return true; - case Intrinsic::vacopy: - Out << "0; "; - Out << "va_copy(*(va_list*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ", *(va_list*)"; - writeOperand(I.getArgOperand(1), ContextCasted); - Out << ')'; - return true; - case Intrinsic::returnaddress: - Out << "__builtin_return_address("; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ')'; - return true; - case Intrinsic::frameaddress: - Out << "__builtin_frame_address("; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ')'; - return true; - case Intrinsic::setjmp: - Out << "setjmp(*(jmp_buf*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ')'; - return true; - case Intrinsic::longjmp: - Out << "longjmp(*(jmp_buf*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ", "; - writeOperand(I.getArgOperand(1), ContextCasted); - Out << ')'; - return true; - case Intrinsic::sigsetjmp: - Out << "sigsetjmp(*(sigjmp_buf*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ','; - writeOperand(I.getArgOperand(1), ContextCasted); - Out << ')'; - return true; - case Intrinsic::siglongjmp: - Out << "siglongjmp(*(sigjmp_buf*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ", "; - writeOperand(I.getArgOperand(1), ContextCasted); - Out << ')'; - return true; - case Intrinsic::prefetch: - Out << "LLVM_PREFETCH((const void *)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ", "; - writeOperand(I.getArgOperand(1), ContextCasted); - Out << ", "; - writeOperand(I.getArgOperand(2), ContextCasted); - Out << ")"; - return true; - case Intrinsic::stacksave: - // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save() - // to work around GCC bugs (see PR1809). - Out << "0; *((void**)&" << GetValueName(&I) - << ") = __builtin_stack_save()"; - return true; - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse_cmp_ps: - case Intrinsic::x86_sse2_cmp_sd: - case Intrinsic::x86_sse2_cmp_pd: - Out << '('; - printTypeName(Out, I.getType()); - Out << ')'; - // Multiple GCC builtins multiplex onto this intrinsic. - switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) { - default: llvm_unreachable("Invalid llvm.x86.sse.cmp!"); - case 0: Out << "__builtin_ia32_cmpeq"; break; - case 1: Out << "__builtin_ia32_cmplt"; break; - case 2: Out << "__builtin_ia32_cmple"; break; - case 3: Out << "__builtin_ia32_cmpunord"; break; - case 4: Out << "__builtin_ia32_cmpneq"; break; - case 5: Out << "__builtin_ia32_cmpnlt"; break; - case 6: Out << "__builtin_ia32_cmpnle"; break; - case 7: Out << "__builtin_ia32_cmpord"; break; - } - if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd) - Out << 'p'; - else - Out << 's'; - if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps) - Out << 's'; - else - Out << 'd'; - - Out << "("; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ", "; - writeOperand(I.getArgOperand(1), ContextCasted); - Out << ")"; - return true; - case Intrinsic::ppc_altivec_lvsl: - Out << '('; - printTypeName(Out, I.getType()); - Out << ')'; - Out << "__builtin_altivec_lvsl(0, (void*)"; - writeOperand(I.getArgOperand(0), ContextCasted); - Out << ")"; - return true; - case Intrinsic::stackprotector: - writeOperandDeref(I.getArgOperand(1)); - Out << " = "; - writeOperand(I.getArgOperand(0), ContextCasted); - return true; - case Intrinsic::uadd_with_overflow: - case Intrinsic::sadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::bswap: - case Intrinsic::ceil: - case Intrinsic::ctlz: - case Intrinsic::ctpop: - case Intrinsic::cttz: - case Intrinsic::fabs: - case Intrinsic::floor: - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::pow: - case Intrinsic::powi: - case Intrinsic::rint: - case Intrinsic::sqrt: - case Intrinsic::trap: - case Intrinsic::trunc: - return false; // these use the normal function call emission + llvm_unreachable(0); + return false; + } + + case Intrinsic::dbg_value: + case Intrinsic::dbg_declare: + return true; // ignore these intrinsics + case Intrinsic::vastart: + Out << "0; "; + + Out << "va_start(*(va_list*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ", "; + // Output the last argument to the enclosing function. + if (I.getParent()->getParent()->arg_empty()) + Out << "vararg_dummy_arg"; + else + writeOperand(&*(I.getParent()->getParent()->arg_end() - 1)); + Out << ')'; + return true; + case Intrinsic::vaend: + if (!isa<ConstantPointerNull>(I.getArgOperand(0))) { + Out << "0; va_end(*(va_list*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ')'; + } else { + Out << "va_end(*(va_list*)0)"; + } + return true; + case Intrinsic::vacopy: + Out << "0; "; + Out << "va_copy(*(va_list*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ", *(va_list*)"; + writeOperand(I.getArgOperand(1), ContextCasted); + Out << ')'; + return true; + case Intrinsic::returnaddress: + Out << "__builtin_return_address("; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ')'; + return true; + case Intrinsic::frameaddress: + Out << "__builtin_frame_address("; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ')'; + return true; + case Intrinsic::setjmp: + Out << "setjmp(*(jmp_buf*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ')'; + return true; + case Intrinsic::longjmp: + Out << "longjmp(*(jmp_buf*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ", "; + writeOperand(I.getArgOperand(1), ContextCasted); + Out << ')'; + return true; + case Intrinsic::sigsetjmp: + Out << "sigsetjmp(*(sigjmp_buf*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ','; + writeOperand(I.getArgOperand(1), ContextCasted); + Out << ')'; + return true; + case Intrinsic::siglongjmp: + Out << "siglongjmp(*(sigjmp_buf*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ", "; + writeOperand(I.getArgOperand(1), ContextCasted); + Out << ')'; + return true; + case Intrinsic::prefetch: + Out << "LLVM_PREFETCH((const void *)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ", "; + writeOperand(I.getArgOperand(1), ContextCasted); + Out << ", "; + writeOperand(I.getArgOperand(2), ContextCasted); + Out << ")"; + return true; + case Intrinsic::stacksave: + // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save() + // to work around GCC bugs (see PR1809). + Out << "0; *((void**)&" << GetValueName(&I) << ") = __builtin_stack_save()"; + return true; + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse_cmp_ps: + case Intrinsic::x86_sse2_cmp_sd: + case Intrinsic::x86_sse2_cmp_pd: + Out << '('; + printTypeName(Out, I.getType()); + Out << ')'; + // Multiple GCC builtins multiplex onto this intrinsic. + switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) { + default: + llvm_unreachable("Invalid llvm.x86.sse.cmp!"); + case 0: + Out << "__builtin_ia32_cmpeq"; + break; + case 1: + Out << "__builtin_ia32_cmplt"; + break; + case 2: + Out << "__builtin_ia32_cmple"; + break; + case 3: + Out << "__builtin_ia32_cmpunord"; + break; + case 4: + Out << "__builtin_ia32_cmpneq"; + break; + case 5: + Out << "__builtin_ia32_cmpnlt"; + break; + case 6: + Out << "__builtin_ia32_cmpnle"; + break; + case 7: + Out << "__builtin_ia32_cmpord"; + break; + } + if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd) + Out << 'p'; + else + Out << 's'; + if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps) + Out << 's'; + else + Out << 'd'; + + Out << "("; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ", "; + writeOperand(I.getArgOperand(1), ContextCasted); + Out << ")"; + return true; + case Intrinsic::ppc_altivec_lvsl: + Out << '('; + printTypeName(Out, I.getType()); + Out << ')'; + Out << "__builtin_altivec_lvsl(0, (void*)"; + writeOperand(I.getArgOperand(0), ContextCasted); + Out << ")"; + return true; + case Intrinsic::stackprotector: + writeOperandDeref(I.getArgOperand(1)); + Out << " = "; + writeOperand(I.getArgOperand(0), ContextCasted); + return true; + case Intrinsic::uadd_with_overflow: + case Intrinsic::sadd_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::bswap: + case Intrinsic::ceil: + case Intrinsic::ctlz: + case Intrinsic::ctpop: + case Intrinsic::cttz: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::fma: + case Intrinsic::fmuladd: + case Intrinsic::pow: + case Intrinsic::powi: + case Intrinsic::rint: + case Intrinsic::sqrt: + case Intrinsic::trap: + case Intrinsic::trunc: + return false; // these use the normal function call emission } } -//This converts the llvm constraint string to something gcc is expecting. -//TODO: work out platform independent constraints and factor those out +// This converts the llvm constraint string to something gcc is expecting. +// TODO: work out platform independent constraints and factor those out // of the per target tables // handle multiple constraint codes -std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) { +std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo &c) { return TargetLowering::AsmOperandInfo(c).ConstraintCode; #if 0 assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle"); @@ -4917,7 +5360,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) { #endif } -//TODO: import logic from AsmPrinter.cpp +// TODO: import logic from AsmPrinter.cpp static std::string gccifyAsm(std::string asmstr) { for (std::string::size_type i = 0; i != asmstr.size(); ++i) if (asmstr[i] == '\n') @@ -4928,27 +5371,28 @@ static std::string gccifyAsm(std::string asmstr) { if (asmstr[i + 1] == '{') { std::string::size_type a = asmstr.find_first_of(':', i + 1); std::string::size_type b = asmstr.find_first_of('}', i + 1); - std::string n = "%" + - asmstr.substr(a + 1, b - a - 1) + - asmstr.substr(i + 2, a - i - 2); + std::string n = "%" + asmstr.substr(a + 1, b - a - 1) + + asmstr.substr(i + 2, a - i - 2); asmstr.replace(i, b - i + 1, n); i += n.size() - 1; } else asmstr.replace(i, 1, "%"); + } else if (asmstr[i] == '%') // grr + { + asmstr.replace(i, 1, "%%"); + ++i; } - else if (asmstr[i] == '%')//grr - { asmstr.replace(i, 1, "%%"); ++i;} return asmstr; } -//TODO: assumptions about what consume arguments from the call are likely wrong +// TODO: assumptions about what consume arguments from the call are likely wrong // handle communitivity void CWriter::visitInlineAsm(CallInst &CI) { - InlineAsm* as = cast<InlineAsm>(CI.getCalledValue()); + InlineAsm *as = cast<InlineAsm>(CI.getCalledValue()); InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints(); - std::vector<std::pair<Value*, int> > ResultVals; + std::vector<std::pair<Value *, int>> ResultVals; if (CI.getType() == Type::getVoidTy(CI.getContext())) ; else if (StructType *ST = dyn_cast<StructType>(CI.getType())) { @@ -4967,16 +5411,18 @@ void CWriter::visitInlineAsm(CallInst &CI) { // Convert over all the output constraints. for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(), - E = Constraints.end(); I != E; ++I) { + E = Constraints.end(); + I != E; ++I) { if (I->Type != InlineAsm::isOutput) { ++ValueCount; - continue; // Ignore non-output constraints. + continue; // Ignore non-output constraints. } assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle"); std::string C = InterpretASMConstraint(*I); - if (C.empty()) continue; + if (C.empty()) + continue; if (!IsFirst) { Out << ", "; @@ -4991,10 +5437,10 @@ void CWriter::visitInlineAsm(CallInst &CI) { DestVal = ResultVals[ValueCount].first; DestValNo = ResultVals[ValueCount].second; } else - DestVal = CI.getArgOperand(ValueCount-ResultVals.size()); + DestVal = CI.getArgOperand(ValueCount - ResultVals.size()); if (I->isEarlyClobber) - C = "&"+C; + C = "&" + C; Out << "\"=" << C << "\"(" << GetValueName(DestVal); if (DestValNo != -1) @@ -5003,21 +5449,22 @@ void CWriter::visitInlineAsm(CallInst &CI) { ++ValueCount; } - // Convert over all the input constraints. Out << "\n :"; IsFirst = true; ValueCount = 0; for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(), - E = Constraints.end(); I != E; ++I) { + E = Constraints.end(); + I != E; ++I) { if (I->Type != InlineAsm::isInput) { ++ValueCount; - continue; // Ignore non-input constraints. + continue; // Ignore non-input constraints. } assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle"); std::string C = InterpretASMConstraint(*I); - if (C.empty()) continue; + if (C.empty()) + continue; if (!IsFirst) { Out << ", "; @@ -5025,7 +5472,7 @@ void CWriter::visitInlineAsm(CallInst &CI) { } assert(ValueCount >= ResultVals.size() && "Input can't refer to result"); - Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size()); + Value *SrcVal = CI.getArgOperand(ValueCount - ResultVals.size()); Out << "\"" << C << "\"("; if (!I->isIndirect) @@ -5038,13 +5485,15 @@ void CWriter::visitInlineAsm(CallInst &CI) { // Convert over the clobber constraints. IsFirst = true; for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(), - E = Constraints.end(); I != E; ++I) { + E = Constraints.end(); + I != E; ++I) { if (I->Type != InlineAsm::isClobber) - continue; // Ignore non-input constraints. + continue; // Ignore non-input constraints. assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle"); std::string C = InterpretASMConstraint(*I); - if (C.empty()) continue; + if (C.empty()) + continue; if (!IsFirst) { Out << ", "; @@ -5062,21 +5511,22 @@ void CWriter::visitAllocaInst(AllocaInst &I) { printTypeName(Out, I.getType()); Out << ") alloca(sizeof("; printTypeName(Out, I.getType()->getElementType()); - if (I.isArrayAllocation()) { - Out << ") * (" ; + if (I.isArrayAllocation()) { + Out << ") * ("; writeOperand(I.getArraySize(), ContextCasted); } Out << "))"; } void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I, - gep_type_iterator E, bool isArrayType, GetElementPtrInst *GEPI) { - //DEBUG(errs() << "Printing GEP\n"); - //DEBUG(errs() << "\tPtr: " << *Ptr << "\n"); - //DEBUG(errs() << "\tGEPI: " << *GEPI <<"\n"); + gep_type_iterator E, bool isArrayType, + GetElementPtrInst *GEPI) { + // DEBUG(errs() << "Printing GEP\n"); + // DEBUG(errs() << "\tPtr: " << *Ptr << "\n"); + // DEBUG(errs() << "\tGEPI: " << *GEPI <<"\n"); // If there are no indices, just print out the pointer. if (I == E) { - //DEBUG(errs() << "I==E: Calling writeOperand()\n"); + // DEBUG(errs() << "I==E: Calling writeOperand()\n"); writeOperand(Ptr); return; } @@ -5087,7 +5537,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I, VectorType *LastIndexIsVector = 0; { for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI) - //LastIndexIsVector = dyn_cast<VectorType>(TmpI.getCurTy()); + // LastIndexIsVector = dyn_cast<VectorType>(TmpI.getCurTy()); // CHECK: This change needs thorough testing LastIndexIsVector = dyn_cast<VectorType>(TmpI.getIndexedType()); } @@ -5096,53 +5546,55 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I, // If the last index is into a vector, we can't print it as &a[i][j] because // we can't index into a vector with j in GCC. Instead, emit this as // (((float*)&a[i])+j) - // TODO: this is no longer true now that we don't represent vectors using gcc-extentions + // TODO: this is no longer true now that we don't represent vectors using + // gcc-extentions if (LastIndexIsVector) { - //DEBUG(errs() << "LastIndexIsVector\n"); + // DEBUG(errs() << "LastIndexIsVector\n"); Out << "(("; - printTypeName(Out, PointerType::getUnqual(LastIndexIsVector->getElementType())); + printTypeName(Out, + PointerType::getUnqual(LastIndexIsVector->getElementType())); Out << ")("; } - bool isArrayAccess = false; + bool isArrayAccess = false; if (GEPStack.size() > 0 && GEPStack.top() == GEPI) { - //DEBUG(errs() << "Processing load-specific GEP\n"); + // DEBUG(errs() << "Processing load-specific GEP\n"); GEPStack.pop(); isArrayAccess = true; } else { - //DEBUG(errs() << "I'm hereee!\n"); + // DEBUG(errs() << "I'm hereee!\n"); Out << '&'; } - //DEBUG(errs() << "Here!\n"); + // DEBUG(errs() << "Here!\n"); // If the first index is 0 (very typical) we can do a number of // simplifications to clean up the code. Value *FirstOp = I.getOperand(); - //DEBUG(errs() << "FirstOp: " << *(I.getOperand()) << "\n"); + // DEBUG(errs() << "FirstOp: " << *(I.getOperand()) << "\n"); if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) { - //DEBUG(errs() << "Calling writeoperand()\n"); + // DEBUG(errs() << "Calling writeoperand()\n"); // First index isn't simple, print it the hard way. writeOperand(Ptr, ContextNormal, isArrayAccess); } else { - ++I; // Skip the zero index. - //DEBUG(errs() << "Skipping zero index\n"); + ++I; // Skip the zero index. + // DEBUG(errs() << "Skipping zero index\n"); // Okay, emit the first operand. If Ptr is something that is already address // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead. if (isAddressExposed(Ptr)) { - //DEBUG(errs() << "Address exposed; calling writeoperandinternal()\n"); + // DEBUG(errs() << "Address exposed; calling writeoperandinternal()\n"); writeOperandInternal(Ptr); } - //else if (I != E && (I.getCurTy())->isStructTy()) { + // else if (I != E && (I.getCurTy())->isStructTy()) { // NOTE: This change needs to be tested more - else if (I != E && (I.isStruct()) ) { - //DEBUG(errs() << "Not address exposed; is struct type\n"); + else if (I != E && (I.isStruct())) { + // DEBUG(errs() << "Not address exposed; is struct type\n"); // If we didn't already emit the first operand, see if we can print it as // P->f instead of "P[0].f" writeOperand(Ptr); Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue(); - ++I; // eat the struct index as well. + ++I; // eat the struct index as well. } else { - //DEBUG(errs() << "In else; emitting *P\n"); + // DEBUG(errs() << "In else; emitting *P\n"); // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic. Out << "(*"; writeOperand(Ptr); @@ -5153,28 +5605,32 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I, Type *Agg = GEPI->getSourceElementType(); unsigned CurIdx = 1; for (; I != E; ++CurIdx, ++I) { - assert(I.getOperand()->getType()->isIntegerTy()); // TODO: indexing a Vector with a Vector is valid, but we don't support it here - //DEBUG(errs() << "Type: " << *Agg << "; operand: " << *(I.getOperand()) << "\n"); - if ((Agg->isStructTy())){ - //DEBUG(errs() << "Found a struct\n"); + assert(I.getOperand() + ->getType() + ->isIntegerTy()); // TODO: indexing a Vector with a Vector is + // valid, but we don't support it here + // DEBUG(errs() << "Type: " << *Agg << "; operand: " << *(I.getOperand()) << + // "\n"); + if ((Agg->isStructTy())) { + // DEBUG(errs() << "Found a struct\n"); Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue(); } else if (Agg->isArrayTy()) { - //DEBUG(errs() << "Found an array!\n"); + // DEBUG(errs() << "Found an array!\n"); Out << ".array["; writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr); Out << ']'; } else if (!Agg->isVectorTy()) { - //DEBUG(errs() << "Not a vector!\n"); + // DEBUG(errs() << "Not a vector!\n"); Out << '['; writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr); Out << ']'; } else { - //DEBUG(errs() << "In else!\n"); + // DEBUG(errs() << "In else!\n"); // If the last index is into a vector, then print it out as "+j)". This // works with the 'LastIndexIsVector' code above. if (isa<Constant>(I.getOperand()) && cast<Constant>(I.getOperand())->isNullValue()) { - Out << "))"; // avoid "+0". + Out << "))"; // avoid "+0". } else { Out << ")+("; writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr); @@ -5182,248 +5638,246 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I, } } CompositeType *CT = dyn_cast<CompositeType>(Agg); - if (!CT || CT->isPointerTy()) - { - //DEBUG(errs() << "Something wrong!!\n"); + if (!CT || CT->isPointerTy()) { + // DEBUG(errs() << "Something wrong!!\n"); break; } - Value* Index = GEPI->getOperand(CurIdx); + Value *Index = GEPI->getOperand(CurIdx); if (!CT->indexValid(Index)) - if (!CT || CT->isPointerTy()) - { - //DEBUG(errs() << "Something wrong 2!!\n"); + if (!CT || CT->isPointerTy()) { + // DEBUG(errs() << "Something wrong 2!!\n"); break; } Agg = CT->getTypeAtIndex(Index); } Out << ")"; - //DEBUG(errs() << "Leaving printGEPExpression\n"); - } + // DEBUG(errs() << "Leaving printGEPExpression\n"); +} - void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType, - bool IsVolatile, unsigned Alignment /*bytes*/) { - //DEBUG(errs() << *OperandType << "; " << *Operand << "\n"); - bool arrayAccess = false; - if(isa<GetElementPtrInst>(Operand)) { - //DEBUG(errs() << "ISA Get Element Pointer!\n"); - arrayAccess = true; - GEPStack.push(dyn_cast<GetElementPtrInst>(Operand)); - } - // if (isAddressExposed(Operand)) { - // DEBUG(errs() << "Is address exposed!!\n"); - // writeOperandInternal(Operand); - // return; - // } - - bool IsUnaligned = Alignment && - Alignment < TD->getABITypeAlignment(OperandType); - if (!arrayAccess) { - if (!IsUnaligned) - Out << '*'; - - else if (IsUnaligned) { - Out << "__UNALIGNED_LOAD__("; - printTypeNameUnaligned(Out, OperandType, false); - if (IsVolatile) Out << " volatile"; - Out << ", " << Alignment << ", "; - } +void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType, + bool IsVolatile, unsigned Alignment /*bytes*/) { + // DEBUG(errs() << *OperandType << "; " << *Operand << "\n"); + bool arrayAccess = false; + if (isa<GetElementPtrInst>(Operand)) { + // DEBUG(errs() << "ISA Get Element Pointer!\n"); + arrayAccess = true; + GEPStack.push(dyn_cast<GetElementPtrInst>(Operand)); + } + // if (isAddressExposed(Operand)) { + // DEBUG(errs() << "Is address exposed!!\n"); + // writeOperandInternal(Operand); + // return; + // } - else if (IsVolatile) { - Out << "("; - printTypeName(Out, OperandType, false); - Out << "volatile"; - Out << "*)"; - } + bool IsUnaligned = + Alignment && Alignment < TD->getABITypeAlignment(OperandType); + if (!arrayAccess) { + if (!IsUnaligned) + Out << '*'; + + else if (IsUnaligned) { + Out << "__UNALIGNED_LOAD__("; + printTypeNameUnaligned(Out, OperandType, false); + if (IsVolatile) + Out << " volatile"; + Out << ", " << Alignment << ", "; } - writeOperand(Operand,ContextNormal, arrayAccess ); - - if (IsUnaligned) { - Out << ")"; + else if (IsVolatile) { + Out << "("; + printTypeName(Out, OperandType, false); + Out << "volatile"; + Out << "*)"; } } - void CWriter::visitLoadInst(LoadInst &I) { - //DEBUG(errs() << "Visiting Load instruction!\n"); - // DEBUG(errs() << "Visiting load: " << I << "\n"); - writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(), - I.getAlignment()); + writeOperand(Operand, ContextNormal, arrayAccess); + if (IsUnaligned) { + Out << ")"; } +} - void CWriter::visitStoreInst(StoreInst &I) { - //DEBUG(errs() << "Visiting store instruction!\n"); - writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(), - I.isVolatile(), I.getAlignment()); - Out << " = "; - Value *Operand = I.getOperand(0); - unsigned BitMask = 0; - if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType())) - if (!ITy->isPowerOf2ByteWidth()) - // We have a bit width that doesn't match an even power-of-2 byte - // size. Consequently we must & the value with the type's bit mask - BitMask = ITy->getBitMask(); - if (BitMask) - Out << "(("; - writeOperand(Operand, BitMask ? ContextNormal : ContextCasted); - if (BitMask) - Out << ") & " << BitMask << ")"; - } +void CWriter::visitLoadInst(LoadInst &I) { + // DEBUG(errs() << "Visiting Load instruction!\n"); + // DEBUG(errs() << "Visiting load: " << I << "\n"); + writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(), + I.getAlignment()); +} - void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) { - // DEBUG(errs() <<"Visiting GEP: " << I << "\n"); - printGEPExpression(I.getPointerOperand(), gep_type_begin(I), - gep_type_end(I), I.getSourceElementType()->isArrayTy(), &I); - } +void CWriter::visitStoreInst(StoreInst &I) { + // DEBUG(errs() << "Visiting store instruction!\n"); + writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(), + I.isVolatile(), I.getAlignment()); + Out << " = "; + Value *Operand = I.getOperand(0); + unsigned BitMask = 0; + if (IntegerType *ITy = dyn_cast<IntegerType>(Operand->getType())) + if (!ITy->isPowerOf2ByteWidth()) + // We have a bit width that doesn't match an even power-of-2 byte + // size. Consequently we must & the value with the type's bit mask + BitMask = ITy->getBitMask(); + if (BitMask) + Out << "(("; + writeOperand(Operand, BitMask ? ContextNormal : ContextCasted); + if (BitMask) + Out << ") & " << BitMask << ")"; +} - void CWriter::visitVAArgInst(VAArgInst &I) { - Out << "va_arg(*(va_list*)"; - writeOperand(I.getOperand(0), ContextCasted); - Out << ", "; - printTypeName(Out, I.getType()); - Out << ");\n "; - } +void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) { + // DEBUG(errs() <<"Visiting GEP: " << I << "\n"); + printGEPExpression(I.getPointerOperand(), gep_type_begin(I), gep_type_end(I), + I.getSourceElementType()->isArrayTy(), &I); +} - void CWriter::visitInsertElementInst(InsertElementInst &I) { - // Start by copying the entire aggregate value into the result variable. - writeOperand(I.getOperand(0)); - Type *EltTy = I.getType()->getElementType(); - assert(I.getOperand(1)->getType() == EltTy); - if (isEmptyType(EltTy)) return; - - // Then do the insert to update the field. - Out << ";\n "; - Out << GetValueName(&I) << ".vector["; - writeOperand(I.getOperand(2)); - Out << "] = "; - writeOperand(I.getOperand(1), ContextCasted); - } +void CWriter::visitVAArgInst(VAArgInst &I) { + Out << "va_arg(*(va_list*)"; + writeOperand(I.getOperand(0), ContextCasted); + Out << ", "; + printTypeName(Out, I.getType()); + Out << ");\n "; +} - void CWriter::visitExtractElementInst(ExtractElementInst &I) { - assert(!isEmptyType(I.getType())); - if (isa<UndefValue>(I.getOperand(0))) { - Out << "("; - printTypeName(Out, I.getType()); - Out << ") 0/*UNDEF*/"; - } else { - Out << "("; - writeOperand(I.getOperand(0)); - Out << ").vector["; - writeOperand(I.getOperand(1)); - Out << "]"; - } - } +void CWriter::visitInsertElementInst(InsertElementInst &I) { + // Start by copying the entire aggregate value into the result variable. + writeOperand(I.getOperand(0)); + Type *EltTy = I.getType()->getElementType(); + assert(I.getOperand(1)->getType() == EltTy); + if (isEmptyType(EltTy)) + return; - // <result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask> - // ; yields <m x <ty>> - void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) { - VectorType *VT = SVI.getType(); - Type *EltTy = VT->getElementType(); - VectorType *InputVT = cast<VectorType>(SVI.getOperand(0)->getType()); - assert(!isEmptyType(VT)); - assert(InputVT->getElementType() == VT->getElementType()); + // Then do the insert to update the field. + Out << ";\n "; + Out << GetValueName(&I) << ".vector["; + writeOperand(I.getOperand(2)); + Out << "] = "; + writeOperand(I.getOperand(1), ContextCasted); +} - CtorDeclTypes.insert(VT); - Out << "llvm_ctor_"; - printTypeString(Out, VT, false); +void CWriter::visitExtractElementInst(ExtractElementInst &I) { + assert(!isEmptyType(I.getType())); + if (isa<UndefValue>(I.getOperand(0))) { Out << "("; + printTypeName(Out, I.getType()); + Out << ") 0/*UNDEF*/"; + } else { + Out << "("; + writeOperand(I.getOperand(0)); + Out << ").vector["; + writeOperand(I.getOperand(1)); + Out << "]"; + } +} + +// <result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask> +// ; yields <m x <ty>> +void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) { + VectorType *VT = SVI.getType(); + Type *EltTy = VT->getElementType(); + VectorType *InputVT = cast<VectorType>(SVI.getOperand(0)->getType()); + assert(!isEmptyType(VT)); + assert(InputVT->getElementType() == VT->getElementType()); - Constant *Zero = Constant::getNullValue(EltTy); - unsigned NumElts = VT->getNumElements(); - unsigned NumInputElts = InputVT->getNumElements(); // n - for (unsigned i = 0; i != NumElts; ++i) { - if (i) Out << ", "; - int SrcVal = SVI.getMaskValue(i); - if ((unsigned)SrcVal >= NumInputElts * 2) { - Out << "/*undef*/"; + CtorDeclTypes.insert(VT); + Out << "llvm_ctor_"; + printTypeString(Out, VT, false); + Out << "("; + + Constant *Zero = Constant::getNullValue(EltTy); + unsigned NumElts = VT->getNumElements(); + unsigned NumInputElts = InputVT->getNumElements(); // n + for (unsigned i = 0; i != NumElts; ++i) { + if (i) + Out << ", "; + int SrcVal = SVI.getMaskValue(i); + if ((unsigned)SrcVal >= NumInputElts * 2) { + Out << "/*undef*/"; + printConstant(Zero, ContextCasted); + } else { + // If SrcVal belongs [0, n - 1], it extracts value from <v1> + // If SrcVal belongs [n, 2 * n - 1], it extracts value from <v2> + // In C++, the value false is converted to zero and the value true is + // converted to one + Value *Op = SVI.getOperand((unsigned)SrcVal >= NumInputElts); + if (isa<Instruction>(Op)) { + // Do an extractelement of this value from the appropriate input. + Out << "("; + writeOperand(Op); + Out << ").vector["; + Out << ((unsigned)SrcVal >= NumInputElts ? SrcVal - NumInputElts + : SrcVal); + Out << "]"; + } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) { printConstant(Zero, ContextCasted); } else { - // If SrcVal belongs [0, n - 1], it extracts value from <v1> - // If SrcVal belongs [n, 2 * n - 1], it extracts value from <v2> - // In C++, the value false is converted to zero and the value true is - // converted to one - Value *Op = SVI.getOperand((unsigned)SrcVal >= NumInputElts); - if (isa<Instruction>(Op)) { - // Do an extractelement of this value from the appropriate input. - Out << "("; - writeOperand(Op); - Out << ").vector["; - Out << ((unsigned)SrcVal >= NumInputElts ? SrcVal - NumInputElts : SrcVal); - Out << "]"; - } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) { - printConstant(Zero, ContextCasted); - } else { - printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal & - (NumElts-1)), - ContextNormal); - } + printConstant( + cast<ConstantVector>(Op)->getOperand(SrcVal & (NumElts - 1)), + ContextNormal); } } - Out << ")"; } + Out << ")"; +} + +void CWriter::visitInsertValueInst(InsertValueInst &IVI) { + // Start by copying the entire aggregate value into the result variable. + writeOperand(IVI.getOperand(0)); + Type *EltTy = IVI.getOperand(1)->getType(); + if (isEmptyType(EltTy)) + return; + + // Then do the insert to update the field. + Out << ";\n "; + Out << GetValueName(&IVI); + for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end(); i != e; + ++i) { + Type *IndexedTy = ExtractValueInst::getIndexedType( + IVI.getOperand(0)->getType(), makeArrayRef(b, i)); + assert(IndexedTy); + if (IndexedTy->isArrayTy()) + Out << ".array[" << *i << "]"; + else + Out << ".field" << *i; + } + Out << " = "; + writeOperand(IVI.getOperand(1), ContextCasted); +} - void CWriter::visitInsertValueInst(InsertValueInst &IVI) { - // Start by copying the entire aggregate value into the result variable. - writeOperand(IVI.getOperand(0)); - Type *EltTy = IVI.getOperand(1)->getType(); - if (isEmptyType(EltTy)) return; - - // Then do the insert to update the field. - Out << ";\n "; - Out << GetValueName(&IVI); - for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end(); - i != e; ++i) { - Type *IndexedTy = - ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(), - makeArrayRef(b, i)); - assert(IndexedTy); +void CWriter::visitExtractValueInst(ExtractValueInst &EVI) { + Out << "("; + if (isa<UndefValue>(EVI.getOperand(0))) { + Out << "("; + printTypeName(Out, EVI.getType()); + Out << ") 0/*UNDEF*/"; + } else { + writeOperand(EVI.getOperand(0)); + for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end(); + i != e; ++i) { + Type *IndexedTy = ExtractValueInst::getIndexedType( + EVI.getOperand(0)->getType(), makeArrayRef(b, i)); if (IndexedTy->isArrayTy()) Out << ".array[" << *i << "]"; else Out << ".field" << *i; } - Out << " = "; - writeOperand(IVI.getOperand(1), ContextCasted); - } - - void CWriter::visitExtractValueInst(ExtractValueInst &EVI) { - Out << "("; - if (isa<UndefValue>(EVI.getOperand(0))) { - Out << "("; - printTypeName(Out, EVI.getType()); - Out << ") 0/*UNDEF*/"; - } else { - writeOperand(EVI.getOperand(0)); - for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end(); - i != e; ++i) { - Type *IndexedTy = - ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(), - makeArrayRef(b, i)); - if (IndexedTy->isArrayTy()) - Out << ".array[" << *i << "]"; - else - Out << ".field" << *i; - } - } - Out << ")"; } + Out << ")"; +} - //===----------------------------------------------------------------------===// - // External Interface declaration - //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// External Interface declaration +//===----------------------------------------------------------------------===// - bool CTargetMachine::addPassesToEmitFile(PassManagerBase &PM, - raw_pwrite_stream &Out, - raw_pwrite_stream *Out2, - CodeGenFileType FileType, - bool DisableVerify, - MachineModuleInfo *MMI){ +bool CTargetMachine::addPassesToEmitFile( + PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *Out2, + CodeGenFileType FileType, bool DisableVerify, MachineModuleInfo *MMI) { - if (FileType != TargetMachine::CGFT_AssemblyFile) return true; + if (FileType != TargetMachine::CGFT_AssemblyFile) + return true; - PM.add(createGCLoweringPass()); - PM.add(createLowerInvokePass()); - PM.add(createCFGSimplificationPass()); // clean up after lower invoke. - PM.add(new CWriter(Out)); - return false; - } + PM.add(createGCLoweringPass()); + PM.add(createLowerInvokePass()); + PM.add(createCFGSimplificationPass()); // clean up after lower invoke. + PM.add(new CWriter(Out)); + return false; +} diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h index 07f3b80af5d7fc4eda35df068e59eb6b7e79202d..33d936d9d09026e961d8bf723263c32baa0bd390 100644 --- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h +++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h @@ -1,25 +1,34 @@ #include "CTargetMachine.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Attributes.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/InstVisitor.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -29,21 +38,12 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Pass.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/IR/Dominators.h" -#include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/Mem2Reg.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/IVUsers.h" #include "llvm/Transforms/Utils/LoopSimplify.h" -#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/Mem2Reg.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <set> #include <stack> @@ -55,290 +55,304 @@ #define PRIVATE_ADDRSPACE 5 namespace { - using namespace llvm; - - class CBEMCAsmInfo : public MCAsmInfo { - public: - CBEMCAsmInfo() { - PrivateGlobalPrefix = ""; - } - }; - - /// CWriter - This class is the main chunk of code that converts an LLVM - /// module to a C translation unit. - class CWriter : public FunctionPass, public InstVisitor<CWriter> { - std::string _Out; - raw_string_ostream Out; - raw_pwrite_stream &FileOut; - IntrinsicLowering *IL; - LoopInfo *LI; - PostDominatorTree *PDT; - DominatorTree *DT; - ScalarEvolution *SE; - IVUsers *IU; - AssumptionCache *AC; - - const Module *TheModule; - const MCAsmInfo* TAsm; - const MCRegisterInfo *MRI; - const MCObjectFileInfo *MOFI; - MCContext *TCtx; - const DataLayout* TD; - - std::map<const ConstantFP *, unsigned> FPConstantMap; - std::set<const Argument*> ByValParams; - - // Set for storing all loop induction variables - std::set<PHINode*> LInductionVars; - std::map<Loop*, PHINode*> LoopIndVarsMap; - - unsigned FPCounter; - unsigned OpaqueCounter; - - DenseMap<const Value*, unsigned> AnonValueNumbers; - unsigned NextAnonValueNumber; - - /// UnnamedStructIDs - This contains a unique ID for each struct that is - /// either anonymous or has no name. - DenseMap<StructType*, unsigned> UnnamedStructIDs; - unsigned NextAnonStructNumber; - - std::set<Type*> TypedefDeclTypes; - std::set<Type*> SelectDeclTypes; - std::set<std::pair<CmpInst::Predicate, VectorType*>> CmpDeclTypes; - std::set<std::pair<CastInst::CastOps, std::pair<Type*, Type*>>> CastOpDeclTypes; - std::set<std::pair<unsigned, Type*>> InlineOpDeclTypes; - std::set<Type*> CtorDeclTypes; - - DenseMap<std::pair<FunctionType*, std::pair<AttributeList, CallingConv::ID>>, unsigned> UnnamedFunctionIDs; - unsigned NextFunctionNumber; - - // This is used to keep track of intrinsics that get generated to a lowered - // function. We must generate the prototypes before the function body which - // will only be expanded on first use - std::vector<Function*> prototypesToGen; - - // Set for keeping track of visited blocks to avoid goto when possible - std::set<BasicBlock*> VisitedBlocks; - std::set<BasicBlock*> CompVisitedBlocks; - std::set<BasicBlock*> FindVisitedBlocks; - std::set<BasicBlock*> ReplicateBlocks; - std::stack<BasicBlock*> ImmPostDommBlocks; - std::stack<BasicBlock*> ElseBlocks; - std::stack<BranchInst*> ElseBranches; - std::stack<GetElementPtrInst*> GEPStack; - public: - static char ID; - explicit CWriter(raw_pwrite_stream &o) - : FunctionPass(ID), Out(_Out), FileOut(o), IL(0), LI(0), - TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0), - OpaqueCounter(0), NextAnonValueNumber(0), - NextAnonStructNumber(0), NextFunctionNumber(0), PDT(0) { - FPCounter = 0; - } - - virtual StringRef getPassName() const { return "C backend"; } - - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<LoopInfoWrapperPass>(); - // Adding PDT pass to avoid code duplication - AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<AssumptionCacheTracker>(); -// AU.addRequiredID(LoopSimplifyID); -// AU.addRequired<LoopSimplifyPass>(); - -// AU.addRequired<IVUsersWrapperPass>(); - //AU.addRequired<PromotePass>(); - AU.setPreservesCFG(); - } - - virtual bool doInitialization(Module &M); - virtual bool doFinalization(Module &M); - virtual bool runOnFunction(Function &F); - - private: - - void generateHeader(Module &M); - void declareOneGlobalVariable(GlobalVariable* I); - - void forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted); - void forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, - std::set<Type*> &TypesPrinted); - - raw_ostream &printFunctionProto(raw_ostream &Out, FunctionType *Ty, - //std::pair<AttributeSet, CallingConv::ID> Attrs, - std::pair<AttributeList, CallingConv::ID> Attrs, - const std::string &Name, - Function::arg_iterator ArgList, - //Function::ArgumentListType *ArgList, - bool isKernel); - - raw_ostream &printFunctionProto(raw_ostream &Out, Function *F) { - bool isKernel = false; - if (NamedMDNode * KernelMD = F->getParent()->getNamedMetadata("opencl.kernels")) { - for (auto iter : KernelMD->operands()) { - const MDOperand *KernelMDOp = iter->operands().begin(); - Metadata *KMD = KernelMDOp->get(); - if(ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)){ - Value *KMDVal = KMDVAM->getValue(); - Function *KMDFunc = dyn_cast<Function>(KMDVal); - if(KMDFunc == F) { - isKernel = true; - } +using namespace llvm; + +class CBEMCAsmInfo : public MCAsmInfo { +public: + CBEMCAsmInfo() { PrivateGlobalPrefix = ""; } +}; + +/// CWriter - This class is the main chunk of code that converts an LLVM +/// module to a C translation unit. +class CWriter : public FunctionPass, public InstVisitor<CWriter> { + std::string _Out; + raw_string_ostream Out; + raw_pwrite_stream &FileOut; + IntrinsicLowering *IL; + LoopInfo *LI; + PostDominatorTree *PDT; + DominatorTree *DT; + ScalarEvolution *SE; + IVUsers *IU; + AssumptionCache *AC; + + const Module *TheModule; + const MCAsmInfo *TAsm; + const MCRegisterInfo *MRI; + const MCObjectFileInfo *MOFI; + MCContext *TCtx; + const DataLayout *TD; + + std::map<const ConstantFP *, unsigned> FPConstantMap; + std::set<const Argument *> ByValParams; + + // Set for storing all loop induction variables + std::set<PHINode *> LInductionVars; + std::map<Loop *, PHINode *> LoopIndVarsMap; + + unsigned FPCounter; + unsigned OpaqueCounter; + + DenseMap<const Value *, unsigned> AnonValueNumbers; + unsigned NextAnonValueNumber; + + /// UnnamedStructIDs - This contains a unique ID for each struct that is + /// either anonymous or has no name. + DenseMap<StructType *, unsigned> UnnamedStructIDs; + unsigned NextAnonStructNumber; + + std::set<Type *> TypedefDeclTypes; + std::set<Type *> SelectDeclTypes; + std::set<std::pair<CmpInst::Predicate, VectorType *>> CmpDeclTypes; + std::set<std::pair<CastInst::CastOps, std::pair<Type *, Type *>>> + CastOpDeclTypes; + std::set<std::pair<unsigned, Type *>> InlineOpDeclTypes; + std::set<Type *> CtorDeclTypes; + + DenseMap<std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>>, + unsigned> + UnnamedFunctionIDs; + unsigned NextFunctionNumber; + + // This is used to keep track of intrinsics that get generated to a lowered + // function. We must generate the prototypes before the function body which + // will only be expanded on first use + std::vector<Function *> prototypesToGen; + + // Set for keeping track of visited blocks to avoid goto when possible + std::set<BasicBlock *> VisitedBlocks; + std::set<BasicBlock *> CompVisitedBlocks; + std::set<BasicBlock *> FindVisitedBlocks; + std::set<BasicBlock *> ReplicateBlocks; + std::stack<BasicBlock *> ImmPostDommBlocks; + std::stack<BasicBlock *> ElseBlocks; + std::stack<BranchInst *> ElseBranches; + std::stack<GetElementPtrInst *> GEPStack; + +public: + static char ID; + explicit CWriter(raw_pwrite_stream &o) + : FunctionPass(ID), Out(_Out), FileOut(o), IL(0), LI(0), TheModule(0), + TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0), OpaqueCounter(0), + NextAnonValueNumber(0), NextAnonStructNumber(0), NextFunctionNumber(0), + PDT(0) { + FPCounter = 0; + } + + virtual StringRef getPassName() const { return "C backend"; } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfoWrapperPass>(); + // Adding PDT pass to avoid code duplication + AU.addRequired<PostDominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + // AU.addRequiredID(LoopSimplifyID); + // AU.addRequired<LoopSimplifyPass>(); + + // AU.addRequired<IVUsersWrapperPass>(); + // AU.addRequired<PromotePass>(); + AU.setPreservesCFG(); + } + + virtual bool doInitialization(Module &M); + virtual bool doFinalization(Module &M); + virtual bool runOnFunction(Function &F); + +private: + void generateHeader(Module &M); + void declareOneGlobalVariable(GlobalVariable *I); + + void forwardDeclareStructs(raw_ostream &Out, Type *Ty, + std::set<Type *> &TypesPrinted); + void forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, + std::set<Type *> &TypesPrinted); + + raw_ostream & + printFunctionProto(raw_ostream &Out, FunctionType *Ty, + // std::pair<AttributeSet, CallingConv::ID> Attrs, + std::pair<AttributeList, CallingConv::ID> Attrs, + const std::string &Name, Function::arg_iterator ArgList, + // Function::ArgumentListType *ArgList, + bool isKernel); + + raw_ostream &printFunctionProto(raw_ostream &Out, Function *F) { + bool isKernel = false; + if (NamedMDNode *KernelMD = + F->getParent()->getNamedMetadata("opencl.kernels")) { + for (auto iter : KernelMD->operands()) { + const MDOperand *KernelMDOp = iter->operands().begin(); + Metadata *KMD = KernelMDOp->get(); + if (ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)) { + Value *KMDVal = KMDVAM->getValue(); + Function *KMDFunc = dyn_cast<Function>(KMDVal); + if (KMDFunc == F) { + isKernel = true; } } } - - return printFunctionProto(Out, F->getFunctionType(), std::make_pair(F->getAttributes(), F->getCallingConv()), GetValueName(F), NULL, isKernel); - } - raw_ostream &printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty, - std::pair<AttributeList, CallingConv::ID> PAL = std::make_pair(AttributeList(), CallingConv::C)); - raw_ostream &printStructDeclaration(raw_ostream &Out, StructType *Ty); - raw_ostream &printArrayDeclaration(raw_ostream &Out, ArrayType *Ty); - raw_ostream &printVectorDeclaration(raw_ostream &Out, VectorType *Ty); - - raw_ostream &printTypeName(raw_ostream &Out, Type *Ty, - bool isSigned = false, - std::pair<AttributeList, CallingConv::ID> - PAL = std::make_pair(AttributeList(), CallingConv::C)); - raw_ostream &printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool isSigned = false); - raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned); - raw_ostream &printTypeString(raw_ostream &Out, Type *Ty, bool isSigned); - - std::string getStructName(StructType *ST); - std::string getFunctionName(FunctionType *FT, - std::pair<AttributeList, CallingConv::ID> PAL - = std::make_pair(AttributeList(), CallingConv::C)); - std::string getArrayName(ArrayType *AT); - std::string getVectorName(VectorType *VT, bool Aligned); - - enum OperandContext { - ContextNormal, - ContextCasted, - // Casted context means the type-cast will be implicit, - // such as the RHS of a `var = RHS;` expression - // or inside a struct initializer expression - ContextStatic - // Static context means that it is being used in as a static initializer - // (also implies ContextCasted) - }; - - void writeOperandDeref(Value *Operand); - void writeOperand(Value *Operand, enum OperandContext Context = ContextNormal, bool arrayAccess = false); - void writeInstComputationInline(Instruction &I); - void writeOperandInternal(Value *Operand, enum OperandContext Context = ContextNormal); - void writeOperandWithCast(Value* Operand, unsigned Opcode); - void opcodeNeedsCast(unsigned Opcode, bool &shouldCast, bool &castIsSigned); - - void writeOperandWithCast(Value* Operand, ICmpInst &I); - bool writeInstructionCast(Instruction &I); - void writeMemoryAccess(Value *Operand, Type *OperandType, - bool IsVolatile, unsigned Alignment); - - std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c); - - void lowerIntrinsics(Function &F); - /// Prints the definition of the intrinsic function F. Supports the - /// intrinsics which need to be explicitly defined in the CBackend. - void printIntrinsicDefinition(Function &F, raw_ostream &Out); - void printIntrinsicDefinition(FunctionType *funT, - unsigned Opcode, std::string OpName, - raw_ostream &Out); - - void printModuleTypes(raw_ostream &Out); - void printContainedTypes(raw_ostream &Out, Type *Ty, std::set<Type*> &); - - void printFloatingPointConstants(Function &F); - void printFloatingPointConstants(const Constant *C); - - void printFunction(Function &); - void printBasicBlock(BasicBlock *BB); - void printLoop(Loop *L); - - void printCast(unsigned opcode, Type *SrcTy, Type *DstTy); - void printConstant(Constant *CPV, enum OperandContext Context); - void printConstantWithCast(Constant *CPV, unsigned Opcode); - bool printConstExprCast(ConstantExpr *CE); - void printConstantArray(ConstantArray *CPA, enum OperandContext Context); - void printConstantVector(ConstantVector *CV, enum OperandContext Context); - void printConstantDataSequential(ConstantDataSequential *CDS, enum OperandContext Context); - bool printConstantString(Constant *C, enum OperandContext Context); - - bool isEmptyType(Type *Ty) const; - bool isAddressExposed(Value *V) const; - bool isInlinableInst(Instruction &I) const; - AllocaInst *isDirectAlloca(Value *V) const; - bool isInlineAsm(Instruction& I) const; - - // Instruction visitation functions - friend class InstVisitor<CWriter>; - - void visitReturnInst(ReturnInst &I); - void visitBranchInst(BranchInst &I); - void visitSwitchInst(SwitchInst &I); - void visitIndirectBrInst(IndirectBrInst &I); - void visitInvokeInst(InvokeInst &I) { - llvm_unreachable("Lowerinvoke pass didn't work!"); - } - void visitResumeInst(ResumeInst &I) { - llvm_unreachable("DwarfEHPrepare pass didn't work!"); - } - void visitUnreachableInst(UnreachableInst &I); - - void visitPHINode(PHINode &I); - void visitBinaryOperator(BinaryOperator &I); - void visitICmpInst(ICmpInst &I); - void visitFCmpInst(FCmpInst &I); - - void visitCastInst (CastInst &I); - void visitSelectInst(SelectInst &I); - void visitCallInst (CallInst &I); - void visitInlineAsm(CallInst &I); - bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID); - - void visitAllocaInst(AllocaInst &I); - void visitLoadInst (LoadInst &I); - void visitStoreInst (StoreInst &I); - void visitGetElementPtrInst(GetElementPtrInst &I); - void visitVAArgInst (VAArgInst &I); - - void visitInsertElementInst(InsertElementInst &I); - void visitExtractElementInst(ExtractElementInst &I); - void visitShuffleVectorInst(ShuffleVectorInst &SVI); - - void visitInsertValueInst(InsertValueInst &I); - void visitExtractValueInst(ExtractValueInst &I); - void visitInstruction(Instruction &I) { + return printFunctionProto( + Out, F->getFunctionType(), + std::make_pair(F->getAttributes(), F->getCallingConv()), + GetValueName(F), NULL, isKernel); + } + + raw_ostream & + printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty, + std::pair<AttributeList, CallingConv::ID> PAL = + std::make_pair(AttributeList(), CallingConv::C)); + raw_ostream &printStructDeclaration(raw_ostream &Out, StructType *Ty); + raw_ostream &printArrayDeclaration(raw_ostream &Out, ArrayType *Ty); + raw_ostream &printVectorDeclaration(raw_ostream &Out, VectorType *Ty); + + raw_ostream &printTypeName(raw_ostream &Out, Type *Ty, bool isSigned = false, + std::pair<AttributeList, CallingConv::ID> PAL = + std::make_pair(AttributeList(), + CallingConv::C)); + raw_ostream &printTypeNameUnaligned(raw_ostream &Out, Type *Ty, + bool isSigned = false); + raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned); + raw_ostream &printTypeString(raw_ostream &Out, Type *Ty, bool isSigned); + + std::string getStructName(StructType *ST); + std::string getFunctionName(FunctionType *FT, + std::pair<AttributeList, CallingConv::ID> PAL = + std::make_pair(AttributeList(), + CallingConv::C)); + std::string getArrayName(ArrayType *AT); + std::string getVectorName(VectorType *VT, bool Aligned); + + enum OperandContext { + ContextNormal, + ContextCasted, + // Casted context means the type-cast will be implicit, + // such as the RHS of a `var = RHS;` expression + // or inside a struct initializer expression + ContextStatic + // Static context means that it is being used in as a static initializer + // (also implies ContextCasted) + }; + + void writeOperandDeref(Value *Operand); + void writeOperand(Value *Operand, enum OperandContext Context = ContextNormal, + bool arrayAccess = false); + void writeInstComputationInline(Instruction &I); + void writeOperandInternal(Value *Operand, + enum OperandContext Context = ContextNormal); + void writeOperandWithCast(Value *Operand, unsigned Opcode); + void opcodeNeedsCast(unsigned Opcode, bool &shouldCast, bool &castIsSigned); + + void writeOperandWithCast(Value *Operand, ICmpInst &I); + bool writeInstructionCast(Instruction &I); + void writeMemoryAccess(Value *Operand, Type *OperandType, bool IsVolatile, + unsigned Alignment); + + std::string InterpretASMConstraint(InlineAsm::ConstraintInfo &c); + + void lowerIntrinsics(Function &F); + /// Prints the definition of the intrinsic function F. Supports the + /// intrinsics which need to be explicitly defined in the CBackend. + void printIntrinsicDefinition(Function &F, raw_ostream &Out); + void printIntrinsicDefinition(FunctionType *funT, unsigned Opcode, + std::string OpName, raw_ostream &Out); + + void printModuleTypes(raw_ostream &Out); + void printContainedTypes(raw_ostream &Out, Type *Ty, std::set<Type *> &); + + void printFloatingPointConstants(Function &F); + void printFloatingPointConstants(const Constant *C); + + void printFunction(Function &); + void printBasicBlock(BasicBlock *BB); + void printLoop(Loop *L); + + void printCast(unsigned opcode, Type *SrcTy, Type *DstTy); + void printConstant(Constant *CPV, enum OperandContext Context); + void printConstantWithCast(Constant *CPV, unsigned Opcode); + bool printConstExprCast(ConstantExpr *CE); + void printConstantArray(ConstantArray *CPA, enum OperandContext Context); + void printConstantVector(ConstantVector *CV, enum OperandContext Context); + void printConstantDataSequential(ConstantDataSequential *CDS, + enum OperandContext Context); + bool printConstantString(Constant *C, enum OperandContext Context); + + bool isEmptyType(Type *Ty) const; + bool isAddressExposed(Value *V) const; + bool isInlinableInst(Instruction &I) const; + AllocaInst *isDirectAlloca(Value *V) const; + bool isInlineAsm(Instruction &I) const; + + // Instruction visitation functions + friend class InstVisitor<CWriter>; + + void visitReturnInst(ReturnInst &I); + void visitBranchInst(BranchInst &I); + void visitSwitchInst(SwitchInst &I); + void visitIndirectBrInst(IndirectBrInst &I); + void visitInvokeInst(InvokeInst &I) { + llvm_unreachable("Lowerinvoke pass didn't work!"); + } + void visitResumeInst(ResumeInst &I) { + llvm_unreachable("DwarfEHPrepare pass didn't work!"); + } + void visitUnreachableInst(UnreachableInst &I); + + void visitPHINode(PHINode &I); + void visitBinaryOperator(BinaryOperator &I); + void visitICmpInst(ICmpInst &I); + void visitFCmpInst(FCmpInst &I); + + void visitCastInst(CastInst &I); + void visitSelectInst(SelectInst &I); + void visitCallInst(CallInst &I); + void visitInlineAsm(CallInst &I); + bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID); + + void visitAllocaInst(AllocaInst &I); + void visitLoadInst(LoadInst &I); + void visitStoreInst(StoreInst &I); + void visitGetElementPtrInst(GetElementPtrInst &I); + void visitVAArgInst(VAArgInst &I); + + void visitInsertElementInst(InsertElementInst &I); + void visitExtractElementInst(ExtractElementInst &I); + void visitShuffleVectorInst(ShuffleVectorInst &SVI); + + void visitInsertValueInst(InsertValueInst &I); + void visitExtractValueInst(ExtractValueInst &I); + void visitInstruction(Instruction &I) { #ifndef NDEBUG - errs() << "C Writer does not know about " << I; + errs() << "C Writer does not know about " << I; #endif - llvm_unreachable(0); - } - - void outputLValue(Instruction *I) { - Out << " " << GetValueName(I) << " = "; - } - - bool extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *IndVarChain, Instruction *Branch, unsigned indent); - - bool traverseUseDefChain(Instruction*I, PHINode*PI); - bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To); - void printPHICopiesForSuccessor(BasicBlock *CurBlock, - BasicBlock *Successor, unsigned Indent); - void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock, - unsigned Indent); - void printGEPExpression(Value *Ptr, gep_type_iterator I, gep_type_iterator E, bool isArrayType, GetElementPtrInst*); - - - bool findLoopBranch(BranchInst **LBranch, BasicBlock* CurBlock, BasicBlock* LHeader, std::set<BasicBlock*>*visitSet); - std::string GetValueName(Value *Operand); - void printBBorLoop(BasicBlock *BB); - - bool compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm); - bool findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm); - }; -} + llvm_unreachable(0); + } + + void outputLValue(Instruction *I) { Out << " " << GetValueName(I) << " = "; } + + bool extractIndVarChain(Instruction *Inst, + std::stack<Instruction *> *IndVarChain, + Instruction *Branch, unsigned indent); + + bool traverseUseDefChain(Instruction *I, PHINode *PI); + bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To); + void printPHICopiesForSuccessor(BasicBlock *CurBlock, BasicBlock *Successor, + unsigned Indent); + void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock, + unsigned Indent); + void printGEPExpression(Value *Ptr, gep_type_iterator I, gep_type_iterator E, + bool isArrayType, GetElementPtrInst *); + + bool findLoopBranch(BranchInst **LBranch, BasicBlock *CurBlock, + BasicBlock *LHeader, std::set<BasicBlock *> *visitSet); + std::string GetValueName(Value *Operand); + void printBBorLoop(BasicBlock *BB); + + bool compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, + BasicBlock *ImmPostDomm); + bool findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, + BasicBlock *ImmPostDomm); +}; +} // namespace diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h index b6c02fc46186f78ee085ff0d7fb050ad2002743f..1c61289817d148365742f89f3f3999500283bd8a 100644 --- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h +++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h @@ -14,23 +14,21 @@ #ifndef CTARGETMACHINE_H #define CTARGETMACHINE_H -#include "llvm/Target/TargetMachine.h" -#include "llvm/IR/DataLayout.h" #include "llvm/ADT/Optional.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { struct CTargetMachine : public TargetMachine { - - // NOTE: Interface change - CTargetMachine(const Target &T, const Triple &TargetTriple, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - Optional<CodeModel::Model> CM, - CodeGenOpt::Level OL, bool JIT) - - : TargetMachine(T, "", TargetTriple, CPU, FS, Options) { } + + // NOTE: Interface change + CTargetMachine(const Target &T, const Triple &TargetTriple, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) + + : TargetMachine(T, "", TargetTriple, CPU, FS, Options) {} /// Add passes to the specified pass manager to get the specified file /// emitted. Typically this will involve several steps of code generation. @@ -38,21 +36,20 @@ struct CTargetMachine : public TargetMachine { /*bool addPassesToEmitFile( PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType, bool DisableVerify = true, AnalysisID StartBefore = nullptr, - AnalysisID StartAfter = nullptr, AnalysisID StopBefore = nullptr, + AnalysisID StartAfter = nullptr, AnalysisID StopBefore = nullptr, AnalysisID StopAfter = nullptr) override; //MachineFunctionInitializer *MFInitializer = nullptr) override; */ virtual bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out, - raw_pwrite_stream *Out2, CodeGenFileType FileType, + raw_pwrite_stream *Out2, + CodeGenFileType FileType, bool DisableVerify = true, - MachineModuleInfo *MMI = nullptr) override; - + MachineModuleInfo *MMI = nullptr) override; }; extern Target TheCBackendTarget; -} // End llvm namespace - +} // namespace llvm #endif diff --git a/hpvm/projects/llvm-cbe/test/APInt-C.cpp b/hpvm/projects/llvm-cbe/test/APInt-C.cpp index 0dec791c141d94939bf0762120be55ce184dcfb6..c44440985a0b50a57bd25e1995d39cd904ec32c5 100644 --- a/hpvm/projects/llvm-cbe/test/APInt-C.cpp +++ b/hpvm/projects/llvm-cbe/test/APInt-C.cpp @@ -1,9 +1,9 @@ // This file is a part of Julia. License is MIT: http://julialang.org/license #include "llvm-version.h" -#include <llvm/ADT/ArrayRef.h> -#include <llvm/ADT/APInt.h> #include <llvm/ADT/APFloat.h> +#include <llvm/ADT/APInt.h> +#include <llvm/ADT/ArrayRef.h> #include <llvm/Support/MathExtras.h> #include "APInt-C.h" @@ -12,524 +12,539 @@ using namespace llvm; #if JL_LLVM_VERSION >= 30900 -inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { - return alignTo(Value, Align, Skew); +inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align, + uint64_t Skew = 0) { + return alignTo(Value, Align, Skew); } #endif /* create "APInt s" from "integerPart *ps" */ -#define CREATE(s) \ - APInt s; \ - if ((numbits % integerPartWidth) != 0) { \ - /* use LLT_ALIGN to round the memory area up to the nearest integerPart-sized chunk */ \ - unsigned nbytes = RoundUpToAlignment(numbits, integerPartWidth) / host_char_bit; \ - integerPart *data_a64 = (integerPart*)alloca(nbytes); \ - /* TODO: this memcpy assumes little-endian, - * for big-endian, need to align the copy to the other end */ \ - memcpy(data_a64, p##s, RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \ - s = APInt(numbits, makeArrayRef(data_a64, nbytes / sizeof(integerPart))); \ - } \ - else { \ - s = APInt(numbits, makeArrayRef(p##s, numbits / integerPartWidth)); \ - } +#define CREATE(s) \ + APInt s; \ + if ((numbits % integerPartWidth) != 0) { \ + /* use LLT_ALIGN to round the memory area up to the nearest \ \ + * integerPart-sized chunk */ \ + unsigned nbytes = \ + RoundUpToAlignment(numbits, integerPartWidth) / host_char_bit; \ + integerPart *data_a64 = (integerPart *)alloca(nbytes); \ + /* TODO: this memcpy assumes little-endian, \ for big-endian, need to \ + * align the copy to the other end */ \ + memcpy(data_a64, p##s, \ + RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \ + s = APInt(numbits, makeArrayRef(data_a64, nbytes / sizeof(integerPart))); \ + } else { \ + s = APInt(numbits, makeArrayRef(p##s, numbits / integerPartWidth)); \ + } /* assign to "integerPart *pr" from "APInt a" */ -#define ASSIGN(r, a) \ - if (numbits <= 8) \ - *(uint8_t*)p##r = a.getZExtValue(); \ - else if (numbits <= 16) \ - *(uint16_t*)p##r = a.getZExtValue(); \ - else if (numbits <= 32) \ - *(uint32_t*)p##r = a.getZExtValue(); \ - else if (numbits <= 64) \ - *(uint64_t*)p##r = a.getZExtValue(); \ - else \ - memcpy(p##r, a.getRawData(), RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \ - -extern "C" JL_DLLEXPORT -void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr) { - APInt z(numbits, 0); - CREATE(a) - z -= a; - ASSIGN(r, z) -} - -extern "C" JL_DLLEXPORT -void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a += b; - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a -= b; - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a *= b; - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.sdiv(b); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.udiv(b); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.srem(b); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.urem(b); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pb) { - CREATE(a) - CREATE(b) - return a.eq(b); -} - -extern "C" JL_DLLEXPORT -int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb) { - CREATE(a) - CREATE(b) - return a.ne(b); -} - -extern "C" JL_DLLEXPORT -int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb) { - CREATE(a) - CREATE(b) - return a.slt(b); -} - -extern "C" JL_DLLEXPORT -int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb) { - CREATE(a) - CREATE(b) - return a.ult(b); -} - -extern "C" JL_DLLEXPORT -int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb) { - CREATE(a) - CREATE(b) - return a.sle(b); -} - -extern "C" JL_DLLEXPORT -int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb) { - CREATE(a) - CREATE(b) - return a.ule(b); -} - -extern "C" JL_DLLEXPORT -void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a &= b; - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a |= b; - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a ^= b; - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.shl(b); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.lshr(b); - ASSIGN(r, a) -} -extern "C" JL_DLLEXPORT -void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.ashr(b); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr) { - CREATE(a) - a.flipAllBits(); - ASSIGN(r, a) -} - -extern "C" JL_DLLEXPORT -int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.uadd_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.sadd_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.usub_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.ssub_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.smul_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.umul_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMDiv_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - bool Overflow; - a = a.sdiv_ov(b, Overflow); - ASSIGN(r, a) - return Overflow; -} - -extern "C" JL_DLLEXPORT -int LLVMDiv_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.udiv(b); - ASSIGN(r, a) - // unsigned division cannot overflow - return false; -} - -extern "C" JL_DLLEXPORT -int LLVMRem_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.srem(b); - ASSIGN(r, a) - // signed remainder cannot overflow - return false; -} - -extern "C" JL_DLLEXPORT -int LLVMRem_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - a = a.urem(b); - ASSIGN(r, a) - // unsigned remainder cannot overflow - return false; -} - -extern "C" JL_DLLEXPORT -void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr) { - CREATE(a) - a = a.byteSwap(); - ASSIGN(r, a) -} - -void LLVMFPtoInt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) { - double Val; - if (numbits == 32) - Val = *(float*)pa; - else if (numbits == 64) - Val = *(double*)pa; - else - jl_error("FPtoSI: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); - unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; - if (onumbits <= 64) { // fast-path, if possible - if (isSigned) { - int64_t ia = Val; - memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian - if (isExact) { - // check whether the conversion was lossless - int64_t ia2 = ia < 0 ? -1 : 0; - memcpy(&ia2, pr, onumbytes); - *isExact = (Val == (double)ia2 && ia == ia2); - } - } - else { - uint64_t ia = Val; - memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian - if (isExact) { - // check whether the conversion was lossless - uint64_t ia2 = 0; - memcpy(&ia2, pr, onumbytes); - *isExact = (Val == (double)ia2 && ia == ia2); - } - } +#define ASSIGN(r, a) \ + if (numbits <= 8) \ + *(uint8_t *)p##r = a.getZExtValue(); \ + else if (numbits <= 16) \ + *(uint16_t *)p##r = a.getZExtValue(); \ + else if (numbits <= 32) \ + *(uint32_t *)p##r = a.getZExtValue(); \ + else if (numbits <= 64) \ + *(uint64_t *)p##r = a.getZExtValue(); \ + else \ + memcpy(p##r, a.getRawData(), \ + RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); + +extern "C" JL_DLLEXPORT void LLVMNeg(unsigned numbits, integerPart *pa, + integerPart *pr) { + APInt z(numbits, 0); + CREATE(a) + z -= a; + ASSIGN(r, z) +} + +extern "C" JL_DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a += b; + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a -= b; + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a *= b; + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.sdiv(b); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.udiv(b); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.srem(b); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.urem(b); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT int LLVMICmpEQ(unsigned numbits, integerPart *pa, + integerPart *pb) { + CREATE(a) + CREATE(b) + return a.eq(b); +} + +extern "C" JL_DLLEXPORT int LLVMICmpNE(unsigned numbits, integerPart *pa, + integerPart *pb) { + CREATE(a) + CREATE(b) + return a.ne(b); +} + +extern "C" JL_DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa, + integerPart *pb) { + CREATE(a) + CREATE(b) + return a.slt(b); +} + +extern "C" JL_DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa, + integerPart *pb) { + CREATE(a) + CREATE(b) + return a.ult(b); +} + +extern "C" JL_DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa, + integerPart *pb) { + CREATE(a) + CREATE(b) + return a.sle(b); +} + +extern "C" JL_DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa, + integerPart *pb) { + CREATE(a) + CREATE(b) + return a.ule(b); +} + +extern "C" JL_DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a &= b; + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a |= b; + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a ^= b; + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.shl(b); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.lshr(b); + ASSIGN(r, a) +} +extern "C" JL_DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.ashr(b); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa, + integerPart *pr) { + CREATE(a) + a.flipAllBits(); + ASSIGN(r, a) +} + +extern "C" JL_DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.uadd_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.sadd_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.usub_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.ssub_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.smul_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.umul_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMDiv_sov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + bool Overflow; + a = a.sdiv_ov(b, Overflow); + ASSIGN(r, a) + return Overflow; +} + +extern "C" JL_DLLEXPORT int LLVMDiv_uov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.udiv(b); + ASSIGN(r, a) + // unsigned division cannot overflow + return false; +} + +extern "C" JL_DLLEXPORT int LLVMRem_sov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.srem(b); + ASSIGN(r, a) + // signed remainder cannot overflow + return false; +} + +extern "C" JL_DLLEXPORT int LLVMRem_uov(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + a = a.urem(b); + ASSIGN(r, a) + // unsigned remainder cannot overflow + return false; +} + +extern "C" JL_DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa, + integerPart *pr) { + CREATE(a) + a = a.byteSwap(); + ASSIGN(r, a) +} + +void LLVMFPtoInt(unsigned numbits, integerPart *pa, unsigned onumbits, + integerPart *pr, bool isSigned, bool *isExact) { + double Val; + if (numbits == 32) + Val = *(float *)pa; + else if (numbits == 64) + Val = *(double *)pa; + else + jl_error("FPtoSI: runtime floating point intrinsics are not implemented " + "for bit sizes other than 32 and 64"); + unsigned onumbytes = + RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; + if (onumbits <= 64) { // fast-path, if possible + if (isSigned) { + int64_t ia = Val; + memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian + if (isExact) { + // check whether the conversion was lossless + int64_t ia2 = ia < 0 ? -1 : 0; + memcpy(&ia2, pr, onumbytes); + *isExact = (Val == (double)ia2 && ia == ia2); + } + } else { + uint64_t ia = Val; + memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian + if (isExact) { + // check whether the conversion was lossless + uint64_t ia2 = 0; + memcpy(&ia2, pr, onumbytes); + *isExact = (Val == (double)ia2 && ia == ia2); + } } - else { - APFloat a(Val); - bool isVeryExact; - APFloat::roundingMode rounding_mode = APFloat::rmNearestTiesToEven; - unsigned nbytes = RoundUpToAlignment(onumbits, integerPartWidth) / host_char_bit; - integerPart *parts = (integerPart*)alloca(nbytes); - APFloat::opStatus status = a.convertToInteger(parts, onumbits, isSigned, rounding_mode, &isVeryExact); - memcpy(pr, parts, onumbytes); - if (isExact) - *isExact = (status == APFloat::opOK); - } -} - -extern "C" JL_DLLEXPORT -void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - LLVMFPtoInt(numbits, pa, onumbits, pr, true, NULL); -} - -extern "C" JL_DLLEXPORT -void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - LLVMFPtoInt(numbits, pa, onumbits, pr, false, NULL); -} - -extern "C" JL_DLLEXPORT -int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - bool isExact; - LLVMFPtoInt(numbits, pa, onumbits, pr, true, &isExact); - return isExact; -} - -extern "C" JL_DLLEXPORT -int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - bool isExact; - LLVMFPtoInt(numbits, pa, onumbits, pr, false, &isExact); - return isExact; -} - -extern "C" JL_DLLEXPORT -void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - CREATE(a) - double val = a.roundToDouble(true); - if (onumbits == 32) - *(float*)pr = val; - else if (onumbits == 64) - *(double*)pr = val; - else - jl_error("SItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); -} - -extern "C" JL_DLLEXPORT -void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - CREATE(a) - double val = a.roundToDouble(false); - if (onumbits == 32) - *(float*)pr = val; - else if (onumbits == 64) - *(double*)pr = val; - else - jl_error("UItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); -} - -extern "C" JL_DLLEXPORT -void LLVMSExt(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - assert(inumbits < onumbits); - unsigned inumbytes = RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit; - unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; - int bits = (0 - inumbits) % host_char_bit; - int signbit = (inumbits - 1) % host_char_bit; - int sign = ((unsigned char*)pa)[inumbytes - 1] & (1 << signbit) ? -1 : 0; - // copy over the input bytes - memcpy(pr, pa, inumbytes); - if (bits) { - // sign-extend the partial byte - ((signed char*)pr)[inumbytes - 1] = ((signed char*)pa)[inumbytes - 1] << bits >> bits; - } - // sign-extend the rest of the bytes - memset((char*)pr + inumbytes, sign, onumbytes - inumbytes); -} - -extern "C" JL_DLLEXPORT -void LLVMZExt(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - assert(inumbits < onumbits); - unsigned inumbytes = RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit; - unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; - int bits = (0 - inumbits) % host_char_bit; - // copy over the input bytes - memcpy(pr, pa, inumbytes); - if (bits) { - // zero the remaining bits of the partial byte - ((unsigned char*)pr)[inumbytes - 1] = ((unsigned char*)pa)[inumbytes - 1] << bits >> bits; - } - // zero-extend the rest of the bytes - memset((char*)pr + inumbytes, 0, onumbytes - inumbytes); -} - -extern "C" JL_DLLEXPORT -void LLVMTrunc(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) { - assert(inumbits > onumbits); - unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; - memcpy(pr, pa, onumbytes); -} - -extern "C" JL_DLLEXPORT -unsigned countTrailingZeros_8(uint8_t Val) { + } else { + APFloat a(Val); + bool isVeryExact; + APFloat::roundingMode rounding_mode = APFloat::rmNearestTiesToEven; + unsigned nbytes = + RoundUpToAlignment(onumbits, integerPartWidth) / host_char_bit; + integerPart *parts = (integerPart *)alloca(nbytes); + APFloat::opStatus status = a.convertToInteger(parts, onumbits, isSigned, + rounding_mode, &isVeryExact); + memcpy(pr, parts, onumbytes); + if (isExact) + *isExact = (status == APFloat::opOK); + } +} + +extern "C" JL_DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + LLVMFPtoInt(numbits, pa, onumbits, pr, true, NULL); +} + +extern "C" JL_DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + LLVMFPtoInt(numbits, pa, onumbits, pr, false, NULL); +} + +extern "C" JL_DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, + unsigned onumbits, + integerPart *pr) { + bool isExact; + LLVMFPtoInt(numbits, pa, onumbits, pr, true, &isExact); + return isExact; +} + +extern "C" JL_DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, + unsigned onumbits, + integerPart *pr) { + bool isExact; + LLVMFPtoInt(numbits, pa, onumbits, pr, false, &isExact); + return isExact; +} + +extern "C" JL_DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + CREATE(a) + double val = a.roundToDouble(true); + if (onumbits == 32) + *(float *)pr = val; + else if (onumbits == 64) + *(double *)pr = val; + else + jl_error("SItoFP: runtime floating point intrinsics are not implemented " + "for bit sizes other than 32 and 64"); +} + +extern "C" JL_DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + CREATE(a) + double val = a.roundToDouble(false); + if (onumbits == 32) + *(float *)pr = val; + else if (onumbits == 64) + *(double *)pr = val; + else + jl_error("UItoFP: runtime floating point intrinsics are not implemented " + "for bit sizes other than 32 and 64"); +} + +extern "C" JL_DLLEXPORT void LLVMSExt(unsigned inumbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + assert(inumbits < onumbits); + unsigned inumbytes = + RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit; + unsigned onumbytes = + RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; + int bits = (0 - inumbits) % host_char_bit; + int signbit = (inumbits - 1) % host_char_bit; + int sign = ((unsigned char *)pa)[inumbytes - 1] & (1 << signbit) ? -1 : 0; + // copy over the input bytes + memcpy(pr, pa, inumbytes); + if (bits) { + // sign-extend the partial byte + ((signed char *)pr)[inumbytes - 1] = + ((signed char *)pa)[inumbytes - 1] << bits >> bits; + } + // sign-extend the rest of the bytes + memset((char *)pr + inumbytes, sign, onumbytes - inumbytes); +} + +extern "C" JL_DLLEXPORT void LLVMZExt(unsigned inumbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + assert(inumbits < onumbits); + unsigned inumbytes = + RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit; + unsigned onumbytes = + RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; + int bits = (0 - inumbits) % host_char_bit; + // copy over the input bytes + memcpy(pr, pa, inumbytes); + if (bits) { + // zero the remaining bits of the partial byte + ((unsigned char *)pr)[inumbytes - 1] = + ((unsigned char *)pa)[inumbytes - 1] << bits >> bits; + } + // zero-extend the rest of the bytes + memset((char *)pr + inumbytes, 0, onumbytes - inumbytes); +} + +extern "C" JL_DLLEXPORT void LLVMTrunc(unsigned inumbits, integerPart *pa, + unsigned onumbits, integerPart *pr) { + assert(inumbits > onumbits); + unsigned onumbytes = + RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit; + memcpy(pr, pa, onumbytes); +} + +extern "C" JL_DLLEXPORT unsigned countTrailingZeros_8(uint8_t Val) { #if JL_LLVM_VERSION >= 30500 - return countTrailingZeros(Val); + return countTrailingZeros(Val); #else - return CountTrailingZeros_32(Val); + return CountTrailingZeros_32(Val); #endif } -extern "C" JL_DLLEXPORT -unsigned countTrailingZeros_16(uint16_t Val) { +extern "C" JL_DLLEXPORT unsigned countTrailingZeros_16(uint16_t Val) { #if JL_LLVM_VERSION >= 30500 - return countTrailingZeros(Val); + return countTrailingZeros(Val); #else - return CountTrailingZeros_32(Val); + return CountTrailingZeros_32(Val); #endif } -extern "C" JL_DLLEXPORT -unsigned countTrailingZeros_32(uint32_t Val) { +extern "C" JL_DLLEXPORT unsigned countTrailingZeros_32(uint32_t Val) { #if JL_LLVM_VERSION >= 30500 - return countTrailingZeros(Val); + return countTrailingZeros(Val); #else - return CountTrailingZeros_32(Val); + return CountTrailingZeros_32(Val); #endif } -extern "C" JL_DLLEXPORT -unsigned countTrailingZeros_64(uint64_t Val) { +extern "C" JL_DLLEXPORT unsigned countTrailingZeros_64(uint64_t Val) { #if JL_LLVM_VERSION >= 30500 - return countTrailingZeros(Val); + return countTrailingZeros(Val); #else - return CountTrailingZeros_64(Val); + return CountTrailingZeros_64(Val); #endif } -extern "C" JL_DLLEXPORT -void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - CREATE(a) - CREATE(b) - APInt r = a.srem(b); - if (a.isNegative() != b.isNegative()) { - r = (b + r).srem(b); - } - ASSIGN(r, r) +extern "C" JL_DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + CREATE(a) + CREATE(b) + APInt r = a.srem(b); + if (a.isNegative() != b.isNegative()) { + r = (b + r).srem(b); + } + ASSIGN(r, r) } -extern "C" JL_DLLEXPORT -void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) { - unsigned numbytes = RoundUpToAlignment(numbits, host_char_bit) / host_char_bit; - int signbit = (numbits - 1) % host_char_bit; - int sign = ((unsigned char*)pa)[numbytes - 1] & (1 << signbit) ? -1 : 0; - if (sign) - LLVMNeg(numbits, pa, pr); - else - memcpy(pr, pa, numbytes); +extern "C" JL_DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr) { + unsigned numbytes = + RoundUpToAlignment(numbits, host_char_bit) / host_char_bit; + int signbit = (numbits - 1) % host_char_bit; + int sign = ((unsigned char *)pa)[numbytes - 1] & (1 << signbit) ? -1 : 0; + if (sign) + LLVMNeg(numbits, pa, pr); + else + memcpy(pr, pa, numbytes); } -extern "C" JL_DLLEXPORT -unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa) { - CREATE(a) - return a.countPopulation(); +extern "C" JL_DLLEXPORT unsigned LLVMCountPopulation(unsigned numbits, + integerPart *pa) { + CREATE(a) + return a.countPopulation(); } -extern "C" JL_DLLEXPORT -unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa) { - CREATE(a) - return a.countTrailingOnes(); +extern "C" JL_DLLEXPORT unsigned LLVMCountTrailingOnes(unsigned numbits, + integerPart *pa) { + CREATE(a) + return a.countTrailingOnes(); } -extern "C" JL_DLLEXPORT -unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa) { - CREATE(a) - return a.countTrailingZeros(); +extern "C" JL_DLLEXPORT unsigned LLVMCountTrailingZeros(unsigned numbits, + integerPart *pa) { + CREATE(a) + return a.countTrailingZeros(); } -extern "C" JL_DLLEXPORT -unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa) { - CREATE(a) - return a.countLeadingOnes(); +extern "C" JL_DLLEXPORT unsigned LLVMCountLeadingOnes(unsigned numbits, + integerPart *pa) { + CREATE(a) + return a.countLeadingOnes(); } -extern "C" JL_DLLEXPORT -unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa) { - CREATE(a) - return a.countLeadingZeros(); +extern "C" JL_DLLEXPORT unsigned LLVMCountLeadingZeros(unsigned numbits, + integerPart *pa) { + CREATE(a) + return a.countLeadingZeros(); } \ No newline at end of file diff --git a/hpvm/projects/llvm-cbe/test/APInt-C.h b/hpvm/projects/llvm-cbe/test/APInt-C.h index 793bc123003f81308a28c9ff154b4f6fd77b9ab2..873227caf38926f24e7529d62c8110d5fa2d64ef 100644 --- a/hpvm/projects/llvm-cbe/test/APInt-C.h +++ b/hpvm/projects/llvm-cbe/test/APInt-C.h @@ -15,41 +15,70 @@ typedef void integerPart; #endif JL_DLLEXPORT void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr); -JL_DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr); - -JL_DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); - -JL_DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr); +JL_DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa, + integerPart *pr); + +JL_DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); + +JL_DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa, + integerPart *pr); JL_DLLEXPORT int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pr); JL_DLLEXPORT int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb); -JL_DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb); -JL_DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb); -JL_DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb); -JL_DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb); - -JL_DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMDiv_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMDiv_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMRem_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT int LLVMRem_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); +JL_DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa, + integerPart *pb); +JL_DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa, + integerPart *pb); +JL_DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa, + integerPart *pb); +JL_DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa, + integerPart *pb); + +JL_DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMDiv_sov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMDiv_uov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMRem_sov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); +JL_DLLEXPORT int LLVMRem_uov(unsigned numbits, integerPart *pa, integerPart *pb, + integerPart *pr); JL_DLLEXPORT unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa); JL_DLLEXPORT unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa); @@ -57,30 +86,40 @@ JL_DLLEXPORT unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa); JL_DLLEXPORT unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa); JL_DLLEXPORT unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa); -JL_DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT void LLVMTrunc(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); - -JL_DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); -JL_DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr); - -JL_DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); -JL_DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr); +JL_DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); +JL_DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); +JL_DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); +JL_DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); +JL_DLLEXPORT void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits, + integerPart *pr); +JL_DLLEXPORT void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits, + integerPart *pr); +JL_DLLEXPORT void LLVMTrunc(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); + +JL_DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); +JL_DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, + unsigned onumbits, integerPart *pr); + +JL_DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr); +JL_DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, + integerPart *pb, integerPart *pr); JL_DLLEXPORT unsigned countTrailingZeros_8(uint8_t Val); JL_DLLEXPORT unsigned countTrailingZeros_16(uint16_t Val); JL_DLLEXPORT unsigned countTrailingZeros_32(uint32_t Val); JL_DLLEXPORT unsigned countTrailingZeros_64(uint64_t Val); -//uint8_t getSwappedBytes_8(uint8_t Value); // no-op -//uint16_t getSwappedBytes_16(uint16_t Value); -//uint32_t getSwappedBytes_32(uint32_t Value); -//uint64_t getSwappedBytes_64(uint64_t Value); - +// uint8_t getSwappedBytes_8(uint8_t Value); // no-op +// uint16_t getSwappedBytes_16(uint16_t Value); +// uint32_t getSwappedBytes_32(uint32_t Value); +// uint64_t getSwappedBytes_64(uint64_t Value); #ifdef __cplusplus } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test001.c b/hpvm/projects/llvm-cbe/test/cfiles/test001.c index 817d7ca8cae09d11e57848ee7d3fdb9a7931d19a..8606d141ba73ddce2a598e85c6a787d715b1a5e2 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test001.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test001.c @@ -11,7 +11,4 @@ // //===----------------------------------------------------------------------===// -int main() -{ - return 6; -} +int main() { return 6; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test002.c b/hpvm/projects/llvm-cbe/test/cfiles/test002.c index 9af3c34ee82cf9517f0f4ed4015a239fdace5cfb..aeb02526f8b2bda1b0bae293d1f006c6a4622641 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test002.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test002.c @@ -8,14 +8,13 @@ //===----------------------------------------------------------------------===// // // This code tests to see that the CBE will execute a for loop correctly. -// *TW +// *TW // //===----------------------------------------------------------------------===// -int main() -{ - int i, x = 0; - for (i = 0; i < 6; i++) - ++x; - return x; +int main() { + int i, x = 0; + for (i = 0; i < 6; i++) + ++x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test003.c b/hpvm/projects/llvm-cbe/test/cfiles/test003.c index 4aa8eb6bfb6e4a4e4d67f5fd4f5847cc608d6cae..bfeaef5db7a85f23c746b90f17461fb10dfd87e8 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test003.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test003.c @@ -11,13 +11,11 @@ // *TW //===----------------------------------------------------------------------===// -int main() -{ - int i = 0, x = 0; - while (i < 6) { - ++x; - ++i; - } - return x; -} - +int main() { + int i = 0, x = 0; + while (i < 6) { + ++x; + ++i; + } + return x; +} diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test004.c b/hpvm/projects/llvm-cbe/test/cfiles/test004.c index ba619f09bbaab461723a6f85dca1dfbb28ceac41..35a5a02d83091093a1b251bbb5a7158b11d93244 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test004.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test004.c @@ -7,15 +7,15 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute an if/else statement correctly. -// *TW +// This code tests to see that the CBE will execute an if/else statement +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - int x = 3; - x += 3; - if (x == 6) - return x; - else - return 0; + int x = 3; + x += 3; + if (x == 6) + return x; + else + return 0; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test005.c b/hpvm/projects/llvm-cbe/test/cfiles/test005.c index 8b9323a97e3a27cfb4cc45b17ba26b39c96a180c..a287f075cd3b152e84a0bd24ce35097c5bb231b7 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test005.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test005.c @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// int main() { - int i, j, x = 0; - for (i = 0; i < 3; i++) - for (j = 0; j < 2; j++) - ++x; - - return x; + int i, j, x = 0; + for (i = 0; i < 3; i++) + for (j = 0; j < 2; j++) + ++x; + + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test006.c b/hpvm/projects/llvm-cbe/test/cfiles/test006.c index b513d75d4ab163388f15f156d1387d5b71dfcdf4..fe901d6d19cd2dabd11f66623d1f1ca3d0cf55b9 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test006.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test006.c @@ -7,18 +7,18 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a nested while loop correctly. -// *TW +// This code tests to see that the CBE will execute a nested while loop +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - int i = 0, j = 0, x = 0; - while (i < 6) { - while (j < 6) { - ++x; - ++j; - } - ++i; + int i = 0, j = 0, x = 0; + while (i < 6) { + while (j < 6) { + ++x; + ++j; } - return x; + ++i; + } + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test007.c b/hpvm/projects/llvm-cbe/test/cfiles/test007.c index 50c895d18192844c38c53c6e706eb2c4f163713d..b4ff4365db7ad0f48dc9fa2171a818757ba899c1 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test007.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test007.c @@ -7,27 +7,27 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a switch statement correctly. -// *TW +// This code tests to see that the CBE will execute a switch statement +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - char var = 'x'; - - switch (var) { - case 'z' : - return 0; - break; - case 'y' : - return 1; - break; - case 'x' : - return 6; - break; - case 'w' : - return 7; - break; - default : - return 100; - } + char var = 'x'; + + switch (var) { + case 'z': + return 0; + break; + case 'y': + return 1; + break; + case 'x': + return 6; + break; + case 'w': + return 7; + break; + default: + return 100; + } } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test008.c b/hpvm/projects/llvm-cbe/test/cfiles/test008.c index 283b8f73bafe45c6225e5270249c458b9a75a80d..f054263e0b5490d25b16c53c082d7b0dfbd1793f 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test008.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test008.c @@ -12,18 +12,18 @@ //===----------------------------------------------------------------------===// struct test { - int var1; - int var2; - int var3; + int var1; + int var2; + int var3; }; int main() { - struct test variable; + struct test variable; - variable.var2 = 5; - variable.var3 = 6; - variable.var1 = 9; - - return variable.var3; + variable.var2 = 5; + variable.var3 = 6; + variable.var1 = 9; + + return variable.var3; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test009.c b/hpvm/projects/llvm-cbe/test/cfiles/test009.c index a46509105cb73430794e55eb9d5af6d0da98ff6f..1b2fc327e2c7fd67ba1520dbecebd4803507c600 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test009.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test009.c @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// int main() { - int example[10]; - int i; - for (i = 0;i < 10; ++i) { - example[i] = i; - } - return example[6]; + int example[10]; + int i; + for (i = 0; i < 10; ++i) { + example[i] = i; + } + return example[6]; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test010.c b/hpvm/projects/llvm-cbe/test/cfiles/test010.c index e3841e64d3e41aa923201427f6913c3e30a650c9..21c6fdd0c7b6ed0a6c346d01a8e8836a4b2050a5 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test010.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test010.c @@ -7,37 +7,37 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a nested switch statement correctly. -// *TW +// This code tests to see that the CBE will execute a nested switch statement +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - char var = 'x', var2; - switch (var) { - case 'z' : - return 0; - break; - case 'y' : - return 1; - break; - case 'x' : - var2 = 'b'; - - switch (var2) { - case 'a' : - return 10; - break; - case 'b' : - return 6; - break; - default : - return 18; - } + char var = 'x', var2; + switch (var) { + case 'z': + return 0; + break; + case 'y': + return 1; + break; + case 'x': + var2 = 'b'; - case 'w' : - return 7; - break; - default : - return 100; - } + switch (var2) { + case 'a': + return 10; + break; + case 'b': + return 6; + break; + default: + return 18; + } + + case 'w': + return 7; + break; + default: + return 100; + } } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test011.c b/hpvm/projects/llvm-cbe/test/cfiles/test011.c index aa0ee7229f512c25e2794372eac697c85d35b531..9ff808b7096c728794ed472349b472d5ce61b952 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test011.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test011.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,13 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle addition between two variables. -// *TW +// This code tests to see that the CBE can handle addition between two +// variables. *TW //===------------------------------------------------------------------------===// -int main() -{ - int i = 2, t = 4, x = 0; - x = i+t; +int main() { + int i = 2, t = 4, x = 0; + x = i + t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test012.c b/hpvm/projects/llvm-cbe/test/cfiles/test012.c index 403c635686a51eb493c0ca224c043b6aa6c2fce6..60689156c5bcd5c835aebb0a9c5e0e8d7612d164 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test012.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test012.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,14 @@ // //===----------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle subtraction between two variables. +// This code tests to see that the CBE can handle subtraction between two +// variables. // *TW //===----------------------------------------------------------------------------===// -int main() -{ - int i = 8, t = 2, x = 0; - x = i-t; +int main() { + int i = 8, t = 2, x = 0; + x = i - t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test013.c b/hpvm/projects/llvm-cbe/test/cfiles/test013.c index 444d4676b78a2f5324cb9bbccfac67bfcf9330aa..9bb5dc492bc251f11c152eb2ea7b506c3354430c 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test013.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test013.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,13 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle multiplication between two variables. -// *TW +// This code tests to see that the CBE can handle multiplication between two +// variables. *TW //===------------------------------------------------------------------------------===// -int main() -{ - int i = 3, t = 2, x = 0; - x = i*t; +int main() { + int i = 3, t = 2, x = 0; + x = i * t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test014.c b/hpvm/projects/llvm-cbe/test/cfiles/test014.c index e1dc6931f9e989ed867f8abf93dfa5f57042de5c..cbc0ad52d407bfc768a56b7105d4b93a7d2bdaf7 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test014.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test014.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,13 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle division between two variables. -// *TW +// This code tests to see that the CBE can handle division between two +// variables. *TW //===------------------------------------------------------------------------------===// -int main() -{ - int i = 30, t = 5, x = 0; - x = i/t; +int main() { + int i = 30, t = 5, x = 0; + x = i / t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test015.c b/hpvm/projects/llvm-cbe/test/cfiles/test015.c index e4c2a5c03b28ca481dd3709e18567237cc12a660..81c2f22808e4f4efcb7a4d031faf6a7e2e197f37 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test015.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test015.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,10 +12,9 @@ // *TW //===------------------------------------------------------------------------------===// -int main() -{ - int i = 26, t = 20, x = 0; - x = i%t; +int main() { + int i = 26, t = 20, x = 0; + x = i % t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test016.c b/hpvm/projects/llvm-cbe/test/cfiles/test016.c index 0841840ebc31ba622a4538f328b658b0bf52e08c..bb5bc64fff2b798375e2c2470e6538d5009c7719 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test016.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test016.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +13,12 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - char ch; + char ch; - if(sizeof(+ch) == 4) { - return 6; - } - return 1; + if (sizeof(+ch) == 4) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test017.c b/hpvm/projects/llvm-cbe/test/cfiles/test017.c index 0535862b3057ea1cf7ac7ba2801a563a85d75dbe..a87abcd1e8f3311be495deb7bcf369f01ceeaa7f 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test017.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test017.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -13,11 +14,11 @@ int main() { - signed int a = 10; - signed int b = -a; + signed int a = 10; + signed int b = -a; - if(b == -10) { - return 6; - } - return 1; + if (b == -10) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test018.c b/hpvm/projects/llvm-cbe/test/cfiles/test018.c index c02efa9d0e914b96a6d491769e1a22e2e2747047..ea38b291393f20192f1885bdd702ef321b6929f0 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test018.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test018.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,15 +8,15 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle the incremental (++a) operator. -// *TW +// This code tests to see that the CBE can handle the incremental (++a) +// operator. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 5; + int x = 5; - ++x; + ++x; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test019.c b/hpvm/projects/llvm-cbe/test/cfiles/test019.c index 1975bb9c5b3e0eaae7a1417310da435f3df8a0d6..484fe0481656cba546bee1565e110f1a0dc90327 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test019.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test019.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,15 +8,15 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle the decremental (--a) operator. -// *TW +// This code tests to see that the CBE can handle the decremental (--a) +// operator. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 7; - - --x; + int x = 7; - return x; + --x; + + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test020.c b/hpvm/projects/llvm-cbe/test/cfiles/test020.c index a68801708d9b628dae8dc3b5dba130f86436bdb6..98ed7f1701cdfdf442e6253706f2bc2f1f30227f 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test020.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test020.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 3; + int x = 6; + int y = 3; - if(x > y){ - return x; - } - return 1; + if (x > y) { + return x; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test021.c b/hpvm/projects/llvm-cbe/test/cfiles/test021.c index 93eed31d9bbfc11ad0559dcaf649df9e1f9206c1..0c5e63a462482f6b2f5cc392c2508a3076c3c3e6 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test021.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test021.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; + int x = 6; + int y = 6; - if(x >= y){ - return x; - } - return 1; + if (x >= y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test022.c b/hpvm/projects/llvm-cbe/test/cfiles/test022.c index 895069a83bc7b1c1df9d6f27784187940b493b35..1578e158914dd68f5b99c4c69b36e19f23217939 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test022.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test022.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 12; + int x = 6; + int y = 12; - - if(x < y){ - return x; - } - return 1; + if (x < y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test023.c b/hpvm/projects/llvm-cbe/test/cfiles/test023.c index 52348d3e1690624aa712ec6735e811f4ab958055..bc309ddb015a9af75cfcd3f09a7c5f5e093ba981 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test023.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test023.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; + int x = 6; + int y = 6; - if(x <= y){ - return x; - } - return 1; + if (x <= y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test024.c b/hpvm/projects/llvm-cbe/test/cfiles/test024.c index 2c90879b87e3646db441c47e114b04f05de134a8..782d41a47880e3078af3d3775923870efba0a915 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test024.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test024.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; + int x = 6; + int y = 6; - if(x == y){ - return x; - } - return 1; + if (x == y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test025.c b/hpvm/projects/llvm-cbe/test/cfiles/test025.c index 153cb4013477a27ad95248d38c62aa45bb2d5206..26bedf78ca25c8da2a1ba9c12694ddbdba087033 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test025.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test025.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 2; + int x = 6; + int y = 2; - if(x != y){ - return x; - } - return 1; + if (x != y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test026.c b/hpvm/projects/llvm-cbe/test/cfiles/test026.c index 874c06957d200d2402974e3928aae339f5c2d16c..cf0b3e6ae94f24c8392a7b6f91a7dade1c1a6613 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test026.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test026.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { -int x = 6; -int y = 6; -int z = 6; + int x = 6; + int y = 6; + int z = 6; - if(x == y && x == z){ - return 6; - } - return 1; + if (x == y && x == z) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test027.c b/hpvm/projects/llvm-cbe/test/cfiles/test027.c index d1322597c34d5d3d284ae8d4de203b4bc769f998..f1e0adb31dc38ed4d35a25590281e3a9ac505474 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test027.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test027.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; - int z = 6; + int x = 6; + int y = 6; + int z = 6; - if(x == y || x != z){ - return 6; - } - return 1; + if (x == y || x != z) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test028.c b/hpvm/projects/llvm-cbe/test/cfiles/test028.c index ce77d792f3b2e75d3795784a3a932c37c120c764..7e2ecdcf3f66c4637a15b2d2d19ddd3b5e740469 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test028.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test028.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = -7; - unsigned int b = 0; + unsigned int a = -7; + unsigned int b = 0; - b = ~a; - if( b == 6){ - return 6; - } - return 1; + b = ~a; + if (b == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test029.c b/hpvm/projects/llvm-cbe/test/cfiles/test029.c index b7ac93ecf5f275ba5129a9bed6988b694cc0ca39..34d1ff5c8be474ccc629e9d10c51ca30b5cb8c10 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test029.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test029.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,15 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 6; //0110 - unsigned int b = 15; //1111 - unsigned int c = 0; + unsigned int a = 6; // 0110 + unsigned int b = 15; // 1111 + unsigned int c = 0; - c = a&b; - if(c == 6){ - return 6; - } - return 1; + c = a & b; + if (c == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test030.c b/hpvm/projects/llvm-cbe/test/cfiles/test030.c index 333ce5aa01915623200c600249ebb2377782a139..a88c910f8f25d85785ed5c5f03578157fc13b47d 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test030.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test030.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,15 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { -unsigned int a = 2; -unsigned int b = 4; -unsigned int c = 0; + unsigned int a = 2; + unsigned int b = 4; + unsigned int c = 0; - c = a|b; - if(c == 6){ - return 6; - } - return 1; + c = a | b; + if (c == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test031.c b/hpvm/projects/llvm-cbe/test/cfiles/test031.c index 69d0dab0e1ff78ff13fce90e3ae68de355b7cf19..6e13a9f03fae2a6b47e6fa6000e7fcea7a74b8d1 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test031.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test031.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,15 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 9; //1001 - unsigned int b = 15; //1111 - unsigned int c = 0; + unsigned int a = 9; // 1001 + unsigned int b = 15; // 1111 + unsigned int c = 0; - - c = a^b; - if(c == 6){ - return 6; - } - return 1; + c = a ^ b; + if (c == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test032.c b/hpvm/projects/llvm-cbe/test/cfiles/test032.c index ae63e2c4d26864d0bec5dc89dc6a80174e89c985..a98ab650e98bd0e147825bca50dca1bfcaea5809 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test032.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test032.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 3; //0011 - unsigned int b = 0; + unsigned int a = 3; // 0011 + unsigned int b = 0; - - b = a << 1; //0110 - if(b == 6){ - return 6; - } - return 1; + b = a << 1; // 0110 + if (b == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test033.c b/hpvm/projects/llvm-cbe/test/cfiles/test033.c index 1bb96d21bdef67392305cf8631aa7243ab77cb98..81b4177184b79eb5e282cafea5b00c60ce48a5a4 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test033.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test033.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 13; //1100 - unsigned int b = 0; + unsigned int a = 13; // 1100 + unsigned int b = 0; - b = a >> 1; //0110 - if(b == 6){ - return 6; - } - return 1; + b = a >> 1; // 0110 + if (b == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test034.c b/hpvm/projects/llvm-cbe/test/cfiles/test034.c index dd9106b0be38c88c2a9a10a5c937c2c373ed5eed..977bf40358d94bb0bb10a66938b4f21b500aab57 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test034.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test034.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 3; - int b = 3; - - a+=b; - if(a == 6){ - return 6; - } - return 1; + int a = 3; + int b = 3; + + a += b; + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test035.c b/hpvm/projects/llvm-cbe/test/cfiles/test035.c index d1c0ae391f15a0a8b3a118b2435667329ca86a85..8a7f23e17b1bc6f7569cc3352d6898ea033e40a5 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test035.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test035.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 9; - int b = 3; + int a = 9; + int b = 3; - a-=b; - if(a == 6){ - return 6; - } - return 1; + a -= b; + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test036.c b/hpvm/projects/llvm-cbe/test/cfiles/test036.c index d8d5a1957c84d7afa5cc9d96a6afd64297172f21..019722660fb4641ddfbe13d3d5fcec7aa06102c6 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test036.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test036.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -11,15 +12,14 @@ // Compound Multiplication Assignment(a*=b) operator. // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 2; - int b = 3; + int a = 2; + int b = 3; - a*=b; - if(a == 6){ - return 6; - } - return 1; + a *= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test037.c b/hpvm/projects/llvm-cbe/test/cfiles/test037.c index 5bf5ee705a44ec929a70d0176690733bcc1fbcb8..2363c91ce91768e7dee329eec5db68beedf8076f 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test037.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test037.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 30; - int b = 5; + int a = 30; + int b = 5; - a/=b; - if(a == 6){ - return 6; - } - return 1; + a /= b; + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test038.c b/hpvm/projects/llvm-cbe/test/cfiles/test038.c index efbe23460710f9f7108b18ba5e60ace5168338a6..1d6aa395aac2994f6bca35de131941154650038b 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test038.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test038.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 20; - int b = 14; + int a = 20; + int b = 14; - a%=b; - if(a == 6){ - return 6; - } - return 1; + a %= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test039.c b/hpvm/projects/llvm-cbe/test/cfiles/test039.c index 112d7f69700f6d7bfad21bc58d91506f4d95b68e..53d4fcb9133bdf57967034a7ad02b231088c59ad 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test039.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test039.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 6; //0110 - unsigned int b = 15; //1111 + unsigned int a = 6; // 0110 + unsigned int b = 15; // 1111 - a&=b; - if(a == 6){ - return 6; - } - return 1; + a &= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test040.c b/hpvm/projects/llvm-cbe/test/cfiles/test040.c index 5285fb73ecf5d3572c9cbf279be94debc6043e85..d174e7e88041eaca97dc2acb13d218c7ea8baba9 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test040.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test040.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 2; - unsigned int b = 4; + unsigned int a = 2; + unsigned int b = 4; - a|=b; - if(a == 6){ - return 6; - } - return 1; + a |= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test041.c b/hpvm/projects/llvm-cbe/test/cfiles/test041.c index f04e682356dff0739053b0e91857805b952c8aec..45f64966d499f07f09a6b3904a8c2cd9fc9c71a0 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test041.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test041.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 9; //1001 - unsigned int b = 15; //1111 + unsigned int a = 9; // 1001 + unsigned int b = 15; // 1111 - a^=b; - if(a == 6){ - return 6; - } - return 1; + a ^= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test042.c b/hpvm/projects/llvm-cbe/test/cfiles/test042.c index 5b4f12d80882f347efbe1ae59103f7eaf672c464..ec2547370b90902fffb0e61564bb02c782e20fd9 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test042.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test042.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 3; //0011 + unsigned int a = 3; // 0011 - a <<= 1; //0110 - if( a == 6){ - return 6; - } - return 1; + a <<= 1; // 0110 + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test043.c b/hpvm/projects/llvm-cbe/test/cfiles/test043.c index 3b42179304a1741c268d8adf90192bafc5a2ba98..6aeb7bd17c9f40b6f86cdf1fd8ea2dbe520ce554 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test043.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test043.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 13; //1100 + unsigned int a = 13; // 1100 - a >>= 1; //0110 - if(a == 6){ - return 6; - } - return 1; + a >>= 1; // 0110 + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test044.c b/hpvm/projects/llvm-cbe/test/cfiles/test044.c index dbb9d31ad940421d1d33d21af916eccd672ac2a8..f9b7c2d4632326b81ca526a82765152039269fba 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test044.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test044.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,17 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a char. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// char. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - char a = 'A' ; //65 - int ia = 0; + char a = 'A'; // 65 + int ia = 0; - ia = a; - ia-=59; + ia = a; + ia -= 59; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test045.c b/hpvm/projects/llvm-cbe/test/cfiles/test045.c index 50aaa8effcd3994d1dd47213d25748b1293f49f0..c8b57993a7edcbc691c641b1405f6f3ae137b65e 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test045.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test045.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - signed char a = 'A'; - int ia = 0; + signed char a = 'A'; + int ia = 0; - ia = a; - ia-=59; + ia = a; + ia -= 59; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test046.c b/hpvm/projects/llvm-cbe/test/cfiles/test046.c index ea57085caf034bdaf554169c439d21cabdfc1606..edbfe837fe615cea8ce58d5e7732da49632cf66c 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test046.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test046.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned char a = 'A'; - int ia = 0; + unsigned char a = 'A'; + int ia = 0; - ia = a; - ia-=59; + ia = a; + ia -= 59; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test047.c b/hpvm/projects/llvm-cbe/test/cfiles/test047.c index 2b90d14c7f9b195cdb94611925644cd4debb99ea..476cea234f53c18c684c9069c0675fc2effe48d3 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test047.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test047.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,13 +8,12 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning an int. -// *TW +// This code tests to see that the CBE can handle declaring and returning an +// int. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 6; - return a; + int a = 6; + return a; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test048.c b/hpvm/projects/llvm-cbe/test/cfiles/test048.c index c30694ff502502de722f99e2f8a21cfe79ddf17c..ee3966afccc9d7e3f0aaff62aec16142e28a601e 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test048.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test048.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +13,11 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - short int a = 6; - int ia = 0; - ia = (int)a; + short int a = 6; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test049.c b/hpvm/projects/llvm-cbe/test/cfiles/test049.c index bb4a0801981e734a36518cf406fc8edd2213d0cd..5f29feffc05704adf39078d49177ab5edb5cffcf 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test049.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test049.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +13,11 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - long int a = 6; - int ia = 0; - ia = (int)a; + long int a = 6; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test050.c b/hpvm/projects/llvm-cbe/test/cfiles/test050.c index f69c7cee23cbc47535ce653f28a26305195541e4..aa49757a320855970290cdef405a495052405ffe 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test050.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test050.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +13,11 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - signed int a = 6; - int ia = 0; - ia = (int)a; + signed int a = 6; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test051.c b/hpvm/projects/llvm-cbe/test/cfiles/test051.c index 61f1e03d57d03c6a298ed50880d074e4f58e9e9a..0334eafdf30b2be0c6cde7dfeadaaf074d943608 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test051.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test051.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,12 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 6; + unsigned int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test052.c b/hpvm/projects/llvm-cbe/test/cfiles/test052.c index 48e1ce67f8edf77d03d1a9c75d72352b26f06511..3230b192b7b70080bb12318c983b86dcc4b3159b 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test052.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test052.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a float. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// float. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - float a = 6.0; + float a = 6.0; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test053.c b/hpvm/projects/llvm-cbe/test/cfiles/test053.c index 86dd5691a77f96fcd6e8568d22e93ef3a160872b..4ea19186428065a3813addb8537d5331c2709015 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test053.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test053.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a double. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// double. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - double a = 6.0; + double a = 6.0; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test054.c b/hpvm/projects/llvm-cbe/test/cfiles/test054.c index 4c86601412f5db9c6ec818f6029374dacaeebb60..caa7d00080554531f087c950da49975854b2d4aa 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test054.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test054.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long double. -// *TW +// This code tests to see that the CBE can handle declaring and returning a long +// double. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long double a = 6.0; + long double a = 6.0; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test055.c b/hpvm/projects/llvm-cbe/test/cfiles/test055.c index cd7891acfe29906d1c0f6b6e9462ba2a95d8e747..4b85082d3353dea43862a1a0db736f1c43bf91fd 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test055.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test055.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a short. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// short. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - short a = 6; + short a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test056.c b/hpvm/projects/llvm-cbe/test/cfiles/test056.c index b12df1df990921aa6586cb6c4328733098c89386..305f044be1a342cd921f5bdc9ab22d938e172103 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test056.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test056.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed short. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed short. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed short a = 6; + signed short a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test057.c b/hpvm/projects/llvm-cbe/test/cfiles/test057.c index 50678081ec9ba22cc6274de0c6be137f913704bb..280ec876bda3a5ffb004d0eb14afa47250253c7d 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test057.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test057.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning an unsigned short. -// *TW +// This code tests to see that the CBE can handle declaring and returning an +// unsigned short. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned short a = 6; + unsigned short a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test058.c b/hpvm/projects/llvm-cbe/test/cfiles/test058.c index cdbfac068fe5aa9639691dcf869d016213e7dff7..f5404bd8336012b80e9f3d02074b4bd390924a84 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test058.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test058.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed short int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed short int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed short int a = 6; + signed short int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test059.c b/hpvm/projects/llvm-cbe/test/cfiles/test059.c index 4de964a13ec97e47ff09e618ac6e9c232d9acf35..13b3ac08797e64625e89a9404cd35a0c27d21203 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test059.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test059.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a unsigned short int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// unsigned short int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned short int a = 6; + unsigned short int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test060.c b/hpvm/projects/llvm-cbe/test/cfiles/test060.c index a0a6e16949f5730787137cbdf0bf5284ae6d292a..ecb393f2f368e5137d164a2996837db204c2f9f4 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test060.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test060.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long a = 6; + long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test061.c b/hpvm/projects/llvm-cbe/test/cfiles/test061.c index d1bf812aa0c3312a6b0dfafd2e59866ec5bdc236..ac7cadd45fe6e5148c41f38dee679ee8bddad2e3 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test061.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test061.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long a = 6; + signed long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test062.c b/hpvm/projects/llvm-cbe/test/cfiles/test062.c index 077ace8b321d7c9bd6a865bf0b2adb9bf892a3be..eaaf59853f711197b7049a993c48b939cbfab608 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test062.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test062.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a unsigned long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// unsigned long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long a = 6; + unsigned long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test063.c b/hpvm/projects/llvm-cbe/test/cfiles/test063.c index 78fbe390f5e05fbbf35f149bf6dc3f56ecd69549..fa6cd18e88bef646c55391a68564355302e55775 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test063.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test063.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed long int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed long int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long int a = 6; + signed long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test064.c b/hpvm/projects/llvm-cbe/test/cfiles/test064.c index c26a3da001557d18686ca20ec4de52bdd8e5e765..05a72b4b9a937ed87262c6ad40a9a24238b201dd 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test064.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test064.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +13,12 @@ // *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long int a = 6; + unsigned long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test065.c b/hpvm/projects/llvm-cbe/test/cfiles/test065.c index d9b299752c54e9238c6e2171342bb6f1c470163b..76958db4c2fe457f52cccac60f8dd3f49f8a868d 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test065.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test065.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a long +// long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long long a = 6; + long long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test066.c b/hpvm/projects/llvm-cbe/test/cfiles/test066.c index b4adc62240751fac572c9c1cede33279f51c7c90..10ec61f56ec72432bc43d8ae8af85226cd3f08e8 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test066.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test066.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long long int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a long +// long int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long long int a = 6; + long long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test067.c b/hpvm/projects/llvm-cbe/test/cfiles/test067.c index 9d786b521063454dc42ba609fc742091ba3df1bb..e90cc8caea23b2baec248752eb84fe3c9afd3479 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test067.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test067.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed long long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed long long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long long a = 6; + signed long long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test068.c b/hpvm/projects/llvm-cbe/test/cfiles/test068.c index 1f72ecd1b7fa39c845c159ad3b9d86ce44d72547..5c0daa8a157d2ada06f4a1a4f2c66e1f3ac35354 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test068.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test068.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a unsigned long long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// unsigned long long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long long a = 6; + unsigned long long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test069.c b/hpvm/projects/llvm-cbe/test/cfiles/test069.c index bc611f13c1552f42c38cbf054134fe8fc6f37e24..6cae210ec65e8c1fe9c3827e49ed1147cfe42d22 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test069.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test069.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +13,12 @@ // *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long long int a = 6; + signed long long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test070.c b/hpvm/projects/llvm-cbe/test/cfiles/test070.c index 94c42bd8b5b4afee99d0cf18bec1806ceead963e..e9b55e232f54c9ac5ba6eea1e10a3babb88fb791 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test070.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test070.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning an unsigned long long int. -// *TW +// This code tests to see that the CBE can handle declaring and returning an +// unsigned long long int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long long int a = 6; + unsigned long long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test071.c b/hpvm/projects/llvm-cbe/test/cfiles/test071.c index 3e090147c7e09ed0ce208e305659083a17a31f81..357bc1e53330345808b5cf966bd9bbd4827f89af 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test071.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test071.c @@ -8,13 +8,13 @@ //===----------------------------------------------------------------------===// // // This code tests to see that the CBE will execute an if statement correctly. -// *TW +// *TW // //===----------------------------------------------------------------------===// int main() { - int x = 6; - if (x == 6) - return x; - return 0; + int x = 6; + if (x == 6) + return x; + return 0; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test072.c b/hpvm/projects/llvm-cbe/test/cfiles/test072.c index 7c7cbcb391bb51a53e20bfae8aabb23cf0bc2ef6..87cbd91bb0591e452c9ada49c94968ab63c84c64 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test072.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test072.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,18 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute an else-if statement correctly. -// *TW +// This code tests to see that the CBE will execute an else-if statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x = 6; - if (x == 4) { - return 2; - } else if (x == 6){ - return 6; - } else { - return 8; - } + int x = 6; + if (x == 4) { + return 2; + } else if (x == 6) { + return 6; + } else { + return 8; + } } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test073.c b/hpvm/projects/llvm-cbe/test/cfiles/test073.c index 006a7348e87c6259a41227f731542cdfe1f931d2..2e664c4c73bfe827019ff0cc3aae9e9f4037155d 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test073.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test073.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a do-while statement correctly. -// *TW +// This code tests to see that the CBE will execute a do-while statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x = 0; - do { - x++; - } while (x < 6); + int x = 0; + do { + x++; + } while (x < 6); - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test074.c b/hpvm/projects/llvm-cbe/test/cfiles/test074.c index bb3ff37858bdc25554481a48810c597f7b2f176e..903af81861c08822d9b9c9078bd0cd14d977f612 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test074.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test074.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,18 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a break/continue statement correctly. -// *TW +// This code tests to see that the CBE will execute a break/continue statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x; - for (x=0; x<=25; x++) { - if (x == 6) - break; - if (x < 15) - continue; - } - return x; + int x; + for (x = 0; x <= 25; x++) { + if (x == 6) + break; + if (x < 15) + continue; + } + return x; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test075.c b/hpvm/projects/llvm-cbe/test/cfiles/test075.c index a0601622c2f897e513615bcbd9b0a91176a26a5b..55562b99efb1414e10e139188919ee1c03e9133e 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test075.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test075.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,22 +8,21 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a Goto-Label statement correctly. -// *TW +// This code tests to see that the CBE will execute a Goto-Label statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x = 0; - goto label; - - for(;;) { - x = 10; - return x; - } + int x = 0; + goto label; - label: - x = 6; - return x; + for (;;) { + x = 10; + return x; + } +label: + x = 6; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test076.c b/hpvm/projects/llvm-cbe/test/cfiles/test076.c index d5f149eb3b51471ce23d8b9baa3186d58e509b44..faf56a3e37e14594fbfbfc3894e504989fcfdea1 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test076.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test076.c @@ -15,8 +15,8 @@ int main() { - int x = 6, y = 0, *ip = 0; - ip = &x; - y = *ip; - return y; + int x = 6, y = 0, *ip = 0; + ip = &x; + y = *ip; + return y; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test077.c b/hpvm/projects/llvm-cbe/test/cfiles/test077.c index a6e1fc7985b1bb5c9d1ac6943533d5c523a7b18d..771463d5afe5dafd1b3975697d807ab3767b3915 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test077.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test077.c @@ -14,11 +14,11 @@ //===----------------------------------------------------------------------===// int main() { - char x = 'a', y = 'b', *cp; - cp = &x; - y = *cp; - if (y == 'a'){ - return 6; - } - return 1; + char x = 'a', y = 'b', *cp; + cp = &x; + y = *cp; + if (y == 'a') { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test078.c b/hpvm/projects/llvm-cbe/test/cfiles/test078.c index cc60c18e34b3ecf4a18401fe604d1b45d1f8d1b1..f511a93fd34e3f62ea2d9a515cc6dd70449dc0b3 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test078.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test078.c @@ -15,9 +15,9 @@ #include <stddef.h> int main() { - int *ptr = NULL; - if (ptr == 0){ - return 6; - } - return 1; + int *ptr = NULL; + if (ptr == 0) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test079.c b/hpvm/projects/llvm-cbe/test/cfiles/test079.c index fd1ea110398c435cb9276fd064f6190aba0b5470..12b3477e32ae06f3be29a89b0fc18bac338a0b57 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test079.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test079.c @@ -14,12 +14,11 @@ //===----------------------------------------------------------------------===// int main() { - double x = 6, y = 0, *dp; - dp = &x; - y = *dp; - if (y == 6){ - return 6; - } - return 1; + double x = 6, y = 0, *dp; + dp = &x; + y = *dp; + if (y == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test080.c b/hpvm/projects/llvm-cbe/test/cfiles/test080.c index b7fb855bf45dc87a03bd9ca24a785516250ead4b..9b42fab5d93a392064b8a22bf97d325cd0ae24cc 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test080.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test080.c @@ -14,11 +14,11 @@ //===----------------------------------------------------------------------===// int main() { - float x = 6, y = 0, *fp; - fp = &x; - y = *fp; - if (y == 6){ - return 6; - } - return 1; + float x = 6, y = 0, *fp; + fp = &x; + y = *fp; + if (y == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test081.c b/hpvm/projects/llvm-cbe/test/cfiles/test081.c index 6efcad46eadbb23dcc85b7f5aefa43855383bbe9..e032f57f519165cb7a35578a68babe127edea66f 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test081.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test081.c @@ -7,17 +7,16 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly use the address-of value (&) -// variable and and return the value-at address (*) variable from integer 'num'. -// *TW +// This code tests to see that the CBE will properly use the address-of value +// (&) variable and and return the value-at address (*) variable from integer +// 'num'. *TW // //===----------------------------------------------------------------------===// -int main(){ - int *ptr; - int num = 6; - ptr = # - int deref = *ptr; - return deref; - +int main() { + int *ptr; + int num = 6; + ptr = # + int deref = *ptr; + return deref; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test082.c b/hpvm/projects/llvm-cbe/test/cfiles/test082.c index e30bb7a2f192adc03fd646bc625340c847cfe92c..7a7bf109eb1dc2b29ab1e66c4b1b804df6e586e3 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test082.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test082.c @@ -13,13 +13,13 @@ // //===----------------------------------------------------------------------===// -struct Number{ - int price; +struct Number { + int price; }; -int main(){ - struct Number a; - struct Number* ptr = &a; - ptr->price = 6; - return ptr->price; +int main() { + struct Number a; + struct Number *ptr = &a; + ptr->price = 6; + return ptr->price; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test083.c b/hpvm/projects/llvm-cbe/test/cfiles/test083.c index 5dc920edf485b69c20b78886a4eb2229af9151ad..58eb4c14a3dcd37623a1bd972705ac0e4cd46703 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test083.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test083.c @@ -13,12 +13,12 @@ // //===----------------------------------------------------------------------===// -int main(){ - int *ip; - int a[2]; - a[0] = 1; - a[1] = 6; - ip = &a[1]; +int main() { + int *ip; + int a[2]; + a[0] = 1; + a[1] = 6; + ip = &a[1]; - return *ip; + return *ip; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test084.c b/hpvm/projects/llvm-cbe/test/cfiles/test084.c index 6f5b3ad6d9cc526c609719308fa1da9a8ab6ab47..3a67fc1ef9cef5b1340e68eb41d93500000c5a26 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test084.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test084.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly increment a pointer via int. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly increment a pointer via +// int. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW // //===---------------------------------------------------------------------------===// @@ -20,9 +21,9 @@ int main() { intptr_t inc0 = 0, inc1 = 0, diff = 0, a = 100; intptr_t *p = &a; inc0 = (intptr_t)p; - ++(*p++); //++(*p++); + ++(*p++); //++(*p++); inc1 = (intptr_t)p; - diff = inc1-inc0; + diff = inc1 - inc0; diff += 2; return diff; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test085.c b/hpvm/projects/llvm-cbe/test/cfiles/test085.c index 01e8d65e6cbb83bc21b46ff9c493284f8e41d2cd..04c47b83d6bce8e9cb8dba4deff3171ece95b46c 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test085.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test085.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly decrement a pointer via int. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly decrement a pointer via +// int. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW // //===---------------------------------------------------------------------------===// @@ -22,8 +23,7 @@ int main() { inc0 = (intptr_t)p; --(*p--); //--(*p--); inc1 = (intptr_t)p; - diff = inc0-inc1; + diff = inc0 - inc1; diff += 2; return diff; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test086.c b/hpvm/projects/llvm-cbe/test/cfiles/test086.c index 72e7f03901df7570e5e134c4707c20e8fada74a5..32e33e992378733e721082dc94c339b64bc1cd81 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test086.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test086.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly increment a pointer via char. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly increment a pointer via +// char. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW // //===---------------------------------------------------------------------------===// @@ -24,5 +25,5 @@ int main() { // diff = inc1-inc0; // diff += 2; // return diff; - return 6; //TODO + return 6; // TODO } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test087.c b/hpvm/projects/llvm-cbe/test/cfiles/test087.c index 29291167906a5cb9fd3aedaa0d3523eaa54d5bbd..6c983a65d62b9a71c9c2be11a8107e734628f999 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test087.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test087.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly decrement a pointer via char. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly decrement a pointer via +// char. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW //===---------------------------------------------------------------------------===// int main() { @@ -23,5 +24,5 @@ int main() { // diff = inc0-inc1; // diff += 2; // return diff; - return 6; //TODO + return 6; // TODO } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test088.c b/hpvm/projects/llvm-cbe/test/cfiles/test088.c index 938237bea9774b7c9e52b36ae20d814bb563507c..7cefca1537290d57c2adce83321b848ca82fcbe3 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test088.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test088.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===---------------------------------------------------------------------------===// -int main(){ - int a[2][2]; - int *ip; - a[0][0] = 0; - a[0][1] = 1; - a[1][0] = 3; - a[1][1] = 6; - ip = &a[1][1]; +int main() { + int a[2][2]; + int *ip; + a[0][0] = 0; + a[0][1] = 1; + a[1][0] = 3; + a[1][1] = 6; + ip = &a[1][1]; - return *ip; + return *ip; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test089.c b/hpvm/projects/llvm-cbe/test/cfiles/test089.c index 925c3bb56ba77bb395641197dd4b5cef231d369e..59b20d5b45ba6c4d4d0cd70e04d0fd99d0253964 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test089.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test089.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,20 +8,20 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute data-packing in a structure correctly. -// *TW +// This code tests to see that the CBE will execute data-packing in a structure +// correctly. *TW //===------------------------------------------------------------------------------===// #pragma pack(push) #pragma pack(1) -struct DataSize{ - char Data2; - char Data3; - int Data1; +struct DataSize { + char Data2; + char Data3; + int Data1; }; -int main(){ - struct DataSize example; - return sizeof(example); +int main() { + struct DataSize example; + return sizeof(example); } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test090.c b/hpvm/projects/llvm-cbe/test/cfiles/test090.c index 021a05e8a002bcf2320df59c7e39c2963e52c756..d3e64ff5b9b21a68147c0a0aab69d74d05fc93e4 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test090.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test090.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,19 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a union and check the data size correctly. -// *TW +// This code tests to see that the CBE will execute a union and check the data +// size correctly. *TW //===------------------------------------------------------------------------------===// -union Data{ - int i; - float f; - char str[8]; +union Data { + int i; + float f; + char str[8]; }; -int main(){ - union Data data; - int datasize = sizeof(data) - 2; +int main() { + union Data data; + int datasize = sizeof(data) - 2; - return datasize; + return datasize; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test091.c b/hpvm/projects/llvm-cbe/test/cfiles/test091.c index dce59d85d5b788696deb7e0b4e0a97e69cdea0e8..557286e1ddd2326f912d4ae788218fce536a0a25 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test091.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test091.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,17 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will access and return union members correctly. -// *TW +// This code tests to see that the CBE will access and return union members +// correctly. *TW //===------------------------------------------------------------------------------===// -union Data{ - char unit1[6]; - char unit2; - char unit3; +union Data { + char unit1[6]; + char unit2; + char unit3; }; -int main(){ - union Data data; - return sizeof(data); +int main() { + union Data data; + return sizeof(data); } - - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test092.c b/hpvm/projects/llvm-cbe/test/cfiles/test092.c index 3b197f21a5f8964daf0ac427955df96faa9feec2..8018bca7eecd1b4196f822bc354108e6b5e8dc27 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test092.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test092.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,29 +8,25 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will pass a structure into a function correctly. -// *TW +// This code tests to see that the CBE will pass a structure into a function +// correctly. *TW //===------------------------------------------------------------------------------===// int k = 0; -struct test{ - int i; - float f; +struct test { + int i; + float f; }; -void funct(struct test example){ - k = example.i; -} +void funct(struct test example) { k = example.i; } -int main(){ - struct test example; +int main() { + struct test example; - example.i = 6; - example.f = 6.0; - funct(example); + example.i = 6; + example.f = 6.0; + funct(example); - return k; + return k; } - - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test093.c b/hpvm/projects/llvm-cbe/test/cfiles/test093.c index 3553edea3a5fdb8680feaf2297ab32938ca2c608..9a6188e7d4d13b8e5b73a2e0cf832cb3ddb0f0ba 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test093.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test093.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,19 +12,19 @@ // *TW //===------------------------------------------------------------------------------===// -struct layer1{ - int depth1; - char name1[20]; +struct layer1 { + int depth1; + char name1[20]; }; -struct layer2{ - int depth2; - char name2[20]; - struct layer1 layer_data; -}layer2_data; +struct layer2 { + int depth2; + char name2[20]; + struct layer1 layer_data; +} layer2_data; -int main(){ - struct layer2 layer2_data = {1, "test", {6, "test2"}}; +int main() { + struct layer2 layer2_data = {1, "test", {6, "test2"}}; - return layer2_data.layer_data.depth1; + return layer2_data.layer_data.depth1; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test094.c b/hpvm/projects/llvm-cbe/test/cfiles/test094.c index 2568c9c3537d9cedce0cb36e86a414c068493504..8faf3330cc9f360debb2434f2720cfead79be20e 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test094.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test094.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,18 +13,17 @@ //===------------------------------------------------------------------------------===// typedef struct test { - int var1; - int var2; - int var3; -}testrename; + int var1; + int var2; + int var3; +} testrename; -int main(){ - testrename variable; +int main() { + testrename variable; - variable.var2 = 5; - variable.var3 = 6; - variable.var1 = 9; + variable.var2 = 5; + variable.var3 = 6; + variable.var1 = 9; - return variable.var3; + return variable.var3; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test095.c b/hpvm/projects/llvm-cbe/test/cfiles/test095.c index 21db27203416db2f9454ce203eed555299465a40..b622c4b94c071548e734f4dd4ceec7097b5b90a2 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test095.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test095.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,17 +12,16 @@ // *TW //===------------------------------------------------------------------------------===// -struct Shows - { - char show[20]; - int runlength; - int rating; +struct Shows { + char show[20]; + int runlength; + int rating; }; -int main(){ -struct Shows b1[3] = { - {"Big Bang Theory",22,6}, - {"NCIS",45,9}, - }; - return b1[0].rating; +int main() { + struct Shows b1[3] = { + {"Big Bang Theory", 22, 6}, + {"NCIS", 45, 9}, + }; + return b1[0].rating; } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test096.c b/hpvm/projects/llvm-cbe/test/cfiles/test096.c index 81661df1212b75da06f9eabcc9e64c82118172ad..35982e134131b895bcf15cbb22bda76e482d1e0b 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test096.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test096.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,18 +8,18 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a self referencing structure. -// *TW +// This code tests to see that the CBE will execute a self referencing +// structure. *TW //===------------------------------------------------------------------------------===// #include <stdio.h> //for NULL -struct data{ - int a; - struct data *ptr; +struct data { + int a; + struct data *ptr; }; -int main(){ - struct data p=(struct data){.a=3,.ptr=&(struct data){.a=6,.ptr=NULL}}; - return p.ptr->a; +int main() { + struct data p = + (struct data){.a = 3, .ptr = &(struct data){.a = 6, .ptr = NULL}}; + return p.ptr->a; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test097.c b/hpvm/projects/llvm-cbe/test/cfiles/test097.c index a42e36b6cb43551113d7c38984895a07a479ffc4..6e0f8145b0909b0c6c6b3f26e09633b8ccc58b12 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test097.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test097.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,17 +12,16 @@ // *TW //===------------------------------------------------------------------------------===// -int addby2 ( int x ); +int addby2(int x); -int main( ){ - int n ; - n = addby2 ( 4 ) ; - return n; +int main() { + int n; + n = addby2(4); + return n; } -int addby2(int x){ - int p ; - p = x + 2 ; - return ( p ) ; +int addby2(int x) { + int p; + p = x + 2; + return (p); } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test098.c b/hpvm/projects/llvm-cbe/test/cfiles/test098.c index 70de117e51a9064e638354fe78072e93a635c904..d8594b5a7615b6be6fcc0cb7a04b9e5ff972acd3 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test098.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test098.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,18 +12,18 @@ // *TW //===------------------------------------------------------------------------------===// -int subtrby2 ( int x ); +int subtrby2(int x); static int eight = 8; static int two = 2; -int main( ){ - int n ; - n = subtrby2 ( eight ) ; - return n; +int main() { + int n; + n = subtrby2(eight); + return n; } -int subtrby2(int x){ - int p ; - p = x - two ; - return ( p ) ; +int subtrby2(int x) { + int p; + p = x - two; + return (p); } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test099.c b/hpvm/projects/llvm-cbe/test/cfiles/test099.c index 1c4713262eeaf6042f7af0ee7e6e41547f226bb2..c4ab77522b27cdf29251409e707bb6548891e9a7 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test099.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test099.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,9 +13,8 @@ //===------------------------------------------------------------------------------===// int main() { - register int counter = 0; - counter += 6; + register int counter = 0; + counter += 6; - return 6; + return 6; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test100.c b/hpvm/projects/llvm-cbe/test/cfiles/test100.c index db2cd9ea604e3a5aa1b64eaf4159ae9f1fe2700c..2b6a07912d94388827c9cde38e997ca96249b269 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test100.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test100.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,20 +12,19 @@ // *TW //===------------------------------------------------------------------------------===// -int fibonaci(int i){ - if(i == 0){ - return 0; - } - if(i == 1){ - return 1; - } - return fibonaci(i-1) + fibonaci(i-2); +int fibonaci(int i) { + if (i == 0) { + return 0; + } + if (i == 1) { + return 1; + } + return fibonaci(i - 1) + fibonaci(i - 2); } -int main(){ - int returnval; - returnval = fibonaci(6) - 2; +int main() { + int returnval; + returnval = fibonaci(6) - 2; - return returnval; + return returnval; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test101.c b/hpvm/projects/llvm-cbe/test/cfiles/test101.c index 50d18d3ec33746d58a24cf342247e717d926d31a..ffffeb592072391026a4b0c3a705e8c63db235fd 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test101.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test101.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -14,24 +15,26 @@ unsigned int fastfib(unsigned int n); -int main(){ - return fastfib(6) - 2; -} +int main() { return fastfib(6) - 2; } -unsigned int fastfib(unsigned int n){ - unsigned int a[3]; - unsigned int *p=a; - unsigned int i; +unsigned int fastfib(unsigned int n) { + unsigned int a[3]; + unsigned int *p = a; + unsigned int i; - for(i=0; i<=n; ++i) { - if(i<2) *p=i; - else{ - if(p==a) *p=*(a+1)+*(a+2); - else if(p==a+1) *p=*a+*(a+2); - else *p=*a+*(a+1); - } - if(++p>a+2) p=a; + for (i = 0; i <= n; ++i) { + if (i < 2) + *p = i; + else { + if (p == a) + *p = *(a + 1) + *(a + 2); + else if (p == a + 1) + *p = *a + *(a + 2); + else + *p = *a + *(a + 1); } - return p==a?*(p+2):*(p-1); + if (++p > a + 2) + p = a; + } + return p == a ? *(p + 2) : *(p - 1); } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test102.c b/hpvm/projects/llvm-cbe/test/cfiles/test102.c index 572ea0310334592c668e6266da7c364d39a80ebb..44247c6231a26acfca041e6896bcbb300d2bc6f5 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test102.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test102.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test103.c b/hpvm/projects/llvm-cbe/test/cfiles/test103.c index 6e2329021d257f46fb2b818e68f932e90899b8d8..e751c2d8a4e3c2249921b15c833ae0e99a47d10a 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test103.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test103.c @@ -15,9 +15,8 @@ #define B 3 #define C A + B -int main(){ +int main() { - int x = C; - return x; + int x = C; + return x; } - diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test104.c b/hpvm/projects/llvm-cbe/test/cfiles/test104.c index 88884d68575f413784f039a1685430c8e1dce56e..43c29dedb685484fd779d0565eaf3d30f97c160a 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test104.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test104.c @@ -12,13 +12,11 @@ // //===----------------------------------------------------------------------===// -int tail (int n) { +int tail(int n) { if (n == 6) return n; else - return tail(n+1); + return tail(n + 1); } -int main(){ - return tail(0); -} +int main() { return tail(0); } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test105.c b/hpvm/projects/llvm-cbe/test/cfiles/test105.c index 7e830d55c55182e5d995a8841c41132555c54ee4..79ab340aef5c7db27c06d076efa95bb85fb5a964 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/test105.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/test105.c @@ -12,13 +12,11 @@ // //===----------------------------------------------------------------------===// -int head(int n){ - if(n == 6) +int head(int n) { + if (n == 6) return n; else - return head(n+1); + return head(n + 1); } -int main(){ - return head(0); -} +int main() { return head(0); } diff --git a/hpvm/projects/llvm-cbe/test/cfiles/testbad.c b/hpvm/projects/llvm-cbe/test/cfiles/testbad.c index a7456dc2b52888358ae2e7fce0da5b0c799c9b45..a8a9bca17c49e5ebed010beeeb987ac5904b3b10 100644 --- a/hpvm/projects/llvm-cbe/test/cfiles/testbad.c +++ b/hpvm/projects/llvm-cbe/test/cfiles/testbad.c @@ -11,7 +11,4 @@ // //===----------------------------------------------------------------------===// -int main() -{ - return 25; -} +int main() { return 25; } diff --git a/hpvm/projects/llvm-cbe/test/dtypes.h b/hpvm/projects/llvm-cbe/test/dtypes.h index 3ab8d8b1c5399d17cfb052d144d8356783574027..00f1b417bfc5c2c5c4a499516346896c1ad21c75 100644 --- a/hpvm/projects/llvm-cbe/test/dtypes.h +++ b/hpvm/projects/llvm-cbe/test/dtypes.h @@ -26,22 +26,21 @@ #if !defined(_COMPILER_MINGW_) -#define strtoull _strtoui64 -#define strtoll _strtoi64 -#define strcasecmp _stricmp -#define strncasecmp _strnicmp -#define snprintf _snprintf -#define stat _stat +#define strtoull _strtoui64 +#define strtoll _strtoi64 +#define strcasecmp _stricmp +#define strncasecmp _strnicmp +#define snprintf _snprintf +#define stat _stat -#define STDIN_FILENO 0 -#define STDOUT_FILENO 1 -#define STDERR_FILENO 2 +#define STDIN_FILENO 0 +#define STDOUT_FILENO 1 +#define STDERR_FILENO 2 #endif /* !_COMPILER_MINGW_ */ #endif /* _OS_WINDOWS_ */ - /* This file defines sane integer types for our target platforms. This library only runs on machines with the following characteristics: @@ -56,86 +55,86 @@ #ifdef _OS_WINDOWS_ #define STDCALL __stdcall -# ifdef LIBRARY_EXPORTS -# define JL_DLLEXPORT __declspec(dllexport) -# else -# define JL_DLLEXPORT __declspec(dllimport) -# endif +#ifdef LIBRARY_EXPORTS +#define JL_DLLEXPORT __declspec(dllexport) +#else +#define JL_DLLEXPORT __declspec(dllimport) +#endif #else #define STDCALL -#define JL_DLLEXPORT __attribute__ ((visibility("default"))) +#define JL_DLLEXPORT __attribute__((visibility("default"))) #endif #ifdef _OS_LINUX_ #include <endian.h> -#define LITTLE_ENDIAN __LITTLE_ENDIAN -#define BIG_ENDIAN __BIG_ENDIAN -#define PDP_ENDIAN __PDP_ENDIAN -#define BYTE_ORDER __BYTE_ORDER +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#define PDP_ENDIAN __PDP_ENDIAN +#define BYTE_ORDER __BYTE_ORDER #endif #if defined(__APPLE__) || defined(__FreeBSD__) #include <machine/endian.h> -#define __LITTLE_ENDIAN LITTLE_ENDIAN -#define __BIG_ENDIAN BIG_ENDIAN -#define __PDP_ENDIAN PDP_ENDIAN -#define __BYTE_ORDER BYTE_ORDER +#define __LITTLE_ENDIAN LITTLE_ENDIAN +#define __BIG_ENDIAN BIG_ENDIAN +#define __PDP_ENDIAN PDP_ENDIAN +#define __BYTE_ORDER BYTE_ORDER #endif #ifdef _OS_WINDOWS_ -#define __LITTLE_ENDIAN 1234 -#define __BIG_ENDIAN 4321 -#define __PDP_ENDIAN 3412 -#define __BYTE_ORDER __LITTLE_ENDIAN +#define __LITTLE_ENDIAN 1234 +#define __BIG_ENDIAN 4321 +#define __PDP_ENDIAN 3412 +#define __BYTE_ORDER __LITTLE_ENDIAN #define __FLOAT_WORD_ORDER __LITTLE_ENDIAN -#define LITTLE_ENDIAN __LITTLE_ENDIAN -#define BIG_ENDIAN __BIG_ENDIAN -#define PDP_ENDIAN __PDP_ENDIAN -#define BYTE_ORDER __BYTE_ORDER +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#define PDP_ENDIAN __PDP_ENDIAN +#define BYTE_ORDER __BYTE_ORDER #endif #define LLT_ALLOC(n) malloc(n) -#define LLT_REALLOC(p,n) realloc((p),(n)) +#define LLT_REALLOC(p, n) realloc((p), (n)) #define LLT_FREE(x) free(x) #if defined(_OS_WINDOWS_) && defined(_COMPILER_INTEL_) -# define STATIC_INLINE static -# define INLINE +#define STATIC_INLINE static +#define INLINE #elif defined(_OS_WINDOWS_) && defined(_COMPILER_MICROSOFT_) -# define STATIC_INLINE static __inline -# define INLINE __inline +#define STATIC_INLINE static __inline +#define INLINE __inline #else -# define STATIC_INLINE static inline -# define INLINE inline +#define STATIC_INLINE static inline +#define INLINE inline #endif #if defined(_OS_WINDOWS_) && !defined(_COMPILER_MINGW_) -# define NOINLINE __declspec(noinline) -# define NOINLINE_DECL(f) __declspec(noinline) f +#define NOINLINE __declspec(noinline) +#define NOINLINE_DECL(f) __declspec(noinline) f #else -# define NOINLINE __attribute__((noinline)) -# define NOINLINE_DECL(f) f __attribute__((noinline)) +#define NOINLINE __attribute__((noinline)) +#define NOINLINE_DECL(f) f __attribute__((noinline)) #endif #ifdef _COMPILER_MICROSOFT_ -# ifdef _P64 -# define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(8)) x -# else -# define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(4)) x -# endif +#ifdef _P64 +#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(8)) x +#else +#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(4)) x +#endif #elif defined(__GNUC__) -# define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) x __attribute__ ((aligned (sizeof(void*)))) +#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) x __attribute__((aligned(sizeof(void *)))) #else -# define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) +#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) #endif typedef int bool_t; -typedef unsigned char byte_t; /* 1 byte */ +typedef unsigned char byte_t; /* 1 byte */ #ifdef _P64 #define TOP_BIT 0x8000000000000000 #define NBITS 64 -typedef uint64_t uint_t; // preferred int type on platform +typedef uint64_t uint_t; // preferred int type on platform typedef int64_t int_t; #else #define TOP_BIT 0x80000000 @@ -144,17 +143,16 @@ typedef uint32_t uint_t; typedef int32_t int_t; #endif -STATIC_INLINE unsigned int next_power_of_two(unsigned int val) -{ - /* this function taken from libuv src/unix/core.c */ - val -= 1; - val |= val >> 1; - val |= val >> 2; - val |= val >> 4; - val |= val >> 8; - val |= val >> 16; - val += 1; - return val; +STATIC_INLINE unsigned int next_power_of_two(unsigned int val) { + /* this function taken from libuv src/unix/core.c */ + val -= 1; + val |= val >> 1; + val |= val >> 2; + val |= val >> 4; + val |= val >> 8; + val |= val >> 16; + val += 1; + return val; } #define LLT_ALIGN(x, sz) (((x) + (sz)-1) & -(sz)) @@ -162,22 +160,22 @@ STATIC_INLINE unsigned int next_power_of_two(unsigned int val) // branch prediction annotations #ifdef __GNUC__ #define __unlikely(x) __builtin_expect(!!(x), 0) -#define __likely(x) __builtin_expect(!!(x), 1) +#define __likely(x) __builtin_expect(!!(x), 1) #else #define __unlikely(x) (x) -#define __likely(x) (x) +#define __likely(x) (x) #endif #define DBL_MAXINT 9007199254740992LL #define FLT_MAXINT 16777216 -#define U64_MAX 18446744073709551615ULL -#define S64_MAX 9223372036854775807LL -#define S64_MIN (-S64_MAX - 1LL) -#define BIT63 0x8000000000000000LL -#define U32_MAX 4294967295L -#define S32_MAX 2147483647L -#define S32_MIN (-S32_MAX - 1L) -#define BIT31 0x80000000 +#define U64_MAX 18446744073709551615ULL +#define S64_MAX 9223372036854775807LL +#define S64_MIN (-S64_MAX - 1LL) +#define BIT63 0x8000000000000000LL +#define U32_MAX 4294967295L +#define S32_MAX 2147483647L +#define S32_MIN (-S32_MAX - 1L) +#define BIT31 0x80000000 #define D_PNAN ((double)+NAN) #define D_NNAN ((double)-NAN) @@ -188,17 +186,27 @@ STATIC_INLINE unsigned int next_power_of_two(unsigned int val) #define F_PINF ((float)+INFINITY) #define F_NINF ((float)-INFINITY) -typedef enum { T_INT8, T_UINT8, T_INT16, T_UINT16, T_INT32, T_UINT32, - T_INT64, T_UINT64, T_FLOAT, T_DOUBLE } numerictype_t; - -#define N_NUMTYPES ((int)T_DOUBLE+1) +typedef enum { + T_INT8, + T_UINT8, + T_INT16, + T_UINT16, + T_INT32, + T_UINT32, + T_INT64, + T_UINT64, + T_FLOAT, + T_DOUBLE +} numerictype_t; + +#define N_NUMTYPES ((int)T_DOUBLE + 1) #ifdef _P64 -# define T_PTRDIFF T_INT64 -# define T_SIZE T_UINT64 +#define T_PTRDIFF T_INT64 +#define T_SIZE T_UINT64 #else -# define T_PTRDIFF T_INT32 -# define T_SIZE T_UINT32 +#define T_PTRDIFF T_INT32 +#define T_SIZE T_UINT32 #endif #endif /* DTYPES_H */ \ No newline at end of file diff --git a/hpvm/projects/llvm-cbe/test/platform.h b/hpvm/projects/llvm-cbe/test/platform.h index 0b7c6bcbbdd700d90adc40f495c9210241fb32be..8db68aae7bceb0506c2f8620a971a2d86fdb1695 100644 --- a/hpvm/projects/llvm-cbe/test/platform.h +++ b/hpvm/projects/llvm-cbe/test/platform.h @@ -30,8 +30,8 @@ */ /******************************************************************************* -* Compiler * -*******************************************************************************/ + * Compiler * + *******************************************************************************/ /* * Notes: @@ -60,8 +60,8 @@ #endif /******************************************************************************* -* OS * -*******************************************************************************/ + * OS * + *******************************************************************************/ #if defined(__FreeBSD__) #define _OS_FREEBSD_ @@ -74,12 +74,14 @@ #endif /******************************************************************************* -* Architecture * -*******************************************************************************/ + * Architecture * + *******************************************************************************/ -#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \ + defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) #define _CPU_X86_64_ -#elif defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(_X86_) +#elif defined(i386) || defined(__i386) || defined(__i386__) || \ + defined(_M_IX86) || defined(_X86_) #define _CPU_X86_ #elif defined(__aarch64__) #define _CPU_AARCH64_ @@ -92,22 +94,22 @@ #endif #if defined(_CPU_X86_64_) -# define _P64 +#define _P64 #elif defined(_CPU_X86_) -# define _P32 +#define _P32 #elif defined(_OS_WINDOWS_) /* Not sure how to determine pointer size on Windows running ARM. */ -# if _WIN64 -# define _P64 -# else -# define _P32 -# endif +#if _WIN64 +#define _P64 +#else +#define _P32 +#endif #elif __SIZEOF_POINTER__ == 8 -# define _P64 +#define _P64 #elif __SIZEOF_POINTER__ == 4 -# define _P32 +#define _P32 #else -# error pointer size not known for your platform / compiler +#error pointer size not known for your platform / compiler #endif #endif /* !PLATFORM_H */ \ No newline at end of file diff --git a/hpvm/projects/llvm-cbe/test/selectionsort/main.c b/hpvm/projects/llvm-cbe/test/selectionsort/main.c index 47cf877d34daea4d6ea2cb45fec6053e04cfbec5..65848e6adab8d6eb1e6b75ece1b720849d510623 100644 --- a/hpvm/projects/llvm-cbe/test/selectionsort/main.c +++ b/hpvm/projects/llvm-cbe/test/selectionsort/main.c @@ -1,39 +1,34 @@ #include <stdio.h> -int main() -{ - int array[100], n, c, d, position, swap; - - printf("Enter number of elements\n"); - scanf("%d", &n); - - printf("Enter %d integers\n", n); - - for (c = 0; c < n; c++) - scanf("%d", &array[c]); - - for (c = 0; c < (n - 1); c++) - { - position = c; - - for (d = c +1; d < n; d++) - { - if (array[position] > array[d]) - position = d; - } - if (position != c) - { - swap = array[c]; - array[c] = array[position]; - array[position] = swap; - } - } - - printf("Sorted list in ascending order:\n"); - - for (c = 0; c < n; c++) - printf("%d\n", array[c]); - - return 0; +int main() { + int array[100], n, c, d, position, swap; + + printf("Enter number of elements\n"); + scanf("%d", &n); + + printf("Enter %d integers\n", n); + + for (c = 0; c < n; c++) + scanf("%d", &array[c]); + + for (c = 0; c < (n - 1); c++) { + position = c; + + for (d = c + 1; d < n; d++) { + if (array[position] > array[d]) + position = d; + } + if (position != c) { + swap = array[c]; + array[c] = array[position]; + array[position] = swap; + } + } + + printf("Sorted list in ascending order:\n"); + + for (c = 0; c < n; c++) + printf("%d\n", array[c]); + + return 0; } - diff --git a/hpvm/projects/llvm-cbe/test/test001.c b/hpvm/projects/llvm-cbe/test/test001.c index 817d7ca8cae09d11e57848ee7d3fdb9a7931d19a..8606d141ba73ddce2a598e85c6a787d715b1a5e2 100644 --- a/hpvm/projects/llvm-cbe/test/test001.c +++ b/hpvm/projects/llvm-cbe/test/test001.c @@ -11,7 +11,4 @@ // //===----------------------------------------------------------------------===// -int main() -{ - return 6; -} +int main() { return 6; } diff --git a/hpvm/projects/llvm-cbe/test/test002.c b/hpvm/projects/llvm-cbe/test/test002.c index 9af3c34ee82cf9517f0f4ed4015a239fdace5cfb..aeb02526f8b2bda1b0bae293d1f006c6a4622641 100644 --- a/hpvm/projects/llvm-cbe/test/test002.c +++ b/hpvm/projects/llvm-cbe/test/test002.c @@ -8,14 +8,13 @@ //===----------------------------------------------------------------------===// // // This code tests to see that the CBE will execute a for loop correctly. -// *TW +// *TW // //===----------------------------------------------------------------------===// -int main() -{ - int i, x = 0; - for (i = 0; i < 6; i++) - ++x; - return x; +int main() { + int i, x = 0; + for (i = 0; i < 6; i++) + ++x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test003.c b/hpvm/projects/llvm-cbe/test/test003.c index 4aa8eb6bfb6e4a4e4d67f5fd4f5847cc608d6cae..bfeaef5db7a85f23c746b90f17461fb10dfd87e8 100644 --- a/hpvm/projects/llvm-cbe/test/test003.c +++ b/hpvm/projects/llvm-cbe/test/test003.c @@ -11,13 +11,11 @@ // *TW //===----------------------------------------------------------------------===// -int main() -{ - int i = 0, x = 0; - while (i < 6) { - ++x; - ++i; - } - return x; -} - +int main() { + int i = 0, x = 0; + while (i < 6) { + ++x; + ++i; + } + return x; +} diff --git a/hpvm/projects/llvm-cbe/test/test004.c b/hpvm/projects/llvm-cbe/test/test004.c index ba619f09bbaab461723a6f85dca1dfbb28ceac41..35a5a02d83091093a1b251bbb5a7158b11d93244 100644 --- a/hpvm/projects/llvm-cbe/test/test004.c +++ b/hpvm/projects/llvm-cbe/test/test004.c @@ -7,15 +7,15 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute an if/else statement correctly. -// *TW +// This code tests to see that the CBE will execute an if/else statement +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - int x = 3; - x += 3; - if (x == 6) - return x; - else - return 0; + int x = 3; + x += 3; + if (x == 6) + return x; + else + return 0; } diff --git a/hpvm/projects/llvm-cbe/test/test005.c b/hpvm/projects/llvm-cbe/test/test005.c index 8b9323a97e3a27cfb4cc45b17ba26b39c96a180c..a287f075cd3b152e84a0bd24ce35097c5bb231b7 100644 --- a/hpvm/projects/llvm-cbe/test/test005.c +++ b/hpvm/projects/llvm-cbe/test/test005.c @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// int main() { - int i, j, x = 0; - for (i = 0; i < 3; i++) - for (j = 0; j < 2; j++) - ++x; - - return x; + int i, j, x = 0; + for (i = 0; i < 3; i++) + for (j = 0; j < 2; j++) + ++x; + + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test006.c b/hpvm/projects/llvm-cbe/test/test006.c index b513d75d4ab163388f15f156d1387d5b71dfcdf4..fe901d6d19cd2dabd11f66623d1f1ca3d0cf55b9 100644 --- a/hpvm/projects/llvm-cbe/test/test006.c +++ b/hpvm/projects/llvm-cbe/test/test006.c @@ -7,18 +7,18 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a nested while loop correctly. -// *TW +// This code tests to see that the CBE will execute a nested while loop +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - int i = 0, j = 0, x = 0; - while (i < 6) { - while (j < 6) { - ++x; - ++j; - } - ++i; + int i = 0, j = 0, x = 0; + while (i < 6) { + while (j < 6) { + ++x; + ++j; } - return x; + ++i; + } + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test007.c b/hpvm/projects/llvm-cbe/test/test007.c index 50c895d18192844c38c53c6e706eb2c4f163713d..b4ff4365db7ad0f48dc9fa2171a818757ba899c1 100644 --- a/hpvm/projects/llvm-cbe/test/test007.c +++ b/hpvm/projects/llvm-cbe/test/test007.c @@ -7,27 +7,27 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a switch statement correctly. -// *TW +// This code tests to see that the CBE will execute a switch statement +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - char var = 'x'; - - switch (var) { - case 'z' : - return 0; - break; - case 'y' : - return 1; - break; - case 'x' : - return 6; - break; - case 'w' : - return 7; - break; - default : - return 100; - } + char var = 'x'; + + switch (var) { + case 'z': + return 0; + break; + case 'y': + return 1; + break; + case 'x': + return 6; + break; + case 'w': + return 7; + break; + default: + return 100; + } } diff --git a/hpvm/projects/llvm-cbe/test/test008.c b/hpvm/projects/llvm-cbe/test/test008.c index 283b8f73bafe45c6225e5270249c458b9a75a80d..f054263e0b5490d25b16c53c082d7b0dfbd1793f 100644 --- a/hpvm/projects/llvm-cbe/test/test008.c +++ b/hpvm/projects/llvm-cbe/test/test008.c @@ -12,18 +12,18 @@ //===----------------------------------------------------------------------===// struct test { - int var1; - int var2; - int var3; + int var1; + int var2; + int var3; }; int main() { - struct test variable; + struct test variable; - variable.var2 = 5; - variable.var3 = 6; - variable.var1 = 9; - - return variable.var3; + variable.var2 = 5; + variable.var3 = 6; + variable.var1 = 9; + + return variable.var3; } diff --git a/hpvm/projects/llvm-cbe/test/test009.c b/hpvm/projects/llvm-cbe/test/test009.c index a46509105cb73430794e55eb9d5af6d0da98ff6f..1b2fc327e2c7fd67ba1520dbecebd4803507c600 100644 --- a/hpvm/projects/llvm-cbe/test/test009.c +++ b/hpvm/projects/llvm-cbe/test/test009.c @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// int main() { - int example[10]; - int i; - for (i = 0;i < 10; ++i) { - example[i] = i; - } - return example[6]; + int example[10]; + int i; + for (i = 0; i < 10; ++i) { + example[i] = i; + } + return example[6]; } diff --git a/hpvm/projects/llvm-cbe/test/test010.c b/hpvm/projects/llvm-cbe/test/test010.c index e3841e64d3e41aa923201427f6913c3e30a650c9..21c6fdd0c7b6ed0a6c346d01a8e8836a4b2050a5 100644 --- a/hpvm/projects/llvm-cbe/test/test010.c +++ b/hpvm/projects/llvm-cbe/test/test010.c @@ -7,37 +7,37 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a nested switch statement correctly. -// *TW +// This code tests to see that the CBE will execute a nested switch statement +// correctly. *TW //===----------------------------------------------------------------------===// int main() { - char var = 'x', var2; - switch (var) { - case 'z' : - return 0; - break; - case 'y' : - return 1; - break; - case 'x' : - var2 = 'b'; - - switch (var2) { - case 'a' : - return 10; - break; - case 'b' : - return 6; - break; - default : - return 18; - } + char var = 'x', var2; + switch (var) { + case 'z': + return 0; + break; + case 'y': + return 1; + break; + case 'x': + var2 = 'b'; - case 'w' : - return 7; - break; - default : - return 100; - } + switch (var2) { + case 'a': + return 10; + break; + case 'b': + return 6; + break; + default: + return 18; + } + + case 'w': + return 7; + break; + default: + return 100; + } } diff --git a/hpvm/projects/llvm-cbe/test/test011.c b/hpvm/projects/llvm-cbe/test/test011.c index aa0ee7229f512c25e2794372eac697c85d35b531..9ff808b7096c728794ed472349b472d5ce61b952 100644 --- a/hpvm/projects/llvm-cbe/test/test011.c +++ b/hpvm/projects/llvm-cbe/test/test011.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,13 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle addition between two variables. -// *TW +// This code tests to see that the CBE can handle addition between two +// variables. *TW //===------------------------------------------------------------------------===// -int main() -{ - int i = 2, t = 4, x = 0; - x = i+t; +int main() { + int i = 2, t = 4, x = 0; + x = i + t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test012.c b/hpvm/projects/llvm-cbe/test/test012.c index 403c635686a51eb493c0ca224c043b6aa6c2fce6..60689156c5bcd5c835aebb0a9c5e0e8d7612d164 100644 --- a/hpvm/projects/llvm-cbe/test/test012.c +++ b/hpvm/projects/llvm-cbe/test/test012.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,14 @@ // //===----------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle subtraction between two variables. +// This code tests to see that the CBE can handle subtraction between two +// variables. // *TW //===----------------------------------------------------------------------------===// -int main() -{ - int i = 8, t = 2, x = 0; - x = i-t; +int main() { + int i = 8, t = 2, x = 0; + x = i - t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test013.c b/hpvm/projects/llvm-cbe/test/test013.c index 444d4676b78a2f5324cb9bbccfac67bfcf9330aa..9bb5dc492bc251f11c152eb2ea7b506c3354430c 100644 --- a/hpvm/projects/llvm-cbe/test/test013.c +++ b/hpvm/projects/llvm-cbe/test/test013.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,13 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle multiplication between two variables. -// *TW +// This code tests to see that the CBE can handle multiplication between two +// variables. *TW //===------------------------------------------------------------------------------===// -int main() -{ - int i = 3, t = 2, x = 0; - x = i*t; +int main() { + int i = 3, t = 2, x = 0; + x = i * t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test014.c b/hpvm/projects/llvm-cbe/test/test014.c index e1dc6931f9e989ed867f8abf93dfa5f57042de5c..cbc0ad52d407bfc768a56b7105d4b93a7d2bdaf7 100644 --- a/hpvm/projects/llvm-cbe/test/test014.c +++ b/hpvm/projects/llvm-cbe/test/test014.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +8,13 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle division between two variables. -// *TW +// This code tests to see that the CBE can handle division between two +// variables. *TW //===------------------------------------------------------------------------------===// -int main() -{ - int i = 30, t = 5, x = 0; - x = i/t; +int main() { + int i = 30, t = 5, x = 0; + x = i / t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test015.c b/hpvm/projects/llvm-cbe/test/test015.c index e4c2a5c03b28ca481dd3709e18567237cc12a660..81c2f22808e4f4efcb7a4d031faf6a7e2e197f37 100644 --- a/hpvm/projects/llvm-cbe/test/test015.c +++ b/hpvm/projects/llvm-cbe/test/test015.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,10 +12,9 @@ // *TW //===------------------------------------------------------------------------------===// -int main() -{ - int i = 26, t = 20, x = 0; - x = i%t; +int main() { + int i = 26, t = 20, x = 0; + x = i % t; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test016.c b/hpvm/projects/llvm-cbe/test/test016.c index 0841840ebc31ba622a4538f328b658b0bf52e08c..bb5bc64fff2b798375e2c2470e6538d5009c7719 100644 --- a/hpvm/projects/llvm-cbe/test/test016.c +++ b/hpvm/projects/llvm-cbe/test/test016.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +13,12 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - char ch; + char ch; - if(sizeof(+ch) == 4) { - return 6; - } - return 1; + if (sizeof(+ch) == 4) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test017.c b/hpvm/projects/llvm-cbe/test/test017.c index 0535862b3057ea1cf7ac7ba2801a563a85d75dbe..a87abcd1e8f3311be495deb7bcf369f01ceeaa7f 100644 --- a/hpvm/projects/llvm-cbe/test/test017.c +++ b/hpvm/projects/llvm-cbe/test/test017.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -13,11 +14,11 @@ int main() { - signed int a = 10; - signed int b = -a; + signed int a = 10; + signed int b = -a; - if(b == -10) { - return 6; - } - return 1; + if (b == -10) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test018.c b/hpvm/projects/llvm-cbe/test/test018.c index c02efa9d0e914b96a6d491769e1a22e2e2747047..ea38b291393f20192f1885bdd702ef321b6929f0 100644 --- a/hpvm/projects/llvm-cbe/test/test018.c +++ b/hpvm/projects/llvm-cbe/test/test018.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,15 +8,15 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle the incremental (++a) operator. -// *TW +// This code tests to see that the CBE can handle the incremental (++a) +// operator. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 5; + int x = 5; - ++x; + ++x; - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test019.c b/hpvm/projects/llvm-cbe/test/test019.c index 1975bb9c5b3e0eaae7a1417310da435f3df8a0d6..484fe0481656cba546bee1565e110f1a0dc90327 100644 --- a/hpvm/projects/llvm-cbe/test/test019.c +++ b/hpvm/projects/llvm-cbe/test/test019.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,15 +8,15 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle the decremental (--a) operator. -// *TW +// This code tests to see that the CBE can handle the decremental (--a) +// operator. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 7; - - --x; + int x = 7; - return x; + --x; + + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test020.c b/hpvm/projects/llvm-cbe/test/test020.c index a68801708d9b628dae8dc3b5dba130f86436bdb6..98ed7f1701cdfdf442e6253706f2bc2f1f30227f 100644 --- a/hpvm/projects/llvm-cbe/test/test020.c +++ b/hpvm/projects/llvm-cbe/test/test020.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 3; + int x = 6; + int y = 3; - if(x > y){ - return x; - } - return 1; + if (x > y) { + return x; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test021.c b/hpvm/projects/llvm-cbe/test/test021.c index 93eed31d9bbfc11ad0559dcaf649df9e1f9206c1..0c5e63a462482f6b2f5cc392c2508a3076c3c3e6 100644 --- a/hpvm/projects/llvm-cbe/test/test021.c +++ b/hpvm/projects/llvm-cbe/test/test021.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; + int x = 6; + int y = 6; - if(x >= y){ - return x; - } - return 1; + if (x >= y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test022.c b/hpvm/projects/llvm-cbe/test/test022.c index 895069a83bc7b1c1df9d6f27784187940b493b35..1578e158914dd68f5b99c4c69b36e19f23217939 100644 --- a/hpvm/projects/llvm-cbe/test/test022.c +++ b/hpvm/projects/llvm-cbe/test/test022.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 12; + int x = 6; + int y = 12; - - if(x < y){ - return x; - } - return 1; + if (x < y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test023.c b/hpvm/projects/llvm-cbe/test/test023.c index 52348d3e1690624aa712ec6735e811f4ab958055..bc309ddb015a9af75cfcd3f09a7c5f5e093ba981 100644 --- a/hpvm/projects/llvm-cbe/test/test023.c +++ b/hpvm/projects/llvm-cbe/test/test023.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; + int x = 6; + int y = 6; - if(x <= y){ - return x; - } - return 1; + if (x <= y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test024.c b/hpvm/projects/llvm-cbe/test/test024.c index 2c90879b87e3646db441c47e114b04f05de134a8..782d41a47880e3078af3d3775923870efba0a915 100644 --- a/hpvm/projects/llvm-cbe/test/test024.c +++ b/hpvm/projects/llvm-cbe/test/test024.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; + int x = 6; + int y = 6; - if(x == y){ - return x; - } - return 1; + if (x == y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test025.c b/hpvm/projects/llvm-cbe/test/test025.c index 153cb4013477a27ad95248d38c62aa45bb2d5206..26bedf78ca25c8da2a1ba9c12694ddbdba087033 100644 --- a/hpvm/projects/llvm-cbe/test/test025.c +++ b/hpvm/projects/llvm-cbe/test/test025.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 2; + int x = 6; + int y = 2; - if(x != y){ - return x; - } - return 1; + if (x != y) { + return x; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test026.c b/hpvm/projects/llvm-cbe/test/test026.c index 874c06957d200d2402974e3928aae339f5c2d16c..cf0b3e6ae94f24c8392a7b6f91a7dade1c1a6613 100644 --- a/hpvm/projects/llvm-cbe/test/test026.c +++ b/hpvm/projects/llvm-cbe/test/test026.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { -int x = 6; -int y = 6; -int z = 6; + int x = 6; + int y = 6; + int z = 6; - if(x == y && x == z){ - return 6; - } - return 1; + if (x == y && x == z) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test027.c b/hpvm/projects/llvm-cbe/test/test027.c index d1322597c34d5d3d284ae8d4de203b4bc769f998..f1e0adb31dc38ed4d35a25590281e3a9ac505474 100644 --- a/hpvm/projects/llvm-cbe/test/test027.c +++ b/hpvm/projects/llvm-cbe/test/test027.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int x = 6; - int y = 6; - int z = 6; + int x = 6; + int y = 6; + int z = 6; - if(x == y || x != z){ - return 6; - } - return 1; + if (x == y || x != z) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test028.c b/hpvm/projects/llvm-cbe/test/test028.c index ce77d792f3b2e75d3795784a3a932c37c120c764..7e2ecdcf3f66c4637a15b2d2d19ddd3b5e740469 100644 --- a/hpvm/projects/llvm-cbe/test/test028.c +++ b/hpvm/projects/llvm-cbe/test/test028.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = -7; - unsigned int b = 0; + unsigned int a = -7; + unsigned int b = 0; - b = ~a; - if( b == 6){ - return 6; - } - return 1; + b = ~a; + if (b == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test029.c b/hpvm/projects/llvm-cbe/test/test029.c index b7ac93ecf5f275ba5129a9bed6988b694cc0ca39..34d1ff5c8be474ccc629e9d10c51ca30b5cb8c10 100644 --- a/hpvm/projects/llvm-cbe/test/test029.c +++ b/hpvm/projects/llvm-cbe/test/test029.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,15 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 6; //0110 - unsigned int b = 15; //1111 - unsigned int c = 0; + unsigned int a = 6; // 0110 + unsigned int b = 15; // 1111 + unsigned int c = 0; - c = a&b; - if(c == 6){ - return 6; - } - return 1; + c = a & b; + if (c == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test030.c b/hpvm/projects/llvm-cbe/test/test030.c index 333ce5aa01915623200c600249ebb2377782a139..a88c910f8f25d85785ed5c5f03578157fc13b47d 100644 --- a/hpvm/projects/llvm-cbe/test/test030.c +++ b/hpvm/projects/llvm-cbe/test/test030.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,15 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { -unsigned int a = 2; -unsigned int b = 4; -unsigned int c = 0; + unsigned int a = 2; + unsigned int b = 4; + unsigned int c = 0; - c = a|b; - if(c == 6){ - return 6; - } - return 1; + c = a | b; + if (c == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test031.c b/hpvm/projects/llvm-cbe/test/test031.c index 69d0dab0e1ff78ff13fce90e3ae68de355b7cf19..6e13a9f03fae2a6b47e6fa6000e7fcea7a74b8d1 100644 --- a/hpvm/projects/llvm-cbe/test/test031.c +++ b/hpvm/projects/llvm-cbe/test/test031.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,15 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 9; //1001 - unsigned int b = 15; //1111 - unsigned int c = 0; + unsigned int a = 9; // 1001 + unsigned int b = 15; // 1111 + unsigned int c = 0; - - c = a^b; - if(c == 6){ - return 6; - } - return 1; + c = a ^ b; + if (c == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test032.c b/hpvm/projects/llvm-cbe/test/test032.c index ae63e2c4d26864d0bec5dc89dc6a80174e89c985..a98ab650e98bd0e147825bca50dca1bfcaea5809 100644 --- a/hpvm/projects/llvm-cbe/test/test032.c +++ b/hpvm/projects/llvm-cbe/test/test032.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 3; //0011 - unsigned int b = 0; + unsigned int a = 3; // 0011 + unsigned int b = 0; - - b = a << 1; //0110 - if(b == 6){ - return 6; - } - return 1; + b = a << 1; // 0110 + if (b == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test033.c b/hpvm/projects/llvm-cbe/test/test033.c index 1bb96d21bdef67392305cf8631aa7243ab77cb98..81b4177184b79eb5e282cafea5b00c60ce48a5a4 100644 --- a/hpvm/projects/llvm-cbe/test/test033.c +++ b/hpvm/projects/llvm-cbe/test/test033.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 13; //1100 - unsigned int b = 0; + unsigned int a = 13; // 1100 + unsigned int b = 0; - b = a >> 1; //0110 - if(b == 6){ - return 6; - } - return 1; + b = a >> 1; // 0110 + if (b == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test034.c b/hpvm/projects/llvm-cbe/test/test034.c index dd9106b0be38c88c2a9a10a5c937c2c373ed5eed..977bf40358d94bb0bb10a66938b4f21b500aab57 100644 --- a/hpvm/projects/llvm-cbe/test/test034.c +++ b/hpvm/projects/llvm-cbe/test/test034.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 3; - int b = 3; - - a+=b; - if(a == 6){ - return 6; - } - return 1; + int a = 3; + int b = 3; + + a += b; + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test035.c b/hpvm/projects/llvm-cbe/test/test035.c index d1c0ae391f15a0a8b3a118b2435667329ca86a85..8a7f23e17b1bc6f7569cc3352d6898ea033e40a5 100644 --- a/hpvm/projects/llvm-cbe/test/test035.c +++ b/hpvm/projects/llvm-cbe/test/test035.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 9; - int b = 3; + int a = 9; + int b = 3; - a-=b; - if(a == 6){ - return 6; - } - return 1; + a -= b; + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test036.c b/hpvm/projects/llvm-cbe/test/test036.c index d8d5a1957c84d7afa5cc9d96a6afd64297172f21..019722660fb4641ddfbe13d3d5fcec7aa06102c6 100644 --- a/hpvm/projects/llvm-cbe/test/test036.c +++ b/hpvm/projects/llvm-cbe/test/test036.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -11,15 +12,14 @@ // Compound Multiplication Assignment(a*=b) operator. // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 2; - int b = 3; + int a = 2; + int b = 3; - a*=b; - if(a == 6){ - return 6; - } - return 1; + a *= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test037.c b/hpvm/projects/llvm-cbe/test/test037.c index 5bf5ee705a44ec929a70d0176690733bcc1fbcb8..2363c91ce91768e7dee329eec5db68beedf8076f 100644 --- a/hpvm/projects/llvm-cbe/test/test037.c +++ b/hpvm/projects/llvm-cbe/test/test037.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 30; - int b = 5; + int a = 30; + int b = 5; - a/=b; - if(a == 6){ - return 6; - } - return 1; + a /= b; + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test038.c b/hpvm/projects/llvm-cbe/test/test038.c index efbe23460710f9f7108b18ba5e60ace5168338a6..1d6aa395aac2994f6bca35de131941154650038b 100644 --- a/hpvm/projects/llvm-cbe/test/test038.c +++ b/hpvm/projects/llvm-cbe/test/test038.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 20; - int b = 14; + int a = 20; + int b = 14; - a%=b; - if(a == 6){ - return 6; - } - return 1; + a %= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test039.c b/hpvm/projects/llvm-cbe/test/test039.c index 112d7f69700f6d7bfad21bc58d91506f4d95b68e..53d4fcb9133bdf57967034a7ad02b231088c59ad 100644 --- a/hpvm/projects/llvm-cbe/test/test039.c +++ b/hpvm/projects/llvm-cbe/test/test039.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 6; //0110 - unsigned int b = 15; //1111 + unsigned int a = 6; // 0110 + unsigned int b = 15; // 1111 - a&=b; - if(a == 6){ - return 6; - } - return 1; + a &= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test040.c b/hpvm/projects/llvm-cbe/test/test040.c index 5285fb73ecf5d3572c9cbf279be94debc6043e85..d174e7e88041eaca97dc2acb13d218c7ea8baba9 100644 --- a/hpvm/projects/llvm-cbe/test/test040.c +++ b/hpvm/projects/llvm-cbe/test/test040.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 2; - unsigned int b = 4; + unsigned int a = 2; + unsigned int b = 4; - a|=b; - if(a == 6){ - return 6; - } - return 1; + a |= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test041.c b/hpvm/projects/llvm-cbe/test/test041.c index f04e682356dff0739053b0e91857805b952c8aec..45f64966d499f07f09a6b3904a8c2cd9fc9c71a0 100644 --- a/hpvm/projects/llvm-cbe/test/test041.c +++ b/hpvm/projects/llvm-cbe/test/test041.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,15 +13,14 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 9; //1001 - unsigned int b = 15; //1111 + unsigned int a = 9; // 1001 + unsigned int b = 15; // 1111 - a^=b; - if(a == 6){ - return 6; - } - return 1; + a ^= b; + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test042.c b/hpvm/projects/llvm-cbe/test/test042.c index 5b4f12d80882f347efbe1ae59103f7eaf672c464..ec2547370b90902fffb0e61564bb02c782e20fd9 100644 --- a/hpvm/projects/llvm-cbe/test/test042.c +++ b/hpvm/projects/llvm-cbe/test/test042.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 3; //0011 + unsigned int a = 3; // 0011 - a <<= 1; //0110 - if( a == 6){ - return 6; - } - return 1; + a <<= 1; // 0110 + if (a == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test043.c b/hpvm/projects/llvm-cbe/test/test043.c index 3b42179304a1741c268d8adf90192bafc5a2ba98..6aeb7bd17c9f40b6f86cdf1fd8ea2dbe520ce554 100644 --- a/hpvm/projects/llvm-cbe/test/test043.c +++ b/hpvm/projects/llvm-cbe/test/test043.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 13; //1100 + unsigned int a = 13; // 1100 - a >>= 1; //0110 - if(a == 6){ - return 6; - } - return 1; + a >>= 1; // 0110 + if (a == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test044.c b/hpvm/projects/llvm-cbe/test/test044.c index dbb9d31ad940421d1d33d21af916eccd672ac2a8..f9b7c2d4632326b81ca526a82765152039269fba 100644 --- a/hpvm/projects/llvm-cbe/test/test044.c +++ b/hpvm/projects/llvm-cbe/test/test044.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,17 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a char. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// char. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - char a = 'A' ; //65 - int ia = 0; + char a = 'A'; // 65 + int ia = 0; - ia = a; - ia-=59; + ia = a; + ia -= 59; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test045.c b/hpvm/projects/llvm-cbe/test/test045.c index 50aaa8effcd3994d1dd47213d25748b1293f49f0..c8b57993a7edcbc691c641b1405f6f3ae137b65e 100644 --- a/hpvm/projects/llvm-cbe/test/test045.c +++ b/hpvm/projects/llvm-cbe/test/test045.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - signed char a = 'A'; - int ia = 0; + signed char a = 'A'; + int ia = 0; - ia = a; - ia-=59; + ia = a; + ia -= 59; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test046.c b/hpvm/projects/llvm-cbe/test/test046.c index ea57085caf034bdaf554169c439d21cabdfc1606..edbfe837fe615cea8ce58d5e7732da49632cf66c 100644 --- a/hpvm/projects/llvm-cbe/test/test046.c +++ b/hpvm/projects/llvm-cbe/test/test046.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,13 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned char a = 'A'; - int ia = 0; + unsigned char a = 'A'; + int ia = 0; - ia = a; - ia-=59; + ia = a; + ia -= 59; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test047.c b/hpvm/projects/llvm-cbe/test/test047.c index 2b90d14c7f9b195cdb94611925644cd4debb99ea..476cea234f53c18c684c9069c0675fc2effe48d3 100644 --- a/hpvm/projects/llvm-cbe/test/test047.c +++ b/hpvm/projects/llvm-cbe/test/test047.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,13 +8,12 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning an int. -// *TW +// This code tests to see that the CBE can handle declaring and returning an +// int. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - int a = 6; - return a; + int a = 6; + return a; } - diff --git a/hpvm/projects/llvm-cbe/test/test048.c b/hpvm/projects/llvm-cbe/test/test048.c index c30694ff502502de722f99e2f8a21cfe79ddf17c..ee3966afccc9d7e3f0aaff62aec16142e28a601e 100644 --- a/hpvm/projects/llvm-cbe/test/test048.c +++ b/hpvm/projects/llvm-cbe/test/test048.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +13,11 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - short int a = 6; - int ia = 0; - ia = (int)a; + short int a = 6; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test049.c b/hpvm/projects/llvm-cbe/test/test049.c index bb4a0801981e734a36518cf406fc8edd2213d0cd..5f29feffc05704adf39078d49177ab5edb5cffcf 100644 --- a/hpvm/projects/llvm-cbe/test/test049.c +++ b/hpvm/projects/llvm-cbe/test/test049.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +13,11 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - long int a = 6; - int ia = 0; - ia = (int)a; + long int a = 6; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test050.c b/hpvm/projects/llvm-cbe/test/test050.c index f69c7cee23cbc47535ce653f28a26305195541e4..aa49757a320855970290cdef405a495052405ffe 100644 --- a/hpvm/projects/llvm-cbe/test/test050.c +++ b/hpvm/projects/llvm-cbe/test/test050.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,11 +13,11 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - signed int a = 6; - int ia = 0; - ia = (int)a; + signed int a = 6; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test051.c b/hpvm/projects/llvm-cbe/test/test051.c index 61f1e03d57d03c6a298ed50880d074e4f58e9e9a..0334eafdf30b2be0c6cde7dfeadaaf074d943608 100644 --- a/hpvm/projects/llvm-cbe/test/test051.c +++ b/hpvm/projects/llvm-cbe/test/test051.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +13,12 @@ // *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned int a = 6; + unsigned int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test052.c b/hpvm/projects/llvm-cbe/test/test052.c index 48e1ce67f8edf77d03d1a9c75d72352b26f06511..3230b192b7b70080bb12318c983b86dcc4b3159b 100644 --- a/hpvm/projects/llvm-cbe/test/test052.c +++ b/hpvm/projects/llvm-cbe/test/test052.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a float. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// float. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - float a = 6.0; + float a = 6.0; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test053.c b/hpvm/projects/llvm-cbe/test/test053.c index 86dd5691a77f96fcd6e8568d22e93ef3a160872b..4ea19186428065a3813addb8537d5331c2709015 100644 --- a/hpvm/projects/llvm-cbe/test/test053.c +++ b/hpvm/projects/llvm-cbe/test/test053.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a double. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// double. *TW //===------------------------------------------------------------------------===// -int main(){ +int main() { - double a = 6.0; + double a = 6.0; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test054.c b/hpvm/projects/llvm-cbe/test/test054.c index 4c86601412f5db9c6ec818f6029374dacaeebb60..caa7d00080554531f087c950da49975854b2d4aa 100644 --- a/hpvm/projects/llvm-cbe/test/test054.c +++ b/hpvm/projects/llvm-cbe/test/test054.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long double. -// *TW +// This code tests to see that the CBE can handle declaring and returning a long +// double. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long double a = 6.0; + long double a = 6.0; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test055.c b/hpvm/projects/llvm-cbe/test/test055.c index cd7891acfe29906d1c0f6b6e9462ba2a95d8e747..4b85082d3353dea43862a1a0db736f1c43bf91fd 100644 --- a/hpvm/projects/llvm-cbe/test/test055.c +++ b/hpvm/projects/llvm-cbe/test/test055.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a short. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// short. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - short a = 6; + short a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test056.c b/hpvm/projects/llvm-cbe/test/test056.c index b12df1df990921aa6586cb6c4328733098c89386..305f044be1a342cd921f5bdc9ab22d938e172103 100644 --- a/hpvm/projects/llvm-cbe/test/test056.c +++ b/hpvm/projects/llvm-cbe/test/test056.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed short. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed short. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed short a = 6; + signed short a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test057.c b/hpvm/projects/llvm-cbe/test/test057.c index 50678081ec9ba22cc6274de0c6be137f913704bb..280ec876bda3a5ffb004d0eb14afa47250253c7d 100644 --- a/hpvm/projects/llvm-cbe/test/test057.c +++ b/hpvm/projects/llvm-cbe/test/test057.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning an unsigned short. -// *TW +// This code tests to see that the CBE can handle declaring and returning an +// unsigned short. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned short a = 6; + unsigned short a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test058.c b/hpvm/projects/llvm-cbe/test/test058.c index cdbfac068fe5aa9639691dcf869d016213e7dff7..f5404bd8336012b80e9f3d02074b4bd390924a84 100644 --- a/hpvm/projects/llvm-cbe/test/test058.c +++ b/hpvm/projects/llvm-cbe/test/test058.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed short int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed short int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed short int a = 6; + signed short int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test059.c b/hpvm/projects/llvm-cbe/test/test059.c index 4de964a13ec97e47ff09e618ac6e9c232d9acf35..13b3ac08797e64625e89a9404cd35a0c27d21203 100644 --- a/hpvm/projects/llvm-cbe/test/test059.c +++ b/hpvm/projects/llvm-cbe/test/test059.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a unsigned short int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// unsigned short int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned short int a = 6; + unsigned short int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test060.c b/hpvm/projects/llvm-cbe/test/test060.c index a0a6e16949f5730787137cbdf0bf5284ae6d292a..ecb393f2f368e5137d164a2996837db204c2f9f4 100644 --- a/hpvm/projects/llvm-cbe/test/test060.c +++ b/hpvm/projects/llvm-cbe/test/test060.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long a = 6; + long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test061.c b/hpvm/projects/llvm-cbe/test/test061.c index d1bf812aa0c3312a6b0dfafd2e59866ec5bdc236..ac7cadd45fe6e5148c41f38dee679ee8bddad2e3 100644 --- a/hpvm/projects/llvm-cbe/test/test061.c +++ b/hpvm/projects/llvm-cbe/test/test061.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long a = 6; + signed long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test062.c b/hpvm/projects/llvm-cbe/test/test062.c index 077ace8b321d7c9bd6a865bf0b2adb9bf892a3be..eaaf59853f711197b7049a993c48b939cbfab608 100644 --- a/hpvm/projects/llvm-cbe/test/test062.c +++ b/hpvm/projects/llvm-cbe/test/test062.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a unsigned long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// unsigned long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long a = 6; + unsigned long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test063.c b/hpvm/projects/llvm-cbe/test/test063.c index 78fbe390f5e05fbbf35f149bf6dc3f56ecd69549..fa6cd18e88bef646c55391a68564355302e55775 100644 --- a/hpvm/projects/llvm-cbe/test/test063.c +++ b/hpvm/projects/llvm-cbe/test/test063.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed long int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed long int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long int a = 6; + signed long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test064.c b/hpvm/projects/llvm-cbe/test/test064.c index c26a3da001557d18686ca20ec4de52bdd8e5e765..05a72b4b9a937ed87262c6ad40a9a24238b201dd 100644 --- a/hpvm/projects/llvm-cbe/test/test064.c +++ b/hpvm/projects/llvm-cbe/test/test064.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +13,12 @@ // *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long int a = 6; + unsigned long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test065.c b/hpvm/projects/llvm-cbe/test/test065.c index d9b299752c54e9238c6e2171342bb6f1c470163b..76958db4c2fe457f52cccac60f8dd3f49f8a868d 100644 --- a/hpvm/projects/llvm-cbe/test/test065.c +++ b/hpvm/projects/llvm-cbe/test/test065.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a long +// long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long long a = 6; + long long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test066.c b/hpvm/projects/llvm-cbe/test/test066.c index b4adc62240751fac572c9c1cede33279f51c7c90..10ec61f56ec72432bc43d8ae8af85226cd3f08e8 100644 --- a/hpvm/projects/llvm-cbe/test/test066.c +++ b/hpvm/projects/llvm-cbe/test/test066.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a long long int. -// *TW +// This code tests to see that the CBE can handle declaring and returning a long +// long int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - long long int a = 6; + long long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test067.c b/hpvm/projects/llvm-cbe/test/test067.c index 9d786b521063454dc42ba609fc742091ba3df1bb..e90cc8caea23b2baec248752eb84fe3c9afd3479 100644 --- a/hpvm/projects/llvm-cbe/test/test067.c +++ b/hpvm/projects/llvm-cbe/test/test067.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a signed long long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// signed long long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long long a = 6; + signed long long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test068.c b/hpvm/projects/llvm-cbe/test/test068.c index 1f72ecd1b7fa39c845c159ad3b9d86ce44d72547..5c0daa8a157d2ada06f4a1a4f2c66e1f3ac35354 100644 --- a/hpvm/projects/llvm-cbe/test/test068.c +++ b/hpvm/projects/llvm-cbe/test/test068.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning a unsigned long long. -// *TW +// This code tests to see that the CBE can handle declaring and returning a +// unsigned long long. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long long a = 6; + unsigned long long a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } - diff --git a/hpvm/projects/llvm-cbe/test/test069.c b/hpvm/projects/llvm-cbe/test/test069.c index bc611f13c1552f42c38cbf054134fe8fc6f37e24..6cae210ec65e8c1fe9c3827e49ed1147cfe42d22 100644 --- a/hpvm/projects/llvm-cbe/test/test069.c +++ b/hpvm/projects/llvm-cbe/test/test069.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +13,12 @@ // *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - signed long long int a = 6; + signed long long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test070.c b/hpvm/projects/llvm-cbe/test/test070.c index 94c42bd8b5b4afee99d0cf18bec1806ceead963e..e9b55e232f54c9ac5ba6eea1e10a3babb88fb791 100644 --- a/hpvm/projects/llvm-cbe/test/test070.c +++ b/hpvm/projects/llvm-cbe/test/test070.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===-------------------------------------------------------------------------------===// // -// This code tests to see that the CBE can handle declaring and returning an unsigned long long int. -// *TW +// This code tests to see that the CBE can handle declaring and returning an +// unsigned long long int. *TW //===-------------------------------------------------------------------------------===// -int main(){ +int main() { - unsigned long long int a = 6; + unsigned long long int a = 6; - int ia = 0; - ia = (int)a; + int ia = 0; + ia = (int)a; - return ia; + return ia; } diff --git a/hpvm/projects/llvm-cbe/test/test071.c b/hpvm/projects/llvm-cbe/test/test071.c index 3e090147c7e09ed0ce208e305659083a17a31f81..357bc1e53330345808b5cf966bd9bbd4827f89af 100644 --- a/hpvm/projects/llvm-cbe/test/test071.c +++ b/hpvm/projects/llvm-cbe/test/test071.c @@ -8,13 +8,13 @@ //===----------------------------------------------------------------------===// // // This code tests to see that the CBE will execute an if statement correctly. -// *TW +// *TW // //===----------------------------------------------------------------------===// int main() { - int x = 6; - if (x == 6) - return x; - return 0; + int x = 6; + if (x == 6) + return x; + return 0; } diff --git a/hpvm/projects/llvm-cbe/test/test072.c b/hpvm/projects/llvm-cbe/test/test072.c index 7c7cbcb391bb51a53e20bfae8aabb23cf0bc2ef6..87cbd91bb0591e452c9ada49c94968ab63c84c64 100644 --- a/hpvm/projects/llvm-cbe/test/test072.c +++ b/hpvm/projects/llvm-cbe/test/test072.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,18 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute an else-if statement correctly. -// *TW +// This code tests to see that the CBE will execute an else-if statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x = 6; - if (x == 4) { - return 2; - } else if (x == 6){ - return 6; - } else { - return 8; - } + int x = 6; + if (x == 4) { + return 2; + } else if (x == 6) { + return 6; + } else { + return 8; + } } - diff --git a/hpvm/projects/llvm-cbe/test/test073.c b/hpvm/projects/llvm-cbe/test/test073.c index 006a7348e87c6259a41227f731542cdfe1f931d2..2e664c4c73bfe827019ff0cc3aae9e9f4037155d 100644 --- a/hpvm/projects/llvm-cbe/test/test073.c +++ b/hpvm/projects/llvm-cbe/test/test073.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,16 +8,16 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a do-while statement correctly. -// *TW +// This code tests to see that the CBE will execute a do-while statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x = 0; - do { - x++; - } while (x < 6); + int x = 0; + do { + x++; + } while (x < 6); - return x; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test074.c b/hpvm/projects/llvm-cbe/test/test074.c index bb3ff37858bdc25554481a48810c597f7b2f176e..903af81861c08822d9b9c9078bd0cd14d977f612 100644 --- a/hpvm/projects/llvm-cbe/test/test074.c +++ b/hpvm/projects/llvm-cbe/test/test074.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,18 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a break/continue statement correctly. -// *TW +// This code tests to see that the CBE will execute a break/continue statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x; - for (x=0; x<=25; x++) { - if (x == 6) - break; - if (x < 15) - continue; - } - return x; + int x; + for (x = 0; x <= 25; x++) { + if (x == 6) + break; + if (x < 15) + continue; + } + return x; } - diff --git a/hpvm/projects/llvm-cbe/test/test075.c b/hpvm/projects/llvm-cbe/test/test075.c index a0601622c2f897e513615bcbd9b0a91176a26a5b..55562b99efb1414e10e139188919ee1c03e9133e 100644 --- a/hpvm/projects/llvm-cbe/test/test075.c +++ b/hpvm/projects/llvm-cbe/test/test075.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//--------------------===// // // The LLVM Compiler Infrastructure // @@ -7,22 +8,21 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a Goto-Label statement correctly. -// *TW +// This code tests to see that the CBE will execute a Goto-Label statement +// correctly. *TW // //===---------------------------------------------------------------------------===// int main() { - int x = 0; - goto label; - - for(;;) { - x = 10; - return x; - } + int x = 0; + goto label; - label: - x = 6; - return x; + for (;;) { + x = 10; + return x; + } +label: + x = 6; + return x; } diff --git a/hpvm/projects/llvm-cbe/test/test076.c b/hpvm/projects/llvm-cbe/test/test076.c index d5f149eb3b51471ce23d8b9baa3186d58e509b44..faf56a3e37e14594fbfbfc3894e504989fcfdea1 100644 --- a/hpvm/projects/llvm-cbe/test/test076.c +++ b/hpvm/projects/llvm-cbe/test/test076.c @@ -15,8 +15,8 @@ int main() { - int x = 6, y = 0, *ip = 0; - ip = &x; - y = *ip; - return y; + int x = 6, y = 0, *ip = 0; + ip = &x; + y = *ip; + return y; } diff --git a/hpvm/projects/llvm-cbe/test/test077.c b/hpvm/projects/llvm-cbe/test/test077.c index a6e1fc7985b1bb5c9d1ac6943533d5c523a7b18d..771463d5afe5dafd1b3975697d807ab3767b3915 100644 --- a/hpvm/projects/llvm-cbe/test/test077.c +++ b/hpvm/projects/llvm-cbe/test/test077.c @@ -14,11 +14,11 @@ //===----------------------------------------------------------------------===// int main() { - char x = 'a', y = 'b', *cp; - cp = &x; - y = *cp; - if (y == 'a'){ - return 6; - } - return 1; + char x = 'a', y = 'b', *cp; + cp = &x; + y = *cp; + if (y == 'a') { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test078.c b/hpvm/projects/llvm-cbe/test/test078.c index cc60c18e34b3ecf4a18401fe604d1b45d1f8d1b1..f511a93fd34e3f62ea2d9a515cc6dd70449dc0b3 100644 --- a/hpvm/projects/llvm-cbe/test/test078.c +++ b/hpvm/projects/llvm-cbe/test/test078.c @@ -15,9 +15,9 @@ #include <stddef.h> int main() { - int *ptr = NULL; - if (ptr == 0){ - return 6; - } - return 1; + int *ptr = NULL; + if (ptr == 0) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test079.c b/hpvm/projects/llvm-cbe/test/test079.c index fd1ea110398c435cb9276fd064f6190aba0b5470..12b3477e32ae06f3be29a89b0fc18bac338a0b57 100644 --- a/hpvm/projects/llvm-cbe/test/test079.c +++ b/hpvm/projects/llvm-cbe/test/test079.c @@ -14,12 +14,11 @@ //===----------------------------------------------------------------------===// int main() { - double x = 6, y = 0, *dp; - dp = &x; - y = *dp; - if (y == 6){ - return 6; - } - return 1; + double x = 6, y = 0, *dp; + dp = &x; + y = *dp; + if (y == 6) { + return 6; + } + return 1; } - diff --git a/hpvm/projects/llvm-cbe/test/test080.c b/hpvm/projects/llvm-cbe/test/test080.c index b7fb855bf45dc87a03bd9ca24a785516250ead4b..9b42fab5d93a392064b8a22bf97d325cd0ae24cc 100644 --- a/hpvm/projects/llvm-cbe/test/test080.c +++ b/hpvm/projects/llvm-cbe/test/test080.c @@ -14,11 +14,11 @@ //===----------------------------------------------------------------------===// int main() { - float x = 6, y = 0, *fp; - fp = &x; - y = *fp; - if (y == 6){ - return 6; - } - return 1; + float x = 6, y = 0, *fp; + fp = &x; + y = *fp; + if (y == 6) { + return 6; + } + return 1; } diff --git a/hpvm/projects/llvm-cbe/test/test081.c b/hpvm/projects/llvm-cbe/test/test081.c index 6efcad46eadbb23dcc85b7f5aefa43855383bbe9..e032f57f519165cb7a35578a68babe127edea66f 100644 --- a/hpvm/projects/llvm-cbe/test/test081.c +++ b/hpvm/projects/llvm-cbe/test/test081.c @@ -7,17 +7,16 @@ // //===----------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly use the address-of value (&) -// variable and and return the value-at address (*) variable from integer 'num'. -// *TW +// This code tests to see that the CBE will properly use the address-of value +// (&) variable and and return the value-at address (*) variable from integer +// 'num'. *TW // //===----------------------------------------------------------------------===// -int main(){ - int *ptr; - int num = 6; - ptr = # - int deref = *ptr; - return deref; - +int main() { + int *ptr; + int num = 6; + ptr = # + int deref = *ptr; + return deref; } diff --git a/hpvm/projects/llvm-cbe/test/test082.c b/hpvm/projects/llvm-cbe/test/test082.c index e30bb7a2f192adc03fd646bc625340c847cfe92c..7a7bf109eb1dc2b29ab1e66c4b1b804df6e586e3 100644 --- a/hpvm/projects/llvm-cbe/test/test082.c +++ b/hpvm/projects/llvm-cbe/test/test082.c @@ -13,13 +13,13 @@ // //===----------------------------------------------------------------------===// -struct Number{ - int price; +struct Number { + int price; }; -int main(){ - struct Number a; - struct Number* ptr = &a; - ptr->price = 6; - return ptr->price; +int main() { + struct Number a; + struct Number *ptr = &a; + ptr->price = 6; + return ptr->price; } diff --git a/hpvm/projects/llvm-cbe/test/test083.c b/hpvm/projects/llvm-cbe/test/test083.c index 5dc920edf485b69c20b78886a4eb2229af9151ad..58eb4c14a3dcd37623a1bd972705ac0e4cd46703 100644 --- a/hpvm/projects/llvm-cbe/test/test083.c +++ b/hpvm/projects/llvm-cbe/test/test083.c @@ -13,12 +13,12 @@ // //===----------------------------------------------------------------------===// -int main(){ - int *ip; - int a[2]; - a[0] = 1; - a[1] = 6; - ip = &a[1]; +int main() { + int *ip; + int a[2]; + a[0] = 1; + a[1] = 6; + ip = &a[1]; - return *ip; + return *ip; } diff --git a/hpvm/projects/llvm-cbe/test/test084.c b/hpvm/projects/llvm-cbe/test/test084.c index 6f5b3ad6d9cc526c609719308fa1da9a8ab6ab47..3a67fc1ef9cef5b1340e68eb41d93500000c5a26 100644 --- a/hpvm/projects/llvm-cbe/test/test084.c +++ b/hpvm/projects/llvm-cbe/test/test084.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly increment a pointer via int. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly increment a pointer via +// int. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW // //===---------------------------------------------------------------------------===// @@ -20,9 +21,9 @@ int main() { intptr_t inc0 = 0, inc1 = 0, diff = 0, a = 100; intptr_t *p = &a; inc0 = (intptr_t)p; - ++(*p++); //++(*p++); + ++(*p++); //++(*p++); inc1 = (intptr_t)p; - diff = inc1-inc0; + diff = inc1 - inc0; diff += 2; return diff; } diff --git a/hpvm/projects/llvm-cbe/test/test085.c b/hpvm/projects/llvm-cbe/test/test085.c index 01e8d65e6cbb83bc21b46ff9c493284f8e41d2cd..04c47b83d6bce8e9cb8dba4deff3171ece95b46c 100644 --- a/hpvm/projects/llvm-cbe/test/test085.c +++ b/hpvm/projects/llvm-cbe/test/test085.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly decrement a pointer via int. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly decrement a pointer via +// int. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW // //===---------------------------------------------------------------------------===// @@ -22,8 +23,7 @@ int main() { inc0 = (intptr_t)p; --(*p--); //--(*p--); inc1 = (intptr_t)p; - diff = inc0-inc1; + diff = inc0 - inc1; diff += 2; return diff; } - diff --git a/hpvm/projects/llvm-cbe/test/test086.c b/hpvm/projects/llvm-cbe/test/test086.c index 72e7f03901df7570e5e134c4707c20e8fada74a5..32e33e992378733e721082dc94c339b64bc1cd81 100644 --- a/hpvm/projects/llvm-cbe/test/test086.c +++ b/hpvm/projects/llvm-cbe/test/test086.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly increment a pointer via char. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly increment a pointer via +// char. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW // //===---------------------------------------------------------------------------===// @@ -24,5 +25,5 @@ int main() { // diff = inc1-inc0; // diff += 2; // return diff; - return 6; //TODO + return 6; // TODO } diff --git a/hpvm/projects/llvm-cbe/test/test087.c b/hpvm/projects/llvm-cbe/test/test087.c index 29291167906a5cb9fd3aedaa0d3523eaa54d5bbd..6c983a65d62b9a71c9c2be11a8107e734628f999 100644 --- a/hpvm/projects/llvm-cbe/test/test087.c +++ b/hpvm/projects/llvm-cbe/test/test087.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -7,9 +8,9 @@ // //===---------------------------------------------------------------------------===// // -// This code tests to see that the CBE will properly decrement a pointer via char. -// This example works by subtracting two mem. addresses and adding 2 to return 6. -// *TW +// This code tests to see that the CBE will properly decrement a pointer via +// char. This example works by subtracting two mem. addresses and adding 2 to +// return 6. *TW //===---------------------------------------------------------------------------===// int main() { @@ -23,5 +24,5 @@ int main() { // diff = inc0-inc1; // diff += 2; // return diff; - return 6; //TODO + return 6; // TODO } diff --git a/hpvm/projects/llvm-cbe/test/test088.c b/hpvm/projects/llvm-cbe/test/test088.c index 938237bea9774b7c9e52b36ae20d814bb563507c..7cefca1537290d57c2adce83321b848ca82fcbe3 100644 --- a/hpvm/projects/llvm-cbe/test/test088.c +++ b/hpvm/projects/llvm-cbe/test/test088.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//-------------------===// // // The LLVM Compiler Infrastructure // @@ -12,14 +13,14 @@ // *TW //===---------------------------------------------------------------------------===// -int main(){ - int a[2][2]; - int *ip; - a[0][0] = 0; - a[0][1] = 1; - a[1][0] = 3; - a[1][1] = 6; - ip = &a[1][1]; +int main() { + int a[2][2]; + int *ip; + a[0][0] = 0; + a[0][1] = 1; + a[1][0] = 3; + a[1][1] = 6; + ip = &a[1][1]; - return *ip; + return *ip; } diff --git a/hpvm/projects/llvm-cbe/test/test089.c b/hpvm/projects/llvm-cbe/test/test089.c index 925c3bb56ba77bb395641197dd4b5cef231d369e..59b20d5b45ba6c4d4d0cd70e04d0fd99d0253964 100644 --- a/hpvm/projects/llvm-cbe/test/test089.c +++ b/hpvm/projects/llvm-cbe/test/test089.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,20 +8,20 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute data-packing in a structure correctly. -// *TW +// This code tests to see that the CBE will execute data-packing in a structure +// correctly. *TW //===------------------------------------------------------------------------------===// #pragma pack(push) #pragma pack(1) -struct DataSize{ - char Data2; - char Data3; - int Data1; +struct DataSize { + char Data2; + char Data3; + int Data1; }; -int main(){ - struct DataSize example; - return sizeof(example); +int main() { + struct DataSize example; + return sizeof(example); } diff --git a/hpvm/projects/llvm-cbe/test/test090.c b/hpvm/projects/llvm-cbe/test/test090.c index 021a05e8a002bcf2320df59c7e39c2963e52c756..d3e64ff5b9b21a68147c0a0aab69d74d05fc93e4 100644 --- a/hpvm/projects/llvm-cbe/test/test090.c +++ b/hpvm/projects/llvm-cbe/test/test090.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,19 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a union and check the data size correctly. -// *TW +// This code tests to see that the CBE will execute a union and check the data +// size correctly. *TW //===------------------------------------------------------------------------------===// -union Data{ - int i; - float f; - char str[8]; +union Data { + int i; + float f; + char str[8]; }; -int main(){ - union Data data; - int datasize = sizeof(data) - 2; +int main() { + union Data data; + int datasize = sizeof(data) - 2; - return datasize; + return datasize; } diff --git a/hpvm/projects/llvm-cbe/test/test091.c b/hpvm/projects/llvm-cbe/test/test091.c index dce59d85d5b788696deb7e0b4e0a97e69cdea0e8..557286e1ddd2326f912d4ae788218fce536a0a25 100644 --- a/hpvm/projects/llvm-cbe/test/test091.c +++ b/hpvm/projects/llvm-cbe/test/test091.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,19 +8,17 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will access and return union members correctly. -// *TW +// This code tests to see that the CBE will access and return union members +// correctly. *TW //===------------------------------------------------------------------------------===// -union Data{ - char unit1[6]; - char unit2; - char unit3; +union Data { + char unit1[6]; + char unit2; + char unit3; }; -int main(){ - union Data data; - return sizeof(data); +int main() { + union Data data; + return sizeof(data); } - - diff --git a/hpvm/projects/llvm-cbe/test/test092.c b/hpvm/projects/llvm-cbe/test/test092.c index 3b197f21a5f8964daf0ac427955df96faa9feec2..8018bca7eecd1b4196f822bc354108e6b5e8dc27 100644 --- a/hpvm/projects/llvm-cbe/test/test092.c +++ b/hpvm/projects/llvm-cbe/test/test092.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,29 +8,25 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will pass a structure into a function correctly. -// *TW +// This code tests to see that the CBE will pass a structure into a function +// correctly. *TW //===------------------------------------------------------------------------------===// int k = 0; -struct test{ - int i; - float f; +struct test { + int i; + float f; }; -void funct(struct test example){ - k = example.i; -} +void funct(struct test example) { k = example.i; } -int main(){ - struct test example; +int main() { + struct test example; - example.i = 6; - example.f = 6.0; - funct(example); + example.i = 6; + example.f = 6.0; + funct(example); - return k; + return k; } - - diff --git a/hpvm/projects/llvm-cbe/test/test093.c b/hpvm/projects/llvm-cbe/test/test093.c index 3553edea3a5fdb8680feaf2297ab32938ca2c608..9a6188e7d4d13b8e5b73a2e0cf832cb3ddb0f0ba 100644 --- a/hpvm/projects/llvm-cbe/test/test093.c +++ b/hpvm/projects/llvm-cbe/test/test093.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,19 +12,19 @@ // *TW //===------------------------------------------------------------------------------===// -struct layer1{ - int depth1; - char name1[20]; +struct layer1 { + int depth1; + char name1[20]; }; -struct layer2{ - int depth2; - char name2[20]; - struct layer1 layer_data; -}layer2_data; +struct layer2 { + int depth2; + char name2[20]; + struct layer1 layer_data; +} layer2_data; -int main(){ - struct layer2 layer2_data = {1, "test", {6, "test2"}}; +int main() { + struct layer2 layer2_data = {1, "test", {6, "test2"}}; - return layer2_data.layer_data.depth1; + return layer2_data.layer_data.depth1; } diff --git a/hpvm/projects/llvm-cbe/test/test094.c b/hpvm/projects/llvm-cbe/test/test094.c index 2568c9c3537d9cedce0cb36e86a414c068493504..8faf3330cc9f360debb2434f2720cfead79be20e 100644 --- a/hpvm/projects/llvm-cbe/test/test094.c +++ b/hpvm/projects/llvm-cbe/test/test094.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,18 +13,17 @@ //===------------------------------------------------------------------------------===// typedef struct test { - int var1; - int var2; - int var3; -}testrename; + int var1; + int var2; + int var3; +} testrename; -int main(){ - testrename variable; +int main() { + testrename variable; - variable.var2 = 5; - variable.var3 = 6; - variable.var1 = 9; + variable.var2 = 5; + variable.var3 = 6; + variable.var1 = 9; - return variable.var3; + return variable.var3; } - diff --git a/hpvm/projects/llvm-cbe/test/test095.c b/hpvm/projects/llvm-cbe/test/test095.c index 21db27203416db2f9454ce203eed555299465a40..b622c4b94c071548e734f4dd4ceec7097b5b90a2 100644 --- a/hpvm/projects/llvm-cbe/test/test095.c +++ b/hpvm/projects/llvm-cbe/test/test095.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,17 +12,16 @@ // *TW //===------------------------------------------------------------------------------===// -struct Shows - { - char show[20]; - int runlength; - int rating; +struct Shows { + char show[20]; + int runlength; + int rating; }; -int main(){ -struct Shows b1[3] = { - {"Big Bang Theory",22,6}, - {"NCIS",45,9}, - }; - return b1[0].rating; +int main() { + struct Shows b1[3] = { + {"Big Bang Theory", 22, 6}, + {"NCIS", 45, 9}, + }; + return b1[0].rating; } diff --git a/hpvm/projects/llvm-cbe/test/test096.c b/hpvm/projects/llvm-cbe/test/test096.c index 81661df1212b75da06f9eabcc9e64c82118172ad..35982e134131b895bcf15cbb22bda76e482d1e0b 100644 --- a/hpvm/projects/llvm-cbe/test/test096.c +++ b/hpvm/projects/llvm-cbe/test/test096.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,18 +8,18 @@ // //===------------------------------------------------------------------------------===// // -// This code tests to see that the CBE will execute a self referencing structure. -// *TW +// This code tests to see that the CBE will execute a self referencing +// structure. *TW //===------------------------------------------------------------------------------===// #include <stdio.h> //for NULL -struct data{ - int a; - struct data *ptr; +struct data { + int a; + struct data *ptr; }; -int main(){ - struct data p=(struct data){.a=3,.ptr=&(struct data){.a=6,.ptr=NULL}}; - return p.ptr->a; +int main() { + struct data p = + (struct data){.a = 3, .ptr = &(struct data){.a = 6, .ptr = NULL}}; + return p.ptr->a; } - diff --git a/hpvm/projects/llvm-cbe/test/test097.c b/hpvm/projects/llvm-cbe/test/test097.c index a42e36b6cb43551113d7c38984895a07a479ffc4..6e0f8145b0909b0c6c6b3f26e09633b8ccc58b12 100644 --- a/hpvm/projects/llvm-cbe/test/test097.c +++ b/hpvm/projects/llvm-cbe/test/test097.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,17 +12,16 @@ // *TW //===------------------------------------------------------------------------------===// -int addby2 ( int x ); +int addby2(int x); -int main( ){ - int n ; - n = addby2 ( 4 ) ; - return n; +int main() { + int n; + n = addby2(4); + return n; } -int addby2(int x){ - int p ; - p = x + 2 ; - return ( p ) ; +int addby2(int x) { + int p; + p = x + 2; + return (p); } - diff --git a/hpvm/projects/llvm-cbe/test/test098.c b/hpvm/projects/llvm-cbe/test/test098.c index 70de117e51a9064e638354fe78072e93a635c904..d8594b5a7615b6be6fcc0cb7a04b9e5ff972acd3 100644 --- a/hpvm/projects/llvm-cbe/test/test098.c +++ b/hpvm/projects/llvm-cbe/test/test098.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,18 +12,18 @@ // *TW //===------------------------------------------------------------------------------===// -int subtrby2 ( int x ); +int subtrby2(int x); static int eight = 8; static int two = 2; -int main( ){ - int n ; - n = subtrby2 ( eight ) ; - return n; +int main() { + int n; + n = subtrby2(eight); + return n; } -int subtrby2(int x){ - int p ; - p = x - two ; - return ( p ) ; +int subtrby2(int x) { + int p; + p = x - two; + return (p); } diff --git a/hpvm/projects/llvm-cbe/test/test099.c b/hpvm/projects/llvm-cbe/test/test099.c index 1c4713262eeaf6042f7af0ee7e6e41547f226bb2..c4ab77522b27cdf29251409e707bb6548891e9a7 100644 --- a/hpvm/projects/llvm-cbe/test/test099.c +++ b/hpvm/projects/llvm-cbe/test/test099.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,9 +13,8 @@ //===------------------------------------------------------------------------------===// int main() { - register int counter = 0; - counter += 6; + register int counter = 0; + counter += 6; - return 6; + return 6; } - diff --git a/hpvm/projects/llvm-cbe/test/test100.c b/hpvm/projects/llvm-cbe/test/test100.c index db2cd9ea604e3a5aa1b64eaf4159ae9f1fe2700c..2b6a07912d94388827c9cde38e997ca96249b269 100644 --- a/hpvm/projects/llvm-cbe/test/test100.c +++ b/hpvm/projects/llvm-cbe/test/test100.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -11,20 +12,19 @@ // *TW //===------------------------------------------------------------------------------===// -int fibonaci(int i){ - if(i == 0){ - return 0; - } - if(i == 1){ - return 1; - } - return fibonaci(i-1) + fibonaci(i-2); +int fibonaci(int i) { + if (i == 0) { + return 0; + } + if (i == 1) { + return 1; + } + return fibonaci(i - 1) + fibonaci(i - 2); } -int main(){ - int returnval; - returnval = fibonaci(6) - 2; +int main() { + int returnval; + returnval = fibonaci(6) - 2; - return returnval; + return returnval; } - diff --git a/hpvm/projects/llvm-cbe/test/test101.c b/hpvm/projects/llvm-cbe/test/test101.c index 50d18d3ec33746d58a24cf342247e717d926d31a..ffffeb592072391026a4b0c3a705e8c63db235fd 100644 --- a/hpvm/projects/llvm-cbe/test/test101.c +++ b/hpvm/projects/llvm-cbe/test/test101.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // @@ -14,24 +15,26 @@ unsigned int fastfib(unsigned int n); -int main(){ - return fastfib(6) - 2; -} +int main() { return fastfib(6) - 2; } -unsigned int fastfib(unsigned int n){ - unsigned int a[3]; - unsigned int *p=a; - unsigned int i; +unsigned int fastfib(unsigned int n) { + unsigned int a[3]; + unsigned int *p = a; + unsigned int i; - for(i=0; i<=n; ++i) { - if(i<2) *p=i; - else{ - if(p==a) *p=*(a+1)+*(a+2); - else if(p==a+1) *p=*a+*(a+2); - else *p=*a+*(a+1); - } - if(++p>a+2) p=a; + for (i = 0; i <= n; ++i) { + if (i < 2) + *p = i; + else { + if (p == a) + *p = *(a + 1) + *(a + 2); + else if (p == a + 1) + *p = *a + *(a + 2); + else + *p = *a + *(a + 1); } - return p==a?*(p+2):*(p-1); + if (++p > a + 2) + p = a; + } + return p == a ? *(p + 2) : *(p - 1); } - diff --git a/hpvm/projects/llvm-cbe/test/test102.c b/hpvm/projects/llvm-cbe/test/test102.c index 572ea0310334592c668e6266da7c364d39a80ebb..44247c6231a26acfca041e6896bcbb300d2bc6f5 100644 --- a/hpvm/projects/llvm-cbe/test/test102.c +++ b/hpvm/projects/llvm-cbe/test/test102.c @@ -1,4 +1,5 @@ -//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===// +//===-- CBackend.cpp - Library for converting LLVM code to C +//----------------------===// // // The LLVM Compiler Infrastructure // diff --git a/hpvm/projects/llvm-cbe/test/test103.c b/hpvm/projects/llvm-cbe/test/test103.c index 6e2329021d257f46fb2b818e68f932e90899b8d8..e751c2d8a4e3c2249921b15c833ae0e99a47d10a 100644 --- a/hpvm/projects/llvm-cbe/test/test103.c +++ b/hpvm/projects/llvm-cbe/test/test103.c @@ -15,9 +15,8 @@ #define B 3 #define C A + B -int main(){ +int main() { - int x = C; - return x; + int x = C; + return x; } - diff --git a/hpvm/projects/llvm-cbe/test/test104.c b/hpvm/projects/llvm-cbe/test/test104.c index 88884d68575f413784f039a1685430c8e1dce56e..43c29dedb685484fd779d0565eaf3d30f97c160a 100644 --- a/hpvm/projects/llvm-cbe/test/test104.c +++ b/hpvm/projects/llvm-cbe/test/test104.c @@ -12,13 +12,11 @@ // //===----------------------------------------------------------------------===// -int tail (int n) { +int tail(int n) { if (n == 6) return n; else - return tail(n+1); + return tail(n + 1); } -int main(){ - return tail(0); -} +int main() { return tail(0); } diff --git a/hpvm/projects/llvm-cbe/test/test105.c b/hpvm/projects/llvm-cbe/test/test105.c index 7e830d55c55182e5d995a8841c41132555c54ee4..79ab340aef5c7db27c06d076efa95bb85fb5a964 100644 --- a/hpvm/projects/llvm-cbe/test/test105.c +++ b/hpvm/projects/llvm-cbe/test/test105.c @@ -12,13 +12,11 @@ // //===----------------------------------------------------------------------===// -int head(int n){ - if(n == 6) +int head(int n) { + if (n == 6) return n; else - return head(n+1); + return head(n + 1); } -int main(){ - return head(0); -} +int main() { return head(0); } diff --git a/hpvm/projects/llvm-cbe/test/testbad.c b/hpvm/projects/llvm-cbe/test/testbad.c index a7456dc2b52888358ae2e7fce0da5b0c799c9b45..a8a9bca17c49e5ebed010beeeb987ac5904b3b10 100644 --- a/hpvm/projects/llvm-cbe/test/testbad.c +++ b/hpvm/projects/llvm-cbe/test/testbad.c @@ -11,7 +11,4 @@ // //===----------------------------------------------------------------------===// -int main() -{ - return 25; -} +int main() { return 25; } diff --git a/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp b/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp index 6c4750182516934ae627d7b25baa8e8e98daa6ba..87a67a7364aa8b5132d7e82fd9c4e1005d3ce6fa 100644 --- a/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp +++ b/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp @@ -192,8 +192,8 @@ int main(int argc, char **argv) { initializeLowerIntrinsicsPass(*Registry); initializeUnreachableBlockElimLegacyPassPass(*Registry); - // Adding necessary passes for loop generation - initializeLoopInfoWrapperPassPass(*Registry); + // Adding necessary passes for loop generation + initializeLoopInfoWrapperPassPass(*Registry); initializePostDominatorTreeWrapperPassPass(*Registry); initializeScalarEvolutionWrapperPassPass(*Registry); initializeDominatorTreeWrapperPassPass(*Registry); diff --git a/hpvm/projects/visc-rt/device_abstraction.h b/hpvm/projects/visc-rt/device_abstraction.h index 68748c7ab73d316c7bf296e67d88c0114b4cac81..7e77d100deb6b23b6ed9ca994796cd1cb108b0d4 100644 --- a/hpvm/projects/visc-rt/device_abstraction.h +++ b/hpvm/projects/visc-rt/device_abstraction.h @@ -1,14 +1,13 @@ #ifndef __DEVICE_ABSTRACTION__ #define __DEVICE_ABSTRACTION__ +#include <fstream> +#include <iostream> #include <stdio.h> #include <stdlib.h> -#include <time.h> -#include <time.h> #include <thread> +#include <time.h> #include <vector> -#include <iostream> -#include <fstream> #define MIN_INTERVAL 2 #define MAX_INTERVAL 8 @@ -23,14 +22,13 @@ std::vector<unsigned> Intervals; // simulation volatile bool executionEnd = false; - void initializeDeviceStatusIntervals() { unsigned sz = 0; unsigned tmp = 0; - const char *fn = - "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt"; + const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/" + "deviceStatusSwitchIntervals.txt"; std::ifstream infile; infile.open(fn); if (!infile.is_open()) { @@ -55,10 +53,11 @@ void initializeDeviceStatusIntervals() { std::cout << "Failed to open " << fn << " for writing\n"; return; } - sz = 1 + rand()%NUM_INTERVALS; - outfile << sz; + sz = 1 + rand() % NUM_INTERVALS; + outfile << sz; for (unsigned i = 0; i < sz; i++) { - Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL)); + Intervals.push_back(MIN_INTERVAL + + rand() % (MAX_INTERVAL - MIN_INTERVAL)); outfile << Intervals[i]; } outfile.close(); @@ -71,12 +70,11 @@ void updateDeviceStatus() { unsigned i = 0; while (!executionEnd) { - std::this_thread::sleep_for (std::chrono::seconds(Intervals[i])); + std::this_thread::sleep_for(std::chrono::seconds(Intervals[i])); deviceStatus = !deviceStatus; std::cout << "Changed device status to " << deviceStatus << "\n"; - i = (i+1) % Intervals.size(); + i = (i + 1) % Intervals.size(); } - } #endif // __DEVICE_ABSTRACTION__ diff --git a/hpvm/projects/visc-rt/policy.h b/hpvm/projects/visc-rt/policy.h index 4bd6fa046967a7a1632e89941b155695ee139718..d50e65868b376bfbcc3d4bd00d4919db677722b8 100644 --- a/hpvm/projects/visc-rt/policy.h +++ b/hpvm/projects/visc-rt/policy.h @@ -1,23 +1,21 @@ #ifndef __POLICY__ #define __POLICY__ -#include <string> #include "device_abstraction.h" +#include <string> - /************************* Policies *************************************/ +/************************* Policies *************************************/ class Policy { - public: - virtual int getVersion(const char *, int64_t) = 0; - virtual ~Policy() {}; +public: + virtual int getVersion(const char *, int64_t) = 0; + virtual ~Policy(){}; }; class ConstPolicy : public Policy { public: - ConstPolicy(int deviceID): deviceID(deviceID) {} + ConstPolicy(int deviceID) : deviceID(deviceID) {} - int getVersion(const char *, int64_t) override { - return deviceID; - } + int getVersion(const char *, int64_t) override { return deviceID; } private: int deviceID; @@ -26,16 +24,17 @@ private: class NodePolicy : public Policy { virtual int getVersion(const char *name, int64_t it) override { std::string s(name); - //std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" }; + // std::string NodeNames[1] = { + // "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" }; std::string NodeNames[] = { - "WrapperGaussianSmoothing_cloned", - "WrapperlaplacianEstimate_cloned", - "WrapperComputeZeroCrossings_cloned", - "WrapperComputeGradient_cloned", - "WrapperComputeMaxGradient_cloned", - "WrapperRejectZeroCrossings_cloned", + "WrapperGaussianSmoothing_cloned", + "WrapperlaplacianEstimate_cloned", + "WrapperComputeZeroCrossings_cloned", + "WrapperComputeGradient_cloned", + "WrapperComputeMaxGradient_cloned", + "WrapperRejectZeroCrossings_cloned", }; - //if (!s.compare(NodeNames[4])) { + // if (!s.compare(NodeNames[4])) { // std::cout << s << ": CPU" << "\n"; // return 0; //} @@ -55,11 +54,10 @@ class IterationPolicy : public Policy { class DeviceStatusPolicy : public Policy { virtual int getVersion(const char *name, int64_t it) override { if (deviceStatus) { - //std::cout << "Returning GPU\n"; + // std::cout << "Returning GPU\n"; return 2; - } - else { - //std::cout << "Returning CPU\n"; + } else { + // std::cout << "Returning CPU\n"; return 0; } } @@ -98,12 +96,12 @@ public: userTargetDeviceChoice = 1; end = false; userTargetDeviceChoiceThread = - std::thread(&InteractivePolicy::updateUserTargetChoice, this); + std::thread(&InteractivePolicy::updateUserTargetChoice, this); } ~InteractivePolicy() { end = true; - userTargetDeviceChoiceThread.join(); + userTargetDeviceChoiceThread.join(); } }; diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/visc-rt/visc-rt.cpp index eff618548f3405b668249791738015043a537f17..53d3b516f22b59857b1a17aecba32a6b723998f0 100644 --- a/hpvm/projects/visc-rt/visc-rt.cpp +++ b/hpvm/projects/visc-rt/visc-rt.cpp @@ -1,43 +1,44 @@ -#include <iostream> -#include <string> -#include <pthread.h> -#include <cstdlib> +#include <CL/cl.h> +#include <cassert> #include <cstdio> +#include <cstdlib> #include <cstring> -#include <cassert> +#include <iostream> #include <map> -#include <CL/cl.h> +#include <pthread.h> +#include <string> #include <unistd.h> #if _POSIX_VERSION >= 200112L -# include <sys/time.h> +#include <sys/time.h> #endif #include "visc-rt.h" #ifndef DEBUG_BUILD -#define DEBUG(s) {} +#define DEBUG(s) \ + {} #else #define DEBUG(s) s #endif -#define BILLION 1000000000LL +#define BILLION 1000000000LL using namespace std; typedef struct { pthread_t threadID; - std::vector<pthread_t>* threads; - // Map from InputPort to Size - std::map<unsigned, uint64_t>* ArgInPortSizeMap; - //std::vector<uint64_t>* BindInSizes; - std::vector<unsigned>* BindInSourcePort; - std::vector<uint64_t>* BindOutSizes; - std::vector<uint64_t>* EdgeSizes; - std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers; - std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers; - std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers; - std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers; + std::vector<pthread_t> *threads; + // Map from InputPort to Size + std::map<unsigned, uint64_t> *ArgInPortSizeMap; + // std::vector<uint64_t>* BindInSizes; + std::vector<unsigned> *BindInSourcePort; + std::vector<uint64_t> *BindOutSizes; + std::vector<uint64_t> *EdgeSizes; + std::vector<CircularBuffer<uint64_t> *> *BindInputBuffers; + std::vector<CircularBuffer<uint64_t> *> *BindOutputBuffers; + std::vector<CircularBuffer<uint64_t> *> *EdgeBuffers; + std::vector<CircularBuffer<uint64_t> *> *isLastInputBuffers; } DFNodeContext_X86; typedef struct { @@ -48,7 +49,7 @@ typedef struct { } DFNodeContext_OCL; cl_context globalOCLContext; -cl_device_id* clDevices; +cl_device_id *clDevices; cl_command_queue globalCommandQue; Policy *policy = NULL; @@ -60,10 +61,10 @@ pthread_mutex_t ocl_mtx; #define NUM_TESTS 1 visc_TimerSet kernel_timer; -static inline void checkErr(cl_int err, cl_int success, const char * name) { +static inline void checkErr(cl_int err, cl_int success, const char *name) { if (err != success) { - cout << "ERROR: " << name << flush << "\n"; - cout << "ErrorCode: " << err << flush << "\n"; + cout << "ERROR: " << name << flush << "\n"; + cout << "ErrorCode: " << err << flush << "\n"; exit(EXIT_FAILURE); } } @@ -71,16 +72,17 @@ static inline void checkErr(cl_int err, cl_int success, const char * name) { /************************* Policies *************************************/ void llvm_visc_policy_init() { cout << "Initializing policy object ...\n"; -// policy = new NodePolicy(); -// policy = new IterationPolicy(); -// policy = new DeviceStatusPolicy(); + // policy = new NodePolicy(); + // policy = new IterationPolicy(); + // policy = new DeviceStatusPolicy(); // policy = new InteractivePolicy(); policy = new ConstPolicy(0); cout << "DONE: Initializing policy object.\n"; } void llvm_visc_policy_clear() { - if (policy) free(policy); + if (policy) + free(policy); } int llvm_visc_policy_getVersion(const char *name, int64_t i) { @@ -111,58 +113,65 @@ void llvm_visc_deviceAbstraction_end() { } void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { - while (!deviceStatus) { }; + while (!deviceStatus) { + }; return; } /************************* Depth Stack Routines ***************************/ -void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY, - uint64_t iY, uint64_t limitZ, uint64_t iZ) { - DEBUG(cout << "Pushing node information on stack:\n"); - DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n"); - DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n"); - DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ); - pthread_mutex_lock(&ocl_mtx); - DStack.push_back(nodeInfo); - DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); +void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, + uint64_t limitY, uint64_t iY, uint64_t limitZ, + uint64_t iZ) { + DEBUG(cout << "Pushing node information on stack:\n"); + DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY + << ", " << limitZ << ")\n"); + DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", " << iZ << ")\n"); + DFGDepth nodeInfo(n, limitX, iX, limitY, iY, limitZ, iZ); + pthread_mutex_lock(&ocl_mtx); + DStack.push_back(nodeInfo); + DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); + pthread_mutex_unlock(&ocl_mtx); } void llvm_visc_x86_dstack_pop() { - DEBUG(cout << "Popping from depth stack\n"); - pthread_mutex_lock(&ocl_mtx); - DStack.pop_back(); - DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); + DEBUG(cout << "Popping from depth stack\n"); + pthread_mutex_lock(&ocl_mtx); + DStack.pop_back(); + DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); + pthread_mutex_unlock(&ocl_mtx); } uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) { - DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n"); - pthread_mutex_lock(&ocl_mtx); - unsigned size = DStack.size(); - DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n"); - uint64_t result = DStack[size-level-1].getDimLimit(dim); - pthread_mutex_unlock(&ocl_mtx); - return result; + DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level + << flush << "\n"); + pthread_mutex_lock(&ocl_mtx); + unsigned size = DStack.size(); + DEBUG(cout << "\t Return: " << DStack[size - level - 1].getDimLimit(dim) + << flush << "\n"); + uint64_t result = DStack[size - level - 1].getDimLimit(dim); + pthread_mutex_unlock(&ocl_mtx); + return result; } uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) { - DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n"); - pthread_mutex_lock(&ocl_mtx); - unsigned size = DStack.size(); - DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n"); - uint64_t result = DStack[size-level-1].getDimInstance(dim); - pthread_mutex_unlock(&ocl_mtx); - return result; + DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " + << level << flush << "\n"); + pthread_mutex_lock(&ocl_mtx); + unsigned size = DStack.size(); + DEBUG(cout << "\t Return: " << DStack[size - level - 1].getDimInstance(dim) + << flush << "\n"); + uint64_t result = DStack[size - level - 1].getDimInstance(dim); + pthread_mutex_unlock(&ocl_mtx); + return result; } /********************** Memory Tracking Routines **************************/ -void llvm_visc_track_mem(void* ptr, size_t size) { +void llvm_visc_track_mem(void *ptr, size_t size) { DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if(MTE != NULL) { + MemTrackerEntry *MTE = MTracker.lookup(ptr); + if (MTE != NULL) { DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n"); return; } @@ -171,25 +180,28 @@ void llvm_visc_track_mem(void* ptr, size_t size) { DEBUG(MTracker.print()); } -void llvm_visc_untrack_mem(void* ptr) { +void llvm_visc_untrack_mem(void *ptr) { DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if(MTE == NULL) { - cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n"; + MemTrackerEntry *MTE = MTracker.lookup(ptr); + if (MTE == NULL) { + cout << "WARNING: Trying to remove ID " << ptr + << " not present in the MemTracker Table\n"; return; } DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n"); - if(MTE->getLocation() == MemTrackerEntry::DEVICE) - clReleaseMemObject((cl_mem) MTE->getAddress()); + if (MTE->getLocation() == MemTrackerEntry::DEVICE) + clReleaseMemObject((cl_mem)MTE->getAddress()); MTracker.remove(ptr); DEBUG(MTracker.print()); } - -static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) { +static void *llvm_visc_ocl_request_mem(void *ptr, size_t size, + DFNodeContext_OCL *Context, bool isInput, + bool isOutput) { pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); + DEBUG(cout << "[OCL] Request memory: " << ptr + << " for context: " << Context->clOCLContext << flush << "\n"); + MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE == NULL) { MTracker.print(); cout << "ERROR: Requesting memory not present in Table\n"; @@ -197,89 +209,91 @@ static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL } // If already on device if (MTE->getLocation() == MemTrackerEntry::DEVICE && - ((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) { - DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); + ((DFNodeContext_OCL *)MTE->getContext())->clOCLContext == + Context->clOCLContext) { + DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush + << "\n"); pthread_mutex_unlock(&ocl_mtx); return MTE->getAddress(); } - - DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); + + DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush + << "\n"); DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n"); // Else copy and update the latest copy cl_mem_flags clFlags; cl_int errcode; - if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE; - else if(isInput) clFlags = CL_MEM_READ_ONLY; - else if(isOutput) clFlags = CL_MEM_WRITE_ONLY; - else clFlags = CL_MEM_READ_ONLY; + if (isInput && isOutput) + clFlags = CL_MEM_READ_WRITE; + else if (isInput) + clFlags = CL_MEM_READ_ONLY; + else if (isOutput) + clFlags = CL_MEM_WRITE_ONLY; + else + clFlags = CL_MEM_READ_ONLY; visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); - //pthread_mutex_lock(&ocl_mtx); - cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + cl_mem d_input = + clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); - DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n"); - if(isInput) { + DEBUG(cout << "\nMemory allocated on device: " << d_input << flush << "\n"); + if (isInput) { DEBUG(cout << "\tCopying ..."); - //pthread_mutex_lock(&ocl_mtx); - errcode = clEnqueueWriteBuffer(Context->clCommandQue, - d_input, - CL_TRUE, - 0, - size,MTE->getAddress(), - 0,NULL,NULL); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + errcode = clEnqueueWriteBuffer(Context->clCommandQue, d_input, CL_TRUE, 0, + size, MTE->getAddress(), 0, NULL, NULL); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device"); } visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); DEBUG(cout << " done\n"); - MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context); + MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context); DEBUG(cout << "Updated Table\n"); DEBUG(MTracker.print()); pthread_mutex_unlock(&ocl_mtx); return d_input; } -void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) { +void *llvm_visc_x86_argument_ptr(void *ptr, size_t size) { return llvm_visc_request_mem(ptr, size); } -void* llvm_visc_request_mem(void* ptr, size_t size) { +void *llvm_visc_request_mem(void *ptr, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if(MTE == NULL) { + MemTrackerEntry *MTE = MTracker.lookup(ptr); + if (MTE == NULL) { cout << "ERROR: Requesting memory not present in Table\n"; pthread_mutex_unlock(&ocl_mtx); exit(EXIT_FAILURE); } // If already on host - if(MTE->getLocation() == MemTrackerEntry::HOST) { - DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); + if (MTE->getLocation() == MemTrackerEntry::HOST) { + DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush + << "\n"); pthread_mutex_unlock(&ocl_mtx); return MTE->getAddress(); } // Else copy from device and update table - DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); + DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush + << "\n"); DEBUG(cout << "\tCopying ..."); visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue, - (cl_mem) MTE->getAddress(), - CL_TRUE, - 0, - size, - ptr, - 0, NULL, NULL); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clEnqueueReadBuffer( + ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue, + (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL); + // pthread_mutex_unlock(&ocl_mtx); visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); DEBUG(cout << " done\n"); checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output"); DEBUG(cout << "Free mem object on device\n"); - clReleaseMemObject((cl_mem) MTE->getAddress()); + clReleaseMemObject((cl_mem)MTE->getAddress()); DEBUG(cout << "Updated Table\n"); MTE->update(MemTrackerEntry::HOST, ptr); DEBUG(MTracker.print()); @@ -289,63 +303,57 @@ void* llvm_visc_request_mem(void* ptr, size_t size) { /*************************** Timer Routines **********************************/ -static int is_async(enum visc_TimerID timer) -{ - return (timer == visc_TimerID_KERNEL) || - (timer == visc_TimerID_COPY_ASYNC); +static int is_async(enum visc_TimerID timer) { + return (timer == visc_TimerID_KERNEL) || (timer == visc_TimerID_COPY_ASYNC); } -static int is_blocking(enum visc_TimerID timer) -{ +static int is_blocking(enum visc_TimerID timer) { return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE); } #define INVALID_TIMERID visc_TimerID_LAST -static int asyncs_outstanding(struct visc_TimerSet* timers) -{ +static int asyncs_outstanding(struct visc_TimerSet *timers) { return (timers->async_markers != NULL) && - (timers->async_markers->timerID != INVALID_TIMERID); + (timers->async_markers->timerID != INVALID_TIMERID); } static struct visc_async_time_marker_list * -get_last_async(struct visc_TimerSet* timers) -{ +get_last_async(struct visc_TimerSet *timers) { /* Find the last event recorded thus far */ - struct visc_async_time_marker_list * last_event = timers->async_markers; - if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { - while(last_event->next != NULL && - last_event->next->timerID != INVALID_TIMERID) + struct visc_async_time_marker_list *last_event = timers->async_markers; + if (last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while (last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) last_event = last_event->next; return last_event; } else return NULL; } -static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer) -{ +static void insert_marker(struct visc_TimerSet *tset, enum visc_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct visc_async_time_marker_list ** new_event = &(tset->async_markers); + struct visc_async_time_marker_list **new_event = &(tset->async_markers); - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } - if(*new_event == NULL) { - *new_event = (struct visc_async_time_marker_list *) - malloc(sizeof(struct visc_async_time_marker_list)); + if (*new_event == NULL) { + *new_event = (struct visc_async_time_marker_list *)malloc( + sizeof(struct visc_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* - // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event -#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) -fprintf(stderr, "Creating Marker [%d]\n", timer); - *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Creating User Event Object!\n"); + // I don't think this is needed at all. I believe clEnqueueMarker 'creates' +the event #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) fprintf(stderr, "Creating +Marker [%d]\n", timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, +&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User +Event Object!\n"); } - ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Setting User Event Status!\n"); + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), +CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User +Event Status!\n"); } #endif */ @@ -355,38 +363,38 @@ fprintf(stderr, "Creating Marker [%d]\n", timer); /* valid event handle now aquired: insert the event record */ (*new_event)->label = NULL; (*new_event)->timerID = timer; - //pthread_mutex_lock(&ocl_mtx); - ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + ciErrNum = + clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); + // pthread_mutex_unlock(&ocl_mtx); if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); + fprintf(stderr, "Error Enqueueing Marker!\n"); } - } -static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer) -{ +static void insert_submarker(struct visc_TimerSet *tset, char *label, + enum visc_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct visc_async_time_marker_list ** new_event = &(tset->async_markers); + struct visc_async_time_marker_list **new_event = &(tset->async_markers); - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } - if(*new_event == NULL) { - *new_event = (struct visc_async_time_marker_list *) - malloc(sizeof(struct visc_async_time_marker_list)); + if (*new_event == NULL) { + *new_event = (struct visc_async_time_marker_list *)malloc( + sizeof(struct visc_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); - *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Creating User Event Object!\n"); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, +&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User +Event Object!\n"); } - ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Setting User Event Status!\n"); + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), +CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User +Event Status!\n"); } #endif */ @@ -396,44 +404,49 @@ fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); /* valid event handle now aquired: insert the event record */ (*new_event)->label = label; (*new_event)->timerID = timer; - //pthread_mutex_lock(&ocl_mtx); - ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + ciErrNum = + clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); + // pthread_mutex_unlock(&ocl_mtx); if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); + fprintf(stderr, "Error Enqueueing Marker!\n"); } - } - /* Assumes that all recorded events have completed */ -static visc_Timestamp record_async_times(struct visc_TimerSet* tset) -{ - struct visc_async_time_marker_list * next_interval = NULL; - struct visc_async_time_marker_list * last_marker = get_last_async(tset); +static visc_Timestamp record_async_times(struct visc_TimerSet *tset) { + struct visc_async_time_marker_list *next_interval = NULL; + struct visc_async_time_marker_list *last_marker = get_last_async(tset); visc_Timestamp total_async_time = 0; - for(next_interval = tset->async_markers; next_interval != last_marker; - next_interval = next_interval->next) { - cl_ulong command_start=0, command_end=0; + for (next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { + cl_ulong command_start = 0, command_end = 0; cl_int ciErrNum = CL_SUCCESS; - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), + CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &command_start, NULL); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); } - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); + ciErrNum = clGetEventProfilingInfo( + *((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &command_end, NULL); if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); + fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", + ciErrNum); } - visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start))); + visc_Timestamp interval = + (visc_Timestamp)(((double)(command_end - command_start))); tset->timers[next_interval->timerID].elapsed += interval; if (next_interval->label != NULL) { - struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + struct visc_SubTimer *subtimer = + tset->sub_timer_list[next_interval->timerID]->subtimer_list; while (subtimer != NULL) { - if ( strcmp(subtimer->label, next_interval->label) == 0) { + if (strcmp(subtimer->label, next_interval->label) == 0) { subtimer->timer.elapsed += interval; break; } @@ -444,50 +457,42 @@ static visc_Timestamp record_async_times(struct visc_TimerSet* tset) next_interval->timerID = INVALID_TIMERID; } - if(next_interval != NULL) + if (next_interval != NULL) next_interval->timerID = INVALID_TIMERID; return total_async_time; } -static void -accumulate_time(visc_Timestamp *accum, - visc_Timestamp start, - visc_Timestamp end) -{ +static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start, + visc_Timestamp end) { #if _POSIX_VERSION >= 200112L *accum += end - start; #else -# error "Timestamps not implemented for this system" +#error "Timestamps not implemented for this system" #endif } #if _POSIX_VERSION >= 200112L -static visc_Timestamp get_time() -{ +static visc_Timestamp get_time() { struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); - return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec); + return (visc_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); } #else -# error "no supported time libraries are available on this platform" +#error "no supported time libraries are available on this platform" #endif -void -visc_ResetTimer(struct visc_Timer *timer) -{ +void visc_ResetTimer(struct visc_Timer *timer) { timer->state = visc_Timer_STOPPED; #if _POSIX_VERSION >= 200112L timer->elapsed = 0; #else -# error "visc_ResetTimer: not implemented for this system" +#error "visc_ResetTimer: not implemented for this system" #endif } -void -visc_StartTimer(struct visc_Timer *timer) -{ +void visc_StartTimer(struct visc_Timer *timer) { if (timer->state != visc_Timer_STOPPED) { // FIXME: Removing warning statement to avoid printing this error // fputs("Ignoring attempt to start a running timer\n", stderr); @@ -503,13 +508,12 @@ visc_StartTimer(struct visc_Timer *timer) timer->init = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "visc_StartTimer: not implemented for this system" +#error "visc_StartTimer: not implemented for this system" #endif } -void -visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) -{ +void visc_StartTimerAndSubTimer(struct visc_Timer *timer, + struct visc_Timer *subtimer) { unsigned int numNotStopped = 0x3; // 11 if (timer->state != visc_Timer_STOPPED) { @@ -521,7 +525,7 @@ visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer numNotStopped &= 0x2; // Zero out 2^0 } if (numNotStopped == 0x0) { - //fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + // fputs("Ignoring attempt to start running timer and subtimer\n", stderr); return; } @@ -542,18 +546,15 @@ visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer } } #else -# error "visc_StartTimer: not implemented for this system" +#error "visc_StartTimer: not implemented for this system" #endif - } -void -visc_StopTimer(struct visc_Timer *timer) -{ +void visc_StopTimer(struct visc_Timer *timer) { visc_Timestamp fini; if (timer->state != visc_Timer_RUNNING) { - //fputs("Ignoring attempt to stop a stopped timer\n", stderr); + // fputs("Ignoring attempt to stop a stopped timer\n", stderr); return; } @@ -566,14 +567,15 @@ visc_StopTimer(struct visc_Timer *timer) fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "visc_StopTimer: not implemented for this system" +#error "visc_StopTimer: not implemented for this system" #endif accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } -void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) { +void visc_StopTimerAndSubTimer(struct visc_Timer *timer, + struct visc_Timer *subtimer) { visc_Timestamp fini; @@ -587,11 +589,10 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subt numNotRunning &= 0x2; // Zero out 2^0 } if (numNotRunning == 0x0) { - //fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + // fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); return; } - timer->state = visc_Timer_STOPPED; subtimer->state = visc_Timer_STOPPED; @@ -602,7 +603,7 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subt fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "visc_StopTimer: not implemented for this system" +#error "visc_StopTimer: not implemented for this system" #endif if (numNotRunning & 0x2) { @@ -614,13 +615,10 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subt accumulate_time(&subtimer->elapsed, subtimer->init, fini); subtimer->init = fini; } - } /* Get the elapsed time in seconds. */ -double -visc_GetElapsedTime(struct visc_Timer *timer) -{ +double visc_GetElapsedTime(struct visc_Timer *timer) { double ret; if (timer->state != visc_Timer_STOPPED) { @@ -630,14 +628,12 @@ visc_GetElapsedTime(struct visc_Timer *timer) #if _POSIX_VERSION >= 200112L ret = timer->elapsed / 1e9; #else -# error "visc_GetElapsedTime: not implemented for this system" +#error "visc_GetElapsedTime: not implemented for this system" #endif return ret; } -void -visc_InitializeTimerSet(struct visc_TimerSet *timers) -{ +void visc_InitializeTimerSet(struct visc_TimerSet *timers) { int n; timers->wall_begin = get_time(); @@ -651,25 +647,25 @@ visc_InitializeTimerSet(struct visc_TimerSet *timers) } } +void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID visc_Category) { -void -visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) { - - struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc - (sizeof(struct visc_SubTimer)); + struct visc_SubTimer *subtimer = + (struct visc_SubTimer *)malloc(sizeof(struct visc_SubTimer)); int len = strlen(label); - subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + subtimer->label = (char *)malloc(sizeof(char) * (len + 1)); sprintf(subtimer->label, "%s", label); visc_ResetTimer(&subtimer->timer); subtimer->next = NULL; - struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category]; + struct visc_SubTimerList *subtimerlist = + timers->sub_timer_list[visc_Category]; if (subtimerlist == NULL) { - subtimerlist = (struct visc_SubTimerList *) calloc - (1, sizeof(struct visc_SubTimerList)); + subtimerlist = + (struct visc_SubTimerList *)calloc(1, sizeof(struct visc_SubTimerList)); subtimerlist->subtimer_list = subtimer; timers->sub_timer_list[visc_Category] = subtimerlist; } else { @@ -680,22 +676,22 @@ visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID vi } element->next = subtimer; } - } -void -visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) -{ - //cerr << "Switch to timer: " << timer << flush << "\n"; +void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { + // cerr << "Switch to timer: " << timer << flush << "\n"; /* Stop the currently running timer */ if (timers->current != visc_TimerID_NONE) { - struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; + struct visc_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct visc_SubTimer *currSubTimer = + (subtimerlist != NULL) ? subtimerlist->current : NULL; - if (!is_async(timers->current) ) { + if (!is_async(timers->current)) { if (timers->current != timer) { if (currSubTimer != NULL) { - visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + visc_StopTimerAndSubTimer(&timers->timers[timers->current], + &currSubTimer->timer); } else { visc_StopTimer(&timers->timers[timers->current]); } @@ -717,30 +713,31 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(timer) ) ) { + if (asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer))) { - struct visc_async_time_marker_list * last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; cl_int async_done = CL_COMPLETE; - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), + CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), + &async_done, NULL); if (ciErrNum != CL_SUCCESS) { fprintf(stdout, "Error Querying EventInfo1!\n"); } - - if(is_blocking(timer)) { + if (is_blocking(timer)) { /* Async operations completed after previous CPU operations: * overlapped time is the total CPU time since this set of async * operations were first issued */ // timer to switch to is COPY or NONE - if(async_done != CL_COMPLETE) { + if (async_done != CL_COMPLETE) { accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); + timers->async_begin, currentTime); } /* Wait on async operation completion */ @@ -753,25 +750,27 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - if(async_done == CL_COMPLETE) { - //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); + if (async_done == CL_COMPLETE) { + // fprintf(stderr, "Async_done: total_async_type = %lld\n", + // total_async_time); timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; } } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == CL_COMPLETE ) { + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are + // outstanding so something is deeper in stack + if (async_done == CL_COMPLETE) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); + timers->timers[visc_TimerID_OVERLAP].elapsed += + record_async_times(timers); } } /* Start the new timer */ if (timer != visc_TimerID_NONE) { - if(!is_async(timer)) { + if (!is_async(timer)) { visc_StartTimer(&timers->timers[timer]); } else { // toSwitchTo Is Async (KERNEL/COPY_ASYNC) @@ -780,13 +779,13 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) insert_marker(timers, timer); timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { + } else if (!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo * already marked the end of the most recent async operation, * so we can rename that marker as the beginning of this async * operation */ - struct visc_async_time_marker_list * last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); last_event->label = NULL; last_event->timerID = timer; } @@ -796,20 +795,21 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) } } timers->current = timer; - } -void -visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category) -{ - struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; +void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID category) { + struct visc_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct visc_SubTimer *curr = + (subtimerlist != NULL) ? subtimerlist->current : NULL; if (timers->current != visc_TimerID_NONE) { - if (!is_async(timers->current) ) { + if (!is_async(timers->current)) { if (timers->current != category) { if (curr != NULL) { - visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + visc_StopTimerAndSubTimer(&timers->timers[timers->current], + &curr->timer); } else { visc_StopTimer(&timers->timers[timers->current]); } @@ -831,32 +831,35 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(category) ) ) { + if (asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category))) { - struct visc_async_time_marker_list * last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; cl_int async_done = CL_COMPLETE; - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), + CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), + &async_done, NULL); if (ciErrNum != CL_SUCCESS) { fprintf(stdout, "Error Querying EventInfo2!\n"); } - if(is_blocking(category)) { + if (is_blocking(category)) { /* Async operations completed after previous CPU operations: * overlapped time is the total CPU time since this set of async * operations were first issued */ // timer to switch to is COPY or NONE - // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP - // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization - // it seems that the extra sync wall time isn't being recorded anywhere - if(async_done != CL_COMPLETE) + // if it hasn't already finished, then just take now and use that as the + // elapsed time in OVERLAP anything happening after now isn't OVERLAP + // because everything is being stopped to wait for synchronization it + // seems that the extra sync wall time isn't being recorded anywhere + if (async_done != CL_COMPLETE) accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); + timers->async_begin, currentTime); /* Wait on async operation completion */ ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); @@ -867,19 +870,21 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - // If it did finish, then accumulate all the async time that did happen into OVERLAP - // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. - if(async_done == CL_COMPLETE /*cudaSuccess*/) + // If it did finish, then accumulate all the async time that did happen + // into OVERLAP the immediately preceding EventSynchronize theoretically + // didn't have any effect since it was already completed. + if (async_done == CL_COMPLETE /*cudaSuccess*/) timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == CL_COMPLETE /*cudaSuccess*/) { + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are + // outstanding so something is deeper in stack + if (async_done == CL_COMPLETE /*cudaSuccess*/) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); + timers->timers[visc_TimerID_OVERLAP].elapsed += + record_async_times(timers); } // else, this isn't blocking, so just check the next time around } @@ -900,7 +905,7 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer /* Start the new timer */ if (category != visc_TimerID_NONE) { - if(!is_async(category)) { + if (!is_async(category)) { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } @@ -922,18 +927,19 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer /* No asyncs outstanding, insert a fresh async marker */ insert_submarker(timers, label, category); timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { + } else if (!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo * already marked the end of the most recent async operation, * so we can rename that marker as the beginning of this async * operation */ - struct visc_async_time_marker_list * last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); last_event->timerID = category; last_event->label = label; } // else, marker for switchToThis was already inserted - //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + // toSwitchto is already asynchronous, but if current/prev state is async + // too, then DRIVER is already running if (!is_async(timers->current)) { visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); } @@ -943,39 +949,41 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer timers->current = category; } -void -visc_PrintTimerSet(struct visc_TimerSet *timers) -{ +void visc_PrintTimerSet(struct visc_TimerSet *timers) { visc_Timestamp wall_end = get_time(); struct visc_Timer *t = timers->timers; - struct visc_SubTimer* sub = NULL; + struct visc_SubTimer *sub = NULL; int maxSubLength; const char *categories[] = { - "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap", - "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", - "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc", - "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack" + "IO", "Kernel", "Copy", "Driver", + "Copy Async", "Compute", "Overlap", "Init_Ctx", + "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", + "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", + "Misc", "Pthread_Create", "Arg_Pack", "Arg_Unpack", + "Computation", "Output_Pack", "Output_Unpack" }; const int maxCategoryLength = 20; int i; - for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if(visc_GetElapsedTime(&t[i]) != 0 || true) { + for (i = 1; i < visc_TimerID_LAST; + ++i) { // exclude NONE and OVRELAP from this format + if (visc_GetElapsedTime(&t[i]) != 0 || true) { // Print Category Timer - printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i])); + printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1], + visc_GetElapsedTime(&t[i])); if (timers->sub_timer_list[i] != NULL) { sub = timers->sub_timer_list[i]->subtimer_list; maxSubLength = 0; while (sub != NULL) { // Find longest SubTimer label - if (strlen(sub->label) > (unsigned long) maxSubLength) { + if (strlen(sub->label) > (unsigned long)maxSubLength) { maxSubLength = strlen(sub->label); } sub = sub->next; @@ -983,47 +991,47 @@ visc_PrintTimerSet(struct visc_TimerSet *timers) // Fit to Categories if (maxSubLength <= maxCategoryLength) { - maxSubLength = maxCategoryLength; + maxSubLength = maxCategoryLength; } sub = timers->sub_timer_list[i]->subtimer_list; // Print SubTimers while (sub != NULL) { - printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer)); + printf(" -%-*s: %.9f\n", maxSubLength, sub->label, + visc_GetElapsedTime(&sub->timer)); sub = sub->next; } } } } - if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0) - printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP])); + if (visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %.9f\n", + visc_GetElapsedTime(&t[visc_TimerID_OVERLAP])); - float walltime = (wall_end - timers->wall_begin)/ 1e9; + float walltime = (wall_end - timers->wall_begin) / 1e9; printf("Timer Wall Time: %.9f\n", walltime); - } -void visc_DestroyTimerSet(struct visc_TimerSet * timers) -{ +void visc_DestroyTimerSet(struct visc_TimerSet *timers) { /* clean up all of the async event markers */ - struct visc_async_time_marker_list* event = timers->async_markers; - while(event != NULL) { + struct visc_async_time_marker_list *event = timers->async_markers; + while (event != NULL) { cl_int ciErrNum = CL_SUCCESS; ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); if (ciErrNum != CL_SUCCESS) { - //fprintf(stderr, "Error Waiting for Events!\n"); + // fprintf(stderr, "Error Waiting for Events!\n"); } - ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); + ciErrNum = clReleaseEvent(*((cl_event *)(event)->marker)); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Release Events!\n"); } free((event)->marker); - struct visc_async_time_marker_list* next = ((event)->next); + struct visc_async_time_marker_list *next = ((event)->next); free(event); @@ -1032,7 +1040,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet * timers) } int i = 0; - for(i = 0; i < visc_TimerID_LAST; ++i) { + for (i = 0; i < visc_TimerID_LAST; ++i) { if (timers->sub_timer_list[i] != NULL) { struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; struct visc_SubTimer *prev = NULL; @@ -1051,194 +1059,210 @@ void visc_DestroyTimerSet(struct visc_TimerSet * timers) #define BUFFER_SIZE 1 // Launch API for a streaming dataflow graph -void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) { - DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86)); +void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { + DFNodeContext_X86 *Context = + (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86)); Context->threads = new std::vector<pthread_t>(); Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>(); - //Context->BindInSizes = new std::vector<uint64_t>(); + // Context->BindInSizes = new std::vector<uint64_t>(); Context->BindInSourcePort = new std::vector<unsigned>(); Context->BindOutSizes = new std::vector<uint64_t>(); Context->EdgeSizes = new std::vector<uint64_t>(); - Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); + Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t> *>(); + Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t> *>(); + Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t> *>(); + Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t> *>(); - DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n"); + DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args + << flush << "\n"); LaunchFunc(args, Context); return Context; } // Push API for a streaming dataflow graph -void llvm_visc_streamPush(void* graphID, void* args) { - DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; +void llvm_visc_streamPush(void *graphID, void *args) { + DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args + << flush << "\n"); + DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; unsigned offset = 0; - for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) { + for (unsigned i = 0; i < Ctx->ArgInPortSizeMap->size(); i++) { uint64_t element; - memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i)); + memcpy(&element, (char *)args + offset, Ctx->ArgInPortSizeMap->at(i)); offset += Ctx->ArgInPortSizeMap->at(i); - for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) { - if(Ctx->BindInSourcePort->at(j) == i) { + for (unsigned j = 0; j < Ctx->BindInputBuffers->size(); j++) { + if (Ctx->BindInSourcePort->at(j) == i) { // Push to all bind buffers connected to parent node at this port - //DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); + // DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element); } } } // Push 0 in isLastInput buffers of all child nodes - for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers)) + for (CircularBuffer<uint64_t> *buffer : *(Ctx->isLastInputBuffers)) llvm_visc_bufferPush(buffer, 0); } // Pop API for a streaming dataflow graph -void* llvm_visc_streamPop(void* graphID) { +void *llvm_visc_streamPop(void *graphID) { DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; unsigned totalBytes = 0; - for(uint64_t size: *(Ctx->BindOutSizes)) - totalBytes+= size; - void* output = malloc(totalBytes); + for (uint64_t size : *(Ctx->BindOutSizes)) + totalBytes += size; + void *output = malloc(totalBytes); unsigned offset = 0; - for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) { + for (unsigned i = 0; i < Ctx->BindOutputBuffers->size(); i++) { uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i)); - //DEBUG(cout << "\tPopped Value " << element << " from buffer\n"); - memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i)); + // DEBUG(cout << "\tPopped Value " << element << " from buffer\n"); + memcpy((char *)output + offset, &element, Ctx->BindOutSizes->at(i)); offset += Ctx->BindOutSizes->at(i); } return output; } // Wait API for a streaming dataflow graph -void llvm_visc_streamWait(void* graphID) { +void llvm_visc_streamWait(void *graphID) { DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; // Push garbage to all other input buffers - for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) { + for (unsigned i = 0; i < Ctx->BindInputBuffers->size(); i++) { uint64_t element = 0; - //DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); + // DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element); } // Push 1 in isLastInput buffers of all child nodes - for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++) + for (unsigned i = 0; i < Ctx->isLastInputBuffers->size(); i++) llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1); llvm_visc_freeThreads(graphID); } // Create a buffer and return the bufferID -void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) { - DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn"); +void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size, + unsigned inArgPort) { + DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID + << ", Size: " << size << flush << "\n"); + DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; + CircularBuffer<uint64_t> *bufferID = + new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn"); DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); Context->BindInputBuffers->push_back(bufferID); (*(Context->ArgInPortSizeMap))[inArgPort] = size; Context->BindInSourcePort->push_back(inArgPort); - //Context->BindInSizes->push_back(size); + // Context->BindInSizes->push_back(size); return bufferID; } -void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) { - DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size()); - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut"); +void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) { + DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID + << ", Size: " << size << flush << "\n"); + DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; + // Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size()); + CircularBuffer<uint64_t> *bufferID = + new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut"); DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); Context->BindOutputBuffers->push_back(bufferID); Context->BindOutSizes->push_back(size); return bufferID; } -void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) { - DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size()); - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge"); +void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) { + DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size + << flush << "\n"); + DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; + // Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size()); + CircularBuffer<uint64_t> *bufferID = + new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge"); DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); Context->EdgeBuffers->push_back(bufferID); Context->EdgeSizes->push_back(size); return bufferID; } -void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) { - DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size()); - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput"); +void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) { + DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID + << ", Size: " << size << flush << "\n"); + DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; + // Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size()); + CircularBuffer<uint64_t> *bufferID = + new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput"); DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); Context->isLastInputBuffers->push_back(bufferID); return bufferID; } -// Free buffers -void llvm_visc_freeBuffers(void* graphID) { +// Free buffers +void llvm_visc_freeBuffers(void *graphID) { DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers)) + DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; + for (CircularBuffer<uint64_t> *bufferID : *(Context->BindInputBuffers)) delete bufferID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers)) + for (CircularBuffer<uint64_t> *bufferID : *(Context->BindOutputBuffers)) delete bufferID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers)) + for (CircularBuffer<uint64_t> *bufferID : *(Context->EdgeBuffers)) delete bufferID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers)) + for (CircularBuffer<uint64_t> *bufferID : *(Context->isLastInputBuffers)) delete bufferID; } // Pop an element from the buffer -uint64_t llvm_visc_bufferPop(void* bufferID) { - CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID; +uint64_t llvm_visc_bufferPop(void *bufferID) { + CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID; return buffer->pop(); } // Push an element into the buffer -void llvm_visc_bufferPush(void* bufferID, uint64_t element) { - CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID; +void llvm_visc_bufferPush(void *bufferID, uint64_t element) { + CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID; buffer->push(element); } // Create a thread -void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) { - DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; +void llvm_visc_createThread(void *graphID, void *(*Func)(void *), + void *arguments) { + DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func + << ", Args: " << arguments << flush << "\n"); + DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; int err; pthread_t threadID; - if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0) + if ((err = pthread_create(&threadID, NULL, Func, arguments)) != 0) cout << "Failed to create thread. Error code = " << err << flush << "\n"; Ctx->threads->push_back(threadID); } // Wait for thread to finish -void llvm_visc_freeThreads(void* graphID) { +void llvm_visc_freeThreads(void *graphID) { DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; - for(pthread_t thread: *(Ctx->threads)) + DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; + for (pthread_t thread : *(Ctx->threads)) pthread_join(thread, NULL); } /************************ OPENCL & PTHREAD API ********************************/ -void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) { - DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86)); - //int err; - //if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0) - //cout << "Failed to create pthread. Error code = " << err << flush << "\n"; +void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) { + DFNodeContext_X86 *Context = + (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86)); + // int err; + // if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != + // 0) cout << "Failed to create pthread. Error code = " << err << flush << + // "\n"; rootFunc(arguments); return Context; } -void llvm_visc_x86_wait(void* graphID) { +void llvm_visc_x86_wait(void *graphID) { DEBUG(cout << "Waiting for pthread to finish ...\n"); - //DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //pthread_join(Context->threadID, NULL); + // DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + // pthread_join(Context->threadID, NULL); free(graphID); DEBUG(cout << "\t... pthread Done!\n"); } -void* llvm_visc_ocl_initContext(enum visc::Target T) { +void *llvm_visc_ocl_initContext(enum visc::Target T) { pthread_mutex_lock(&ocl_mtx); - DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR"); + DEBUG(std::string Target = T == visc::GPU_TARGET ? "GPU" : "SPIR"); DEBUG(cout << "Initializing Context for " << Target << " device\n"); cl_uint numPlatforms; cl_int errcode; @@ -1246,51 +1270,51 @@ void* llvm_visc_ocl_initContext(enum visc::Target T) { checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); // now get all the platform IDs - cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms); + cl_platform_id *platforms = + (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms); errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); - - for(unsigned i=0; i < numPlatforms; i++) { + for (unsigned i = 0; i < numPlatforms; i++) { char buffer[10240]; DEBUG(cout << "Device " << i << " Info -->\n"); clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n"); clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); - DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n"); + DEBUG(cout << "\tVERSION = " << buffer << flush << "\n"); clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); DEBUG(cout << "\tNAME = " << buffer << flush << "\n"); clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); + clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, + NULL); DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n"); } // set platform property - just pick the first one - //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - //(long) platforms[0], - //0}; - //globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, - //NULL, NULL, &errcode); + // cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + //(long) platforms[0], + // 0}; + // globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, + // NULL, NULL, &errcode); // assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); // Choose second one which is X86 AVX - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - (long) platforms[T == visc::GPU_TARGET? 0 : 1], - 0}; - globalOCLContext = clCreateContextFromType(properties, - T == visc::GPU_TARGET? - CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, - NULL, NULL, &errcode); + cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, (long)platforms[T == visc::GPU_TARGET ? 0 : 1], 0}; + globalOCLContext = clCreateContextFromType( + properties, + T == visc::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, + NULL, &errcode); // get the list of OCL devices associated with context size_t dataBytes; - errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, - NULL, &dataBytes); + errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, NULL, + &dataBytes); checkErr(errcode, CL_SUCCESS, "Failure to get context info length"); - clDevices = (cl_device_id *) malloc(dataBytes); + clDevices = (cl_device_id *)malloc(dataBytes); errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); checkErr(errcode, CL_SUCCESS, "Failure to get context info"); - if(false && T == visc::SPIR_TARGET) { + if (false && T == visc::SPIR_TARGET) { cl_device_partition_property props[4]; props[0] = CL_DEVICE_PARTITION_BY_COUNTS; props[1] = NUM_CORES; @@ -1300,12 +1324,14 @@ void* llvm_visc_ocl_initContext(enum visc::Target T) { cl_uint num_entries = 8; cl_uint numDevices; - clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices); - //printf("Num of devices = %d\n", numDevices); - //for(unsigned i =0 ; i< numDevices; i++) - //printf("Subdevice id %d = %p\n", i, subdevice_id[i]); + clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, + &numDevices); + // printf("Num of devices = %d\n", numDevices); + // for(unsigned i =0 ; i< numDevices; i++) + // printf("Subdevice id %d = %p\n", i, subdevice_id[i]); clDevices[0] = subdevice_id[0]; - globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode); + globalOCLContext = + clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); } @@ -1320,120 +1346,134 @@ void* llvm_visc_ocl_initContext(enum visc::Target T) { return globalOCLContext; } -void llvm_visc_ocl_clearContext(void* graphID) { +void llvm_visc_ocl_clearContext(void *graphID) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Clear Context\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; // FIXME: Have separate function to release command queue and clear context. // Would be useful when a context has multiple command queues clReleaseKernel(Context->clKernel); - //clReleaseProgram(Context->clProgram); - //clReleaseCommandQueue(Context->clCommandQue); - //clReleaseContext(globalOCLContext); - //DEBUG(cout << "Released context at: " << globalOCLContext); + // clReleaseProgram(Context->clProgram); + // clReleaseCommandQueue(Context->clCommandQue); + // clReleaseContext(globalOCLContext); + // DEBUG(cout << "Released context at: " << globalOCLContext); free(Context); DEBUG(cout << "Done with OCL kernel\n"); cout << "Printing VISC Timer: KernelTimer\n"; visc_PrintTimerSet(&kernel_timer); pthread_mutex_unlock(&ocl_mtx); - } -void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) { +void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Shared Memory Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size + << flush << "\n"); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; DEBUG(cout << "Using Context: " << Context << flush << "\n"); DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); - //pthread_mutex_lock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument"); pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) { +void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index, + size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Scalar Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size + << flush << "\n"); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; DEBUG(cout << "Using Context: " << Context << flush << "\n"); DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); - //pthread_mutex_lock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); pthread_mutex_unlock(&ocl_mtx); } -void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) { +void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index, + size_t size, bool isInput, bool isOutput) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Pointer Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n"); + DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input + << ", Size = " << size << flush << "\n"); // Size should be non-zero assert(size != 0 && "Size of data pointed to has to be non-zero!"); - DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - + DEBUG(cout << "\tInput = " << isInput << "\tOutput = " << isOutput << flush + << "\n"); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; + pthread_mutex_unlock(&ocl_mtx); - // Check with runtime the location of this memory - cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput); - + // Check with runtime the location of this memory + cl_mem d_input = (cl_mem)llvm_visc_ocl_request_mem(input, size, Context, + isInput, isOutput); + pthread_mutex_lock(&ocl_mtx); // Set Kernel Argument - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), + (void *)&d_input); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n"); pthread_mutex_unlock(&ocl_mtx); return d_input; } -void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) { +void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set device memory for Output Struct:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size + << flush << "\n"); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; cl_int errcode; - //pthread_mutex_lock(&ocl_mtx); - cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, + size, NULL, &errcode); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device"); - //pthread_mutex_lock(&ocl_mtx); - errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), + (void *)&d_output); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n"); pthread_mutex_unlock(&ocl_mtx); return d_output; } -void llvm_visc_ocl_free(void* ptr) { - //DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n"); - //cl_mem d_ptr = (cl_mem) ptr; - //clReleaseMemObject(d_ptr); +void llvm_visc_ocl_free(void *ptr) { + // DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n"); + // cl_mem d_ptr = (cl_mem) ptr; + // clReleaseMemObject(d_ptr); } -void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) { +void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output, + size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Get Output:\n"); - DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n"); - if(h_output == NULL) + DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output + << ", Size = " << size << flush << "\n"); + if (h_output == NULL) h_output = malloc(size); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, - h_output, 0, NULL, NULL); - //pthread_mutex_unlock(&ocl_mtx); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; + // pthread_mutex_lock(&ocl_mtx); + cl_int errcode = + clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, + size, h_output, 0, NULL, NULL); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output"); pthread_mutex_unlock(&ocl_mtx); return h_output; } -void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t* - localWorkSize, const size_t* globalWorkSize) { +void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim, + const size_t *localWorkSize, + const size_t *globalWorkSize) { pthread_mutex_lock(&ocl_mtx); size_t GlobalWG[3]; @@ -1442,60 +1482,60 @@ void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t* // OpenCL EnqeueNDRangeKernel function results in segementation fault if we // directly use local and global work groups arguments. Hence, allocating it // on stack and copying. - for(unsigned i=0; i<workDim; i++) { + for (unsigned i = 0; i < workDim; i++) { GlobalWG[i] = globalWorkSize[i]; } // OpenCL allows local workgroup to be null. - if(localWorkSize != NULL) { - for(unsigned i=0; i<workDim; i++) { + if (localWorkSize != NULL) { + for (unsigned i = 0; i < workDim; i++) { LocalWG[i] = localWorkSize[i]; } } - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; // TODO: Would like to use event to ensure better scheduling of kernels. // Currently passing the event paratemeter results in seg fault with // clEnqueueNDRangeKernel. - cl_event* event; + cl_event *event; DEBUG(cout << "Enqueuing kernel:\n"); DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n"); DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n"); DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n"); DEBUG(cout << "\tGlobal Work Group: ( "); - for(unsigned i = 0; i<workDim; i++) { + for (unsigned i = 0; i < workDim; i++) { DEBUG(cout << GlobalWG[i] << " "); } DEBUG(cout << ")\n"); - if(localWorkSize != NULL) { + if (localWorkSize != NULL) { DEBUG(cout << "\tLocal Work Group: ( "); - for(unsigned i = 0; i<workDim; i++) { + for (unsigned i = 0; i < workDim; i++) { DEBUG(cout << LocalWG[i] << " "); } DEBUG(cout << ")\n"); } - //pthread_mutex_lock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_unlock(&ocl_mtx); visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION); - //for(int i=0 ;i < NUM_TESTS; i++) { - //cout << "Iteration = " << i << flush << "\n"; - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, - Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL : LocalWG, 0, NULL, NULL); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); + // for(int i=0 ;i < NUM_TESTS; i++) { + // cout << "Iteration = " << i << flush << "\n"; + // pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clEnqueueNDRangeKernel( + Context->clCommandQue, Context->clKernel, workDim, NULL, GlobalWG, + (localWorkSize == NULL) ? NULL : LocalWG, 0, NULL, NULL); + // pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); //} - //pthread_mutex_lock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_unlock(&ocl_mtx); visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); - + pthread_mutex_unlock(&ocl_mtx); return event; } - ////////////////////////////////////////////////////////////////////////////// //! Loads a Program binary file. //! @@ -1503,17 +1543,15 @@ void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t* //! @param Filename program filename //! @param szFinalLength returned length of the code string ////////////////////////////////////////////////////////////////////////////// -static char* LoadProgSource(const char* Filename, size_t* szFinalLength) -{ +static char *LoadProgSource(const char *Filename, size_t *szFinalLength) { DEBUG(cout << "Load Prog Source\n"); // locals - FILE* pFileStream = NULL; + FILE *pFileStream = NULL; size_t szSourceLength; // open the OpenCL source code file pFileStream = fopen(Filename, "rb"); - if(pFileStream == 0) - { + if (pFileStream == 0) { return NULL; } @@ -1523,32 +1561,32 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength) fseek(pFileStream, 0, SEEK_SET); // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + 1); - if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; + char *cSourceString = (char *)malloc(szSourceLength + 1); + if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) { + fclose(pFileStream); + free(cSourceString); + return 0; } - // close the file and return the total length of the combined (preamble + source) string + // close the file and return the total length of the combined (preamble + + // source) string fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength; + if (szFinalLength != 0) { + *szFinalLength = szSourceLength; } cSourceString[szSourceLength] = '\0'; return cSourceString; } -void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { +void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Launch OCL Kernel\n"); // Initialize OpenCL // OpenCL specific variables - DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL)); + DFNodeContext_OCL *Context = + (DFNodeContext_OCL *)malloc(sizeof(DFNodeContext_OCL)); size_t kernelLength; cl_int errcode; @@ -1556,36 +1594,42 @@ void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { // For a single context for all kernels Context->clOCLContext = globalOCLContext; - //Create a command-queue - //pthread_mutex_lock(&ocl_mtx); - Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); + // Create a command-queue + // pthread_mutex_lock(&ocl_mtx); + Context->clCommandQue = clCreateCommandQueue( + Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); globalCommandQue = Context->clCommandQue; - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); DEBUG(cout << "Loading program binary: " << FileName << flush << "\n"); char *programSource = LoadProgSource(FileName, &kernelLength); - checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary"); + checkErr(programSource != NULL, 1 /*bool true*/, + "Failure to load Program Binary"); cl_int binaryStatus; - //pthread_mutex_lock(&ocl_mtx); - Context->clProgram = clCreateProgramWithSource(Context->clOCLContext, 1, (const char **)&programSource, NULL, &errcode); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_lock(&ocl_mtx); + Context->clProgram = clCreateProgramWithSource( + Context->clOCLContext, 1, (const char **)&programSource, NULL, &errcode); + // pthread_mutex_unlock(&ocl_mtx); checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); - DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n"); - errcode = clBuildProgram(Context->clProgram, 1, &clDevices[0], "", NULL, NULL); + DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName + << flush << "\n"); + errcode = + clBuildProgram(Context->clProgram, 1, &clDevices[0], "", NULL, NULL); // If build fails, get build log from device - if(errcode != CL_SUCCESS) { + if (errcode != CL_SUCCESS) { cout << "ERROR: Failure to build program\n"; size_t len = 0; - errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0, - NULL, &len); + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], + CL_PROGRAM_BUILD_LOG, 0, NULL, &len); cout << "LOG LENGTH: " << len << flush << "\n"; - checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length"); - char *log = (char*) malloc(len*sizeof(char)); - errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len, - log, NULL); + checkErr(errcode, CL_SUCCESS, + "Failure to collect program build log length"); + char *log = (char *)malloc(len * sizeof(char)); + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], + CL_PROGRAM_BUILD_LOG, len, log, NULL); checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); cout << "Device Build Log:\n" << log << flush << "\n"; @@ -1598,48 +1642,44 @@ void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n"); - //free(clDevices); + // free(clDevices); free(programSource); pthread_mutex_unlock(&ocl_mtx); return Context; } - -void llvm_visc_ocl_wait(void* graphID) { +void llvm_visc_ocl_wait(void *graphID) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Wait\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID; - //pthread_mutex_lock(&ocl_mtx); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; + // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); - //pthread_mutex_unlock(&ocl_mtx); + // pthread_mutex_unlock(&ocl_mtx); pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) { - //cout << "Switching to timer " << timer << flush << "\n"; +void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID timer) { + // cout << "Switching to timer " << timer << flush << "\n"; pthread_mutex_lock(&ocl_mtx); - //visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer); + // visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer); pthread_mutex_unlock(&ocl_mtx); } -void llvm_visc_printTimerSet(void** timerSet, char* timerName) { +void llvm_visc_printTimerSet(void **timerSet, char *timerName) { pthread_mutex_lock(&ocl_mtx); cout << "Printing VISC Timer: "; - if(timerName != NULL) + if (timerName != NULL) cout << timerName << flush << "\n"; else cout << "Anonymous\n"; - visc_PrintTimerSet((visc_TimerSet*) (*timerSet)); + visc_PrintTimerSet((visc_TimerSet *)(*timerSet)); pthread_mutex_unlock(&ocl_mtx); } -void* llvm_visc_initializeTimerSet() { +void *llvm_visc_initializeTimerSet() { pthread_mutex_lock(&ocl_mtx); - visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet)); + visc_TimerSet *TS = (visc_TimerSet *)malloc(sizeof(visc_TimerSet)); visc_InitializeTimerSet(TS); pthread_mutex_unlock(&ocl_mtx); return TS; } - - - diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/visc-rt/visc-rt.h index 9eab8f8b291966c021b42cb0ff0bbdf169772168..3ad315768bf90584a68c1d620ac68936e62a17f0 100644 --- a/hpvm/projects/visc-rt/visc-rt.h +++ b/hpvm/projects/visc-rt/visc-rt.h @@ -5,12 +5,12 @@ #ifndef VISC_RT_HEADER #define VISC_RT_HEADER +#include <ctime> #include <iostream> #include <map> -#include <ctime> -#include <vector> #include <pthread.h> #include <string> +#include <vector> //#include <condition_variable> #include "../../include/SupportVISC/VISCHint.h" @@ -19,13 +19,12 @@ #include "policy.h" #ifndef DEBUG_BUILD -#define DEBUG(s) {} +#define DEBUG(s) \ + {} #else #define DEBUG(s) s #endif - - using namespace std; extern "C" { @@ -43,263 +42,250 @@ void llvm_visc_deviceAbstraction_waitOnDeviceStatus(); /********************* DFG Depth Stack **********************************/ class DFGDepth { - private: - unsigned numDim; - unsigned dimLimit[3]; - unsigned dimInstance[3]; - public: - DFGDepth() = default; - - DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0, - unsigned dimZ = 0, unsigned iZ = 0) { - assert(n <= 3 && "Error! More than 3 dimensions not supported"); - numDim = n; - dimLimit[0] = dimX; - dimLimit[1] = dimY; - dimLimit[2] = dimZ; - dimInstance[0] = iX; - dimInstance[1] = iY; - dimInstance[2] = iZ; - } +private: + unsigned numDim; + unsigned dimLimit[3]; + unsigned dimInstance[3]; - unsigned getDimLimit(unsigned dim) const { - assert(dim <= numDim && "Error! Requested dimension limit is not specified"); - return dimLimit[dim]; - } +public: + DFGDepth() = default; + + DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, + unsigned iY = 0, unsigned dimZ = 0, unsigned iZ = 0) { + assert(n <= 3 && "Error! More than 3 dimensions not supported"); + numDim = n; + dimLimit[0] = dimX; + dimLimit[1] = dimY; + dimLimit[2] = dimZ; + dimInstance[0] = iX; + dimInstance[1] = iY; + dimInstance[2] = iZ; + } - unsigned getDimInstance(unsigned dim) const { - assert(dim <= numDim && "Error! Requested dimension instance is not specified"); - return dimInstance[dim]; - } + unsigned getDimLimit(unsigned dim) const { + assert(dim <= numDim && + "Error! Requested dimension limit is not specified"); + return dimLimit[dim]; + } - unsigned getNumDim() const { - return numDim; - } + unsigned getDimInstance(unsigned dim) const { + assert(dim <= numDim && + "Error! Requested dimension instance is not specified"); + return dimInstance[dim]; + } + + unsigned getNumDim() const { return numDim; } }; void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, - uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0); + uint64_t limitY = 0, uint64_t iY = 0, + uint64_t limitZ = 0, uint64_t iZ = 0); void llvm_visc_x86_dstack_pop(); uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim); uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim); - /********************* Memory Tracker **********************************/ class MemTrackerEntry { public: - enum Location {HOST, DEVICE}; - private: - size_t size; - Location loc; - void* addr; - void* Context; - - public: - MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context): - size(_size), loc(_loc), addr(_addr), Context(_Context) { - } + enum Location { HOST, DEVICE }; - size_t getSize() const { - return size; - } +private: + size_t size; + Location loc; + void *addr; + void *Context; - Location getLocation() const { - return loc; - } +public: + MemTrackerEntry(size_t _size, Location _loc, void *_addr, void *_Context) + : size(_size), loc(_loc), addr(_addr), Context(_Context) {} - void* getAddress() const { - return addr; - } + size_t getSize() const { return size; } - void* getContext() const { - return Context; - } + Location getLocation() const { return loc; } - void update(Location _loc, void* _addr, void* _Context = NULL) { - loc = _loc; - addr = _addr; - Context = _Context; - } + void *getAddress() const { return addr; } - void print() { - cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context; - } -}; + void *getContext() const { return Context; } + + void update(Location _loc, void *_addr, void *_Context = NULL) { + loc = _loc; + addr = _addr; + Context = _Context; + } + void print() { + cout << "Size = " << size << "\tLocation = " << loc + << "\tAddress = " << addr << "\tContext = " << Context; + } +}; class MemTracker { private: - std::map<void*, MemTrackerEntry*> Table; + std::map<void *, MemTrackerEntry *> Table; public: - MemTracker() { - } + MemTracker() {} - bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) { - MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context); - Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE)); + bool insert(void *ID, size_t size, MemTrackerEntry::Location loc, void *addr, + void *Context = NULL) { + MemTrackerEntry *MTE = new MemTrackerEntry(size, loc, addr, Context); + Table.insert(std::pair<void *, MemTrackerEntry *>(ID, MTE)); return MTE != NULL; } - MemTrackerEntry* lookup(void* ID) { - if(Table.count(ID) == 0) + MemTrackerEntry *lookup(void *ID) { + if (Table.count(ID) == 0) return NULL; return Table[ID]; } - void remove(void* ID) { - MemTrackerEntry* MTE = Table[ID]; + void remove(void *ID) { + MemTrackerEntry *MTE = Table[ID]; free(MTE); Table.erase(ID); } void print() { cout << "Printing Table ... Size = " << Table.size() << flush << "\n"; - for(auto& Entry: Table) { - cout << Entry.first << ":\t" ; + for (auto &Entry : Table) { + cout << Entry.first << ":\t"; Entry.second->print(); cout << flush << "\n"; } } - }; -void llvm_visc_track_mem(void*, size_t); -void llvm_visc_untrack_mem(void*); -void* llvm_visc_request_mem(void*, size_t); +void llvm_visc_track_mem(void *, size_t); +void llvm_visc_untrack_mem(void *); +void *llvm_visc_request_mem(void *, size_t); /*********************** OPENCL & PTHREAD API **************************/ -void* llvm_visc_x86_launch(void* (void*), void*); -void llvm_visc_x86_wait(void*); -void* llvm_visc_ocl_initContext(enum visc::Target); - -void* llvm_visc_x86_argument_ptr(void*, size_t); - -void llvm_visc_ocl_clearContext(void*); -void llvm_visc_ocl_argument_shared(void*, int, size_t); -void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t); -void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool); -void* llvm_visc_ocl_output_ptr(void*, int, size_t); -void llvm_visc_ocl_free(void*); -void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t); -void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*); -void* llvm_visc_ocl_launch(const char*, const char*); -void llvm_visc_ocl_wait(void*); - -void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID); -void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL); -void* llvm_visc_initializeTimerSet(); - +void *llvm_visc_x86_launch(void *(void *), void *); +void llvm_visc_x86_wait(void *); +void *llvm_visc_ocl_initContext(enum visc::Target); + +void *llvm_visc_x86_argument_ptr(void *, size_t); + +void llvm_visc_ocl_clearContext(void *); +void llvm_visc_ocl_argument_shared(void *, int, size_t); +void llvm_visc_ocl_argument_scalar(void *, void *, int, size_t); +void *llvm_visc_ocl_argument_ptr(void *, void *, int, size_t, bool, bool); +void *llvm_visc_ocl_output_ptr(void *, int, size_t); +void llvm_visc_ocl_free(void *); +void *llvm_visc_ocl_getOutput(void *, void *, void *, size_t); +void *llvm_visc_ocl_executeNode(void *, unsigned, const size_t *, + const size_t *); +void *llvm_visc_ocl_launch(const char *, const char *); +void llvm_visc_ocl_wait(void *); + +void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID); +void llvm_visc_printTimerSet(void **timerSet, char *timerName = NULL); +void *llvm_visc_initializeTimerSet(); } /*************************** Pipeline API ******************************/ // Circular Buffer class unsigned counter = 0; -template <class ElementType> -class CircularBuffer { +template <class ElementType> class CircularBuffer { private: - int numElements; - int bufferSize; - int Head; - int Tail; - pthread_mutex_t mtx; - pthread_cond_t cv; - vector<ElementType> buffer; - std::string name; - unsigned ID; + int numElements; + int bufferSize; + int Head; + int Tail; + pthread_mutex_t mtx; + pthread_cond_t cv; + vector<ElementType> buffer; + std::string name; + unsigned ID; public: - CircularBuffer(int maxElements, std::string _name = "ANON") { - ID = counter; - Head = 0; - Tail = 0; - numElements = 0; - name = _name; - bufferSize = maxElements+1; - buffer.reserve(bufferSize); - pthread_mutex_init(&mtx, NULL); - pthread_cond_init(&cv, NULL); - counter++; - - } - - bool push(ElementType E); - ElementType pop(); + CircularBuffer(int maxElements, std::string _name = "ANON") { + ID = counter; + Head = 0; + Tail = 0; + numElements = 0; + name = _name; + bufferSize = maxElements + 1; + buffer.reserve(bufferSize); + pthread_mutex_init(&mtx, NULL); + pthread_cond_init(&cv, NULL); + counter++; + } + bool push(ElementType E); + ElementType pop(); }; template <class ElementType> bool CircularBuffer<ElementType>::push(ElementType E) { - //DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n"); - //unique_lock<mutex> lk(mtx); - pthread_mutex_lock(&mtx); - if((Head +1) % bufferSize == Tail) { - //DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n"); - //cv.wait(lk); - pthread_cond_wait(&cv, &mtx); - //DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n"); - } - buffer[Head] = E; - Head = (Head+1) % bufferSize; - numElements++; - //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n"); - //lk.unlock(); - pthread_mutex_unlock(&mtx); - //cv.notify_one(); - pthread_cond_signal(&cv); - return true; + // DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << + // "\n"); unique_lock<mutex> lk(mtx); + pthread_mutex_lock(&mtx); + if ((Head + 1) % bufferSize == Tail) { + // DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep + // ...\n"); cv.wait(lk); + pthread_cond_wait(&cv, &mtx); + // DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n"); + } + buffer[Head] = E; + Head = (Head + 1) % bufferSize; + numElements++; + // DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << + // numElements << flush << "\n"); lk.unlock(); + pthread_mutex_unlock(&mtx); + // cv.notify_one(); + pthread_cond_signal(&cv); + return true; } -template <class ElementType> -ElementType CircularBuffer<ElementType>::pop() { - //unique_lock<mutex> lk(mtx); - //DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n"); - pthread_mutex_lock(&mtx); - if(Tail == Head) { - //DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n"); - //cv.wait(lk); - pthread_cond_wait(&cv, &mtx); - //DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n"); - } - ElementType E = buffer[Tail]; - Tail = (Tail + 1) % bufferSize; - numElements--; - //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n"); - //lk.unlock(); - pthread_mutex_unlock(&mtx); - //cv.notify_one(); - pthread_cond_signal(&cv); - return E; +template <class ElementType> ElementType CircularBuffer<ElementType>::pop() { + // unique_lock<mutex> lk(mtx); + // DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n"); + pthread_mutex_lock(&mtx); + if (Tail == Head) { + // DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n"); + // cv.wait(lk); + pthread_cond_wait(&cv, &mtx); + // DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n"); + } + ElementType E = buffer[Tail]; + Tail = (Tail + 1) % bufferSize; + numElements--; + // DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << + // numElements << flush << "\n"); lk.unlock(); + pthread_mutex_unlock(&mtx); + // cv.notify_one(); + pthread_cond_signal(&cv); + return E; } extern "C" { // Functions to push and pop values from pipeline buffers -uint64_t llvm_visc_bufferPop(void*); -void llvm_visc_bufferPush(void*, uint64_t); +uint64_t llvm_visc_bufferPop(void *); +void llvm_visc_bufferPush(void *, uint64_t); // Functions to create and destroy buffers -void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned); -void* llvm_visc_createBindOutBuffer(void*, uint64_t); -void* llvm_visc_createEdgeBuffer(void*, uint64_t); -void* llvm_visc_createLastInputBuffer(void*, uint64_t); +void *llvm_visc_createBindInBuffer(void *, uint64_t, unsigned); +void *llvm_visc_createBindOutBuffer(void *, uint64_t); +void *llvm_visc_createEdgeBuffer(void *, uint64_t); +void *llvm_visc_createLastInputBuffer(void *, uint64_t); -void llvm_visc_freeBuffers(void*); +void llvm_visc_freeBuffers(void *); // Functions to create and destroy threads -void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*); -void llvm_visc_freeThreads(void*); +void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *); +void llvm_visc_freeThreads(void *); // Launch API for a streaming graph. // Arguments: // (1) Launch Function: void* (void*, void*) // (2) Push Function: void (void*, std::vector<uint64_t>**, unsgined) // (3) Pop Function: void* (std::vector<uint64_t>**, unsigned) -void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*); -void llvm_visc_streamPush(void* graphID, void* args); -void* llvm_visc_streamPop(void* graphID); -void llvm_visc_streamWait(void* graphID); - +void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *); +void llvm_visc_streamPush(void *graphID, void *args); +void *llvm_visc_streamPop(void *graphID); +void llvm_visc_streamWait(void *graphID); } -#endif //VISC_RT_HEADER +#endif // VISC_RT_HEADER diff --git a/hpvm/test/CTestSuite/gemm.c b/hpvm/test/CTestSuite/gemm.c index 2a54b88828e26c916b011b68455ad349f5929599..d0a69ba25c27fb65ea549023deed2dfb0197b882 100644 --- a/hpvm/test/CTestSuite/gemm.c +++ b/hpvm/test/CTestSuite/gemm.c @@ -1,6 +1,6 @@ -#include <stdlib.h> -#include <stdio.h> #include <math.h> +#include <stdio.h> +#include <stdlib.h> #include <string.h> #define WA 1024 @@ -10,13 +10,11 @@ #define WC WB #define HC HA - - // Thread block size #define BLOCK_SIZE 16 // Allocates a matrix with random float entries. -void randomInit(float* data, int size) { +void randomInit(float *data, int size) { for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX; } @@ -30,26 +28,25 @@ void randomInit(float* data, int size) { ////////////////////////////////////////////////////////////////////////////// // Check bool -int isEqual(float a, float b) { - return (fabs(a-b) < 0.001); -} +int isEqual(float a, float b) { return (fabs(a - b) < 0.001); } // Check Results -__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) { +__attribute__((noinline)) int checkResults(float *A, float *B, float *C) { unsigned int size_A = WA * HA; unsigned int size_B = WB * HB; unsigned int size_C = WC * HC; unsigned int bytesC = sizeof(float) * size_C; - float* goldC = (float*) malloc(bytesC); - for (int i=0; i < HC; i++) { - for (int j=0; j < WC; j++) { - goldC[i*WC + j] = 0; - for (int k=0; k < HB; k++) { - goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j]; + float *goldC = (float *)malloc(bytesC); + for (int i = 0; i < HC; i++) { + for (int j = 0; j < WC; j++) { + goldC[i * WC + j] = 0; + for (int k = 0; k < HB; k++) { + goldC[i * WC + j] += A[i * WA + k] * B[k * WB + j]; } - if(!isEqual(goldC[i*WC + j], C[i*WC + j])) { - printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]); + if (!isEqual(goldC[i * WC + j], C[i * WC + j])) { + printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, + C[i * WC + j], goldC[i * WC + j]); return 0; } } @@ -58,36 +55,38 @@ __attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) { } // Dummy visc node execution call -//void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* outputs); +// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), +// int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* +// outputs); + +void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) { -void matrixMul(float* A, float* B, float* C, unsigned k, unsigned n) { - __visc__attributes(2, A, B, 1, C); - //printf("Entered function\n"); - int tx = get_local_id(0); //2D Global Thread ID x - int ty = get_local_id(1); //2D Global Thread ID y - //int tx = get_global_id(0); //2D Global Thread ID x - //int ty = get_global_id(1); //2D Global Thread ID y + // printf("Entered function\n"); + int tx = get_local_id(0); // 2D Global Thread ID x + int ty = get_local_id(1); // 2D Global Thread ID y + // int tx = get_global_id(0); //2D Global Thread ID x + // int ty = get_global_id(1); //2D Global Thread ID y - //printf("Computing element (%d, %d)\n", tx, ty); + // printf("Computing element (%d, %d)\n", tx, ty); // Initialize accumulator float res = 0.0f; // Perform dot-product of row-column for (int i = 0; i < k; i++) { - //printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx); - res += A[ty*k+i] * B[i*n+tx]; + // printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx); + res += A[ty * k + i] * B[i * n + tx]; } - //printf("Result computed\n"); + // printf("Result computed\n"); // Write in device memory - C[ty*n+tx] = res; + C[ty * n + tx] = res; - //printf("Result written to C\n"); + // printf("Result written to C\n"); } // Main -int main(int argc, char** argv) { +int main(int argc, char **argv) { // seed for rand() srand(2006); @@ -95,46 +94,47 @@ int main(int argc, char** argv) { // Allocate host memory for matrices A and B unsigned int size_A = WA * HA; size_t bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); + float *h_A = (float *)malloc(bytes_A); unsigned int size_B = WB * HB; size_t bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ + float *h_B = (float *)malloc(bytes_B); + + // Initialize host memory + randomInit(h_A, size_A); + randomInit(h_B, size_B); + + /* + // Print A and B + printf("\n\nMatrix A\n"); + for(int i = 0; i < size_A; i++) + { + printf("%f ", h_A[i]); + if(((i + 1) % WA) == 0) + printf("\n"); + } + + printf("\n\nMatrix B\n"); + for(int i = 0; i < size_B; i++) + { + printf("%f ", h_B[i]); + if(((i + 1) % WB) == 0) + printf("\n"); + } + */ // Allocate host memory for the result matrix C unsigned int size_C = WC * HC; size_t bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); + float *h_C = (float *)malloc(bytes_C); - // Compute using OpenCL - //matrixMul(h_A, h_B, h_C, WA, WB); + // Compute using OpenCL + // matrixMul(h_A, h_B, h_C, WA, WB); //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); - unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0); + unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, + bytes_B, h_C, bytes_C, WA, WB, 0); __visc__wait(graphMM); - if(checkResults(h_A, h_B, h_C)) + if (checkResults(h_A, h_B, h_C)) printf("\nPass!\n"); else printf("\nFailed!\n"); @@ -145,4 +145,3 @@ int main(int argc, char** argv) { free(h_B); free(h_C); } - diff --git a/hpvm/test/CTestSuite/gemm_2.c b/hpvm/test/CTestSuite/gemm_2.c index aab0168bef72b00d036e450007a50d55727e515a..bd7ab27fc0160275442d23faf507851b7c2369f7 100644 --- a/hpvm/test/CTestSuite/gemm_2.c +++ b/hpvm/test/CTestSuite/gemm_2.c @@ -1,6 +1,6 @@ -#include <stdlib.h> -#include <stdio.h> #include <math.h> +#include <stdio.h> +#include <stdlib.h> #include <string.h> #define WA 1024 @@ -10,13 +10,11 @@ #define WC WB #define HC HA - - // Thread block size #define BLOCK_SIZE 16 // Allocates a matrix with random float entries. -void randomInit(float* data, int size) { +void randomInit(float *data, int size) { for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX; } @@ -30,26 +28,25 @@ void randomInit(float* data, int size) { ////////////////////////////////////////////////////////////////////////////// // Check bool -int isEqual(float a, float b) { - return (fabs(a-b) < 0.001); -} +int isEqual(float a, float b) { return (fabs(a - b) < 0.001); } // Check Results -__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) { +__attribute__((noinline)) int checkResults(float *A, float *B, float *C) { unsigned int size_A = WA * HA; unsigned int size_B = WB * HB; unsigned int size_C = WC * HC; unsigned int bytesC = sizeof(float) * size_C; - float* goldC = (float*) malloc(bytesC); - for (int i=0; i < HC; i++) { - for (int j=0; j < WC; j++) { - goldC[i*WC + j] = 0; - for (int k=0; k < HB; k++) { - goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j]; + float *goldC = (float *)malloc(bytesC); + for (int i = 0; i < HC; i++) { + for (int j = 0; j < WC; j++) { + goldC[i * WC + j] = 0; + for (int k = 0; k < HB; k++) { + goldC[i * WC + j] += A[i * WA + k] * B[k * WB + j]; } - if(!isEqual(goldC[i*WC + j], C[i*WC + j])) { - printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]); + if (!isEqual(goldC[i * WC + j], C[i * WC + j])) { + printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, + C[i * WC + j], goldC[i * WC + j]); return 0; } } @@ -58,36 +55,38 @@ __attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) { } // Dummy visc node execution call -//void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* outputs); +// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), +// int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* +// outputs); -void matrixMul( float* A, float* B, float* C, unsigned k, unsigned n) { +void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) { __visc__attributes(2, A, B, 1, C); - //printf("Entered function\n"); - int tx = get_global_id(0); //2D Global Thread ID x - int ty = get_global_id(1); //2D Global Thread ID y - //int tx = get_global_id(0); //2D Global Thread ID x - //int ty = get_global_id(1); //2D Global Thread ID y + // printf("Entered function\n"); + int tx = get_global_id(0); // 2D Global Thread ID x + int ty = get_global_id(1); // 2D Global Thread ID y + // int tx = get_global_id(0); //2D Global Thread ID x + // int ty = get_global_id(1); //2D Global Thread ID y - //printf("Computing element (%d, %d)\n", tx, ty); + // printf("Computing element (%d, %d)\n", tx, ty); // Initialize accumulator float res = 0.0f; // Perform dot-product of row-column for (int i = 0; i < k; i++) { - //printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx); - res += A[ty*k+i] * B[i*n+tx]; + // printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx); + res += A[ty * k + i] * B[i * n + tx]; } - //printf("Result computed\n"); + // printf("Result computed\n"); // Write in device memory - C[ty*n+tx] = res; + C[ty * n + tx] = res; - //printf("Result written to C\n"); + // printf("Result written to C\n"); } // Main -int main(int argc, char** argv) { +int main(int argc, char **argv) { // seed for rand() srand(2006); @@ -95,46 +94,48 @@ int main(int argc, char** argv) { // Allocate host memory for matrices A and B unsigned int size_A = WA * HA; size_t bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); + float *h_A = (float *)malloc(bytes_A); unsigned int size_B = WB * HB; size_t bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ + float *h_B = (float *)malloc(bytes_B); + + // Initialize host memory + randomInit(h_A, size_A); + randomInit(h_B, size_B); + + /* + // Print A and B + printf("\n\nMatrix A\n"); + for(int i = 0; i < size_A; i++) + { + printf("%f ", h_A[i]); + if(((i + 1) % WA) == 0) + printf("\n"); + } + + printf("\n\nMatrix B\n"); + for(int i = 0; i < size_B; i++) + { + printf("%f ", h_B[i]); + if(((i + 1) % WB) == 0) + printf("\n"); + } + */ // Allocate host memory for the result matrix C unsigned int size_C = WC * HC; size_t bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); + float *h_C = (float *)malloc(bytes_C); - // Compute using OpenCL - //matrixMul(h_A, h_B, h_C, WA, WB); + // Compute using OpenCL + // matrixMul(h_A, h_B, h_C, WA, WB); //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); - unsigned graphMM = __visc__node(matrixMul, 2, 2, 16, 16, WB/16, HA/16, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0); + unsigned graphMM = + __visc__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A, + h_B, bytes_B, h_C, bytes_C, WA, WB, 0); __visc__wait(graphMM); - if(checkResults(h_A, h_B, h_C)) + if (checkResults(h_A, h_B, h_C)) printf("\nPass!\n"); else printf("\nFailed!\n"); @@ -145,4 +146,3 @@ int main(int argc, char** argv) { free(h_B); free(h_C); } - diff --git a/hpvm/test/hpvm-cava/scripts/gamut_map.cc b/hpvm/test/hpvm-cava/scripts/gamut_map.cc index ba835f086895f58dc77a2f206590fe806b9010d2..ef5162e120aa95d0e56c6c6142770dc6503f8ce4 100644 --- a/hpvm/test/hpvm-cava/scripts/gamut_map.cc +++ b/hpvm/test/hpvm-cava/scripts/gamut_map.cc @@ -1,50 +1,41 @@ -#include <iostream> #include <cmath> +#include <iostream> #include "gamut_map.h" -void gamut_map(float* input, - int row_size, - int col_size, - int chan_size, - float* result, - float* ctrl_pts, - float* weights, - float* coefs, +void gamut_map(float *input, int row_size, int col_size, int chan_size, + float *result, float *ctrl_pts, float *weights, float *coefs, int num_cps) { - ARRAY_3D(float, _input, input, col_size, chan_size); - ARRAY_3D(float, _result, result, col_size, chan_size); - ARRAY_2D(float, _ctrl_pts, ctrl_pts, chan_size); - ARRAY_2D(float, _weights, weights, chan_size); - ARRAY_2D(float, _coefs, coefs, chan_size); + ARRAY_3D(float, _input, input, col_size, chan_size); + ARRAY_3D(float, _result, result, col_size, chan_size); + ARRAY_2D(float, _ctrl_pts, ctrl_pts, chan_size); + ARRAY_2D(float, _weights, weights, chan_size); + ARRAY_2D(float, _coefs, coefs, chan_size); - float* l2_dist = new float[num_cps]; - for (int row = 0; row < row_size; row++) { - for (int col = 0; col < col_size; col++) { - for (int cp = 0; cp < num_cps; cp++) { - l2_dist[cp] = - sqrt((_input[row][col][0] - _ctrl_pts[cp][0]) * - (_input[row][col][0] - _ctrl_pts[cp][0]) + - (_input[row][col][1] - _ctrl_pts[cp][1]) * - (_input[row][col][1] - _ctrl_pts[cp][1]) + - (_input[row][col][2] - _ctrl_pts[cp][2]) * - (_input[row][col][2] - _ctrl_pts[cp][2])); - } - for (int chan = 0; chan < chan_size; chan++) { - float chan_val = 0.0; - for (int cp = 0; cp < num_cps; cp++) { - chan_val += l2_dist[cp] * _weights[cp][chan]; - } - // Add on the biases for the RBF - chan_val += _coefs[0][chan] + - _coefs[1][chan] * _input[row][col][0] + - _coefs[2][chan] * _input[row][col][1] + - _coefs[3][chan] * _input[row][col][2]; - _result[row][col][chan] = (chan_val > 0) ? chan_val : 0; - } + float *l2_dist = new float[num_cps]; + for (int row = 0; row < row_size; row++) { + for (int col = 0; col < col_size; col++) { + for (int cp = 0; cp < num_cps; cp++) { + l2_dist[cp] = sqrt((_input[row][col][0] - _ctrl_pts[cp][0]) * + (_input[row][col][0] - _ctrl_pts[cp][0]) + + (_input[row][col][1] - _ctrl_pts[cp][1]) * + (_input[row][col][1] - _ctrl_pts[cp][1]) + + (_input[row][col][2] - _ctrl_pts[cp][2]) * + (_input[row][col][2] - _ctrl_pts[cp][2])); + } + for (int chan = 0; chan < chan_size; chan++) { + float chan_val = 0.0; + for (int cp = 0; cp < num_cps; cp++) { + chan_val += l2_dist[cp] * _weights[cp][chan]; } + // Add on the biases for the RBF + chan_val += _coefs[0][chan] + _coefs[1][chan] * _input[row][col][0] + + _coefs[2][chan] * _input[row][col][1] + + _coefs[3][chan] * _input[row][col][2]; + _result[row][col][chan] = (chan_val > 0) ? chan_val : 0; + } } - delete l2_dist; + } + delete l2_dist; } - diff --git a/hpvm/test/hpvm-cava/scripts/gamut_map.h b/hpvm/test/hpvm-cava/scripts/gamut_map.h index cd8d2ea9ae90bd71651193d7752fe97656d9617a..2742218fcb364d0eb6fe81666cca2921005b3007 100644 --- a/hpvm/test/hpvm-cava/scripts/gamut_map.h +++ b/hpvm/test/hpvm-cava/scripts/gamut_map.h @@ -2,20 +2,14 @@ #define GAMUT_MAP_H #define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1) \ - TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name + TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name #define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2) \ - TYPE(*output_array_name)[DIM_1][DIM_2] = \ - (TYPE(*)[DIM_1][DIM_2])input_array_name + TYPE(*output_array_name) \ + [DIM_1][DIM_2] = (TYPE(*)[DIM_1][DIM_2])input_array_name -void gamut_map(float* input, - int row_size, - int col_size, - int chan_size, - float* result, - float* ctrl_pts, - float* weights, - float* coefs, +void gamut_map(float *input, int row_size, int col_size, int chan_size, + float *result, float *ctrl_pts, float *weights, float *coefs, int num_cps); #endif diff --git a/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc b/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc index 21864f4abc4c78ba41d42a984871894fcbd17271..a9efa8ff6e7e312cc46cae3442597d4faca16130 100644 --- a/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc +++ b/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc @@ -3,11 +3,11 @@ * Version 3.0.8 * * This file is not intended to be easily readable and contains a number of - * coding conventions designed to improve portability and efficiency. Do not make - * changes to this file unless you know what you are doing--modify the SWIG + * coding conventions designed to improve portability and efficiency. Do not + * make changes to this file unless you know what you are doing--modify the SWIG * interface file instead. - * ----------------------------------------------------------------------------- */ - + * ----------------------------------------------------------------------------- + */ #ifndef SWIGPYTHON #define SWIGPYTHON @@ -18,114 +18,124 @@ /* ----------------------------------------------------------------------------- * This section contains generic SWIG labels for method/variable * declarations/attributes, and other compiler dependent labels. - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ -/* template workaround for compilers that cannot correctly implement the C++ standard */ +/* template workaround for compilers that cannot correctly implement the C++ + * standard */ #ifndef SWIGTEMPLATEDISAMBIGUATOR -# if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x560) -# define SWIGTEMPLATEDISAMBIGUATOR template -# elif defined(__HP_aCC) -/* Needed even with `aCC -AA' when `aCC -V' reports HP ANSI C++ B3910B A.03.55 */ -/* If we find a maximum version that requires this, the test would be __HP_aCC <= 35500 for A.03.55 */ -# define SWIGTEMPLATEDISAMBIGUATOR template -# else -# define SWIGTEMPLATEDISAMBIGUATOR -# endif +#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x560) +#define SWIGTEMPLATEDISAMBIGUATOR template +#elif defined(__HP_aCC) +/* Needed even with `aCC -AA' when `aCC -V' reports HP ANSI C++ B3910B A.03.55 + */ +/* If we find a maximum version that requires this, the test would be __HP_aCC + * <= 35500 for A.03.55 */ +#define SWIGTEMPLATEDISAMBIGUATOR template +#else +#define SWIGTEMPLATEDISAMBIGUATOR +#endif #endif /* inline attribute */ #ifndef SWIGINLINE -# if defined(__cplusplus) || (defined(__GNUC__) && !defined(__STRICT_ANSI__)) -# define SWIGINLINE inline -# else -# define SWIGINLINE -# endif +#if defined(__cplusplus) || (defined(__GNUC__) && !defined(__STRICT_ANSI__)) +#define SWIGINLINE inline +#else +#define SWIGINLINE +#endif #endif /* attribute recognised by some compilers to avoid 'unused' warnings */ #ifndef SWIGUNUSED -# if defined(__GNUC__) -# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) -# define SWIGUNUSED __attribute__ ((__unused__)) -# else -# define SWIGUNUSED -# endif -# elif defined(__ICC) -# define SWIGUNUSED __attribute__ ((__unused__)) -# else -# define SWIGUNUSED -# endif +#if defined(__GNUC__) +#if !(defined(__cplusplus)) || \ + (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +#define SWIGUNUSED __attribute__((__unused__)) +#else +#define SWIGUNUSED +#endif +#elif defined(__ICC) +#define SWIGUNUSED __attribute__((__unused__)) +#else +#define SWIGUNUSED +#endif #endif #ifndef SWIG_MSC_UNSUPPRESS_4505 -# if defined(_MSC_VER) -# pragma warning(disable : 4505) /* unreferenced local function has been removed */ -# endif +#if defined(_MSC_VER) +#pragma warning( \ + disable : 4505) /* unreferenced local function has been removed */ +#endif #endif #ifndef SWIGUNUSEDPARM -# ifdef __cplusplus -# define SWIGUNUSEDPARM(p) -# else -# define SWIGUNUSEDPARM(p) p SWIGUNUSED -# endif +#ifdef __cplusplus +#define SWIGUNUSEDPARM(p) +#else +#define SWIGUNUSEDPARM(p) p SWIGUNUSED +#endif #endif /* internal SWIG method */ #ifndef SWIGINTERN -# define SWIGINTERN static SWIGUNUSED +#define SWIGINTERN static SWIGUNUSED #endif /* internal inline SWIG method */ #ifndef SWIGINTERNINLINE -# define SWIGINTERNINLINE SWIGINTERN SWIGINLINE +#define SWIGINTERNINLINE SWIGINTERN SWIGINLINE #endif /* exporting methods */ #if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) -# ifndef GCC_HASCLASSVISIBILITY -# define GCC_HASCLASSVISIBILITY -# endif +#ifndef GCC_HASCLASSVISIBILITY +#define GCC_HASCLASSVISIBILITY +#endif #endif #ifndef SWIGEXPORT -# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) -# if defined(STATIC_LINKED) -# define SWIGEXPORT -# else -# define SWIGEXPORT __declspec(dllexport) -# endif -# else -# if defined(__GNUC__) && defined(GCC_HASCLASSVISIBILITY) -# define SWIGEXPORT __attribute__ ((visibility("default"))) -# else -# define SWIGEXPORT -# endif -# endif +#if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) +#if defined(STATIC_LINKED) +#define SWIGEXPORT +#else +#define SWIGEXPORT __declspec(dllexport) +#endif +#else +#if defined(__GNUC__) && defined(GCC_HASCLASSVISIBILITY) +#define SWIGEXPORT __attribute__((visibility("default"))) +#else +#define SWIGEXPORT +#endif +#endif #endif /* calling conventions for Windows */ #ifndef SWIGSTDCALL -# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) -# define SWIGSTDCALL __stdcall -# else -# define SWIGSTDCALL -# endif +#if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) +#define SWIGSTDCALL __stdcall +#else +#define SWIGSTDCALL +#endif #endif /* Deal with Microsoft's attempt at deprecating C standard runtime functions */ -#if !defined(SWIG_NO_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE) -# define _CRT_SECURE_NO_DEPRECATE +#if !defined(SWIG_NO_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && \ + !defined(_CRT_SECURE_NO_DEPRECATE) +#define _CRT_SECURE_NO_DEPRECATE #endif -/* Deal with Microsoft's attempt at deprecating methods in the standard C++ library */ -#if !defined(SWIG_NO_SCL_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_SCL_SECURE_NO_DEPRECATE) -# define _SCL_SECURE_NO_DEPRECATE +/* Deal with Microsoft's attempt at deprecating methods in the standard C++ + * library */ +#if !defined(SWIG_NO_SCL_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && \ + !defined(_SCL_SECURE_NO_DEPRECATE) +#define _SCL_SECURE_NO_DEPRECATE #endif /* Deal with Apple's deprecated 'AssertMacros.h' from Carbon-framework */ -#if defined(__APPLE__) && !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES) -# define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0 +#if defined(__APPLE__) && \ + !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES) +#define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0 #endif /* Intel's compiler complains if a variable which was never initialised is @@ -134,17 +144,16 @@ * See: https://github.com/swig/swig/issues/192 for more discussion. */ #ifdef __INTEL_COMPILER -# pragma warning disable 592 +#pragma warning disable 592 #endif - #if defined(_DEBUG) && defined(SWIG_PYTHON_INTERPRETER_NO_DEBUG) /* Use debug wrappers with the Python release dll */ -# undef _DEBUG -# include <Python.h> -# define _DEBUG +#undef _DEBUG +#include <Python.h> +#define _DEBUG #else -# include <Python.h> +#include <Python.h> #endif /* ----------------------------------------------------------------------------- @@ -152,19 +161,20 @@ * * This file contains generic C API SWIG runtime support for pointer * type checking. - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ -/* This should only be incremented when either the layout of swig_type_info changes, - or for whatever reason, the runtime changes incompatibly */ +/* This should only be incremented when either the layout of swig_type_info + changes, or for whatever reason, the runtime changes incompatibly */ #define SWIG_RUNTIME_VERSION "4" /* define SWIG_TYPE_TABLE_NAME as "SWIG_TYPE_TABLE" */ #ifdef SWIG_TYPE_TABLE -# define SWIG_QUOTE_STRING(x) #x -# define SWIG_EXPAND_AND_QUOTE_STRING(x) SWIG_QUOTE_STRING(x) -# define SWIG_TYPE_TABLE_NAME SWIG_EXPAND_AND_QUOTE_STRING(SWIG_TYPE_TABLE) +#define SWIG_QUOTE_STRING(x) #x +#define SWIG_EXPAND_AND_QUOTE_STRING(x) SWIG_QUOTE_STRING(x) +#define SWIG_TYPE_TABLE_NAME SWIG_EXPAND_AND_QUOTE_STRING(SWIG_TYPE_TABLE) #else -# define SWIG_TYPE_TABLE_NAME +#define SWIG_TYPE_TABLE_NAME #endif /* @@ -177,25 +187,24 @@ */ #ifndef SWIGRUNTIME -# define SWIGRUNTIME SWIGINTERN +#define SWIGRUNTIME SWIGINTERN #endif #ifndef SWIGRUNTIMEINLINE -# define SWIGRUNTIMEINLINE SWIGRUNTIME SWIGINLINE +#define SWIGRUNTIMEINLINE SWIGRUNTIME SWIGINLINE #endif /* Generic buffer size */ #ifndef SWIG_BUFFER_SIZE -# define SWIG_BUFFER_SIZE 1024 +#define SWIG_BUFFER_SIZE 1024 #endif /* Flags for pointer conversions */ -#define SWIG_POINTER_DISOWN 0x1 -#define SWIG_CAST_NEW_MEMORY 0x2 +#define SWIG_POINTER_DISOWN 0x1 +#define SWIG_CAST_NEW_MEMORY 0x2 /* Flags for new pointer objects */ -#define SWIG_POINTER_OWN 0x1 - +#define SWIG_POINTER_OWN 0x1 /* Flags/methods for returning states. @@ -232,7 +241,7 @@ // success code if (SWIG_IsNewObj(res) { ... - delete *ptr; + delete *ptr; } else { ... } @@ -258,9 +267,9 @@ } } - Of course, returning the plain '0(success)/-1(fail)' still works, but you can be - more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the - SWIG errors code. + Of course, returning the plain '0(success)/-1(fail)' still works, but you can + be more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the SWIG + errors code. Finally, if the SWIG_CASTRANK_MODE is enabled, the result code allows to return the 'cast rank', for example, if you have this @@ -276,52 +285,53 @@ just use the SWIG_AddCast()/SWIG_CheckState() */ -#define SWIG_OK (0) -#define SWIG_ERROR (-1) -#define SWIG_IsOK(r) (r >= 0) -#define SWIG_ArgError(r) ((r != SWIG_ERROR) ? r : SWIG_TypeError) +#define SWIG_OK (0) +#define SWIG_ERROR (-1) +#define SWIG_IsOK(r) (r >= 0) +#define SWIG_ArgError(r) ((r != SWIG_ERROR) ? r : SWIG_TypeError) /* The CastRankLimit says how many bits are used for the cast rank */ -#define SWIG_CASTRANKLIMIT (1 << 8) +#define SWIG_CASTRANKLIMIT (1 << 8) /* The NewMask denotes the object was created (using new/malloc) */ -#define SWIG_NEWOBJMASK (SWIG_CASTRANKLIMIT << 1) +#define SWIG_NEWOBJMASK (SWIG_CASTRANKLIMIT << 1) /* The TmpMask is for in/out typemaps that use temporal objects */ -#define SWIG_TMPOBJMASK (SWIG_NEWOBJMASK << 1) +#define SWIG_TMPOBJMASK (SWIG_NEWOBJMASK << 1) /* Simple returning values */ -#define SWIG_BADOBJ (SWIG_ERROR) -#define SWIG_OLDOBJ (SWIG_OK) -#define SWIG_NEWOBJ (SWIG_OK | SWIG_NEWOBJMASK) -#define SWIG_TMPOBJ (SWIG_OK | SWIG_TMPOBJMASK) +#define SWIG_BADOBJ (SWIG_ERROR) +#define SWIG_OLDOBJ (SWIG_OK) +#define SWIG_NEWOBJ (SWIG_OK | SWIG_NEWOBJMASK) +#define SWIG_TMPOBJ (SWIG_OK | SWIG_TMPOBJMASK) /* Check, add and del mask methods */ -#define SWIG_AddNewMask(r) (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r) -#define SWIG_DelNewMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r) -#define SWIG_IsNewObj(r) (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK)) -#define SWIG_AddTmpMask(r) (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r) -#define SWIG_DelTmpMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r) -#define SWIG_IsTmpObj(r) (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK)) +#define SWIG_AddNewMask(r) (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r) +#define SWIG_DelNewMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r) +#define SWIG_IsNewObj(r) (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK)) +#define SWIG_AddTmpMask(r) (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r) +#define SWIG_DelTmpMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r) +#define SWIG_IsTmpObj(r) (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK)) /* Cast-Rank Mode */ #if defined(SWIG_CASTRANK_MODE) -# ifndef SWIG_TypeRank -# define SWIG_TypeRank unsigned long -# endif -# ifndef SWIG_MAXCASTRANK /* Default cast allowed */ -# define SWIG_MAXCASTRANK (2) -# endif -# define SWIG_CASTRANKMASK ((SWIG_CASTRANKLIMIT) -1) -# define SWIG_CastRank(r) (r & SWIG_CASTRANKMASK) +#ifndef SWIG_TypeRank +#define SWIG_TypeRank unsigned long +#endif +#ifndef SWIG_MAXCASTRANK /* Default cast allowed */ +#define SWIG_MAXCASTRANK (2) +#endif +#define SWIG_CASTRANKMASK ((SWIG_CASTRANKLIMIT)-1) +#define SWIG_CastRank(r) (r & SWIG_CASTRANKMASK) SWIGINTERNINLINE int SWIG_AddCast(int r) { - return SWIG_IsOK(r) ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR) : r; + return SWIG_IsOK(r) + ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR) + : r; } SWIGINTERNINLINE int SWIG_CheckState(int r) { return SWIG_IsOK(r) ? SWIG_CastRank(r) + 1 : 0; } #else /* no cast-rank mode */ -# define SWIG_AddCast(r) (r) -# define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0) +#define SWIG_AddCast(r) (r) +#define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0) #endif - #include <string.h> #ifdef __cplusplus @@ -333,32 +343,37 @@ typedef struct swig_type_info *(*swig_dycast_func)(void **); /* Structure to store information on one type */ typedef struct swig_type_info { - const char *name; /* mangled name of this type */ - const char *str; /* human readable name of this type */ - swig_dycast_func dcast; /* dynamic cast function down a hierarchy */ - struct swig_cast_info *cast; /* linked list of types that can cast into this type */ - void *clientdata; /* language specific type data */ - int owndata; /* flag if the structure owns the clientdata */ + const char *name; /* mangled name of this type */ + const char *str; /* human readable name of this type */ + swig_dycast_func dcast; /* dynamic cast function down a hierarchy */ + struct swig_cast_info + *cast; /* linked list of types that can cast into this type */ + void *clientdata; /* language specific type data */ + int owndata; /* flag if the structure owns the clientdata */ } swig_type_info; /* Structure to store a type and conversion function used for casting */ typedef struct swig_cast_info { - swig_type_info *type; /* pointer to type that is equivalent to this type */ - swig_converter_func converter; /* function to cast the void pointers */ - struct swig_cast_info *next; /* pointer to next cast in linked list */ - struct swig_cast_info *prev; /* pointer to the previous cast */ + swig_type_info *type; /* pointer to type that is equivalent to this type */ + swig_converter_func converter; /* function to cast the void pointers */ + struct swig_cast_info *next; /* pointer to next cast in linked list */ + struct swig_cast_info *prev; /* pointer to the previous cast */ } swig_cast_info; /* Structure used to store module information * Each module generates one structure like this, and the runtime collects * all of these structures and stores them in a circularly linked list.*/ typedef struct swig_module_info { - swig_type_info **types; /* Array of pointers to swig_type_info structures that are in this module */ - size_t size; /* Number of types in this module */ - struct swig_module_info *next; /* Pointer to next element in circularly linked list */ - swig_type_info **type_initial; /* Array of initially generated type structures */ - swig_cast_info **cast_initial; /* Array of initially generated casting structures */ - void *clientdata; /* Language specific module data */ + swig_type_info **types; /* Array of pointers to swig_type_info structures that + are in this module */ + size_t size; /* Number of types in this module */ + struct swig_module_info + *next; /* Pointer to next element in circularly linked list */ + swig_type_info * + *type_initial; /* Array of initially generated type structures */ + swig_cast_info * + *cast_initial; /* Array of initially generated casting structures */ + void *clientdata; /* Language specific module data */ } swig_module_info; /* @@ -368,13 +383,15 @@ typedef struct swig_module_info { Return 0 when the two name types are equivalent, as in strncmp, but skipping ' '. */ -SWIGRUNTIME int -SWIG_TypeNameComp(const char *f1, const char *l1, - const char *f2, const char *l2) { - for (;(f1 != l1) && (f2 != l2); ++f1, ++f2) { - while ((*f1 == ' ') && (f1 != l1)) ++f1; - while ((*f2 == ' ') && (f2 != l2)) ++f2; - if (*f1 != *f2) return (*f1 > *f2) ? 1 : -1; +SWIGRUNTIME int SWIG_TypeNameComp(const char *f1, const char *l1, + const char *f2, const char *l2) { + for (; (f1 != l1) && (f2 != l2); ++f1, ++f2) { + while ((*f1 == ' ') && (f1 != l1)) + ++f1; + while ((*f2 == ' ') && (f2 != l2)) + ++f2; + if (*f1 != *f2) + return (*f1 > *f2) ? 1 : -1; } return (int)((l1 - f1) - (l2 - f2)); } @@ -383,17 +400,18 @@ SWIG_TypeNameComp(const char *f1, const char *l1, Check type equivalence in a name list like <name1>|<name2>|... Return 0 if equal, -1 if nb < tb, 1 if nb > tb */ -SWIGRUNTIME int -SWIG_TypeCmp(const char *nb, const char *tb) { +SWIGRUNTIME int SWIG_TypeCmp(const char *nb, const char *tb) { int equiv = 1; - const char* te = tb + strlen(tb); - const char* ne = nb; + const char *te = tb + strlen(tb); + const char *ne = nb; while (equiv != 0 && *ne) { for (nb = ne; *ne; ++ne) { - if (*ne == '|') break; + if (*ne == '|') + break; } equiv = SWIG_TypeNameComp(nb, ne, tb, te); - if (*ne) ++ne; + if (*ne) + ++ne; } return equiv; } @@ -402,16 +420,14 @@ SWIG_TypeCmp(const char *nb, const char *tb) { Check type equivalence in a name list like <name1>|<name2>|... Return 0 if not equal, 1 if equal */ -SWIGRUNTIME int -SWIG_TypeEquiv(const char *nb, const char *tb) { +SWIGRUNTIME int SWIG_TypeEquiv(const char *nb, const char *tb) { return SWIG_TypeCmp(nb, tb) == 0 ? 1 : 0; } /* Check the typename */ -SWIGRUNTIME swig_cast_info * -SWIG_TypeCheck(const char *c, swig_type_info *ty) { +SWIGRUNTIME swig_cast_info *SWIG_TypeCheck(const char *c, swig_type_info *ty) { if (ty) { swig_cast_info *iter = ty->cast; while (iter) { @@ -424,7 +440,8 @@ SWIG_TypeCheck(const char *c, swig_type_info *ty) { iter->next->prev = iter->prev; iter->next = ty->cast; iter->prev = 0; - if (ty->cast) ty->cast->prev = iter; + if (ty->cast) + ty->cast->prev = iter; ty->cast = iter; return iter; } @@ -435,10 +452,11 @@ SWIG_TypeCheck(const char *c, swig_type_info *ty) { } /* - Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer comparison + Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer + comparison */ -SWIGRUNTIME swig_cast_info * -SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) { +SWIGRUNTIME swig_cast_info *SWIG_TypeCheckStruct(swig_type_info *from, + swig_type_info *ty) { if (ty) { swig_cast_info *iter = ty->cast; while (iter) { @@ -451,7 +469,8 @@ SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) { iter->next->prev = iter->prev; iter->next = ty->cast; iter->prev = 0; - if (ty->cast) ty->cast->prev = iter; + if (ty->cast) + ty->cast->prev = iter; ty->cast = iter; return iter; } @@ -464,21 +483,23 @@ SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) { /* Cast a pointer up an inheritance hierarchy */ -SWIGRUNTIMEINLINE void * -SWIG_TypeCast(swig_cast_info *ty, void *ptr, int *newmemory) { +SWIGRUNTIMEINLINE void *SWIG_TypeCast(swig_cast_info *ty, void *ptr, + int *newmemory) { return ((!ty) || (!ty->converter)) ? ptr : (*ty->converter)(ptr, newmemory); } /* Dynamic pointer casting. Down an inheritance hierarchy */ -SWIGRUNTIME swig_type_info * -SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) { +SWIGRUNTIME swig_type_info *SWIG_TypeDynamicCast(swig_type_info *ty, + void **ptr) { swig_type_info *lastty = ty; - if (!ty || !ty->dcast) return ty; + if (!ty || !ty->dcast) + return ty; while (ty && (ty->dcast)) { ty = (*ty->dcast)(ptr); - if (ty) lastty = ty; + if (ty) + lastty = ty; } return lastty; } @@ -486,8 +507,7 @@ SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) { /* Return the name associated with this type */ -SWIGRUNTIMEINLINE const char * -SWIG_TypeName(const swig_type_info *ty) { +SWIGRUNTIMEINLINE const char *SWIG_TypeName(const swig_type_info *ty) { return ty->name; } @@ -495,29 +515,28 @@ SWIG_TypeName(const swig_type_info *ty) { Return the pretty name associated with this type, that is an unmangled type name in a form presentable to the user. */ -SWIGRUNTIME const char * -SWIG_TypePrettyName(const swig_type_info *type) { +SWIGRUNTIME const char *SWIG_TypePrettyName(const swig_type_info *type) { /* The "str" field contains the equivalent pretty names of the type, separated by vertical-bar characters. We choose to print the last name, as it is often (?) the most specific. */ - if (!type) return NULL; + if (!type) + return NULL; if (type->str != NULL) { const char *last_name = type->str; const char *s; for (s = type->str; *s; s++) - if (*s == '|') last_name = s+1; + if (*s == '|') + last_name = s + 1; return last_name; - } - else + } else return type->name; } /* Set the clientdata field for a type */ -SWIGRUNTIME void -SWIG_TypeClientData(swig_type_info *ti, void *clientdata) { +SWIGRUNTIME void SWIG_TypeClientData(swig_type_info *ti, void *clientdata) { swig_cast_info *cast = ti->cast; /* if (ti->clientdata == clientdata) return; */ ti->clientdata = clientdata; @@ -526,14 +545,13 @@ SWIG_TypeClientData(swig_type_info *ti, void *clientdata) { if (!cast->converter) { swig_type_info *tc = cast->type; if (!tc->clientdata) { - SWIG_TypeClientData(tc, clientdata); + SWIG_TypeClientData(tc, clientdata); } } cast = cast->next; } } -SWIGRUNTIME void -SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) { +SWIGRUNTIME void SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) { SWIG_TypeClientData(ti, clientdata); ti->owndata = 1; } @@ -543,38 +561,37 @@ SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) { Search is a O(log #types) We start searching at module start, and finish searching when start == end. - Note: if start == end at the beginning of the function, we go all the way around - the circular list. + Note: if start == end at the beginning of the function, we go all the way + around the circular list. */ -SWIGRUNTIME swig_type_info * -SWIG_MangledTypeQueryModule(swig_module_info *start, - swig_module_info *end, - const char *name) { +SWIGRUNTIME swig_type_info *SWIG_MangledTypeQueryModule(swig_module_info *start, + swig_module_info *end, + const char *name) { swig_module_info *iter = start; do { if (iter->size) { size_t l = 0; size_t r = iter->size - 1; do { - /* since l+r >= 0, we can (>> 1) instead (/ 2) */ - size_t i = (l + r) >> 1; - const char *iname = iter->types[i]->name; - if (iname) { - int compare = strcmp(name, iname); - if (compare == 0) { - return iter->types[i]; - } else if (compare < 0) { - if (i) { - r = i - 1; - } else { - break; - } - } else if (compare > 0) { - l = i + 1; - } - } else { - break; /* should never happen */ - } + /* since l+r >= 0, we can (>> 1) instead (/ 2) */ + size_t i = (l + r) >> 1; + const char *iname = iter->types[i]->name; + if (iname) { + int compare = strcmp(name, iname); + if (compare == 0) { + return iter->types[i]; + } else if (compare < 0) { + if (i) { + r = i - 1; + } else { + break; + } + } else if (compare > 0) { + l = i + 1; + } + } else { + break; /* should never happen */ + } } while (l <= r); } iter = iter->next; @@ -583,18 +600,18 @@ SWIG_MangledTypeQueryModule(swig_module_info *start, } /* - Search for a swig_type_info structure for either a mangled name or a human readable name. - It first searches the mangled names of the types, which is a O(log #types) - If a type is not found it then searches the human readable names, which is O(#types). + Search for a swig_type_info structure for either a mangled name or a human + readable name. It first searches the mangled names of the types, which is a + O(log #types) If a type is not found it then searches the human readable + names, which is O(#types). We start searching at module start, and finish searching when start == end. - Note: if start == end at the beginning of the function, we go all the way around - the circular list. + Note: if start == end at the beginning of the function, we go all the way + around the circular list. */ -SWIGRUNTIME swig_type_info * -SWIG_TypeQueryModule(swig_module_info *start, - swig_module_info *end, - const char *name) { +SWIGRUNTIME swig_type_info *SWIG_TypeQueryModule(swig_module_info *start, + swig_module_info *end, + const char *name) { /* STEP 1: Search the name field using binary search */ swig_type_info *ret = SWIG_MangledTypeQueryModule(start, end, name); if (ret) { @@ -606,8 +623,8 @@ SWIG_TypeQueryModule(swig_module_info *start, do { size_t i = 0; for (; i < iter->size; ++i) { - if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name))) - return iter->types[i]; + if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name))) + return iter->types[i]; } iter = iter->next; } while (iter != end); @@ -620,11 +637,10 @@ SWIG_TypeQueryModule(swig_module_info *start, /* Pack binary data into a string */ -SWIGRUNTIME char * -SWIG_PackData(char *c, void *ptr, size_t sz) { +SWIGRUNTIME char *SWIG_PackData(char *c, void *ptr, size_t sz) { static const char hex[17] = "0123456789abcdef"; - const unsigned char *u = (unsigned char *) ptr; - const unsigned char *eu = u + sz; + const unsigned char *u = (unsigned char *)ptr; + const unsigned char *eu = u + sz; for (; u != eu; ++u) { unsigned char uu = *u; *(c++) = hex[(uu & 0xf0) >> 4]; @@ -636,9 +652,8 @@ SWIG_PackData(char *c, void *ptr, size_t sz) { /* Unpack binary data from a string */ -SWIGRUNTIME const char * -SWIG_UnpackData(const char *c, void *ptr, size_t sz) { - unsigned char *u = (unsigned char *) ptr; +SWIGRUNTIME const char *SWIG_UnpackData(const char *c, void *ptr, size_t sz) { + unsigned char *u = (unsigned char *)ptr; const unsigned char *eu = u + sz; for (; u != eu; ++u) { char d = *(c++); @@ -646,16 +661,16 @@ SWIG_UnpackData(const char *c, void *ptr, size_t sz) { if ((d >= '0') && (d <= '9')) uu = ((d - '0') << 4); else if ((d >= 'a') && (d <= 'f')) - uu = ((d - ('a'-10)) << 4); + uu = ((d - ('a' - 10)) << 4); else - return (char *) 0; + return (char *)0; d = *(c++); if ((d >= '0') && (d <= '9')) uu |= (d - '0'); else if ((d >= 'a') && (d <= 'f')) - uu |= (d - ('a'-10)); + uu |= (d - ('a' - 10)); else - return (char *) 0; + return (char *)0; *u = uu; } return c; @@ -664,56 +679,59 @@ SWIG_UnpackData(const char *c, void *ptr, size_t sz) { /* Pack 'void *' into a string buffer. */ -SWIGRUNTIME char * -SWIG_PackVoidPtr(char *buff, void *ptr, const char *name, size_t bsz) { +SWIGRUNTIME char *SWIG_PackVoidPtr(char *buff, void *ptr, const char *name, + size_t bsz) { char *r = buff; - if ((2*sizeof(void *) + 2) > bsz) return 0; + if ((2 * sizeof(void *) + 2) > bsz) + return 0; *(r++) = '_'; - r = SWIG_PackData(r,&ptr,sizeof(void *)); - if (strlen(name) + 1 > (bsz - (r - buff))) return 0; - strcpy(r,name); + r = SWIG_PackData(r, &ptr, sizeof(void *)); + if (strlen(name) + 1 > (bsz - (r - buff))) + return 0; + strcpy(r, name); return buff; } -SWIGRUNTIME const char * -SWIG_UnpackVoidPtr(const char *c, void **ptr, const char *name) { +SWIGRUNTIME const char *SWIG_UnpackVoidPtr(const char *c, void **ptr, + const char *name) { if (*c != '_') { - if (strcmp(c,"NULL") == 0) { - *ptr = (void *) 0; + if (strcmp(c, "NULL") == 0) { + *ptr = (void *)0; return name; } else { return 0; } } - return SWIG_UnpackData(++c,ptr,sizeof(void *)); + return SWIG_UnpackData(++c, ptr, sizeof(void *)); } -SWIGRUNTIME char * -SWIG_PackDataName(char *buff, void *ptr, size_t sz, const char *name, size_t bsz) { +SWIGRUNTIME char *SWIG_PackDataName(char *buff, void *ptr, size_t sz, + const char *name, size_t bsz) { char *r = buff; size_t lname = (name ? strlen(name) : 0); - if ((2*sz + 2 + lname) > bsz) return 0; + if ((2 * sz + 2 + lname) > bsz) + return 0; *(r++) = '_'; - r = SWIG_PackData(r,ptr,sz); + r = SWIG_PackData(r, ptr, sz); if (lname) { - strncpy(r,name,lname+1); + strncpy(r, name, lname + 1); } else { *r = 0; } return buff; } -SWIGRUNTIME const char * -SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) { +SWIGRUNTIME const char *SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, + const char *name) { if (*c != '_') { - if (strcmp(c,"NULL") == 0) { - memset(ptr,0,sz); + if (strcmp(c, "NULL") == 0) { + memset(ptr, 0, sz); return name; } else { return 0; } } - return SWIG_UnpackData(++c,ptr,sz); + return SWIG_UnpackData(++c, ptr, sz); } #ifdef __cplusplus @@ -721,21 +739,19 @@ SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) { #endif /* Errors in SWIG */ -#define SWIG_UnknownError -1 -#define SWIG_IOError -2 -#define SWIG_RuntimeError -3 -#define SWIG_IndexError -4 -#define SWIG_TypeError -5 -#define SWIG_DivisionByZero -6 -#define SWIG_OverflowError -7 -#define SWIG_SyntaxError -8 -#define SWIG_ValueError -9 -#define SWIG_SystemError -10 -#define SWIG_AttributeError -11 -#define SWIG_MemoryError -12 -#define SWIG_NullReferenceError -13 - - +#define SWIG_UnknownError -1 +#define SWIG_IOError -2 +#define SWIG_RuntimeError -3 +#define SWIG_IndexError -4 +#define SWIG_TypeError -5 +#define SWIG_DivisionByZero -6 +#define SWIG_OverflowError -7 +#define SWIG_SyntaxError -8 +#define SWIG_ValueError -9 +#define SWIG_SystemError -10 +#define SWIG_AttributeError -11 +#define SWIG_MemoryError -12 +#define SWIG_NullReferenceError -13 /* Compatibility macros for Python 3 */ #if PY_VERSION_HEX >= 0x03000000 @@ -747,9 +763,9 @@ SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) { #define PyInt_FromSize_t(x) PyLong_FromSize_t(x) #define PyString_Check(name) PyBytes_Check(name) #define PyString_FromString(x) PyUnicode_FromString(x) -#define PyString_Format(fmt, args) PyUnicode_Format(fmt, args) +#define PyString_Format(fmt, args) PyUnicode_Format(fmt, args) #define PyString_AsString(str) PyBytes_AsString(str) -#define PyString_Size(str) PyBytes_Size(str) +#define PyString_Size(str) PyBytes_Size(str) #define PyString_InternFromString(key) PyUnicode_InternFromString(key) #define Py_TPFLAGS_HAVE_CLASS Py_TPFLAGS_BASETYPE #define PyString_AS_STRING(x) PyUnicode_AS_STRING(x) @@ -758,32 +774,29 @@ SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) { #endif #ifndef Py_TYPE -# define Py_TYPE(op) ((op)->ob_type) +#define Py_TYPE(op) ((op)->ob_type) #endif /* SWIG APIs for compatibility of both Python 2 & 3 */ #if PY_VERSION_HEX >= 0x03000000 -# define SWIG_Python_str_FromFormat PyUnicode_FromFormat +#define SWIG_Python_str_FromFormat PyUnicode_FromFormat #else -# define SWIG_Python_str_FromFormat PyString_FromFormat +#define SWIG_Python_str_FromFormat PyString_FromFormat #endif - /* Warning: This function will allocate a new string in Python 3, * so please call SWIG_Python_str_DelForPy3(x) to free the space. */ -SWIGINTERN char* -SWIG_Python_str_AsChar(PyObject *str) -{ +SWIGINTERN char *SWIG_Python_str_AsChar(PyObject *str) { #if PY_VERSION_HEX >= 0x03000000 char *cstr; char *newstr; Py_ssize_t len; str = PyUnicode_AsUTF8String(str); PyBytes_AsStringAndSize(str, &cstr, &len); - newstr = (char *) malloc(len+1); - memcpy(newstr, cstr, len+1); + newstr = (char *)malloc(len + 1); + memcpy(newstr, cstr, len + 1); Py_XDECREF(str); return newstr; #else @@ -792,17 +805,14 @@ SWIG_Python_str_AsChar(PyObject *str) } #if PY_VERSION_HEX >= 0x03000000 -# define SWIG_Python_str_DelForPy3(x) free( (void*) (x) ) +#define SWIG_Python_str_DelForPy3(x) free((void *)(x)) #else -# define SWIG_Python_str_DelForPy3(x) +#define SWIG_Python_str_DelForPy3(x) #endif - -SWIGINTERN PyObject* -SWIG_Python_str_FromChar(const char *c) -{ +SWIGINTERN PyObject *SWIG_Python_str_FromChar(const char *c) { #if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_FromString(c); + return PyUnicode_FromString(c); #else return PyString_FromString(c); #endif @@ -810,22 +820,21 @@ SWIG_Python_str_FromChar(const char *c) /* Add PyOS_snprintf for old Pythons */ #if PY_VERSION_HEX < 0x02020000 -# if defined(_MSC_VER) || defined(__BORLANDC__) || defined(_WATCOM) -# define PyOS_snprintf _snprintf -# else -# define PyOS_snprintf snprintf -# endif +#if defined(_MSC_VER) || defined(__BORLANDC__) || defined(_WATCOM) +#define PyOS_snprintf _snprintf +#else +#define PyOS_snprintf snprintf +#endif #endif /* A crude PyString_FromFormat implementation for old Pythons */ #if PY_VERSION_HEX < 0x02020000 #ifndef SWIG_PYBUFFER_SIZE -# define SWIG_PYBUFFER_SIZE 1024 +#define SWIG_PYBUFFER_SIZE 1024 #endif -static PyObject * -PyString_FromFormat(const char *fmt, ...) { +static PyObject *PyString_FromFormat(const char *fmt, ...) { va_list ap; char buf[SWIG_PYBUFFER_SIZE * 2]; int res; @@ -838,48 +847,50 @@ PyString_FromFormat(const char *fmt, ...) { /* Add PyObject_Del for old Pythons */ #if PY_VERSION_HEX < 0x01060000 -# define PyObject_Del(op) PyMem_DEL((op)) +#define PyObject_Del(op) PyMem_DEL((op)) #endif #ifndef PyObject_DEL -# define PyObject_DEL PyObject_Del +#define PyObject_DEL PyObject_Del #endif /* A crude PyExc_StopIteration exception for old Pythons */ #if PY_VERSION_HEX < 0x02020000 -# ifndef PyExc_StopIteration -# define PyExc_StopIteration PyExc_RuntimeError -# endif -# ifndef PyObject_GenericGetAttr -# define PyObject_GenericGetAttr 0 -# endif +#ifndef PyExc_StopIteration +#define PyExc_StopIteration PyExc_RuntimeError +#endif +#ifndef PyObject_GenericGetAttr +#define PyObject_GenericGetAttr 0 +#endif #endif /* Py_NotImplemented is defined in 2.1 and up. */ #if PY_VERSION_HEX < 0x02010000 -# ifndef Py_NotImplemented -# define Py_NotImplemented PyExc_RuntimeError -# endif +#ifndef Py_NotImplemented +#define Py_NotImplemented PyExc_RuntimeError +#endif #endif /* A crude PyString_AsStringAndSize implementation for old Pythons */ #if PY_VERSION_HEX < 0x02010000 -# ifndef PyString_AsStringAndSize -# define PyString_AsStringAndSize(obj, s, len) {*s = PyString_AsString(obj); *len = *s ? strlen(*s) : 0;} -# endif +#ifndef PyString_AsStringAndSize +#define PyString_AsStringAndSize(obj, s, len) \ + { \ + *s = PyString_AsString(obj); \ + *len = *s ? strlen(*s) : 0; \ + } +#endif #endif /* PySequence_Size for old Pythons */ #if PY_VERSION_HEX < 0x02000000 -# ifndef PySequence_Size -# define PySequence_Size PySequence_Length -# endif +#ifndef PySequence_Size +#define PySequence_Size PySequence_Length +#endif #endif /* PyBool_FromLong for old Pythons */ #if PY_VERSION_HEX < 0x02030000 -static -PyObject *PyBool_FromLong(long ok) -{ +static PyObject *PyBool_FromLong(long ok) { PyObject *result = ok ? Py_True : Py_False; Py_INCREF(result); return result; @@ -891,8 +902,8 @@ PyObject *PyBool_FromLong(long ok) /* http://www.python.org/dev/peps/pep-0353/#conversion-guidelines */ #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) typedef int Py_ssize_t; -# define PY_SSIZE_T_MAX INT_MAX -# define PY_SSIZE_T_MIN INT_MIN +#define PY_SSIZE_T_MAX INT_MAX +#define PY_SSIZE_T_MIN INT_MIN typedef inquiry lenfunc; typedef intargfunc ssizeargfunc; typedef intintargfunc ssizessizeargfunc; @@ -902,8 +913,7 @@ typedef getreadbufferproc readbufferproc; typedef getwritebufferproc writebufferproc; typedef getsegcountproc segcountproc; typedef getcharbufferproc charbufferproc; -static long PyNumber_AsSsize_t (PyObject *x, void *SWIGUNUSEDPARM(exc)) -{ +static long PyNumber_AsSsize_t(PyObject *x, void *SWIGUNUSEDPARM(exc)) { long result = 0; PyObject *i = PyNumber_Int(x); if (i) { @@ -919,13 +929,13 @@ static long PyNumber_AsSsize_t (PyObject *x, void *SWIGUNUSEDPARM(exc)) #endif #if PY_VERSION_HEX < 0x02040000 -#define Py_VISIT(op) \ - do { \ - if (op) { \ - int vret = visit((op), arg); \ - if (vret) \ - return vret; \ - } \ +#define Py_VISIT(op) \ + do { \ + if (op) { \ + int vret = visit((op), arg); \ + if (vret) \ + return vret; \ + } \ } while (0) #endif @@ -944,11 +954,13 @@ typedef struct { typedef destructor freefunc; #endif -#if ((PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 6) || \ - (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 0) || \ +#if ((PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 6) || \ + (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 0) || \ (PY_MAJOR_VERSION > 3)) -# define SWIGPY_USE_CAPSULE -# define SWIGPY_CAPSULE_NAME ((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION ".type_pointer_capsule" SWIG_TYPE_TABLE_NAME) +#define SWIGPY_USE_CAPSULE +#define SWIGPY_CAPSULE_NAME \ + ((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION \ + ".type_pointer_capsule" SWIG_TYPE_TABLE_NAME) #endif #if PY_VERSION_HEX < 0x03020000 @@ -958,12 +970,12 @@ typedef destructor freefunc; /* ----------------------------------------------------------------------------- * error manipulation - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ -SWIGRUNTIME PyObject* -SWIG_Python_ErrorType(int code) { - PyObject* type = 0; - switch(code) { +SWIGRUNTIME PyObject *SWIG_Python_ErrorType(int code) { + PyObject *type = 0; + switch (code) { case SWIG_MemoryError: type = PyExc_MemoryError; break; @@ -1003,15 +1015,13 @@ SWIG_Python_ErrorType(int code) { return type; } - -SWIGRUNTIME void -SWIG_Python_AddErrorMsg(const char* mesg) -{ +SWIGRUNTIME void SWIG_Python_AddErrorMsg(const char *mesg) { PyObject *type = 0; PyObject *value = 0; PyObject *traceback = 0; - if (PyErr_Occurred()) PyErr_Fetch(&type, &value, &traceback); + if (PyErr_Occurred()) + PyErr_Fetch(&type, &value, &traceback); if (value) { char *tmp; PyObject *old_str = PyObject_Str(value); @@ -1028,75 +1038,93 @@ SWIG_Python_AddErrorMsg(const char* mesg) } #if defined(SWIG_PYTHON_NO_THREADS) -# if defined(SWIG_PYTHON_THREADS) -# undef SWIG_PYTHON_THREADS -# endif +#if defined(SWIG_PYTHON_THREADS) +#undef SWIG_PYTHON_THREADS +#endif #endif #if defined(SWIG_PYTHON_THREADS) /* Threading support is enabled */ -# if !defined(SWIG_PYTHON_USE_GIL) && !defined(SWIG_PYTHON_NO_USE_GIL) -# if (PY_VERSION_HEX >= 0x02030000) /* For 2.3 or later, use the PyGILState calls */ -# define SWIG_PYTHON_USE_GIL -# endif -# endif -# if defined(SWIG_PYTHON_USE_GIL) /* Use PyGILState threads calls */ -# ifndef SWIG_PYTHON_INITIALIZE_THREADS -# define SWIG_PYTHON_INITIALIZE_THREADS PyEval_InitThreads() -# endif -# ifdef __cplusplus /* C++ code */ - class SWIG_Python_Thread_Block { - bool status; - PyGILState_STATE state; - public: - void end() { if (status) { PyGILState_Release(state); status = false;} } - SWIG_Python_Thread_Block() : status(true), state(PyGILState_Ensure()) {} - ~SWIG_Python_Thread_Block() { end(); } - }; - class SWIG_Python_Thread_Allow { - bool status; - PyThreadState *save; - public: - void end() { if (status) { PyEval_RestoreThread(save); status = false; }} - SWIG_Python_Thread_Allow() : status(true), save(PyEval_SaveThread()) {} - ~SWIG_Python_Thread_Allow() { end(); } - }; -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK SWIG_Python_Thread_Block _swig_thread_block -# define SWIG_PYTHON_THREAD_END_BLOCK _swig_thread_block.end() -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW SWIG_Python_Thread_Allow _swig_thread_allow -# define SWIG_PYTHON_THREAD_END_ALLOW _swig_thread_allow.end() -# else /* C code */ -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK PyGILState_STATE _swig_thread_block = PyGILState_Ensure() -# define SWIG_PYTHON_THREAD_END_BLOCK PyGILState_Release(_swig_thread_block) -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW PyThreadState *_swig_thread_allow = PyEval_SaveThread() -# define SWIG_PYTHON_THREAD_END_ALLOW PyEval_RestoreThread(_swig_thread_allow) -# endif -# else /* Old thread way, not implemented, user must provide it */ -# if !defined(SWIG_PYTHON_INITIALIZE_THREADS) -# define SWIG_PYTHON_INITIALIZE_THREADS -# endif -# if !defined(SWIG_PYTHON_THREAD_BEGIN_BLOCK) -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK -# endif -# if !defined(SWIG_PYTHON_THREAD_END_BLOCK) -# define SWIG_PYTHON_THREAD_END_BLOCK -# endif -# if !defined(SWIG_PYTHON_THREAD_BEGIN_ALLOW) -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW -# endif -# if !defined(SWIG_PYTHON_THREAD_END_ALLOW) -# define SWIG_PYTHON_THREAD_END_ALLOW -# endif -# endif +#if !defined(SWIG_PYTHON_USE_GIL) && !defined(SWIG_PYTHON_NO_USE_GIL) +#if (PY_VERSION_HEX >= \ + 0x02030000) /* For 2.3 or later, use the PyGILState calls */ +#define SWIG_PYTHON_USE_GIL +#endif +#endif +#if defined(SWIG_PYTHON_USE_GIL) /* Use PyGILState threads calls */ +#ifndef SWIG_PYTHON_INITIALIZE_THREADS +#define SWIG_PYTHON_INITIALIZE_THREADS PyEval_InitThreads() +#endif +#ifdef __cplusplus /* C++ code */ +class SWIG_Python_Thread_Block { + bool status; + PyGILState_STATE state; + +public: + void end() { + if (status) { + PyGILState_Release(state); + status = false; + } + } + SWIG_Python_Thread_Block() : status(true), state(PyGILState_Ensure()) {} + ~SWIG_Python_Thread_Block() { end(); } +}; +class SWIG_Python_Thread_Allow { + bool status; + PyThreadState *save; + +public: + void end() { + if (status) { + PyEval_RestoreThread(save); + status = false; + } + } + SWIG_Python_Thread_Allow() : status(true), save(PyEval_SaveThread()) {} + ~SWIG_Python_Thread_Allow() { end(); } +}; +#define SWIG_PYTHON_THREAD_BEGIN_BLOCK \ + SWIG_Python_Thread_Block _swig_thread_block +#define SWIG_PYTHON_THREAD_END_BLOCK _swig_thread_block.end() +#define SWIG_PYTHON_THREAD_BEGIN_ALLOW \ + SWIG_Python_Thread_Allow _swig_thread_allow +#define SWIG_PYTHON_THREAD_END_ALLOW _swig_thread_allow.end() +#else /* C code */ +#define SWIG_PYTHON_THREAD_BEGIN_BLOCK \ + PyGILState_STATE _swig_thread_block = PyGILState_Ensure() +#define SWIG_PYTHON_THREAD_END_BLOCK PyGILState_Release(_swig_thread_block) +#define SWIG_PYTHON_THREAD_BEGIN_ALLOW \ + PyThreadState *_swig_thread_allow = PyEval_SaveThread() +#define SWIG_PYTHON_THREAD_END_ALLOW PyEval_RestoreThread(_swig_thread_allow) +#endif +#else /* Old thread way, not implemented, user must provide it */ +#if !defined(SWIG_PYTHON_INITIALIZE_THREADS) +#define SWIG_PYTHON_INITIALIZE_THREADS +#endif +#if !defined(SWIG_PYTHON_THREAD_BEGIN_BLOCK) +#define SWIG_PYTHON_THREAD_BEGIN_BLOCK +#endif +#if !defined(SWIG_PYTHON_THREAD_END_BLOCK) +#define SWIG_PYTHON_THREAD_END_BLOCK +#endif +#if !defined(SWIG_PYTHON_THREAD_BEGIN_ALLOW) +#define SWIG_PYTHON_THREAD_BEGIN_ALLOW +#endif +#if !defined(SWIG_PYTHON_THREAD_END_ALLOW) +#define SWIG_PYTHON_THREAD_END_ALLOW +#endif +#endif #else /* No thread support */ -# define SWIG_PYTHON_INITIALIZE_THREADS -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK -# define SWIG_PYTHON_THREAD_END_BLOCK -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW -# define SWIG_PYTHON_THREAD_END_ALLOW +#define SWIG_PYTHON_INITIALIZE_THREADS +#define SWIG_PYTHON_THREAD_BEGIN_BLOCK +#define SWIG_PYTHON_THREAD_END_BLOCK +#define SWIG_PYTHON_THREAD_BEGIN_ALLOW +#define SWIG_PYTHON_THREAD_END_ALLOW #endif /* ----------------------------------------------------------------------------- * Python API portion that goes into the runtime - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ #ifdef __cplusplus extern "C" { @@ -1104,11 +1132,12 @@ extern "C" { /* ----------------------------------------------------------------------------- * Constant declarations - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ /* Constant Types */ #define SWIG_PY_POINTER 4 -#define SWIG_PY_BINARY 5 +#define SWIG_PY_BINARY 5 /* Constant information structure */ typedef struct swig_const_info { @@ -1116,23 +1145,24 @@ typedef struct swig_const_info { char *name; long lvalue; double dvalue; - void *pvalue; + void *pvalue; swig_type_info **ptype; } swig_const_info; - /* ----------------------------------------------------------------------------- * Wrapper of PyInstanceMethod_New() used in Python 3 * It is exported to the generated module, used for -fastproxy - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ #if PY_VERSION_HEX >= 0x03000000 -SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *func) -{ +SWIGRUNTIME PyObject *SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), + PyObject *func) { return PyInstanceMethod_New(func); } #else -SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *SWIGUNUSEDPARM(func)) -{ +SWIGRUNTIME PyObject * +SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), + PyObject *SWIGUNUSEDPARM(func)) { return NULL; } #endif @@ -1141,7 +1171,6 @@ SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), } #endif - /* ----------------------------------------------------------------------------- * pyrun.swg * @@ -1149,91 +1178,100 @@ SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), * and includes code for managing global variables and pointer * type checking. * - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ /* Common SWIG API */ /* for raw pointers */ -#define SWIG_Python_ConvertPtr(obj, pptr, type, flags) SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, 0) -#define SWIG_ConvertPtr(obj, pptr, type, flags) SWIG_Python_ConvertPtr(obj, pptr, type, flags) -#define SWIG_ConvertPtrAndOwn(obj,pptr,type,flags,own) SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, own) +#define SWIG_Python_ConvertPtr(obj, pptr, type, flags) \ + SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, 0) +#define SWIG_ConvertPtr(obj, pptr, type, flags) \ + SWIG_Python_ConvertPtr(obj, pptr, type, flags) +#define SWIG_ConvertPtrAndOwn(obj, pptr, type, flags, own) \ + SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, own) #ifdef SWIGPYTHON_BUILTIN -#define SWIG_NewPointerObj(ptr, type, flags) SWIG_Python_NewPointerObj(self, ptr, type, flags) +#define SWIG_NewPointerObj(ptr, type, flags) \ + SWIG_Python_NewPointerObj(self, ptr, type, flags) #else -#define SWIG_NewPointerObj(ptr, type, flags) SWIG_Python_NewPointerObj(NULL, ptr, type, flags) +#define SWIG_NewPointerObj(ptr, type, flags) \ + SWIG_Python_NewPointerObj(NULL, ptr, type, flags) #endif -#define SWIG_InternalNewPointerObj(ptr, type, flags) SWIG_Python_NewPointerObj(NULL, ptr, type, flags) +#define SWIG_InternalNewPointerObj(ptr, type, flags) \ + SWIG_Python_NewPointerObj(NULL, ptr, type, flags) -#define SWIG_CheckImplicit(ty) SWIG_Python_CheckImplicit(ty) -#define SWIG_AcquirePtr(ptr, src) SWIG_Python_AcquirePtr(ptr, src) -#define swig_owntype int +#define SWIG_CheckImplicit(ty) SWIG_Python_CheckImplicit(ty) +#define SWIG_AcquirePtr(ptr, src) SWIG_Python_AcquirePtr(ptr, src) +#define swig_owntype int /* for raw packed data */ -#define SWIG_ConvertPacked(obj, ptr, sz, ty) SWIG_Python_ConvertPacked(obj, ptr, sz, ty) -#define SWIG_NewPackedObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type) +#define SWIG_ConvertPacked(obj, ptr, sz, ty) \ + SWIG_Python_ConvertPacked(obj, ptr, sz, ty) +#define SWIG_NewPackedObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type) /* for class or struct pointers */ -#define SWIG_ConvertInstance(obj, pptr, type, flags) SWIG_ConvertPtr(obj, pptr, type, flags) -#define SWIG_NewInstanceObj(ptr, type, flags) SWIG_NewPointerObj(ptr, type, flags) +#define SWIG_ConvertInstance(obj, pptr, type, flags) \ + SWIG_ConvertPtr(obj, pptr, type, flags) +#define SWIG_NewInstanceObj(ptr, type, flags) \ + SWIG_NewPointerObj(ptr, type, flags) /* for C or C++ function pointers */ -#define SWIG_ConvertFunctionPtr(obj, pptr, type) SWIG_Python_ConvertFunctionPtr(obj, pptr, type) -#define SWIG_NewFunctionPtrObj(ptr, type) SWIG_Python_NewPointerObj(NULL, ptr, type, 0) +#define SWIG_ConvertFunctionPtr(obj, pptr, type) \ + SWIG_Python_ConvertFunctionPtr(obj, pptr, type) +#define SWIG_NewFunctionPtrObj(ptr, type) \ + SWIG_Python_NewPointerObj(NULL, ptr, type, 0) /* for C++ member pointers, ie, member methods */ -#define SWIG_ConvertMember(obj, ptr, sz, ty) SWIG_Python_ConvertPacked(obj, ptr, sz, ty) -#define SWIG_NewMemberObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type) - +#define SWIG_ConvertMember(obj, ptr, sz, ty) \ + SWIG_Python_ConvertPacked(obj, ptr, sz, ty) +#define SWIG_NewMemberObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type) /* Runtime API */ -#define SWIG_GetModule(clientdata) SWIG_Python_GetModule(clientdata) -#define SWIG_SetModule(clientdata, pointer) SWIG_Python_SetModule(pointer) -#define SWIG_NewClientData(obj) SwigPyClientData_New(obj) - -#define SWIG_SetErrorObj SWIG_Python_SetErrorObj -#define SWIG_SetErrorMsg SWIG_Python_SetErrorMsg -#define SWIG_ErrorType(code) SWIG_Python_ErrorType(code) -#define SWIG_Error(code, msg) SWIG_Python_SetErrorMsg(SWIG_ErrorType(code), msg) -#define SWIG_fail goto fail +#define SWIG_GetModule(clientdata) SWIG_Python_GetModule(clientdata) +#define SWIG_SetModule(clientdata, pointer) SWIG_Python_SetModule(pointer) +#define SWIG_NewClientData(obj) SwigPyClientData_New(obj) +#define SWIG_SetErrorObj SWIG_Python_SetErrorObj +#define SWIG_SetErrorMsg SWIG_Python_SetErrorMsg +#define SWIG_ErrorType(code) SWIG_Python_ErrorType(code) +#define SWIG_Error(code, msg) SWIG_Python_SetErrorMsg(SWIG_ErrorType(code), msg) +#define SWIG_fail goto fail /* Runtime API implementation */ /* Error manipulation */ -SWIGINTERN void -SWIG_Python_SetErrorObj(PyObject *errtype, PyObject *obj) { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; +SWIGINTERN void SWIG_Python_SetErrorObj(PyObject *errtype, PyObject *obj) { + SWIG_PYTHON_THREAD_BEGIN_BLOCK; PyErr_SetObject(errtype, obj); Py_DECREF(obj); SWIG_PYTHON_THREAD_END_BLOCK; } -SWIGINTERN void -SWIG_Python_SetErrorMsg(PyObject *errtype, const char *msg) { +SWIGINTERN void SWIG_Python_SetErrorMsg(PyObject *errtype, const char *msg) { SWIG_PYTHON_THREAD_BEGIN_BLOCK; PyErr_SetString(errtype, msg); SWIG_PYTHON_THREAD_END_BLOCK; } -#define SWIG_Python_Raise(obj, type, desc) SWIG_Python_SetErrorObj(SWIG_Python_ExceptionType(desc), obj) +#define SWIG_Python_Raise(obj, type, desc) \ + SWIG_Python_SetErrorObj(SWIG_Python_ExceptionType(desc), obj) /* Set a constant value */ #if defined(SWIGPYTHON_BUILTIN) -SWIGINTERN void -SwigPyBuiltin_AddPublicSymbol(PyObject *seq, const char *key) { +SWIGINTERN void SwigPyBuiltin_AddPublicSymbol(PyObject *seq, const char *key) { PyObject *s = PyString_InternFromString(key); PyList_Append(seq, s); Py_DECREF(s); } -SWIGINTERN void -SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, const char *name, PyObject *obj) { +SWIGINTERN void SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, + const char *name, PyObject *obj) { #if PY_VERSION_HEX < 0x02030000 PyDict_SetItemString(d, (char *)name, obj); #else @@ -1246,22 +1284,21 @@ SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, const char *nam #else -SWIGINTERN void -SWIG_Python_SetConstant(PyObject *d, const char *name, PyObject *obj) { +SWIGINTERN void SWIG_Python_SetConstant(PyObject *d, const char *name, + PyObject *obj) { #if PY_VERSION_HEX < 0x02030000 PyDict_SetItemString(d, (char *)name, obj); #else PyDict_SetItemString(d, name, obj); #endif - Py_DECREF(obj); + Py_DECREF(obj); } #endif /* Append a value to the result obj */ -SWIGINTERN PyObject* -SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) { +SWIGINTERN PyObject *SWIG_Python_AppendOutput(PyObject *result, PyObject *obj) { #if !defined(SWIG_PYTHON_OUTPUT_TUPLE) if (!result) { result = obj; @@ -1274,13 +1311,13 @@ SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) { result = PyList_New(1); PyList_SetItem(result, 0, o2); } - PyList_Append(result,obj); + PyList_Append(result, obj); Py_DECREF(obj); } return result; #else - PyObject* o2; - PyObject* o3; + PyObject *o2; + PyObject *o3; if (!result) { result = obj; } else if (result == Py_None) { @@ -1305,57 +1342,60 @@ SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) { /* Unpack the argument tuple */ -SWIGINTERN Py_ssize_t -SWIG_Python_UnpackTuple(PyObject *args, const char *name, Py_ssize_t min, Py_ssize_t max, PyObject **objs) -{ +SWIGINTERN Py_ssize_t SWIG_Python_UnpackTuple(PyObject *args, const char *name, + Py_ssize_t min, Py_ssize_t max, + PyObject **objs) { if (!args) { if (!min && !max) { return 1; } else { - PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got none", - name, (min == max ? "" : "at least "), (int)min); + PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got none", + name, (min == max ? "" : "at least "), (int)min); return 0; } - } + } if (!PyTuple_Check(args)) { if (min <= 1 && max >= 1) { Py_ssize_t i; objs[0] = args; for (i = 1; i < max; ++i) { - objs[i] = 0; + objs[i] = 0; } return 2; } - PyErr_SetString(PyExc_SystemError, "UnpackTuple() argument list is not a tuple"); + PyErr_SetString(PyExc_SystemError, + "UnpackTuple() argument list is not a tuple"); return 0; } else { Py_ssize_t l = PyTuple_GET_SIZE(args); if (l < min) { - PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", - name, (min == max ? "" : "at least "), (int)min, (int)l); + PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", name, + (min == max ? "" : "at least "), (int)min, (int)l); return 0; } else if (l > max) { - PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", - name, (min == max ? "" : "at most "), (int)max, (int)l); + PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", name, + (min == max ? "" : "at most "), (int)max, (int)l); return 0; } else { Py_ssize_t i; for (i = 0; i < l; ++i) { - objs[i] = PyTuple_GET_ITEM(args, i); + objs[i] = PyTuple_GET_ITEM(args, i); } for (; l < max; ++l) { - objs[l] = 0; + objs[l] = 0; } return i + 1; - } + } } } /* A functor is a function object with one single object argument */ #if PY_VERSION_HEX >= 0x02020000 -#define SWIG_Python_CallFunctor(functor, obj) PyObject_CallFunctionObjArgs(functor, obj, NULL); +#define SWIG_Python_CallFunctor(functor, obj) \ + PyObject_CallFunctionObjArgs(functor, obj, NULL); #else -#define SWIG_Python_CallFunctor(functor, obj) PyObject_CallFunction(functor, "O", obj); +#define SWIG_Python_CallFunctor(functor, obj) \ + PyObject_CallFunction(functor, "O", obj); #endif /* @@ -1363,23 +1403,27 @@ SWIG_Python_UnpackTuple(PyObject *args, const char *name, Py_ssize_t min, Py_ssi static PyObject *SWIG_STATIC_POINTER(MyVar) = NewSomething(...); */ #ifdef __cplusplus -#define SWIG_STATIC_POINTER(var) var +#define SWIG_STATIC_POINTER(var) var #else -#define SWIG_STATIC_POINTER(var) var = 0; if (!var) var +#define SWIG_STATIC_POINTER(var) \ + var = 0; \ + if (!var) \ + var #endif /* ----------------------------------------------------------------------------- * Pointer declarations - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ /* Flags for new pointer objects */ -#define SWIG_POINTER_NOSHADOW (SWIG_POINTER_OWN << 1) -#define SWIG_POINTER_NEW (SWIG_POINTER_NOSHADOW | SWIG_POINTER_OWN) +#define SWIG_POINTER_NOSHADOW (SWIG_POINTER_OWN << 1) +#define SWIG_POINTER_NEW (SWIG_POINTER_NOSHADOW | SWIG_POINTER_OWN) -#define SWIG_POINTER_IMPLICIT_CONV (SWIG_POINTER_DISOWN << 1) +#define SWIG_POINTER_IMPLICIT_CONV (SWIG_POINTER_DISOWN << 1) -#define SWIG_BUILTIN_TP_INIT (SWIG_POINTER_OWN << 2) -#define SWIG_BUILTIN_INIT (SWIG_BUILTIN_TP_INIT | SWIG_POINTER_OWN) +#define SWIG_BUILTIN_TP_INIT (SWIG_POINTER_OWN << 2) +#define SWIG_BUILTIN_INIT (SWIG_BUILTIN_TP_INIT | SWIG_POINTER_OWN) #ifdef __cplusplus extern "C" { @@ -1387,28 +1431,24 @@ extern "C" { /* How to access Py_None */ #if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) -# ifndef SWIG_PYTHON_NO_BUILD_NONE -# ifndef SWIG_PYTHON_BUILD_NONE -# define SWIG_PYTHON_BUILD_NONE -# endif -# endif +#ifndef SWIG_PYTHON_NO_BUILD_NONE +#ifndef SWIG_PYTHON_BUILD_NONE +#define SWIG_PYTHON_BUILD_NONE +#endif +#endif #endif #ifdef SWIG_PYTHON_BUILD_NONE -# ifdef Py_None -# undef Py_None -# define Py_None SWIG_Py_None() -# endif -SWIGRUNTIMEINLINE PyObject * -_SWIG_Py_None(void) -{ - PyObject *none = Py_BuildValue((char*)""); +#ifdef Py_None +#undef Py_None +#define Py_None SWIG_Py_None() +#endif +SWIGRUNTIMEINLINE PyObject *_SWIG_Py_None(void) { + PyObject *none = Py_BuildValue((char *)""); Py_DECREF(none); return none; } -SWIGRUNTIME PyObject * -SWIG_Py_None(void) -{ +SWIGRUNTIME PyObject *SWIG_Py_None(void) { static PyObject *SWIG_STATIC_POINTER(none) = _SWIG_Py_None(); return none; } @@ -1416,9 +1456,7 @@ SWIG_Py_None(void) /* The python void return value */ -SWIGRUNTIMEINLINE PyObject * -SWIG_Py_Void(void) -{ +SWIGRUNTIMEINLINE PyObject *SWIG_Py_Void(void) { PyObject *none = Py_None; Py_INCREF(none); return none; @@ -1436,32 +1474,28 @@ typedef struct { PyTypeObject *pytype; } SwigPyClientData; -SWIGRUNTIMEINLINE int -SWIG_Python_CheckImplicit(swig_type_info *ty) -{ +SWIGRUNTIMEINLINE int SWIG_Python_CheckImplicit(swig_type_info *ty) { SwigPyClientData *data = (SwigPyClientData *)ty->clientdata; return data ? data->implicitconv : 0; } -SWIGRUNTIMEINLINE PyObject * -SWIG_Python_ExceptionType(swig_type_info *desc) { - SwigPyClientData *data = desc ? (SwigPyClientData *) desc->clientdata : 0; +SWIGRUNTIMEINLINE PyObject *SWIG_Python_ExceptionType(swig_type_info *desc) { + SwigPyClientData *data = desc ? (SwigPyClientData *)desc->clientdata : 0; PyObject *klass = data ? data->klass : 0; return (klass ? klass : PyExc_RuntimeError); } - -SWIGRUNTIME SwigPyClientData * -SwigPyClientData_New(PyObject* obj) -{ +SWIGRUNTIME SwigPyClientData *SwigPyClientData_New(PyObject *obj) { if (!obj) { return 0; } else { - SwigPyClientData *data = (SwigPyClientData *)malloc(sizeof(SwigPyClientData)); + SwigPyClientData *data = + (SwigPyClientData *)malloc(sizeof(SwigPyClientData)); /* the klass element */ data->klass = obj; Py_INCREF(data->klass); - /* the newraw method and newargs arguments used to create a new raw instance */ + /* the newraw method and newargs arguments used to create a new raw instance + */ if (PyClass_Check(obj)) { data->newraw = 0; data->newargs = obj; @@ -1473,16 +1507,17 @@ SwigPyClientData_New(PyObject* obj) data->newraw = PyObject_GetAttrString(data->klass, (char *)"__new__"); #endif if (data->newraw) { - Py_INCREF(data->newraw); - data->newargs = PyTuple_New(1); - PyTuple_SetItem(data->newargs, 0, obj); + Py_INCREF(data->newraw); + data->newargs = PyTuple_New(1); + PyTuple_SetItem(data->newargs, 0, obj); } else { - data->newargs = obj; + data->newargs = obj; } Py_INCREF(data->newargs); } /* the destroy method, aka as the C++ delete method */ - data->destroy = PyObject_GetAttrString(data->klass, (char *)"__swig_destroy__"); + data->destroy = + PyObject_GetAttrString(data->klass, (char *)"__swig_destroy__"); if (PyErr_Occurred()) { PyErr_Clear(); data->destroy = 0; @@ -1505,8 +1540,7 @@ SwigPyClientData_New(PyObject* obj) } } -SWIGRUNTIME void -SwigPyClientData_Del(SwigPyClientData *data) { +SWIGRUNTIME void SwigPyClientData_Del(SwigPyClientData *data) { Py_XDECREF(data->newraw); Py_XDECREF(data->newargs); Py_XDECREF(data->destroy); @@ -1515,8 +1549,7 @@ SwigPyClientData_Del(SwigPyClientData *data) { /* =============== SwigPyObject =====================*/ typedef struct { - PyObject_HEAD - void *ptr; + PyObject_HEAD void *ptr; swig_type_info *ty; int own; PyObject *next; @@ -1525,12 +1558,10 @@ typedef struct { #endif } SwigPyObject; - #ifdef SWIGPYTHON_BUILTIN SWIGRUNTIME PyObject * -SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) -{ +SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) { SwigPyObject *sobj = (SwigPyObject *)v; if (!sobj->dict) @@ -1542,15 +1573,11 @@ SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) #endif -SWIGRUNTIME PyObject * -SwigPyObject_long(SwigPyObject *v) -{ +SWIGRUNTIME PyObject *SwigPyObject_long(SwigPyObject *v) { return PyLong_FromVoidPtr(v->ptr); } -SWIGRUNTIME PyObject * -SwigPyObject_format(const char* fmt, SwigPyObject *v) -{ +SWIGRUNTIME PyObject *SwigPyObject_format(const char *fmt, SwigPyObject *v) { PyObject *res = NULL; PyObject *args = PyTuple_New(1); if (args) { @@ -1558,11 +1585,11 @@ SwigPyObject_format(const char* fmt, SwigPyObject *v) PyObject *ofmt = SWIG_Python_str_FromChar(fmt); if (ofmt) { #if PY_VERSION_HEX >= 0x03000000 - res = PyUnicode_Format(ofmt,args); + res = PyUnicode_Format(ofmt, args); #else - res = PyString_Format(ofmt,args); + res = PyString_Format(ofmt, args); #endif - Py_DECREF(ofmt); + Py_DECREF(ofmt); } Py_DECREF(args); } @@ -1570,16 +1597,12 @@ SwigPyObject_format(const char* fmt, SwigPyObject *v) return res; } -SWIGRUNTIME PyObject * -SwigPyObject_oct(SwigPyObject *v) -{ - return SwigPyObject_format("%o",v); +SWIGRUNTIME PyObject *SwigPyObject_oct(SwigPyObject *v) { + return SwigPyObject_format("%o", v); } -SWIGRUNTIME PyObject * -SwigPyObject_hex(SwigPyObject *v) -{ - return SwigPyObject_format("%x",v); +SWIGRUNTIME PyObject *SwigPyObject_hex(SwigPyObject *v) { + return SwigPyObject_format("%x", v); } SWIGRUNTIME PyObject * @@ -1590,92 +1613,84 @@ SwigPyObject_repr(SwigPyObject *v, PyObject *args) #endif { const char *name = SWIG_TypePrettyName(v->ty); - PyObject *repr = SWIG_Python_str_FromFormat("<Swig Object of type '%s' at %p>", (name ? name : "unknown"), (void *)v); + PyObject *repr = SWIG_Python_str_FromFormat( + "<Swig Object of type '%s' at %p>", (name ? name : "unknown"), (void *)v); if (v->next) { -# ifdef METH_NOARGS +#ifdef METH_NOARGS PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next); -# else +#else PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next, args); -# endif -# if PY_VERSION_HEX >= 0x03000000 +#endif +#if PY_VERSION_HEX >= 0x03000000 PyObject *joined = PyUnicode_Concat(repr, nrep); Py_DecRef(repr); Py_DecRef(nrep); repr = joined; -# else - PyString_ConcatAndDel(&repr,nrep); -# endif +#else + PyString_ConcatAndDel(&repr, nrep); +#endif } - return repr; + return repr; } -SWIGRUNTIME int -SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w) -{ +SWIGRUNTIME int SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w) { void *i = v->ptr; void *j = w->ptr; return (i < j) ? -1 : ((i > j) ? 1 : 0); } /* Added for Python 3.x, would it also be useful for Python 2.x? */ -SWIGRUNTIME PyObject* -SwigPyObject_richcompare(SwigPyObject *v, SwigPyObject *w, int op) -{ - PyObject* res; - if( op != Py_EQ && op != Py_NE ) { +SWIGRUNTIME PyObject *SwigPyObject_richcompare(SwigPyObject *v, SwigPyObject *w, + int op) { + PyObject *res; + if (op != Py_EQ && op != Py_NE) { Py_INCREF(Py_NotImplemented); return Py_NotImplemented; } - res = PyBool_FromLong( (SwigPyObject_compare(v, w)==0) == (op == Py_EQ) ? 1 : 0); - return res; + res = PyBool_FromLong((SwigPyObject_compare(v, w) == 0) == (op == Py_EQ) ? 1 + : 0); + return res; } - -SWIGRUNTIME PyTypeObject* SwigPyObject_TypeOnce(void); +SWIGRUNTIME PyTypeObject *SwigPyObject_TypeOnce(void); #ifdef SWIGPYTHON_BUILTIN static swig_type_info *SwigPyObject_stype = 0; -SWIGRUNTIME PyTypeObject* -SwigPyObject_type(void) { - SwigPyClientData *cd; - assert(SwigPyObject_stype); - cd = (SwigPyClientData*) SwigPyObject_stype->clientdata; - assert(cd); - assert(cd->pytype); - return cd->pytype; +SWIGRUNTIME PyTypeObject *SwigPyObject_type(void) { + SwigPyClientData *cd; + assert(SwigPyObject_stype); + cd = (SwigPyClientData *)SwigPyObject_stype->clientdata; + assert(cd); + assert(cd->pytype); + return cd->pytype; } #else -SWIGRUNTIME PyTypeObject* -SwigPyObject_type(void) { +SWIGRUNTIME PyTypeObject *SwigPyObject_type(void) { static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyObject_TypeOnce(); return type; } #endif -SWIGRUNTIMEINLINE int -SwigPyObject_Check(PyObject *op) { +SWIGRUNTIMEINLINE int SwigPyObject_Check(PyObject *op) { #ifdef SWIGPYTHON_BUILTIN PyTypeObject *target_tp = SwigPyObject_type(); if (PyType_IsSubtype(op->ob_type, target_tp)) return 1; return (strcmp(op->ob_type->tp_name, "SwigPyObject") == 0); #else - return (Py_TYPE(op) == SwigPyObject_type()) - || (strcmp(Py_TYPE(op)->tp_name,"SwigPyObject") == 0); + return (Py_TYPE(op) == SwigPyObject_type()) || + (strcmp(Py_TYPE(op)->tp_name, "SwigPyObject") == 0); #endif } -SWIGRUNTIME PyObject * -SwigPyObject_New(void *ptr, swig_type_info *ty, int own); +SWIGRUNTIME PyObject *SwigPyObject_New(void *ptr, swig_type_info *ty, int own); -SWIGRUNTIME void -SwigPyObject_dealloc(PyObject *v) -{ - SwigPyObject *sobj = (SwigPyObject *) v; +SWIGRUNTIME void SwigPyObject_dealloc(PyObject *v) { + SwigPyObject *sobj = (SwigPyObject *)v; PyObject *next = sobj->next; if (sobj->own == SWIG_POINTER_OWN) { swig_type_info *ty = sobj->ty; - SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0; + SwigPyClientData *data = ty ? (SwigPyClientData *)ty->clientdata : 0; PyObject *destroy = data ? data->destroy : 0; if (destroy) { /* destroy is always a VARARGS method */ @@ -1687,12 +1702,13 @@ SwigPyObject_dealloc(PyObject *v) StopIteration will be active right now, and this needs to remain true upon return from SwigPyObject_dealloc. So save and restore. */ - + PyObject *val = NULL, *type = NULL, *tb = NULL; PyErr_Fetch(&val, &type, &tb); if (data->delargs) { - /* we need to create a temporary object to carry the destroy operation */ + /* we need to create a temporary object to carry the destroy operation + */ PyObject *tmp = SwigPyObject_New(sobj->ptr, ty, 0); res = SWIG_Python_CallFunctor(destroy, tmp); Py_DECREF(tmp); @@ -1707,25 +1723,26 @@ SwigPyObject_dealloc(PyObject *v) PyErr_Restore(val, type, tb); Py_XDECREF(res); - } + } #if !defined(SWIG_PYTHON_SILENT_MEMLEAK) else { const char *name = SWIG_TypePrettyName(ty); - printf("swig/python detected a memory leak of type '%s', no destructor found.\n", (name ? name : "unknown")); + printf("swig/python detected a memory leak of type '%s', no destructor " + "found.\n", + (name ? name : "unknown")); } #endif - } + } Py_XDECREF(next); PyObject_DEL(v); } -SWIGRUNTIME PyObject* -SwigPyObject_append(PyObject* v, PyObject* next) -{ - SwigPyObject *sobj = (SwigPyObject *) v; +SWIGRUNTIME PyObject *SwigPyObject_append(PyObject *v, PyObject *next) { + SwigPyObject *sobj = (SwigPyObject *)v; #ifndef METH_O PyObject *tmp = 0; - if (!PyArg_ParseTuple(next,(char *)"O:append", &tmp)) return NULL; + if (!PyArg_ParseTuple(next, (char *)"O:append", &tmp)) + return NULL; next = tmp; #endif if (!SwigPyObject_Check(next)) { @@ -1737,15 +1754,15 @@ SwigPyObject_append(PyObject* v, PyObject* next) return SWIG_Py_Void(); } -SWIGRUNTIME PyObject* +SWIGRUNTIME PyObject * #ifdef METH_NOARGS -SwigPyObject_next(PyObject* v) +SwigPyObject_next(PyObject *v) #else -SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) +SwigPyObject_next(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) #endif { - SwigPyObject *sobj = (SwigPyObject *) v; - if (sobj->next) { + SwigPyObject *sobj = (SwigPyObject *)v; + if (sobj->next) { Py_INCREF(sobj->next); return sobj->next; } else { @@ -1753,11 +1770,11 @@ SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) } } -SWIGINTERN PyObject* +SWIGINTERN PyObject * #ifdef METH_NOARGS SwigPyObject_disown(PyObject *v) #else -SwigPyObject_disown(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) +SwigPyObject_disown(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) #endif { SwigPyObject *sobj = (SwigPyObject *)v; @@ -1765,11 +1782,11 @@ SwigPyObject_disown(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) return SWIG_Py_Void(); } -SWIGINTERN PyObject* +SWIGINTERN PyObject * #ifdef METH_NOARGS SwigPyObject_acquire(PyObject *v) #else -SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) +SwigPyObject_acquire(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) #endif { SwigPyObject *sobj = (SwigPyObject *)v; @@ -1777,102 +1794,103 @@ SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) return SWIG_Py_Void(); } -SWIGINTERN PyObject* -SwigPyObject_own(PyObject *v, PyObject *args) -{ +SWIGINTERN PyObject *SwigPyObject_own(PyObject *v, PyObject *args) { PyObject *val = 0; #if (PY_VERSION_HEX < 0x02020000) - if (!PyArg_ParseTuple(args,(char *)"|O:own",&val)) + if (!PyArg_ParseTuple(args, (char *)"|O:own", &val)) #elif (PY_VERSION_HEX < 0x02050000) - if (!PyArg_UnpackTuple(args, (char *)"own", 0, 1, &val)) + if (!PyArg_UnpackTuple(args, (char *)"own", 0, 1, &val)) #else - if (!PyArg_UnpackTuple(args, "own", 0, 1, &val)) + if (!PyArg_UnpackTuple(args, "own", 0, 1, &val)) #endif - { - return NULL; - } - else - { - SwigPyObject *sobj = (SwigPyObject *)v; - PyObject *obj = PyBool_FromLong(sobj->own); - if (val) { + { + return NULL; + } else { + SwigPyObject *sobj = (SwigPyObject *)v; + PyObject *obj = PyBool_FromLong(sobj->own); + if (val) { #ifdef METH_NOARGS - if (PyObject_IsTrue(val)) { - SwigPyObject_acquire(v); - } else { - SwigPyObject_disown(v); - } + if (PyObject_IsTrue(val)) { + SwigPyObject_acquire(v); + } else { + SwigPyObject_disown(v); + } #else - if (PyObject_IsTrue(val)) { - SwigPyObject_acquire(v,args); - } else { - SwigPyObject_disown(v,args); - } -#endif - } - return obj; + if (PyObject_IsTrue(val)) { + SwigPyObject_acquire(v, args); + } else { + SwigPyObject_disown(v, args); + } +#endif } + return obj; + } } #ifdef METH_O -static PyMethodDef -swigobject_methods[] = { - {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_NOARGS, (char *)"releases ownership of the pointer"}, - {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS, (char *)"acquires ownership of the pointer"}, - {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS, (char *)"returns/sets ownership of the pointer"}, - {(char *)"append", (PyCFunction)SwigPyObject_append, METH_O, (char *)"appends another 'this' object"}, - {(char *)"next", (PyCFunction)SwigPyObject_next, METH_NOARGS, (char *)"returns the next 'this' object"}, - {(char *)"__repr__",(PyCFunction)SwigPyObject_repr, METH_NOARGS, (char *)"returns object representation"}, - {0, 0, 0, 0} -}; +static PyMethodDef swigobject_methods[] = { + {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_NOARGS, + (char *)"releases ownership of the pointer"}, + {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS, + (char *)"acquires ownership of the pointer"}, + {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS, + (char *)"returns/sets ownership of the pointer"}, + {(char *)"append", (PyCFunction)SwigPyObject_append, METH_O, + (char *)"appends another 'this' object"}, + {(char *)"next", (PyCFunction)SwigPyObject_next, METH_NOARGS, + (char *)"returns the next 'this' object"}, + {(char *)"__repr__", (PyCFunction)SwigPyObject_repr, METH_NOARGS, + (char *)"returns object representation"}, + {0, 0, 0, 0}}; #else -static PyMethodDef -swigobject_methods[] = { - {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_VARARGS, (char *)"releases ownership of the pointer"}, - {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_VARARGS, (char *)"acquires ownership of the pointer"}, - {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS, (char *)"returns/sets ownership of the pointer"}, - {(char *)"append", (PyCFunction)SwigPyObject_append, METH_VARARGS, (char *)"appends another 'this' object"}, - {(char *)"next", (PyCFunction)SwigPyObject_next, METH_VARARGS, (char *)"returns the next 'this' object"}, - {(char *)"__repr__",(PyCFunction)SwigPyObject_repr, METH_VARARGS, (char *)"returns object representation"}, - {0, 0, 0, 0} -}; +static PyMethodDef swigobject_methods[] = { + {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_VARARGS, + (char *)"releases ownership of the pointer"}, + {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_VARARGS, + (char *)"acquires ownership of the pointer"}, + {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS, + (char *)"returns/sets ownership of the pointer"}, + {(char *)"append", (PyCFunction)SwigPyObject_append, METH_VARARGS, + (char *)"appends another 'this' object"}, + {(char *)"next", (PyCFunction)SwigPyObject_next, METH_VARARGS, + (char *)"returns the next 'this' object"}, + {(char *)"__repr__", (PyCFunction)SwigPyObject_repr, METH_VARARGS, + (char *)"returns object representation"}, + {0, 0, 0, 0}}; #endif #if PY_VERSION_HEX < 0x02020000 -SWIGINTERN PyObject * -SwigPyObject_getattr(SwigPyObject *sobj,char *name) -{ +SWIGINTERN PyObject *SwigPyObject_getattr(SwigPyObject *sobj, char *name) { return Py_FindMethod(swigobject_methods, (PyObject *)sobj, name); } #endif -SWIGRUNTIME PyTypeObject* -SwigPyObject_TypeOnce(void) { +SWIGRUNTIME PyTypeObject *SwigPyObject_TypeOnce(void) { static char swigobject_doc[] = "Swig object carries a C/C++ instance pointer"; static PyNumberMethods SwigPyObject_as_number = { (binaryfunc)0, /*nb_add*/ (binaryfunc)0, /*nb_subtract*/ (binaryfunc)0, /*nb_multiply*/ - /* nb_divide removed in Python 3 */ + /* nb_divide removed in Python 3 */ #if PY_VERSION_HEX < 0x03000000 (binaryfunc)0, /*nb_divide*/ #endif - (binaryfunc)0, /*nb_remainder*/ - (binaryfunc)0, /*nb_divmod*/ - (ternaryfunc)0,/*nb_power*/ - (unaryfunc)0, /*nb_negative*/ - (unaryfunc)0, /*nb_positive*/ - (unaryfunc)0, /*nb_absolute*/ - (inquiry)0, /*nb_nonzero*/ - 0, /*nb_invert*/ - 0, /*nb_lshift*/ - 0, /*nb_rshift*/ - 0, /*nb_and*/ - 0, /*nb_xor*/ - 0, /*nb_or*/ + (binaryfunc)0, /*nb_remainder*/ + (binaryfunc)0, /*nb_divmod*/ + (ternaryfunc)0, /*nb_power*/ + (unaryfunc)0, /*nb_negative*/ + (unaryfunc)0, /*nb_positive*/ + (unaryfunc)0, /*nb_absolute*/ + (inquiry)0, /*nb_nonzero*/ + 0, /*nb_invert*/ + 0, /*nb_lshift*/ + 0, /*nb_rshift*/ + 0, /*nb_and*/ + 0, /*nb_xor*/ + 0, /*nb_or*/ #if PY_VERSION_HEX < 0x03000000 - 0, /*nb_coerce*/ + 0, /*nb_coerce*/ #endif (unaryfunc)SwigPyObject_long, /*nb_int*/ #if PY_VERSION_HEX < 0x03000000 @@ -1880,21 +1898,90 @@ SwigPyObject_TypeOnce(void) { #else 0, /*nb_reserved*/ #endif - (unaryfunc)0, /*nb_float*/ + (unaryfunc)0, /*nb_float*/ #if PY_VERSION_HEX < 0x03000000 - (unaryfunc)SwigPyObject_oct, /*nb_oct*/ - (unaryfunc)SwigPyObject_hex, /*nb_hex*/ + (unaryfunc)SwigPyObject_oct, /*nb_oct*/ + (unaryfunc)SwigPyObject_hex, /*nb_hex*/ #endif #if PY_VERSION_HEX >= 0x03050000 /* 3.5 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_matrix_multiply */ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 /* nb_inplace_add -> nb_inplace_matrix_multiply */ #elif PY_VERSION_HEX >= 0x03000000 /* 3.0 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index, nb_inplace_divide removed */ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 /* nb_inplace_add -> nb_index, nb_inplace_divide removed */ #elif PY_VERSION_HEX >= 0x02050000 /* 2.5.0 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index */ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 /* nb_inplace_add -> nb_index */ #elif PY_VERSION_HEX >= 0x02020000 /* 2.2.0 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_true_divide */ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 /* nb_inplace_add -> nb_inplace_true_divide */ #elif PY_VERSION_HEX >= 0x02000000 /* 2.0.0 */ - 0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_or */ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 /* nb_inplace_add -> nb_inplace_or */ #endif }; @@ -1902,84 +1989,83 @@ SwigPyObject_TypeOnce(void) { static int type_init = 0; if (!type_init) { const PyTypeObject tmp = { - /* PyObject header changed in Python 3 */ + /* PyObject header changed in Python 3 */ #if PY_VERSION_HEX >= 0x03000000 PyVarObject_HEAD_INIT(NULL, 0) #else - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ -#endif - (char *)"SwigPyObject", /* tp_name */ - sizeof(SwigPyObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)SwigPyObject_dealloc, /* tp_dealloc */ - 0, /* tp_print */ + PyObject_HEAD_INIT(NULL) 0, /* ob_size */ +#endif + (char *) "SwigPyObject", /* tp_name */ + sizeof(SwigPyObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)SwigPyObject_dealloc, /* tp_dealloc */ + 0, /* tp_print */ #if PY_VERSION_HEX < 0x02020000 - (getattrfunc)SwigPyObject_getattr, /* tp_getattr */ + (getattrfunc)SwigPyObject_getattr, /* tp_getattr */ #else - (getattrfunc)0, /* tp_getattr */ + (getattrfunc)0, /* tp_getattr */ #endif - (setattrfunc)0, /* tp_setattr */ + (setattrfunc)0, /* tp_setattr */ #if PY_VERSION_HEX >= 0x03000000 - 0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */ + 0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */ #else - (cmpfunc)SwigPyObject_compare, /* tp_compare */ -#endif - (reprfunc)SwigPyObject_repr, /* tp_repr */ - &SwigPyObject_as_number, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - (hashfunc)0, /* tp_hash */ - (ternaryfunc)0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - swigobject_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - (richcmpfunc)SwigPyObject_richcompare,/* tp_richcompare */ - 0, /* tp_weaklistoffset */ + (cmpfunc)SwigPyObject_compare, /* tp_compare */ +#endif + (reprfunc)SwigPyObject_repr, /* tp_repr */ + &SwigPyObject_as_number, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + (hashfunc)0, /* tp_hash */ + (ternaryfunc)0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + swigobject_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + (richcmpfunc)SwigPyObject_richcompare, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ #if PY_VERSION_HEX >= 0x02020000 - 0, /* tp_iter */ - 0, /* tp_iternext */ - swigobject_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ - 0, /* tp_free */ - 0, /* tp_is_gc */ - 0, /* tp_bases */ - 0, /* tp_mro */ - 0, /* tp_cache */ - 0, /* tp_subclasses */ - 0, /* tp_weaklist */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + swigobject_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ #endif #if PY_VERSION_HEX >= 0x02030000 - 0, /* tp_del */ + 0, /* tp_del */ #endif #if PY_VERSION_HEX >= 0x02060000 - 0, /* tp_version_tag */ + 0, /* tp_version_tag */ #endif #if PY_VERSION_HEX >= 0x03040000 - 0, /* tp_finalize */ + 0, /* tp_finalize */ #endif #ifdef COUNT_ALLOCS - 0, /* tp_allocs */ - 0, /* tp_frees */ - 0, /* tp_maxalloc */ + 0, /* tp_allocs */ + 0, /* tp_frees */ + 0, /* tp_maxalloc */ #if PY_VERSION_HEX >= 0x02050000 - 0, /* tp_prev */ + 0, /* tp_prev */ #endif - 0 /* tp_next */ + 0 /* tp_next */ #endif }; swigpyobject_type = tmp; @@ -1994,14 +2080,12 @@ SwigPyObject_TypeOnce(void) { return &swigpyobject_type; } -SWIGRUNTIME PyObject * -SwigPyObject_New(void *ptr, swig_type_info *ty, int own) -{ +SWIGRUNTIME PyObject *SwigPyObject_New(void *ptr, swig_type_info *ty, int own) { SwigPyObject *sobj = PyObject_NEW(SwigPyObject, SwigPyObject_type()); if (sobj) { - sobj->ptr = ptr; - sobj->ty = ty; - sobj->own = own; + sobj->ptr = ptr; + sobj->ty = ty; + sobj->own = own; sobj->next = 0; } return (PyObject *)sobj; @@ -2009,165 +2093,153 @@ SwigPyObject_New(void *ptr, swig_type_info *ty, int own) /* ----------------------------------------------------------------------------- * Implements a simple Swig Packed type, and use it instead of string - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ typedef struct { - PyObject_HEAD - void *pack; + PyObject_HEAD void *pack; swig_type_info *ty; size_t size; } SwigPyPacked; -SWIGRUNTIME int -SwigPyPacked_print(SwigPyPacked *v, FILE *fp, int SWIGUNUSEDPARM(flags)) -{ +SWIGRUNTIME int SwigPyPacked_print(SwigPyPacked *v, FILE *fp, + int SWIGUNUSEDPARM(flags)) { char result[SWIG_BUFFER_SIZE]; - fputs("<Swig Packed ", fp); + fputs("<Swig Packed ", fp); if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) { - fputs("at ", fp); - fputs(result, fp); + fputs("at ", fp); + fputs(result, fp); } - fputs(v->ty->name,fp); + fputs(v->ty->name, fp); fputs(">", fp); - return 0; + return 0; } - -SWIGRUNTIME PyObject * -SwigPyPacked_repr(SwigPyPacked *v) -{ + +SWIGRUNTIME PyObject *SwigPyPacked_repr(SwigPyPacked *v) { char result[SWIG_BUFFER_SIZE]; if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) { - return SWIG_Python_str_FromFormat("<Swig Packed at %s%s>", result, v->ty->name); + return SWIG_Python_str_FromFormat("<Swig Packed at %s%s>", result, + v->ty->name); } else { return SWIG_Python_str_FromFormat("<Swig Packed %s>", v->ty->name); - } + } } -SWIGRUNTIME PyObject * -SwigPyPacked_str(SwigPyPacked *v) -{ +SWIGRUNTIME PyObject *SwigPyPacked_str(SwigPyPacked *v) { char result[SWIG_BUFFER_SIZE]; - if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))){ + if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) { return SWIG_Python_str_FromFormat("%s%s", result, v->ty->name); } else { return SWIG_Python_str_FromChar(v->ty->name); - } + } } -SWIGRUNTIME int -SwigPyPacked_compare(SwigPyPacked *v, SwigPyPacked *w) -{ +SWIGRUNTIME int SwigPyPacked_compare(SwigPyPacked *v, SwigPyPacked *w) { size_t i = v->size; size_t j = w->size; int s = (i < j) ? -1 : ((i > j) ? 1 : 0); - return s ? s : strncmp((char *)v->pack, (char *)w->pack, 2*v->size); + return s ? s : strncmp((char *)v->pack, (char *)w->pack, 2 * v->size); } -SWIGRUNTIME PyTypeObject* SwigPyPacked_TypeOnce(void); +SWIGRUNTIME PyTypeObject *SwigPyPacked_TypeOnce(void); -SWIGRUNTIME PyTypeObject* -SwigPyPacked_type(void) { +SWIGRUNTIME PyTypeObject *SwigPyPacked_type(void) { static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyPacked_TypeOnce(); return type; } -SWIGRUNTIMEINLINE int -SwigPyPacked_Check(PyObject *op) { - return ((op)->ob_type == SwigPyPacked_TypeOnce()) - || (strcmp((op)->ob_type->tp_name,"SwigPyPacked") == 0); +SWIGRUNTIMEINLINE int SwigPyPacked_Check(PyObject *op) { + return ((op)->ob_type == SwigPyPacked_TypeOnce()) || + (strcmp((op)->ob_type->tp_name, "SwigPyPacked") == 0); } -SWIGRUNTIME void -SwigPyPacked_dealloc(PyObject *v) -{ +SWIGRUNTIME void SwigPyPacked_dealloc(PyObject *v) { if (SwigPyPacked_Check(v)) { - SwigPyPacked *sobj = (SwigPyPacked *) v; + SwigPyPacked *sobj = (SwigPyPacked *)v; free(sobj->pack); } PyObject_DEL(v); } -SWIGRUNTIME PyTypeObject* -SwigPyPacked_TypeOnce(void) { +SWIGRUNTIME PyTypeObject *SwigPyPacked_TypeOnce(void) { static char swigpacked_doc[] = "Swig object carries a C/C++ instance pointer"; static PyTypeObject swigpypacked_type; static int type_init = 0; if (!type_init) { const PyTypeObject tmp = { - /* PyObject header changed in Python 3 */ -#if PY_VERSION_HEX>=0x03000000 + /* PyObject header changed in Python 3 */ +#if PY_VERSION_HEX >= 0x03000000 PyVarObject_HEAD_INIT(NULL, 0) #else - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ -#endif - (char *)"SwigPyPacked", /* tp_name */ - sizeof(SwigPyPacked), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)SwigPyPacked_dealloc, /* tp_dealloc */ - (printfunc)SwigPyPacked_print, /* tp_print */ - (getattrfunc)0, /* tp_getattr */ - (setattrfunc)0, /* tp_setattr */ -#if PY_VERSION_HEX>=0x03000000 + PyObject_HEAD_INIT(NULL) 0, /* ob_size */ +#endif + (char *) "SwigPyPacked", /* tp_name */ + sizeof(SwigPyPacked), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)SwigPyPacked_dealloc, /* tp_dealloc */ + (printfunc)SwigPyPacked_print, /* tp_print */ + (getattrfunc)0, /* tp_getattr */ + (setattrfunc)0, /* tp_setattr */ +#if PY_VERSION_HEX >= 0x03000000 0, /* tp_reserved in 3.0.1 */ #else - (cmpfunc)SwigPyPacked_compare, /* tp_compare */ -#endif - (reprfunc)SwigPyPacked_repr, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - (hashfunc)0, /* tp_hash */ - (ternaryfunc)0, /* tp_call */ - (reprfunc)SwigPyPacked_str, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - swigpacked_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ + (cmpfunc)SwigPyPacked_compare, /* tp_compare */ +#endif + (reprfunc)SwigPyPacked_repr, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + (hashfunc)0, /* tp_hash */ + (ternaryfunc)0, /* tp_call */ + (reprfunc)SwigPyPacked_str, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + swigpacked_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ #if PY_VERSION_HEX >= 0x02020000 - 0, /* tp_iter */ - 0, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ - 0, /* tp_free */ - 0, /* tp_is_gc */ - 0, /* tp_bases */ - 0, /* tp_mro */ - 0, /* tp_cache */ - 0, /* tp_subclasses */ - 0, /* tp_weaklist */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ #endif #if PY_VERSION_HEX >= 0x02030000 - 0, /* tp_del */ + 0, /* tp_del */ #endif #if PY_VERSION_HEX >= 0x02060000 - 0, /* tp_version_tag */ + 0, /* tp_version_tag */ #endif #if PY_VERSION_HEX >= 0x03040000 - 0, /* tp_finalize */ + 0, /* tp_finalize */ #endif #ifdef COUNT_ALLOCS - 0, /* tp_allocs */ - 0, /* tp_frees */ - 0, /* tp_maxalloc */ + 0, /* tp_allocs */ + 0, /* tp_frees */ + 0, /* tp_maxalloc */ #if PY_VERSION_HEX >= 0x02050000 - 0, /* tp_prev */ + 0, /* tp_prev */ #endif - 0 /* tp_next */ + 0 /* tp_next */ #endif }; swigpypacked_type = tmp; @@ -2182,31 +2254,30 @@ SwigPyPacked_TypeOnce(void) { return &swigpypacked_type; } -SWIGRUNTIME PyObject * -SwigPyPacked_New(void *ptr, size_t size, swig_type_info *ty) -{ +SWIGRUNTIME PyObject *SwigPyPacked_New(void *ptr, size_t size, + swig_type_info *ty) { SwigPyPacked *sobj = PyObject_NEW(SwigPyPacked, SwigPyPacked_type()); if (sobj) { void *pack = malloc(size); if (pack) { memcpy(pack, ptr, size); sobj->pack = pack; - sobj->ty = ty; + sobj->ty = ty; sobj->size = size; } else { - PyObject_DEL((PyObject *) sobj); + PyObject_DEL((PyObject *)sobj); sobj = 0; } } - return (PyObject *) sobj; + return (PyObject *)sobj; } -SWIGRUNTIME swig_type_info * -SwigPyPacked_UnpackData(PyObject *obj, void *ptr, size_t size) -{ +SWIGRUNTIME swig_type_info *SwigPyPacked_UnpackData(PyObject *obj, void *ptr, + size_t size) { if (SwigPyPacked_Check(obj)) { SwigPyPacked *sobj = (SwigPyPacked *)obj; - if (sobj->size != size) return 0; + if (sobj->size != size) + return 0; memcpy(ptr, sobj->pack, size); return sobj->ty; } else { @@ -2216,19 +2287,16 @@ SwigPyPacked_UnpackData(PyObject *obj, void *ptr, size_t size) /* ----------------------------------------------------------------------------- * pointers/data manipulation - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ -SWIGRUNTIMEINLINE PyObject * -_SWIG_This(void) -{ - return SWIG_Python_str_FromChar("this"); +SWIGRUNTIMEINLINE PyObject *_SWIG_This(void) { + return SWIG_Python_str_FromChar("this"); } static PyObject *swig_this = NULL; -SWIGRUNTIME PyObject * -SWIG_This(void) -{ +SWIGRUNTIME PyObject *SWIG_This(void) { if (swig_this == NULL) swig_this = _SWIG_This(); return swig_this; @@ -2237,27 +2305,25 @@ SWIG_This(void) /* #define SWIG_PYTHON_SLOW_GETSET_THIS */ /* TODO: I don't know how to implement the fast getset in Python 3 right now */ -#if PY_VERSION_HEX>=0x03000000 -#define SWIG_PYTHON_SLOW_GETSET_THIS +#if PY_VERSION_HEX >= 0x03000000 +#define SWIG_PYTHON_SLOW_GETSET_THIS #endif -SWIGRUNTIME SwigPyObject * -SWIG_Python_GetSwigThis(PyObject *pyobj) -{ +SWIGRUNTIME SwigPyObject *SWIG_Python_GetSwigThis(PyObject *pyobj) { PyObject *obj; if (SwigPyObject_Check(pyobj)) - return (SwigPyObject *) pyobj; + return (SwigPyObject *)pyobj; #ifdef SWIGPYTHON_BUILTIN (void)obj; -# ifdef PyWeakref_CheckProxy +#ifdef PyWeakref_CheckProxy if (PyWeakref_CheckProxy(pyobj)) { pyobj = PyWeakref_GET_OBJECT(pyobj); if (pyobj && SwigPyObject_Check(pyobj)) - return (SwigPyObject*) pyobj; + return (SwigPyObject *)pyobj; } -# endif +#endif return NULL; #else @@ -2265,7 +2331,7 @@ SWIG_Python_GetSwigThis(PyObject *pyobj) #if (!defined(SWIG_PYTHON_SLOW_GETSET_THIS) && (PY_VERSION_HEX >= 0x02030000)) if (PyInstance_Check(pyobj)) { - obj = _PyInstance_Lookup(pyobj, SWIG_This()); + obj = _PyInstance_Lookup(pyobj, SWIG_This()); } else { PyObject **dictptr = _PyObject_GetDictPtr(pyobj); if (dictptr != NULL) { @@ -2274,31 +2340,33 @@ SWIG_Python_GetSwigThis(PyObject *pyobj) } else { #ifdef PyWeakref_CheckProxy if (PyWeakref_CheckProxy(pyobj)) { - PyObject *wobj = PyWeakref_GET_OBJECT(pyobj); - return wobj ? SWIG_Python_GetSwigThis(wobj) : 0; + PyObject *wobj = PyWeakref_GET_OBJECT(pyobj); + return wobj ? SWIG_Python_GetSwigThis(wobj) : 0; } #endif - obj = PyObject_GetAttr(pyobj,SWIG_This()); + obj = PyObject_GetAttr(pyobj, SWIG_This()); if (obj) { - Py_DECREF(obj); + Py_DECREF(obj); } else { - if (PyErr_Occurred()) PyErr_Clear(); - return 0; + if (PyErr_Occurred()) + PyErr_Clear(); + return 0; } } } #else - obj = PyObject_GetAttr(pyobj,SWIG_This()); + obj = PyObject_GetAttr(pyobj, SWIG_This()); if (obj) { Py_DECREF(obj); } else { - if (PyErr_Occurred()) PyErr_Clear(); + if (PyErr_Occurred()) + PyErr_Clear(); return 0; } #endif if (obj && !SwigPyObject_Check(obj)) { /* a PyObject is called 'this', try to get the 'real this' - SwigPyObject from it */ + SwigPyObject from it */ return SWIG_Python_GetSwigThis(obj); } return (SwigPyObject *)obj; @@ -2307,8 +2375,7 @@ SWIG_Python_GetSwigThis(PyObject *pyobj) /* Acquire a pointer value */ -SWIGRUNTIME int -SWIG_Python_AcquirePtr(PyObject *obj, int own) { +SWIGRUNTIME int SWIG_Python_AcquirePtr(PyObject *obj, int own) { if (own == SWIG_POINTER_OWN) { SwigPyObject *sobj = SWIG_Python_GetSwigThis(obj); if (sobj) { @@ -2322,8 +2389,9 @@ SWIG_Python_AcquirePtr(PyObject *obj, int own) { /* Convert a pointer value */ -SWIGRUNTIME int -SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int flags, int *own) { +SWIGRUNTIME int SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, + swig_type_info *ty, int flags, + int *own) { int res; SwigPyObject *sobj; int implicit_conv = (flags & SWIG_POINTER_IMPLICIT_CONV) != 0; @@ -2347,18 +2415,20 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int swig_type_info *to = sobj->ty; if (to == ty) { /* no type cast needed */ - if (ptr) *ptr = vptr; + if (ptr) + *ptr = vptr; break; } else { - swig_cast_info *tc = SWIG_TypeCheck(to->name,ty); + swig_cast_info *tc = SWIG_TypeCheck(to->name, ty); if (!tc) { sobj = (SwigPyObject *)sobj->next; } else { if (ptr) { int newmemory = 0; - *ptr = SWIG_TypeCast(tc,vptr,&newmemory); + *ptr = SWIG_TypeCast(tc, vptr, &newmemory); if (newmemory == SWIG_CAST_NEW_MEMORY) { - assert(own); /* badly formed typemap which will lead to a memory leak - it must set and use own to delete *ptr */ + assert(own); /* badly formed typemap which will lead to a memory + leak - it must set and use own to delete *ptr */ if (own) *own = *own | SWIG_CAST_NEW_MEMORY; } @@ -2367,7 +2437,8 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int } } } else { - if (ptr) *ptr = vptr; + if (ptr) + *ptr = vptr; break; } } @@ -2380,12 +2451,13 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int res = SWIG_OK; } else { if (implicit_conv) { - SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0; + SwigPyClientData *data = ty ? (SwigPyClientData *)ty->clientdata : 0; if (data && !data->implicitconv) { PyObject *klass = data->klass; if (klass) { PyObject *impconv; - data->implicitconv = 1; /* avoid recursion and call 'explicit' constructors*/ + data->implicitconv = + 1; /* avoid recursion and call 'explicit' constructors*/ impconv = SWIG_Python_CallFunctor(klass, obj); data->implicitconv = 0; if (PyErr_Occurred()) { @@ -2396,7 +2468,8 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int SwigPyObject *iobj = SWIG_Python_GetSwigThis(impconv); if (iobj) { void *vptr; - res = SWIG_Python_ConvertPtrAndOwn((PyObject*)iobj, &vptr, ty, 0, 0); + res = SWIG_Python_ConvertPtrAndOwn((PyObject *)iobj, &vptr, ty, 0, + 0); if (SWIG_IsOK(res)) { if (ptr) { *ptr = vptr; @@ -2405,7 +2478,7 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int res = SWIG_AddCast(res); res = SWIG_AddNewMask(res); } else { - res = SWIG_AddCast(res); + res = SWIG_AddCast(res); } } } @@ -2427,25 +2500,25 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int /* Convert a function ptr value */ -SWIGRUNTIME int -SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, swig_type_info *ty) { +SWIGRUNTIME int SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, + swig_type_info *ty) { if (!PyCFunction_Check(obj)) { return SWIG_ConvertPtr(obj, ptr, ty, 0); } else { void *vptr = 0; - + /* here we get the method pointer for callbacks */ - const char *doc = (((PyCFunctionObject *)obj) -> m_ml -> ml_doc); + const char *doc = (((PyCFunctionObject *)obj)->m_ml->ml_doc); const char *desc = doc ? strstr(doc, "swig_ptr: ") : 0; if (desc) desc = ty ? SWIG_UnpackVoidPtr(desc + 10, &vptr, ty->name) : 0; - if (!desc) + if (!desc) return SWIG_ERROR; if (ty) { - swig_cast_info *tc = SWIG_TypeCheck(desc,ty); + swig_cast_info *tc = SWIG_TypeCheck(desc, ty); if (tc) { int newmemory = 0; - *ptr = SWIG_TypeCast(tc,vptr,&newmemory); + *ptr = SWIG_TypeCast(tc, vptr, &newmemory); assert(!newmemory); /* newmemory handling not yet implemented */ } else { return SWIG_ERROR; @@ -2459,32 +2532,34 @@ SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, swig_type_info *ty) { /* Convert a packed value value */ -SWIGRUNTIME int -SWIG_Python_ConvertPacked(PyObject *obj, void *ptr, size_t sz, swig_type_info *ty) { +SWIGRUNTIME int SWIG_Python_ConvertPacked(PyObject *obj, void *ptr, size_t sz, + swig_type_info *ty) { swig_type_info *to = SwigPyPacked_UnpackData(obj, ptr, sz); - if (!to) return SWIG_ERROR; + if (!to) + return SWIG_ERROR; if (ty) { if (to != ty) { /* check type cast? */ - swig_cast_info *tc = SWIG_TypeCheck(to->name,ty); - if (!tc) return SWIG_ERROR; + swig_cast_info *tc = SWIG_TypeCheck(to->name, ty); + if (!tc) + return SWIG_ERROR; } } return SWIG_OK; -} +} /* ----------------------------------------------------------------------------- * Create a new pointer object - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ /* Create a new instance object, without calling __init__, and set the 'this' attribute. */ -SWIGRUNTIME PyObject* -SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this) -{ +SWIGRUNTIME PyObject *SWIG_Python_NewShadowInstance(SwigPyClientData *data, + PyObject *swig_this) { #if (PY_VERSION_HEX >= 0x02020000) PyObject *inst = 0; PyObject *newraw = data->newraw; @@ -2494,12 +2569,12 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this) #if !defined(SWIG_PYTHON_SLOW_GETSET_THIS) PyObject **dictptr = _PyObject_GetDictPtr(inst); if (dictptr != NULL) { - PyObject *dict = *dictptr; - if (dict == NULL) { - dict = PyDict_New(); - *dictptr = dict; - PyDict_SetItem(dict, SWIG_This(), swig_this); - } + PyObject *dict = *dictptr; + if (dict == NULL) { + dict = PyDict_New(); + *dictptr = dict; + PyDict_SetItem(dict, SWIG_This(), swig_this); + } } #else PyObject *key = SWIG_This(); @@ -2508,7 +2583,8 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this) } } else { #if PY_VERSION_HEX >= 0x03000000 - inst = ((PyTypeObject*) data->newargs)->tp_new((PyTypeObject*) data->newargs, Py_None, Py_None); + inst = ((PyTypeObject *)data->newargs) + ->tp_new((PyTypeObject *)data->newargs, Py_None, Py_None); if (inst) { PyObject_SetAttr(inst, SWIG_This(), swig_this); Py_TYPE(inst)->tp_flags &= ~Py_TPFLAGS_VALID_VERSION_TAG; @@ -2532,7 +2608,7 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this) inst = PyInstance_NewRaw(data->newargs, dict); Py_DECREF(dict); } - return (PyObject *) inst; + return (PyObject *)inst; #else PyInstanceObject *inst = PyObject_NEW(PyInstanceObject, &PyInstance_Type); if (inst == NULL) { @@ -2552,42 +2628,38 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this) PyObject_GC_Init(inst); #endif PyDict_SetItem(inst->in_dict, SWIG_This(), swig_this); - return (PyObject *) inst; + return (PyObject *)inst; #endif #endif } -SWIGRUNTIME void -SWIG_Python_SetSwigThis(PyObject *inst, PyObject *swig_this) -{ - PyObject *dict; +SWIGRUNTIME void SWIG_Python_SetSwigThis(PyObject *inst, PyObject *swig_this) { + PyObject *dict; #if (PY_VERSION_HEX >= 0x02020000) && !defined(SWIG_PYTHON_SLOW_GETSET_THIS) - PyObject **dictptr = _PyObject_GetDictPtr(inst); - if (dictptr != NULL) { - dict = *dictptr; - if (dict == NULL) { - dict = PyDict_New(); - *dictptr = dict; - } - PyDict_SetItem(dict, SWIG_This(), swig_this); - return; - } -#endif - dict = PyObject_GetAttrString(inst, (char*)"__dict__"); - PyDict_SetItem(dict, SWIG_This(), swig_this); - Py_DECREF(dict); -} - + PyObject **dictptr = _PyObject_GetDictPtr(inst); + if (dictptr != NULL) { + dict = *dictptr; + if (dict == NULL) { + dict = PyDict_New(); + *dictptr = dict; + } + PyDict_SetItem(dict, SWIG_This(), swig_this); + return; + } +#endif + dict = PyObject_GetAttrString(inst, (char *)"__dict__"); + PyDict_SetItem(dict, SWIG_This(), swig_this); + Py_DECREF(dict); +} -SWIGINTERN PyObject * -SWIG_Python_InitShadowInstance(PyObject *args) { +SWIGINTERN PyObject *SWIG_Python_InitShadowInstance(PyObject *args) { PyObject *obj[2]; if (!SWIG_Python_UnpackTuple(args, "swiginit", 2, 2, obj)) { return NULL; } else { SwigPyObject *sthis = SWIG_Python_GetSwigThis(obj[0]); if (sthis) { - SwigPyObject_append((PyObject*) sthis, obj[1]); + SwigPyObject_append((PyObject *)sthis, obj[1]); } else { SWIG_Python_SetSwigThis(obj[0], obj[1]); } @@ -2597,10 +2669,11 @@ SWIG_Python_InitShadowInstance(PyObject *args) { /* Create a new pointer object */ -SWIGRUNTIME PyObject * -SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int flags) { +SWIGRUNTIME PyObject *SWIG_Python_NewPointerObj(PyObject *self, void *ptr, + swig_type_info *type, + int flags) { SwigPyClientData *clientdata; - PyObject * robj; + PyObject *robj; int own; if (!ptr) @@ -2611,11 +2684,12 @@ SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int f if (clientdata && clientdata->pytype) { SwigPyObject *newobj; if (flags & SWIG_BUILTIN_TP_INIT) { - newobj = (SwigPyObject*) self; + newobj = (SwigPyObject *)self; if (newobj->ptr) { - PyObject *next_self = clientdata->pytype->tp_alloc(clientdata->pytype, 0); + PyObject *next_self = + clientdata->pytype->tp_alloc(clientdata->pytype, 0); while (newobj->next) - newobj = (SwigPyObject *) newobj->next; + newobj = (SwigPyObject *)newobj->next; newobj->next = next_self; newobj = (SwigPyObject *)next_self; #ifdef SWIGPYTHON_BUILTIN @@ -2633,7 +2707,7 @@ SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int f newobj->ty = type; newobj->own = own; newobj->next = 0; - return (PyObject*) newobj; + return (PyObject *)newobj; } return SWIG_Py_Void(); } @@ -2651,13 +2725,13 @@ SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int f /* Create a new packed object */ -SWIGRUNTIMEINLINE PyObject * -SWIG_Python_NewPackedObj(void *ptr, size_t sz, swig_type_info *type) { - return ptr ? SwigPyPacked_New((void *) ptr, sz, type) : SWIG_Py_Void(); +SWIGRUNTIMEINLINE PyObject *SWIG_Python_NewPackedObj(void *ptr, size_t sz, + swig_type_info *type) { + return ptr ? SwigPyPacked_New((void *)ptr, sz, type) : SWIG_Py_Void(); } /* -----------------------------------------------------------------------------* - * Get type list + * Get type list * -----------------------------------------------------------------------------*/ #ifdef SWIG_LINK_RUNTIME @@ -2672,42 +2746,43 @@ SWIG_Python_GetModule(void *SWIGUNUSEDPARM(clientdata)) { #ifdef SWIG_LINK_RUNTIME type_pointer = SWIG_ReturnGlobalTypeList((void *)0); #else -# ifdef SWIGPY_USE_CAPSULE +#ifdef SWIGPY_USE_CAPSULE type_pointer = PyCapsule_Import(SWIGPY_CAPSULE_NAME, 0); -# else - type_pointer = PyCObject_Import((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION, - (char*)"type_pointer" SWIG_TYPE_TABLE_NAME); -# endif +#else + type_pointer = + PyCObject_Import((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION, + (char *)"type_pointer" SWIG_TYPE_TABLE_NAME); +#endif if (PyErr_Occurred()) { PyErr_Clear(); type_pointer = (void *)0; } #endif } - return (swig_module_info *) type_pointer; + return (swig_module_info *)type_pointer; } #if PY_MAJOR_VERSION < 2 -/* PyModule_AddObject function was introduced in Python 2.0. The following function - is copied out of Python/modsupport.c in python version 2.3.4 */ -SWIGINTERN int -PyModule_AddObject(PyObject *m, char *name, PyObject *o) -{ +/* PyModule_AddObject function was introduced in Python 2.0. The following + function is copied out of Python/modsupport.c in python version 2.3.4 */ +SWIGINTERN int PyModule_AddObject(PyObject *m, char *name, PyObject *o) { PyObject *dict; if (!PyModule_Check(m)) { - PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs module as first arg"); + PyErr_SetString(PyExc_TypeError, + "PyModule_AddObject() needs module as first arg"); return SWIG_ERROR; } if (!o) { - PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs non-NULL value"); + PyErr_SetString(PyExc_TypeError, + "PyModule_AddObject() needs non-NULL value"); return SWIG_ERROR; } - + dict = PyModule_GetDict(m); if (dict == NULL) { /* Internal error -- modules must have a dict! */ PyErr_Format(PyExc_SystemError, "module '%s' has no __dict__", - PyModule_GetName(m)); + PyModule_GetName(m)); return SWIG_ERROR; } if (PyDict_SetItemString(dict, name, o)) @@ -2725,43 +2800,52 @@ SWIG_Python_DestroyModule(void *vptr) #endif { #ifdef SWIGPY_USE_CAPSULE - swig_module_info *swig_module = (swig_module_info *) PyCapsule_GetPointer(obj, SWIGPY_CAPSULE_NAME); + swig_module_info *swig_module = + (swig_module_info *)PyCapsule_GetPointer(obj, SWIGPY_CAPSULE_NAME); #else - swig_module_info *swig_module = (swig_module_info *) vptr; + swig_module_info *swig_module = (swig_module_info *)vptr; #endif swig_type_info **types = swig_module->types; size_t i; - for (i =0; i < swig_module->size; ++i) { + for (i = 0; i < swig_module->size; ++i) { swig_type_info *ty = types[i]; if (ty->owndata) { - SwigPyClientData *data = (SwigPyClientData *) ty->clientdata; - if (data) SwigPyClientData_Del(data); + SwigPyClientData *data = (SwigPyClientData *)ty->clientdata; + if (data) + SwigPyClientData_Del(data); } } Py_DECREF(SWIG_This()); swig_this = NULL; } -SWIGRUNTIME void -SWIG_Python_SetModule(swig_module_info *swig_module) { +SWIGRUNTIME void SWIG_Python_SetModule(swig_module_info *swig_module) { #if PY_VERSION_HEX >= 0x03000000 - /* Add a dummy module object into sys.modules */ - PyObject *module = PyImport_AddModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION); + /* Add a dummy module object into sys.modules */ + PyObject *module = + PyImport_AddModule((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION); #else - static PyMethodDef swig_empty_runtime_method_table[] = { {NULL, NULL, 0, NULL} }; /* Sentinel */ - PyObject *module = Py_InitModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION, swig_empty_runtime_method_table); + static PyMethodDef swig_empty_runtime_method_table[] = { + {NULL, NULL, 0, NULL}}; /* Sentinel */ + PyObject *module = + Py_InitModule((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION, + swig_empty_runtime_method_table); #endif #ifdef SWIGPY_USE_CAPSULE - PyObject *pointer = PyCapsule_New((void *) swig_module, SWIGPY_CAPSULE_NAME, SWIG_Python_DestroyModule); + PyObject *pointer = PyCapsule_New((void *)swig_module, SWIGPY_CAPSULE_NAME, + SWIG_Python_DestroyModule); if (pointer && module) { - PyModule_AddObject(module, (char*)"type_pointer_capsule" SWIG_TYPE_TABLE_NAME, pointer); + PyModule_AddObject( + module, (char *)"type_pointer_capsule" SWIG_TYPE_TABLE_NAME, pointer); } else { Py_XDECREF(pointer); } #else - PyObject *pointer = PyCObject_FromVoidPtr((void *) swig_module, SWIG_Python_DestroyModule); + PyObject *pointer = + PyCObject_FromVoidPtr((void *)swig_module, SWIG_Python_DestroyModule); if (pointer && module) { - PyModule_AddObject(module, (char*)"type_pointer" SWIG_TYPE_TABLE_NAME, pointer); + PyModule_AddObject(module, (char *)"type_pointer" SWIG_TYPE_TABLE_NAME, + pointer); } else { Py_XDECREF(pointer); } @@ -2769,31 +2853,28 @@ SWIG_Python_SetModule(swig_module_info *swig_module) { } /* The python cached type query */ -SWIGRUNTIME PyObject * -SWIG_Python_TypeCache(void) { +SWIGRUNTIME PyObject *SWIG_Python_TypeCache(void) { static PyObject *SWIG_STATIC_POINTER(cache) = PyDict_New(); return cache; } -SWIGRUNTIME swig_type_info * -SWIG_Python_TypeQuery(const char *type) -{ +SWIGRUNTIME swig_type_info *SWIG_Python_TypeQuery(const char *type) { PyObject *cache = SWIG_Python_TypeCache(); - PyObject *key = SWIG_Python_str_FromChar(type); + PyObject *key = SWIG_Python_str_FromChar(type); PyObject *obj = PyDict_GetItem(cache, key); swig_type_info *descriptor; if (obj) { #ifdef SWIGPY_USE_CAPSULE - descriptor = (swig_type_info *) PyCapsule_GetPointer(obj, NULL); + descriptor = (swig_type_info *)PyCapsule_GetPointer(obj, NULL); #else - descriptor = (swig_type_info *) PyCObject_AsVoidPtr(obj); + descriptor = (swig_type_info *)PyCObject_AsVoidPtr(obj); #endif } else { swig_module_info *swig_module = SWIG_GetModule(0); descriptor = SWIG_TypeQueryModule(swig_module, swig_module, type); if (descriptor) { #ifdef SWIGPY_USE_CAPSULE - obj = PyCapsule_New((void*) descriptor, NULL, NULL); + obj = PyCapsule_New((void *)descriptor, NULL, NULL); #else obj = PyCObject_FromVoidPtr(descriptor, NULL); #endif @@ -2805,16 +2886,15 @@ SWIG_Python_TypeQuery(const char *type) return descriptor; } -/* +/* For backward compatibility only */ -#define SWIG_POINTER_EXCEPTION 0 -#define SWIG_arg_fail(arg) SWIG_Python_ArgFail(arg) -#define SWIG_MustGetPtr(p, type, argnum, flags) SWIG_Python_MustGetPtr(p, type, argnum, flags) +#define SWIG_POINTER_EXCEPTION 0 +#define SWIG_arg_fail(arg) SWIG_Python_ArgFail(arg) +#define SWIG_MustGetPtr(p, type, argnum, flags) \ + SWIG_Python_MustGetPtr(p, type, argnum, flags) -SWIGRUNTIME int -SWIG_Python_AddErrMesg(const char* mesg, int infront) -{ +SWIGRUNTIME int SWIG_Python_AddErrMesg(const char *mesg, int infront) { if (PyErr_Occurred()) { PyObject *type = 0; PyObject *value = 0; @@ -2826,9 +2906,11 @@ SWIG_Python_AddErrMesg(const char* mesg, int infront) Py_XINCREF(type); PyErr_Clear(); if (infront) { - PyErr_Format(type, "%s %s", mesg, tmp = SWIG_Python_str_AsChar(old_str)); + PyErr_Format(type, "%s %s", mesg, + tmp = SWIG_Python_str_AsChar(old_str)); } else { - PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), mesg); + PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), + mesg); } SWIG_Python_str_DelForPy3(tmp); Py_DECREF(old_str); @@ -2838,10 +2920,8 @@ SWIG_Python_AddErrMesg(const char* mesg, int infront) return 0; } } - -SWIGRUNTIME int -SWIG_Python_ArgFail(int argnum) -{ + +SWIGRUNTIME int SWIG_Python_ArgFail(int argnum) { if (PyErr_Occurred()) { /* add information about failing argument */ char mesg[256]; @@ -2852,55 +2932,53 @@ SWIG_Python_ArgFail(int argnum) } } -SWIGRUNTIMEINLINE const char * -SwigPyObject_GetDesc(PyObject *self) -{ +SWIGRUNTIMEINLINE const char *SwigPyObject_GetDesc(PyObject *self) { SwigPyObject *v = (SwigPyObject *)self; swig_type_info *ty = v ? v->ty : 0; return ty ? ty->str : ""; } -SWIGRUNTIME void -SWIG_Python_TypeError(const char *type, PyObject *obj) -{ +SWIGRUNTIME void SWIG_Python_TypeError(const char *type, PyObject *obj) { if (type) { #if defined(SWIG_COBJECT_TYPES) if (obj && SwigPyObject_Check(obj)) { - const char *otype = (const char *) SwigPyObject_GetDesc(obj); + const char *otype = (const char *)SwigPyObject_GetDesc(obj); if (otype) { - PyErr_Format(PyExc_TypeError, "a '%s' is expected, 'SwigPyObject(%s)' is received", - type, otype); - return; + PyErr_Format(PyExc_TypeError, + "a '%s' is expected, 'SwigPyObject(%s)' is received", type, + otype); + return; } - } else -#endif + } else +#endif { - const char *otype = (obj ? obj->ob_type->tp_name : 0); + const char *otype = (obj ? obj->ob_type->tp_name : 0); if (otype) { - PyObject *str = PyObject_Str(obj); - const char *cstr = str ? SWIG_Python_str_AsChar(str) : 0; - if (cstr) { - PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s(%s)' is received", - type, otype, cstr); + PyObject *str = PyObject_Str(obj); + const char *cstr = str ? SWIG_Python_str_AsChar(str) : 0; + if (cstr) { + PyErr_Format(PyExc_TypeError, + "a '%s' is expected, '%s(%s)' is received", type, otype, + cstr); SWIG_Python_str_DelForPy3(cstr); - } else { - PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s' is received", - type, otype); - } - Py_XDECREF(str); - return; + } else { + PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s' is received", + type, otype); + } + Py_XDECREF(str); + return; } - } + } PyErr_Format(PyExc_TypeError, "a '%s' is expected", type); } else { PyErr_Format(PyExc_TypeError, "unexpected type is received"); } } - /* Convert a pointer value, signal an exception on a type mismatch */ -SWIGRUNTIME void * -SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, int SWIGUNUSEDPARM(argnum), int flags) { +SWIGRUNTIME void *SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, + int SWIGUNUSEDPARM(argnum), + int flags) { void *result; if (SWIG_Python_ConvertPtr(obj, &result, ty, flags) == -1) { PyErr_Clear(); @@ -2915,25 +2993,27 @@ SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, int SWIGUNUSEDPARM(arg } #ifdef SWIGPYTHON_BUILTIN -SWIGRUNTIME int -SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) { +SWIGRUNTIME int SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, + PyObject *value) { PyTypeObject *tp = obj->ob_type; PyObject *descr; PyObject *encoded_name; descrsetfunc f; int res = -1; -# ifdef Py_USING_UNICODE +#ifdef Py_USING_UNICODE if (PyString_Check(name)) { - name = PyUnicode_Decode(PyString_AsString(name), PyString_Size(name), NULL, NULL); + name = PyUnicode_Decode(PyString_AsString(name), PyString_Size(name), NULL, + NULL); if (!name) return -1; } else if (!PyUnicode_Check(name)) -# else +#else if (!PyString_Check(name)) -# endif +#endif { - PyErr_Format(PyExc_TypeError, "attribute name must be string, not '%.200s'", name->ob_type->tp_name); + PyErr_Format(PyExc_TypeError, "attribute name must be string, not '%.200s'", + name->ob_type->tp_name); return -1; } else { Py_INCREF(name); @@ -2955,30 +3035,35 @@ SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) { } else { encoded_name = PyUnicode_AsUTF8String(name); } - PyErr_Format(PyExc_AttributeError, "'%.100s' object has no attribute '%.200s'", tp->tp_name, PyString_AsString(encoded_name)); + PyErr_Format(PyExc_AttributeError, + "'%.100s' object has no attribute '%.200s'", tp->tp_name, + PyString_AsString(encoded_name)); Py_DECREF(encoded_name); } else { res = f(descr, obj, value); } - - done: + +done: Py_DECREF(name); return res; } #endif - #ifdef __cplusplus } #endif +#define SWIG_exception_fail(code, msg) \ + do { \ + SWIG_Error(code, msg); \ + SWIG_fail; \ + } while (0) - -#define SWIG_exception_fail(code, msg) do { SWIG_Error(code, msg); SWIG_fail; } while(0) - -#define SWIG_contract_assert(expr, msg) if (!(expr)) { SWIG_Error(SWIG_RuntimeError, msg); SWIG_fail; } else - - +#define SWIG_contract_assert(expr, msg) \ + if (!(expr)) { \ + SWIG_Error(SWIG_RuntimeError, msg); \ + SWIG_fail; \ + } else /* -------- TYPES TABLE (BEGIN) -------- */ @@ -2986,41 +3071,40 @@ SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) { #define SWIGTYPE_p_float swig_types[1] static swig_type_info *swig_types[3]; static swig_module_info swig_module = {swig_types, 2, 0, 0, 0, 0}; -#define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) -#define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) +#define SWIG_TypeQuery(name) \ + SWIG_TypeQueryModule(&swig_module, &swig_module, name) +#define SWIG_MangledTypeQuery(name) \ + SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) /* -------- TYPES TABLE (END) -------- */ #if (PY_VERSION_HEX <= 0x02000000) -# if !defined(SWIG_PYTHON_CLASSIC) -# error "This python version requires swig to be run with the '-classic' option" -# endif +#if !defined(SWIG_PYTHON_CLASSIC) +#error "This python version requires swig to be run with the '-classic' option" +#endif #endif /*----------------------------------------------- @(target):= _cmodule.so ------------------------------------------------*/ #if PY_VERSION_HEX >= 0x03000000 -# define SWIG_init PyInit__cmodule +#define SWIG_init PyInit__cmodule #else -# define SWIG_init init_cmodule +#define SWIG_init init_cmodule #endif -#define SWIG_name "_cmodule" +#define SWIG_name "_cmodule" -#define SWIGVERSION 0x030008 +#define SWIGVERSION 0x030008 #define SWIG_VERSION SWIGVERSION - -#define SWIG_as_voidptr(a) (void *)((const void *)(a)) -#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),(void**)(a)) - +#define SWIG_as_voidptr(a) (void *)((const void *)(a)) +#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a), (void **)(a)) #define SWIG_FILE_WITH_INIT #include "gamut_map.h" - #ifndef SWIG_FILE_WITH_INIT #define NO_IMPORT_ARRAY #endif @@ -3028,520 +3112,457 @@ static swig_module_info swig_module = {swig_types, 2, 0, 0, 0, 0}; #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #include <numpy/arrayobject.h> - -void gamut_map_full(float* input, int in_dim1, int in_dim2, int in_dim3, - float* result, int out_dim1, int out_dim2, int out_dim3, - float* ctrl_pts, int cp_dim1, int cp_dim2, - float* weights, int weight_dim1, int weight_dim2, - float* coefs, int coef_dim1, int coef_dim2) { - gamut_map(input, in_dim1, in_dim2, in_dim3, result, ctrl_pts, weights, coefs, cp_dim1); +void gamut_map_full(float *input, int in_dim1, int in_dim2, int in_dim3, + float *result, int out_dim1, int out_dim2, int out_dim3, + float *ctrl_pts, int cp_dim1, int cp_dim2, float *weights, + int weight_dim1, int weight_dim2, float *coefs, + int coef_dim1, int coef_dim2) { + gamut_map(input, in_dim1, in_dim2, in_dim3, result, ctrl_pts, weights, coefs, + cp_dim1); } - #if NPY_API_VERSION < 0x00000007 #define NPY_ARRAY_DEFAULT NPY_DEFAULT -#define NPY_ARRAY_FARRAY NPY_FARRAY -#define NPY_FORTRANORDER NPY_FORTRAN +#define NPY_ARRAY_FARRAY NPY_FARRAY +#define NPY_FORTRANORDER NPY_FORTRAN #endif - /* Macros to extract array attributes. */ #if NPY_API_VERSION < 0x00000007 -#define is_array(a) ((a) && PyArray_Check((PyArrayObject*)a)) -#define array_type(a) (int)(PyArray_TYPE((PyArrayObject*)a)) -#define array_numdims(a) (((PyArrayObject*)a)->nd) -#define array_dimensions(a) (((PyArrayObject*)a)->dimensions) -#define array_size(a,i) (((PyArrayObject*)a)->dimensions[i]) -#define array_strides(a) (((PyArrayObject*)a)->strides) -#define array_stride(a,i) (((PyArrayObject*)a)->strides[i]) -#define array_data(a) (((PyArrayObject*)a)->data) -#define array_descr(a) (((PyArrayObject*)a)->descr) -#define array_flags(a) (((PyArrayObject*)a)->flags) -#define array_clearflags(a,f) (((PyArrayObject*)a)->flags) &= ~f -#define array_enableflags(a,f) (((PyArrayObject*)a)->flags) = f -#define array_is_fortran(a) (PyArray_ISFORTRAN((PyArrayObject*)a)) +#define is_array(a) ((a) && PyArray_Check((PyArrayObject *)a)) +#define array_type(a) (int)(PyArray_TYPE((PyArrayObject *)a)) +#define array_numdims(a) (((PyArrayObject *)a)->nd) +#define array_dimensions(a) (((PyArrayObject *)a)->dimensions) +#define array_size(a, i) (((PyArrayObject *)a)->dimensions[i]) +#define array_strides(a) (((PyArrayObject *)a)->strides) +#define array_stride(a, i) (((PyArrayObject *)a)->strides[i]) +#define array_data(a) (((PyArrayObject *)a)->data) +#define array_descr(a) (((PyArrayObject *)a)->descr) +#define array_flags(a) (((PyArrayObject *)a)->flags) +#define array_clearflags(a, f) (((PyArrayObject *)a)->flags) &= ~f +#define array_enableflags(a, f) (((PyArrayObject *)a)->flags) = f +#define array_is_fortran(a) (PyArray_ISFORTRAN((PyArrayObject *)a)) #else -#define is_array(a) ((a) && PyArray_Check(a)) -#define array_type(a) PyArray_TYPE((PyArrayObject*)a) -#define array_numdims(a) PyArray_NDIM((PyArrayObject*)a) -#define array_dimensions(a) PyArray_DIMS((PyArrayObject*)a) -#define array_strides(a) PyArray_STRIDES((PyArrayObject*)a) -#define array_stride(a,i) PyArray_STRIDE((PyArrayObject*)a,i) -#define array_size(a,i) PyArray_DIM((PyArrayObject*)a,i) -#define array_data(a) PyArray_DATA((PyArrayObject*)a) -#define array_descr(a) PyArray_DESCR((PyArrayObject*)a) -#define array_flags(a) PyArray_FLAGS((PyArrayObject*)a) -#define array_enableflags(a,f) PyArray_ENABLEFLAGS((PyArrayObject*)a,f) -#define array_clearflags(a,f) PyArray_CLEARFLAGS((PyArrayObject*)a,f) -#define array_is_fortran(a) (PyArray_IS_F_CONTIGUOUS((PyArrayObject*)a)) -#endif -#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject*)a)) -#define array_is_native(a) (PyArray_ISNOTSWAPPED((PyArrayObject*)a)) - - - /* Given a PyObject, return a string describing its type. - */ - const char* pytype_string(PyObject* py_obj) - { - if (py_obj == NULL ) return "C NULL value"; - if (py_obj == Py_None ) return "Python None" ; - if (PyCallable_Check(py_obj)) return "callable" ; - if (PyString_Check( py_obj)) return "string" ; - if (PyInt_Check( py_obj)) return "int" ; - if (PyFloat_Check( py_obj)) return "float" ; - if (PyDict_Check( py_obj)) return "dict" ; - if (PyList_Check( py_obj)) return "list" ; - if (PyTuple_Check( py_obj)) return "tuple" ; +#define is_array(a) ((a) && PyArray_Check(a)) +#define array_type(a) PyArray_TYPE((PyArrayObject *)a) +#define array_numdims(a) PyArray_NDIM((PyArrayObject *)a) +#define array_dimensions(a) PyArray_DIMS((PyArrayObject *)a) +#define array_strides(a) PyArray_STRIDES((PyArrayObject *)a) +#define array_stride(a, i) PyArray_STRIDE((PyArrayObject *)a, i) +#define array_size(a, i) PyArray_DIM((PyArrayObject *)a, i) +#define array_data(a) PyArray_DATA((PyArrayObject *)a) +#define array_descr(a) PyArray_DESCR((PyArrayObject *)a) +#define array_flags(a) PyArray_FLAGS((PyArrayObject *)a) +#define array_enableflags(a, f) PyArray_ENABLEFLAGS((PyArrayObject *)a, f) +#define array_clearflags(a, f) PyArray_CLEARFLAGS((PyArrayObject *)a, f) +#define array_is_fortran(a) (PyArray_IS_F_CONTIGUOUS((PyArrayObject *)a)) +#endif +#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject *)a)) +#define array_is_native(a) (PyArray_ISNOTSWAPPED((PyArrayObject *)a)) + +/* Given a PyObject, return a string describing its type. + */ +const char *pytype_string(PyObject *py_obj) { + if (py_obj == NULL) + return "C NULL value"; + if (py_obj == Py_None) + return "Python None"; + if (PyCallable_Check(py_obj)) + return "callable"; + if (PyString_Check(py_obj)) + return "string"; + if (PyInt_Check(py_obj)) + return "int"; + if (PyFloat_Check(py_obj)) + return "float"; + if (PyDict_Check(py_obj)) + return "dict"; + if (PyList_Check(py_obj)) + return "list"; + if (PyTuple_Check(py_obj)) + return "tuple"; #if PY_MAJOR_VERSION < 3 - if (PyFile_Check( py_obj)) return "file" ; - if (PyModule_Check( py_obj)) return "module" ; - if (PyInstance_Check(py_obj)) return "instance" ; + if (PyFile_Check(py_obj)) + return "file"; + if (PyModule_Check(py_obj)) + return "module"; + if (PyInstance_Check(py_obj)) + return "instance"; #endif - return "unknown type"; - } + return "unknown type"; +} - /* Given a NumPy typecode, return a string describing the type. - */ - const char* typecode_string(int typecode) - { - static const char* type_names[25] = {"bool", - "byte", - "unsigned byte", - "short", - "unsigned short", - "int", - "unsigned int", - "long", - "unsigned long", - "long long", - "unsigned long long", - "float", - "double", - "long double", - "complex float", - "complex double", - "complex long double", - "object", - "string", - "unicode", - "void", - "ntypes", - "notype", - "char", - "unknown"}; - return typecode < 24 ? type_names[typecode] : type_names[24]; - } - - /* Make sure input has correct numpy type. This now just calls - PyArray_EquivTypenums(). - */ - int type_match(int actual_type, - int desired_type) - { - return PyArray_EquivTypenums(actual_type, desired_type); - } +/* Given a NumPy typecode, return a string describing the type. + */ +const char *typecode_string(int typecode) { + static const char *type_names[25] = {"bool", + "byte", + "unsigned byte", + "short", + "unsigned short", + "int", + "unsigned int", + "long", + "unsigned long", + "long long", + "unsigned long long", + "float", + "double", + "long double", + "complex float", + "complex double", + "complex long double", + "object", + "string", + "unicode", + "void", + "ntypes", + "notype", + "char", + "unknown"}; + return typecode < 24 ? type_names[typecode] : type_names[24]; +} + +/* Make sure input has correct numpy type. This now just calls + PyArray_EquivTypenums(). + */ +int type_match(int actual_type, int desired_type) { + return PyArray_EquivTypenums(actual_type, desired_type); +} #ifdef SWIGPY_USE_CAPSULE - void free_cap(PyObject * cap) - { - void* array = (void*) PyCapsule_GetPointer(cap,SWIGPY_CAPSULE_NAME); - if (array != NULL) free(array); - } +void free_cap(PyObject *cap) { + void *array = (void *)PyCapsule_GetPointer(cap, SWIGPY_CAPSULE_NAME); + if (array != NULL) + free(array); +} #endif - - - - /* Given a PyObject pointer, cast it to a PyArrayObject pointer if - * legal. If not, set the python error string appropriately and - * return NULL. - */ - PyArrayObject* obj_to_array_no_conversion(PyObject* input, - int typecode) - { - PyArrayObject* ary = NULL; - if (is_array(input) && (typecode == NPY_NOTYPE || - PyArray_EquivTypenums(array_type(input), typecode))) - { - ary = (PyArrayObject*) input; - } - else if is_array(input) - { - const char* desired_type = typecode_string(typecode); - const char* actual_type = typecode_string(array_type(input)); +/* Given a PyObject pointer, cast it to a PyArrayObject pointer if + * legal. If not, set the python error string appropriately and + * return NULL. + */ +PyArrayObject *obj_to_array_no_conversion(PyObject *input, int typecode) { + PyArrayObject *ary = NULL; + if (is_array(input) && (typecode == NPY_NOTYPE || + PyArray_EquivTypenums(array_type(input), typecode))) { + ary = (PyArrayObject *)input; + } else if + is_array(input) { + const char *desired_type = typecode_string(typecode); + const char *actual_type = typecode_string(array_type(input)); PyErr_Format(PyExc_TypeError, "Array of type '%s' required. Array of type '%s' given", desired_type, actual_type); ary = NULL; } - else - { - const char* desired_type = typecode_string(typecode); - const char* actual_type = pytype_string(input); - PyErr_Format(PyExc_TypeError, - "Array of type '%s' required. A '%s' was given", - desired_type, - actual_type); - ary = NULL; - } - return ary; + else { + const char *desired_type = typecode_string(typecode); + const char *actual_type = pytype_string(input); + PyErr_Format(PyExc_TypeError, + "Array of type '%s' required. A '%s' was given", desired_type, + actual_type); + ary = NULL; } + return ary; +} - /* Convert the given PyObject to a NumPy array with the given - * typecode. On success, return a valid PyArrayObject* with the - * correct type. On failure, the python error string will be set and - * the routine returns NULL. - */ - PyArrayObject* obj_to_array_allow_conversion(PyObject* input, - int typecode, - int* is_new_object) - { - PyArrayObject* ary = NULL; - PyObject* py_obj; - if (is_array(input) && (typecode == NPY_NOTYPE || - PyArray_EquivTypenums(array_type(input),typecode))) - { - ary = (PyArrayObject*) input; - *is_new_object = 0; - } - else - { - py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT); - /* If NULL, PyArray_FromObject will have set python error value.*/ - ary = (PyArrayObject*) py_obj; - *is_new_object = 1; - } - return ary; - } - - /* Given a PyArrayObject, check to see if it is contiguous. If so, - * return the input pointer and flag it as not a new object. If it is - * not contiguous, create a new PyArrayObject using the original data, - * flag it as a new object and return the pointer. - */ - PyArrayObject* make_contiguous(PyArrayObject* ary, - int* is_new_object, - int min_dims, - int max_dims) - { - PyArrayObject* result; - if (array_is_contiguous(ary)) - { - result = ary; - *is_new_object = 0; - } - else - { - result = (PyArrayObject*) PyArray_ContiguousFromObject((PyObject*)ary, - array_type(ary), - min_dims, - max_dims); - *is_new_object = 1; - } - return result; +/* Convert the given PyObject to a NumPy array with the given + * typecode. On success, return a valid PyArrayObject* with the + * correct type. On failure, the python error string will be set and + * the routine returns NULL. + */ +PyArrayObject *obj_to_array_allow_conversion(PyObject *input, int typecode, + int *is_new_object) { + PyArrayObject *ary = NULL; + PyObject *py_obj; + if (is_array(input) && (typecode == NPY_NOTYPE || + PyArray_EquivTypenums(array_type(input), typecode))) { + ary = (PyArrayObject *)input; + *is_new_object = 0; + } else { + py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT); + /* If NULL, PyArray_FromObject will have set python error value.*/ + ary = (PyArrayObject *)py_obj; + *is_new_object = 1; } + return ary; +} - /* Given a PyArrayObject, check to see if it is Fortran-contiguous. - * If so, return the input pointer, but do not flag it as not a new - * object. If it is not Fortran-contiguous, create a new - * PyArrayObject using the original data, flag it as a new object - * and return the pointer. - */ - PyArrayObject* make_fortran(PyArrayObject* ary, - int* is_new_object) - { - PyArrayObject* result; - if (array_is_fortran(ary)) - { - result = ary; - *is_new_object = 0; - } - else - { - Py_INCREF(array_descr(ary)); - result = (PyArrayObject*) PyArray_FromArray(ary, - array_descr(ary), +/* Given a PyArrayObject, check to see if it is contiguous. If so, + * return the input pointer and flag it as not a new object. If it is + * not contiguous, create a new PyArrayObject using the original data, + * flag it as a new object and return the pointer. + */ +PyArrayObject *make_contiguous(PyArrayObject *ary, int *is_new_object, + int min_dims, int max_dims) { + PyArrayObject *result; + if (array_is_contiguous(ary)) { + result = ary; + *is_new_object = 0; + } else { + result = (PyArrayObject *)PyArray_ContiguousFromObject( + (PyObject *)ary, array_type(ary), min_dims, max_dims); + *is_new_object = 1; + } + return result; +} + +/* Given a PyArrayObject, check to see if it is Fortran-contiguous. + * If so, return the input pointer, but do not flag it as not a new + * object. If it is not Fortran-contiguous, create a new + * PyArrayObject using the original data, flag it as a new object + * and return the pointer. + */ +PyArrayObject *make_fortran(PyArrayObject *ary, int *is_new_object) { + PyArrayObject *result; + if (array_is_fortran(ary)) { + result = ary; + *is_new_object = 0; + } else { + Py_INCREF(array_descr(ary)); + result = (PyArrayObject *)PyArray_FromArray(ary, array_descr(ary), #if NPY_API_VERSION < 0x00000007 - NPY_FORTRANORDER); + NPY_FORTRANORDER); #else - NPY_ARRAY_F_CONTIGUOUS); + NPY_ARRAY_F_CONTIGUOUS); #endif - *is_new_object = 1; - } - return result; + *is_new_object = 1; } + return result; +} - /* Convert a given PyObject to a contiguous PyArrayObject of the - * specified type. If the input object is not a contiguous - * PyArrayObject, a new one will be created and the new object flag - * will be set. - */ - PyArrayObject* obj_to_array_contiguous_allow_conversion(PyObject* input, - int typecode, - int* is_new_object) - { - int is_new1 = 0; - int is_new2 = 0; - PyArrayObject* ary2; - PyArrayObject* ary1 = obj_to_array_allow_conversion(input, - typecode, - &is_new1); - if (ary1) - { - ary2 = make_contiguous(ary1, &is_new2, 0, 0); - if ( is_new1 && is_new2) - { - Py_DECREF(ary1); - } - ary1 = ary2; - } - *is_new_object = is_new1 || is_new2; - return ary1; - } - - /* Convert a given PyObject to a Fortran-ordered PyArrayObject of the - * specified type. If the input object is not a Fortran-ordered - * PyArrayObject, a new one will be created and the new object flag - * will be set. - */ - PyArrayObject* obj_to_array_fortran_allow_conversion(PyObject* input, - int typecode, - int* is_new_object) - { - int is_new1 = 0; - int is_new2 = 0; - PyArrayObject* ary2; - PyArrayObject* ary1 = obj_to_array_allow_conversion(input, - typecode, - &is_new1); - if (ary1) - { - ary2 = make_fortran(ary1, &is_new2); - if (is_new1 && is_new2) - { - Py_DECREF(ary1); - } - ary1 = ary2; +/* Convert a given PyObject to a contiguous PyArrayObject of the + * specified type. If the input object is not a contiguous + * PyArrayObject, a new one will be created and the new object flag + * will be set. + */ +PyArrayObject *obj_to_array_contiguous_allow_conversion(PyObject *input, + int typecode, + int *is_new_object) { + int is_new1 = 0; + int is_new2 = 0; + PyArrayObject *ary2; + PyArrayObject *ary1 = + obj_to_array_allow_conversion(input, typecode, &is_new1); + if (ary1) { + ary2 = make_contiguous(ary1, &is_new2, 0, 0); + if (is_new1 && is_new2) { + Py_DECREF(ary1); } - *is_new_object = is_new1 || is_new2; - return ary1; + ary1 = ary2; } + *is_new_object = is_new1 || is_new2; + return ary1; +} - - /* Test whether a python object is contiguous. If array is - * contiguous, return 1. Otherwise, set the python error string and - * return 0. - */ - int require_contiguous(PyArrayObject* ary) - { - int contiguous = 1; - if (!array_is_contiguous(ary)) - { - PyErr_SetString(PyExc_TypeError, - "Array must be contiguous. A non-contiguous array was given"); - contiguous = 0; +/* Convert a given PyObject to a Fortran-ordered PyArrayObject of the + * specified type. If the input object is not a Fortran-ordered + * PyArrayObject, a new one will be created and the new object flag + * will be set. + */ +PyArrayObject *obj_to_array_fortran_allow_conversion(PyObject *input, + int typecode, + int *is_new_object) { + int is_new1 = 0; + int is_new2 = 0; + PyArrayObject *ary2; + PyArrayObject *ary1 = + obj_to_array_allow_conversion(input, typecode, &is_new1); + if (ary1) { + ary2 = make_fortran(ary1, &is_new2); + if (is_new1 && is_new2) { + Py_DECREF(ary1); } - return contiguous; + ary1 = ary2; } + *is_new_object = is_new1 || is_new2; + return ary1; +} - /* Test whether a python object is (C_ or F_) contiguous. If array is - * contiguous, return 1. Otherwise, set the python error string and - * return 0. - */ - int require_c_or_f_contiguous(PyArrayObject* ary) - { - int contiguous = 1; - if (!(array_is_contiguous(ary) || array_is_fortran(ary))) - { - PyErr_SetString(PyExc_TypeError, - "Array must be contiguous (C_ or F_). A non-contiguous array was given"); - contiguous = 0; - } - return contiguous; +/* Test whether a python object is contiguous. If array is + * contiguous, return 1. Otherwise, set the python error string and + * return 0. + */ +int require_contiguous(PyArrayObject *ary) { + int contiguous = 1; + if (!array_is_contiguous(ary)) { + PyErr_SetString( + PyExc_TypeError, + "Array must be contiguous. A non-contiguous array was given"); + contiguous = 0; } + return contiguous; +} - /* Require that a numpy array is not byte-swapped. If the array is - * not byte-swapped, return 1. Otherwise, set the python error string - * and return 0. - */ - int require_native(PyArrayObject* ary) - { - int native = 1; - if (!array_is_native(ary)) - { - PyErr_SetString(PyExc_TypeError, - "Array must have native byteorder. " - "A byte-swapped array was given"); - native = 0; - } - return native; +/* Test whether a python object is (C_ or F_) contiguous. If array is + * contiguous, return 1. Otherwise, set the python error string and + * return 0. + */ +int require_c_or_f_contiguous(PyArrayObject *ary) { + int contiguous = 1; + if (!(array_is_contiguous(ary) || array_is_fortran(ary))) { + PyErr_SetString(PyExc_TypeError, "Array must be contiguous (C_ or F_). A " + "non-contiguous array was given"); + contiguous = 0; } + return contiguous; +} - /* Require the given PyArrayObject to have a specified number of - * dimensions. If the array has the specified number of dimensions, - * return 1. Otherwise, set the python error string and return 0. - */ - int require_dimensions(PyArrayObject* ary, - int exact_dimensions) - { - int success = 1; - if (array_numdims(ary) != exact_dimensions) - { - PyErr_Format(PyExc_TypeError, - "Array must have %d dimensions. Given array has %d dimensions", - exact_dimensions, - array_numdims(ary)); - success = 0; - } - return success; +/* Require that a numpy array is not byte-swapped. If the array is + * not byte-swapped, return 1. Otherwise, set the python error string + * and return 0. + */ +int require_native(PyArrayObject *ary) { + int native = 1; + if (!array_is_native(ary)) { + PyErr_SetString(PyExc_TypeError, "Array must have native byteorder. " + "A byte-swapped array was given"); + native = 0; } + return native; +} - /* Require the given PyArrayObject to have one of a list of specified - * number of dimensions. If the array has one of the specified number - * of dimensions, return 1. Otherwise, set the python error string - * and return 0. - */ - int require_dimensions_n(PyArrayObject* ary, - int* exact_dimensions, - int n) - { - int success = 0; - int i; - char dims_str[255] = ""; - char s[255]; - for (i = 0; i < n && !success; i++) - { - if (array_numdims(ary) == exact_dimensions[i]) - { - success = 1; - } +/* Require the given PyArrayObject to have a specified number of + * dimensions. If the array has the specified number of dimensions, + * return 1. Otherwise, set the python error string and return 0. + */ +int require_dimensions(PyArrayObject *ary, int exact_dimensions) { + int success = 1; + if (array_numdims(ary) != exact_dimensions) { + PyErr_Format( + PyExc_TypeError, + "Array must have %d dimensions. Given array has %d dimensions", + exact_dimensions, array_numdims(ary)); + success = 0; + } + return success; +} + +/* Require the given PyArrayObject to have one of a list of specified + * number of dimensions. If the array has one of the specified number + * of dimensions, return 1. Otherwise, set the python error string + * and return 0. + */ +int require_dimensions_n(PyArrayObject *ary, int *exact_dimensions, int n) { + int success = 0; + int i; + char dims_str[255] = ""; + char s[255]; + for (i = 0; i < n && !success; i++) { + if (array_numdims(ary) == exact_dimensions[i]) { + success = 1; } - if (!success) - { - for (i = 0; i < n-1; i++) - { - sprintf(s, "%d, ", exact_dimensions[i]); - strcat(dims_str,s); - } - sprintf(s, " or %d", exact_dimensions[n-1]); - strcat(dims_str,s); - PyErr_Format(PyExc_TypeError, - "Array must have %s dimensions. Given array has %d dimensions", - dims_str, - array_numdims(ary)); + } + if (!success) { + for (i = 0; i < n - 1; i++) { + sprintf(s, "%d, ", exact_dimensions[i]); + strcat(dims_str, s); } - return success; + sprintf(s, " or %d", exact_dimensions[n - 1]); + strcat(dims_str, s); + PyErr_Format( + PyExc_TypeError, + "Array must have %s dimensions. Given array has %d dimensions", + dims_str, array_numdims(ary)); } + return success; +} - /* Require the given PyArrayObject to have a specified shape. If the - * array has the specified shape, return 1. Otherwise, set the python - * error string and return 0. - */ - int require_size(PyArrayObject* ary, - npy_intp* size, - int n) - { - int i; - int success = 1; - size_t len; - char desired_dims[255] = "["; - char s[255]; - char actual_dims[255] = "["; - for(i=0; i < n;i++) - { - if (size[i] != -1 && size[i] != array_size(ary,i)) - { - success = 0; - } +/* Require the given PyArrayObject to have a specified shape. If the + * array has the specified shape, return 1. Otherwise, set the python + * error string and return 0. + */ +int require_size(PyArrayObject *ary, npy_intp *size, int n) { + int i; + int success = 1; + size_t len; + char desired_dims[255] = "["; + char s[255]; + char actual_dims[255] = "["; + for (i = 0; i < n; i++) { + if (size[i] != -1 && size[i] != array_size(ary, i)) { + success = 0; } - if (!success) - { - for (i = 0; i < n; i++) - { - if (size[i] == -1) - { - sprintf(s, "*,"); - } - else - { - sprintf(s, "%ld,", (long int)size[i]); - } - strcat(desired_dims,s); - } - len = strlen(desired_dims); - desired_dims[len-1] = ']'; - for (i = 0; i < n; i++) - { - sprintf(s, "%ld,", (long int)array_size(ary,i)); - strcat(actual_dims,s); + } + if (!success) { + for (i = 0; i < n; i++) { + if (size[i] == -1) { + sprintf(s, "*,"); + } else { + sprintf(s, "%ld,", (long int)size[i]); } - len = strlen(actual_dims); - actual_dims[len-1] = ']'; - PyErr_Format(PyExc_TypeError, - "Array must have shape of %s. Given array has shape of %s", - desired_dims, - actual_dims); + strcat(desired_dims, s); } - return success; + len = strlen(desired_dims); + desired_dims[len - 1] = ']'; + for (i = 0; i < n; i++) { + sprintf(s, "%ld,", (long int)array_size(ary, i)); + strcat(actual_dims, s); + } + len = strlen(actual_dims); + actual_dims[len - 1] = ']'; + PyErr_Format(PyExc_TypeError, + "Array must have shape of %s. Given array has shape of %s", + desired_dims, actual_dims); } + return success; +} - /* Require the given PyArrayObject to to be Fortran ordered. If the - * the PyArrayObject is already Fortran ordered, do nothing. Else, - * set the Fortran ordering flag and recompute the strides. - */ - int require_fortran(PyArrayObject* ary) - { - int success = 1; - int nd = array_numdims(ary); - int i; - npy_intp * strides = array_strides(ary); - if (array_is_fortran(ary)) return success; - int n_non_one = 0; - /* Set the Fortran ordered flag */ - const npy_intp *dims = array_dimensions(ary); - for (i=0; i < nd; ++i) - n_non_one += (dims[i] != 1) ? 1 : 0; - if (n_non_one > 1) - array_clearflags(ary,NPY_ARRAY_CARRAY); - array_enableflags(ary,NPY_ARRAY_FARRAY); - /* Recompute the strides */ - strides[0] = strides[nd-1]; - for (i=1; i < nd; ++i) - strides[i] = strides[i-1] * array_size(ary,i-1); +/* Require the given PyArrayObject to to be Fortran ordered. If the + * the PyArrayObject is already Fortran ordered, do nothing. Else, + * set the Fortran ordering flag and recompute the strides. + */ +int require_fortran(PyArrayObject *ary) { + int success = 1; + int nd = array_numdims(ary); + int i; + npy_intp *strides = array_strides(ary); + if (array_is_fortran(ary)) return success; - } - - - + int n_non_one = 0; + /* Set the Fortran ordered flag */ + const npy_intp *dims = array_dimensions(ary); + for (i = 0; i < nd; ++i) + n_non_one += (dims[i] != 1) ? 1 : 0; + if (n_non_one > 1) + array_clearflags(ary, NPY_ARRAY_CARRAY); + array_enableflags(ary, NPY_ARRAY_FARRAY); + /* Recompute the strides */ + strides[0] = strides[nd - 1]; + for (i = 1; i < nd; ++i) + strides[i] = strides[i - 1] * array_size(ary, i - 1); + return success; +} #include <limits.h> #if !defined(SWIG_NO_LLONG_MAX) -# if !defined(LLONG_MAX) && defined(__GNUC__) && defined (__LONG_LONG_MAX__) -# define LLONG_MAX __LONG_LONG_MAX__ -# define LLONG_MIN (-LLONG_MAX - 1LL) -# define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL) -# endif +#if !defined(LLONG_MAX) && defined(__GNUC__) && defined(__LONG_LONG_MAX__) +#define LLONG_MAX __LONG_LONG_MAX__ +#define LLONG_MIN (-LLONG_MAX - 1LL) +#define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL) +#endif #endif - -SWIGINTERN int -SWIG_AsVal_double (PyObject *obj, double *val) -{ +SWIGINTERN int SWIG_AsVal_double(PyObject *obj, double *val) { int res = SWIG_TypeError; if (PyFloat_Check(obj)) { - if (val) *val = PyFloat_AsDouble(obj); + if (val) + *val = PyFloat_AsDouble(obj); return SWIG_OK; #if PY_VERSION_HEX < 0x03000000 } else if (PyInt_Check(obj)) { - if (val) *val = PyInt_AsLong(obj); + if (val) + *val = PyInt_AsLong(obj); return SWIG_OK; #endif } else if (PyLong_Check(obj)) { double v = PyLong_AsDouble(obj); if (!PyErr_Occurred()) { - if (val) *val = v; + if (val) + *val = v; return SWIG_OK; } else { PyErr_Clear(); @@ -3552,7 +3573,8 @@ SWIG_AsVal_double (PyObject *obj, double *val) int dispatch = 0; double d = PyFloat_AsDouble(obj); if (!PyErr_Occurred()) { - if (val) *val = d; + if (val) + *val = d; return SWIG_AddCast(SWIG_OK); } else { PyErr_Clear(); @@ -3560,10 +3582,11 @@ SWIG_AsVal_double (PyObject *obj, double *val) if (!dispatch) { long v = PyLong_AsLong(obj); if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_AddCast(SWIG_AddCast(SWIG_OK)); + if (val) + *val = v; + return SWIG_AddCast(SWIG_AddCast(SWIG_OK)); } else { - PyErr_Clear(); + PyErr_Clear(); } } } @@ -3571,56 +3594,51 @@ SWIG_AsVal_double (PyObject *obj, double *val) return res; } - #include <float.h> - #include <math.h> - -SWIGINTERNINLINE int -SWIG_CanCastAsInteger(double *d, double min, double max) { +SWIGINTERNINLINE int SWIG_CanCastAsInteger(double *d, double min, double max) { double x = *d; if ((min <= x && x <= max)) { - double fx = floor(x); - double cx = ceil(x); - double rd = ((x - fx) < 0.5) ? fx : cx; /* simple rint */ - if ((errno == EDOM) || (errno == ERANGE)) { - errno = 0; - } else { - double summ, reps, diff; - if (rd < x) { - diff = x - rd; - } else if (rd > x) { - diff = rd - x; - } else { - return 1; - } - summ = rd + x; - reps = diff/summ; - if (reps < 8*DBL_EPSILON) { - *d = rd; - return 1; - } - } + double fx = floor(x); + double cx = ceil(x); + double rd = ((x - fx) < 0.5) ? fx : cx; /* simple rint */ + if ((errno == EDOM) || (errno == ERANGE)) { + errno = 0; + } else { + double summ, reps, diff; + if (rd < x) { + diff = x - rd; + } else if (rd > x) { + diff = rd - x; + } else { + return 1; + } + summ = rd + x; + reps = diff / summ; + if (reps < 8 * DBL_EPSILON) { + *d = rd; + return 1; + } + } } return 0; } - -SWIGINTERN int -SWIG_AsVal_long (PyObject *obj, long* val) -{ +SWIGINTERN int SWIG_AsVal_long(PyObject *obj, long *val) { #if PY_VERSION_HEX < 0x03000000 if (PyInt_Check(obj)) { - if (val) *val = PyInt_AsLong(obj); + if (val) + *val = PyInt_AsLong(obj); return SWIG_OK; } else #endif - if (PyLong_Check(obj)) { + if (PyLong_Check(obj)) { long v = PyLong_AsLong(obj); if (!PyErr_Occurred()) { - if (val) *val = v; + if (val) + *val = v; return SWIG_OK; } else { PyErr_Clear(); @@ -3632,17 +3650,19 @@ SWIG_AsVal_long (PyObject *obj, long* val) int dispatch = 0; long v = PyInt_AsLong(obj); if (!PyErr_Occurred()) { - if (val) *val = v; + if (val) + *val = v; return SWIG_AddCast(SWIG_OK); } else { PyErr_Clear(); } if (!dispatch) { double d; - int res = SWIG_AddCast(SWIG_AsVal_double (obj,&d)); + int res = SWIG_AddCast(SWIG_AsVal_double(obj, &d)); if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, LONG_MIN, LONG_MAX)) { - if (val) *val = (long)(d); - return res; + if (val) + *val = (long)(d); + return res; } } } @@ -3650,291 +3670,334 @@ SWIG_AsVal_long (PyObject *obj, long* val) return SWIG_TypeError; } - -SWIGINTERN int -SWIG_AsVal_int (PyObject * obj, int *val) -{ +SWIGINTERN int SWIG_AsVal_int(PyObject *obj, int *val) { long v; - int res = SWIG_AsVal_long (obj, &v); + int res = SWIG_AsVal_long(obj, &v); if (SWIG_IsOK(res)) { if ((v < INT_MIN || v > INT_MAX)) { return SWIG_OverflowError; } else { - if (val) *val = (int)(v); + if (val) + *val = (int)(v); } - } + } return res; } #ifdef __cplusplus extern "C" { #endif -SWIGINTERN PyObject *_wrap_gamut_map__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_gamut_map__SWIG_0(PyObject *SWIGUNUSEDPARM(self), + PyObject *args) { PyObject *resultobj = 0; - float *arg1 = (float *) 0 ; - int arg2 ; - int arg3 ; - int arg4 ; - float *arg5 = (float *) 0 ; - int arg6 ; - int arg7 ; - int arg8 ; - float *arg9 = (float *) 0 ; - int arg10 ; - int arg11 ; - float *arg12 = (float *) 0 ; - int arg13 ; - int arg14 ; - float *arg15 = (float *) 0 ; - int arg16 ; - int arg17 ; - PyArrayObject *array1 = NULL ; - int is_new_object1 = 0 ; - PyArrayObject *array5 = NULL ; - PyArrayObject *array9 = NULL ; - int is_new_object9 = 0 ; - PyArrayObject *array12 = NULL ; - int is_new_object12 = 0 ; - PyArrayObject *array15 = NULL ; - int is_new_object15 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOOO:gamut_map",&obj0,&obj1,&obj2,&obj3,&obj4)) SWIG_fail; + float *arg1 = (float *)0; + int arg2; + int arg3; + int arg4; + float *arg5 = (float *)0; + int arg6; + int arg7; + int arg8; + float *arg9 = (float *)0; + int arg10; + int arg11; + float *arg12 = (float *)0; + int arg13; + int arg14; + float *arg15 = (float *)0; + int arg16; + int arg17; + PyArrayObject *array1 = NULL; + int is_new_object1 = 0; + PyArrayObject *array5 = NULL; + PyArrayObject *array9 = NULL; + int is_new_object9 = 0; + PyArrayObject *array12 = NULL; + int is_new_object12 = 0; + PyArrayObject *array15 = NULL; + int is_new_object15 = 0; + PyObject *obj0 = 0; + PyObject *obj1 = 0; + PyObject *obj2 = 0; + PyObject *obj3 = 0; + PyObject *obj4 = 0; + + if (!PyArg_ParseTuple(args, (char *)"OOOOO:gamut_map", &obj0, &obj1, &obj2, + &obj3, &obj4)) + SWIG_fail; { - npy_intp size[3] = { - -1, -1, -1 - }; + npy_intp size[3] = {-1, -1, -1}; array1 = obj_to_array_contiguous_allow_conversion(obj0, NPY_FLOAT, - &is_new_object1); + &is_new_object1); if (!array1 || !require_dimensions(array1, 3) || - !require_size(array1, size, 3)) SWIG_fail; - arg1 = (float*) array_data(array1); - arg2 = (int) array_size(array1,0); - arg3 = (int) array_size(array1,1); - arg4 = (int) array_size(array1,2); + !require_size(array1, size, 3)) + SWIG_fail; + arg1 = (float *)array_data(array1); + arg2 = (int)array_size(array1, 0); + arg3 = (int)array_size(array1, 1); + arg4 = (int)array_size(array1, 2); } { array5 = obj_to_array_no_conversion(obj1, NPY_FLOAT); - if (!array5 || !require_dimensions(array5,3) || !require_contiguous(array5) || - !require_native(array5)) SWIG_fail; - arg5 = (float*) array_data(array5); - arg6 = (int) array_size(array5,0); - arg7 = (int) array_size(array5,1); - arg8 = (int) array_size(array5,2); + if (!array5 || !require_dimensions(array5, 3) || + !require_contiguous(array5) || !require_native(array5)) + SWIG_fail; + arg5 = (float *)array_data(array5); + arg6 = (int)array_size(array5, 0); + arg7 = (int)array_size(array5, 1); + arg8 = (int)array_size(array5, 2); } { - npy_intp size[2] = { - -1, -1 - }; + npy_intp size[2] = {-1, -1}; array9 = obj_to_array_contiguous_allow_conversion(obj2, NPY_FLOAT, - &is_new_object9); + &is_new_object9); if (!array9 || !require_dimensions(array9, 2) || - !require_size(array9, size, 2)) SWIG_fail; - arg9 = (float*) array_data(array9); - arg10 = (int) array_size(array9,0); - arg11 = (int) array_size(array9,1); + !require_size(array9, size, 2)) + SWIG_fail; + arg9 = (float *)array_data(array9); + arg10 = (int)array_size(array9, 0); + arg11 = (int)array_size(array9, 1); } { - npy_intp size[2] = { - -1, -1 - }; + npy_intp size[2] = {-1, -1}; array12 = obj_to_array_contiguous_allow_conversion(obj3, NPY_FLOAT, - &is_new_object12); + &is_new_object12); if (!array12 || !require_dimensions(array12, 2) || - !require_size(array12, size, 2)) SWIG_fail; - arg12 = (float*) array_data(array12); - arg13 = (int) array_size(array12,0); - arg14 = (int) array_size(array12,1); + !require_size(array12, size, 2)) + SWIG_fail; + arg12 = (float *)array_data(array12); + arg13 = (int)array_size(array12, 0); + arg14 = (int)array_size(array12, 1); } { - npy_intp size[2] = { - -1, -1 - }; + npy_intp size[2] = {-1, -1}; array15 = obj_to_array_contiguous_allow_conversion(obj4, NPY_FLOAT, - &is_new_object15); + &is_new_object15); if (!array15 || !require_dimensions(array15, 2) || - !require_size(array15, size, 2)) SWIG_fail; - arg15 = (float*) array_data(array15); - arg16 = (int) array_size(array15,0); - arg17 = (int) array_size(array15,1); + !require_size(array15, size, 2)) + SWIG_fail; + arg15 = (float *)array_data(array15); + arg16 = (int)array_size(array15, 0); + arg17 = (int)array_size(array15, 1); } { - gamut_map_full(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17); - if (PyErr_Occurred()) SWIG_fail; + gamut_map_full(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, + arg11, arg12, arg13, arg14, arg15, arg16, arg17); + if (PyErr_Occurred()) + SWIG_fail; } resultobj = SWIG_Py_Void(); { - if (is_new_object1 && array1) - { - Py_DECREF(array1); + if (is_new_object1 && array1) { + Py_DECREF(array1); } } { - if (is_new_object9 && array9) - { - Py_DECREF(array9); + if (is_new_object9 && array9) { + Py_DECREF(array9); } } { - if (is_new_object12 && array12) - { - Py_DECREF(array12); + if (is_new_object12 && array12) { + Py_DECREF(array12); } } { - if (is_new_object15 && array15) - { - Py_DECREF(array15); + if (is_new_object15 && array15) { + Py_DECREF(array15); } } return resultobj; -fail: - { - if (is_new_object1 && array1) - { - Py_DECREF(array1); - } +fail : { + if (is_new_object1 && array1) { + Py_DECREF(array1); } +} { - if (is_new_object9 && array9) - { - Py_DECREF(array9); + if (is_new_object9 && array9) { + Py_DECREF(array9); } } { - if (is_new_object12 && array12) - { - Py_DECREF(array12); + if (is_new_object12 && array12) { + Py_DECREF(array12); } } { - if (is_new_object15 && array15) - { - Py_DECREF(array15); + if (is_new_object15 && array15) { + Py_DECREF(array15); } } return NULL; } - -SWIGINTERN PyObject *_wrap_gamut_map__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_gamut_map__SWIG_1(PyObject *SWIGUNUSEDPARM(self), + PyObject *args) { PyObject *resultobj = 0; - float *arg1 = (float *) 0 ; - int arg2 ; - int arg3 ; - int arg4 ; - float *arg5 = (float *) 0 ; - float *arg6 = (float *) 0 ; - float *arg7 = (float *) 0 ; - float *arg8 = (float *) 0 ; - int arg9 ; - void *argp1 = 0 ; - int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - int val3 ; - int ecode3 = 0 ; - int val4 ; - int ecode4 = 0 ; - void *argp5 = 0 ; - int res5 = 0 ; - void *argp6 = 0 ; - int res6 = 0 ; - void *argp7 = 0 ; - int res7 = 0 ; - void *argp8 = 0 ; - int res8 = 0 ; - int val9 ; - int ecode9 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - PyObject * obj7 = 0 ; - PyObject * obj8 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOOOO:gamut_map",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7,&obj8)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_float, 0 | 0 ); + float *arg1 = (float *)0; + int arg2; + int arg3; + int arg4; + float *arg5 = (float *)0; + float *arg6 = (float *)0; + float *arg7 = (float *)0; + float *arg8 = (float *)0; + int arg9; + void *argp1 = 0; + int res1 = 0; + int val2; + int ecode2 = 0; + int val3; + int ecode3 = 0; + int val4; + int ecode4 = 0; + void *argp5 = 0; + int res5 = 0; + void *argp6 = 0; + int res6 = 0; + void *argp7 = 0; + int res7 = 0; + void *argp8 = 0; + int res8 = 0; + int val9; + int ecode9 = 0; + PyObject *obj0 = 0; + PyObject *obj1 = 0; + PyObject *obj2 = 0; + PyObject *obj3 = 0; + PyObject *obj4 = 0; + PyObject *obj5 = 0; + PyObject *obj6 = 0; + PyObject *obj7 = 0; + PyObject *obj8 = 0; + + if (!PyArg_ParseTuple(args, (char *)"OOOOOOOOO:gamut_map", &obj0, &obj1, + &obj2, &obj3, &obj4, &obj5, &obj6, &obj7, &obj8)) + SWIG_fail; + res1 = SWIG_ConvertPtr(obj0, &argp1, SWIGTYPE_p_float, 0 | 0); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "gamut_map" "', argument " "1"" of type '" "float *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" + "gamut_map" + "', argument " + "1" + " of type '" + "float *" + "'"); } arg1 = (float *)(argp1); ecode2 = SWIG_AsVal_int(obj1, &val2); if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "gamut_map" "', argument " "2"" of type '" "int""'"); - } + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" + "gamut_map" + "', argument " + "2" + " of type '" + "int" + "'"); + } arg2 = (int)(val2); ecode3 = SWIG_AsVal_int(obj2, &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "gamut_map" "', argument " "3"" of type '" "int""'"); - } + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" + "gamut_map" + "', argument " + "3" + " of type '" + "int" + "'"); + } arg3 = (int)(val3); ecode4 = SWIG_AsVal_int(obj3, &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "gamut_map" "', argument " "4"" of type '" "int""'"); - } + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" + "gamut_map" + "', argument " + "4" + " of type '" + "int" + "'"); + } arg4 = (int)(val4); - res5 = SWIG_ConvertPtr(obj4, &argp5,SWIGTYPE_p_float, 0 | 0 ); + res5 = SWIG_ConvertPtr(obj4, &argp5, SWIGTYPE_p_float, 0 | 0); if (!SWIG_IsOK(res5)) { - SWIG_exception_fail(SWIG_ArgError(res5), "in method '" "gamut_map" "', argument " "5"" of type '" "float *""'"); + SWIG_exception_fail(SWIG_ArgError(res5), "in method '" + "gamut_map" + "', argument " + "5" + " of type '" + "float *" + "'"); } arg5 = (float *)(argp5); - res6 = SWIG_ConvertPtr(obj5, &argp6,SWIGTYPE_p_float, 0 | 0 ); + res6 = SWIG_ConvertPtr(obj5, &argp6, SWIGTYPE_p_float, 0 | 0); if (!SWIG_IsOK(res6)) { - SWIG_exception_fail(SWIG_ArgError(res6), "in method '" "gamut_map" "', argument " "6"" of type '" "float *""'"); + SWIG_exception_fail(SWIG_ArgError(res6), "in method '" + "gamut_map" + "', argument " + "6" + " of type '" + "float *" + "'"); } arg6 = (float *)(argp6); - res7 = SWIG_ConvertPtr(obj6, &argp7,SWIGTYPE_p_float, 0 | 0 ); + res7 = SWIG_ConvertPtr(obj6, &argp7, SWIGTYPE_p_float, 0 | 0); if (!SWIG_IsOK(res7)) { - SWIG_exception_fail(SWIG_ArgError(res7), "in method '" "gamut_map" "', argument " "7"" of type '" "float *""'"); + SWIG_exception_fail(SWIG_ArgError(res7), "in method '" + "gamut_map" + "', argument " + "7" + " of type '" + "float *" + "'"); } arg7 = (float *)(argp7); - res8 = SWIG_ConvertPtr(obj7, &argp8,SWIGTYPE_p_float, 0 | 0 ); + res8 = SWIG_ConvertPtr(obj7, &argp8, SWIGTYPE_p_float, 0 | 0); if (!SWIG_IsOK(res8)) { - SWIG_exception_fail(SWIG_ArgError(res8), "in method '" "gamut_map" "', argument " "8"" of type '" "float *""'"); + SWIG_exception_fail(SWIG_ArgError(res8), "in method '" + "gamut_map" + "', argument " + "8" + " of type '" + "float *" + "'"); } arg8 = (float *)(argp8); ecode9 = SWIG_AsVal_int(obj8, &val9); if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "gamut_map" "', argument " "9"" of type '" "int""'"); - } + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" + "gamut_map" + "', argument " + "9" + " of type '" + "int" + "'"); + } arg9 = (int)(val9); - gamut_map(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + gamut_map(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); resultobj = SWIG_Py_Void(); return resultobj; fail: return NULL; } - SWIGINTERN PyObject *_wrap_gamut_map(PyObject *self, PyObject *args) { Py_ssize_t argc; - PyObject *argv[10] = { - 0 - }; + PyObject *argv[10] = {0}; Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; + + if (!PyTuple_Check(args)) + SWIG_fail; argc = args ? PyObject_Length(args) : 0; for (ii = 0; (ii < 9) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); + argv[ii] = PyTuple_GET_ITEM(args, ii); } if (argc == 5) { int _v; - { - _v = is_array(argv[0]) || PySequence_Check(argv[0]); - } + { _v = is_array(argv[0]) || PySequence_Check(argv[0]); } if (_v) { { - _v = is_array(argv[1]) && PyArray_EquivTypenums(array_type(argv[1]), - NPY_FLOAT); + _v = is_array(argv[1]) && + PyArray_EquivTypenums(array_type(argv[1]), NPY_FLOAT); } if (_v) { { @@ -3996,7 +4059,8 @@ SWIGINTERN PyObject *_wrap_gamut_map(PyObject *self, PyObject *args) { _v = SWIG_CheckState(res); if (_v) { void *vptr = 0; - int res = SWIG_ConvertPtr(argv[7], &vptr, SWIGTYPE_p_float, 0); + int res = + SWIG_ConvertPtr(argv[7], &vptr, SWIGTYPE_p_float, 0); _v = SWIG_CheckState(res); if (_v) { { @@ -4015,46 +4079,50 @@ SWIGINTERN PyObject *_wrap_gamut_map(PyObject *self, PyObject *args) { } } } - + fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'gamut_map'.\n" - " Possible C/C++ prototypes are:\n" - " gamut_map_full(float *,int,int,int,float *,int,int,int,float *,int,int,float *,int,int,float *,int,int)\n" - " gamut_map(float *,int,int,int,float *,float *,float *,float *,int)\n"); + SWIG_SetErrorMsg( + PyExc_NotImplementedError, + "Wrong number or type of arguments for overloaded function 'gamut_map'.\n" + " Possible C/C++ prototypes are:\n" + " gamut_map_full(float *,int,int,int,float *,int,int,int,float " + "*,int,int,float *,int,int,float *,int,int)\n" + " gamut_map(float *,int,int,int,float *,float *,float *,float " + "*,int)\n"); return 0; } - static PyMethodDef SwigMethods[] = { - { (char *)"SWIG_PyInstanceMethod_New", (PyCFunction)SWIG_PyInstanceMethod_New, METH_O, NULL}, - { (char *)"gamut_map", _wrap_gamut_map, METH_VARARGS, NULL}, - { NULL, NULL, 0, NULL } -}; - + {(char *)"SWIG_PyInstanceMethod_New", + (PyCFunction)SWIG_PyInstanceMethod_New, METH_O, NULL}, + {(char *)"gamut_map", _wrap_gamut_map, METH_VARARGS, NULL}, + {NULL, NULL, 0, NULL}}; /* -------- TYPE CONVERSION AND EQUIVALENCE RULES (BEGIN) -------- */ -static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, + 0, (void *)0, 0}; +static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, + 0, (void *)0, 0}; static swig_type_info *swig_type_initial[] = { - &_swigt__p_char, - &_swigt__p_float, + &_swigt__p_char, + &_swigt__p_float, }; -static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_float[] = { {&_swigt__p_float, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_char[] = {{&_swigt__p_char, 0, 0, 0}, + {0, 0, 0, 0}}; +static swig_cast_info _swigc__p_float[] = {{&_swigt__p_float, 0, 0, 0}, + {0, 0, 0, 0}}; static swig_cast_info *swig_cast_initial[] = { - _swigc__p_char, - _swigc__p_float, + _swigc__p_char, + _swigc__p_float, }; - /* -------- TYPE CONVERSION AND EQUIVALENCE RULES (END) -------- */ -static swig_const_info swig_const_table[] = { -{0, 0, 0, 0.0, 0, 0}}; +static swig_const_info swig_const_table[] = {{0, 0, 0, 0.0, 0, 0}}; #ifdef __cplusplus } @@ -4098,7 +4166,8 @@ static swig_const_info swig_const_table[] = { * 3) Finally, if cast->type has not already been loaded, then we add that * swig_cast_info to the linked list (because the cast->type) pointer will * be correct. - * ----------------------------------------------------------------------------- */ + * ----------------------------------------------------------------------------- + */ #ifdef __cplusplus extern "C" { @@ -4111,15 +4180,13 @@ extern "C" { #define SWIGRUNTIME_DEBUG #endif - -SWIGRUNTIME void -SWIG_InitializeModule(void *clientdata) { +SWIGRUNTIME void SWIG_InitializeModule(void *clientdata) { size_t i; swig_module_info *module_head, *iter; int init; - + /* check to see if the circular list has been setup, if not, set it up */ - if (swig_module.next==0) { + if (swig_module.next == 0) { /* Initialize the swig_module */ swig_module.type_initial = swig_type_initial; swig_module.cast_initial = swig_cast_initial; @@ -4128,7 +4195,7 @@ SWIG_InitializeModule(void *clientdata) { } else { init = 0; } - + /* Try and load any already created modules */ module_head = SWIG_GetModule(clientdata); if (!module_head) { @@ -4137,27 +4204,28 @@ SWIG_InitializeModule(void *clientdata) { SWIG_SetModule(clientdata, &swig_module); } else { /* the interpreter has loaded a SWIG module, but has it loaded this one? */ - iter=module_head; + iter = module_head; do { - if (iter==&swig_module) { + if (iter == &swig_module) { /* Our module is already in the list, so there's nothing more to do. */ return; } - iter=iter->next; - } while (iter!= module_head); - + iter = iter->next; + } while (iter != module_head); + /* otherwise we must add our module into the list */ swig_module.next = module_head->next; module_head->next = &swig_module; } - - /* When multiple interpreters are used, a module could have already been initialized in - a different interpreter, but not yet have a pointer in this interpreter. - In this case, we do not want to continue adding types... everything should be - set up already */ - if (init == 0) return; - - /* Now work on filling in swig_module.types */ + + /* When multiple interpreters are used, a module could have already been + initialized in a different interpreter, but not yet have a pointer in this + interpreter. In this case, we do not want to continue adding types... + everything should be set up already */ + if (init == 0) + return; + + /* Now work on filling in swig_module.types */ #ifdef SWIGRUNTIME_DEBUG printf("SWIG_InitializeModule: size %d\n", swig_module.size); #endif @@ -4165,14 +4233,16 @@ SWIG_InitializeModule(void *clientdata) { swig_type_info *type = 0; swig_type_info *ret; swig_cast_info *cast; - + #ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name); + printf("SWIG_InitializeModule: type %d %s\n", i, + swig_module.type_initial[i]->name); #endif - + /* if there is another module already loaded */ if (swig_module.next != &swig_module) { - type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, swig_module.type_initial[i]->name); + type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, + swig_module.type_initial[i]->name); } if (type) { /* Overwrite clientdata field */ @@ -4182,13 +4252,14 @@ SWIG_InitializeModule(void *clientdata) { if (swig_module.type_initial[i]->clientdata) { type->clientdata = swig_module.type_initial[i]->clientdata; #ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: found and overwrite type %s \n", type->name); + printf("SWIG_InitializeModule: found and overwrite type %s \n", + type->name); #endif } } else { type = swig_module.type_initial[i]; } - + /* Insert casting types */ cast = swig_module.cast_initial[i]; while (cast->type) { @@ -4198,9 +4269,11 @@ SWIG_InitializeModule(void *clientdata) { printf("SWIG_InitializeModule: look cast %s\n", cast->type->name); #endif if (swig_module.next != &swig_module) { - ret = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, cast->type->name); + ret = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, + cast->type->name); #ifdef SWIGRUNTIME_DEBUG - if (ret) printf("SWIG_InitializeModule: found cast %s\n", ret->name); + if (ret) + printf("SWIG_InitializeModule: found cast %s\n", ret->name); #endif } if (ret) { @@ -4214,12 +4287,14 @@ SWIG_InitializeModule(void *clientdata) { /* Check for casting already in the list */ swig_cast_info *ocast = SWIG_TypeCheck(ret->name, type); #ifdef SWIGRUNTIME_DEBUG - if (ocast) printf("SWIG_InitializeModule: skip old cast %s\n", ret->name); + if (ocast) + printf("SWIG_InitializeModule: skip old cast %s\n", ret->name); #endif - if (!ocast) ret = 0; + if (!ocast) + ret = 0; } } - + if (!ret) { #ifdef SWIGRUNTIME_DEBUG printf("SWIG_InitializeModule: adding cast %s\n", cast->type->name); @@ -4236,45 +4311,46 @@ SWIG_InitializeModule(void *clientdata) { swig_module.types[i] = type; } swig_module.types[i] = 0; - + #ifdef SWIGRUNTIME_DEBUG printf("**** SWIG_InitializeModule: Cast List ******\n"); for (i = 0; i < swig_module.size; ++i) { int j = 0; swig_cast_info *cast = swig_module.cast_initial[i]; - printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name); + printf("SWIG_InitializeModule: type %d %s\n", i, + swig_module.type_initial[i]->name); while (cast->type) { printf("SWIG_InitializeModule: cast type %s\n", cast->type->name); cast++; ++j; } - printf("---- Total casts: %d\n",j); + printf("---- Total casts: %d\n", j); } printf("**** SWIG_InitializeModule: Cast List ******\n"); #endif } /* This function will propagate the clientdata field of type to -* any new swig_type_info structures that have been added into the list -* of equivalent types. It is like calling -* SWIG_TypeClientData(type, clientdata) a second time. -*/ -SWIGRUNTIME void -SWIG_PropagateClientData(void) { + * any new swig_type_info structures that have been added into the list + * of equivalent types. It is like calling + * SWIG_TypeClientData(type, clientdata) a second time. + */ +SWIGRUNTIME void SWIG_PropagateClientData(void) { size_t i; swig_cast_info *equiv; static int init_run = 0; - - if (init_run) return; + + if (init_run) + return; init_run = 1; - + for (i = 0; i < swig_module.size; i++) { if (swig_module.types[i]->clientdata) { equiv = swig_module.types[i]->cast; while (equiv) { if (!equiv->converter) { if (equiv->type && !equiv->type->clientdata) - SWIG_TypeClientData(equiv->type, swig_module.types[i]->clientdata); + SWIG_TypeClientData(equiv->type, swig_module.types[i]->clientdata); } equiv = equiv->next; } @@ -4290,318 +4366,336 @@ SWIG_PropagateClientData(void) { } #endif - - #ifdef __cplusplus extern "C" { #endif - - /* Python-specific SWIG API */ -#define SWIG_newvarlink() SWIG_Python_newvarlink() -#define SWIG_addvarlink(p, name, get_attr, set_attr) SWIG_Python_addvarlink(p, name, get_attr, set_attr) -#define SWIG_InstallConstants(d, constants) SWIG_Python_InstallConstants(d, constants) - - /* ----------------------------------------------------------------------------- - * global variable support code. - * ----------------------------------------------------------------------------- */ - - typedef struct swig_globalvar { - char *name; /* Name of global variable */ - PyObject *(*get_attr)(void); /* Return the current value */ - int (*set_attr)(PyObject *); /* Set the value */ - struct swig_globalvar *next; - } swig_globalvar; - - typedef struct swig_varlinkobject { - PyObject_HEAD - swig_globalvar *vars; - } swig_varlinkobject; - - SWIGINTERN PyObject * - swig_varlink_repr(swig_varlinkobject *SWIGUNUSEDPARM(v)) { + +/* Python-specific SWIG API */ +#define SWIG_newvarlink() SWIG_Python_newvarlink() +#define SWIG_addvarlink(p, name, get_attr, set_attr) \ + SWIG_Python_addvarlink(p, name, get_attr, set_attr) +#define SWIG_InstallConstants(d, constants) \ + SWIG_Python_InstallConstants(d, constants) + +/* ----------------------------------------------------------------------------- + * global variable support code. + * ----------------------------------------------------------------------------- + */ + +typedef struct swig_globalvar { + char *name; /* Name of global variable */ + PyObject *(*get_attr)(void); /* Return the current value */ + int (*set_attr)(PyObject *); /* Set the value */ + struct swig_globalvar *next; +} swig_globalvar; + +typedef struct swig_varlinkobject { + PyObject_HEAD swig_globalvar *vars; +} swig_varlinkobject; + +SWIGINTERN PyObject *swig_varlink_repr(swig_varlinkobject *SWIGUNUSEDPARM(v)) { #if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_InternFromString("<Swig global variables>"); + return PyUnicode_InternFromString("<Swig global variables>"); #else - return PyString_FromString("<Swig global variables>"); + return PyString_FromString("<Swig global variables>"); #endif - } - - SWIGINTERN PyObject * - swig_varlink_str(swig_varlinkobject *v) { +} + +SWIGINTERN PyObject *swig_varlink_str(swig_varlinkobject *v) { #if PY_VERSION_HEX >= 0x03000000 - PyObject *str = PyUnicode_InternFromString("("); - PyObject *tail; - PyObject *joined; - swig_globalvar *var; - for (var = v->vars; var; var=var->next) { - tail = PyUnicode_FromString(var->name); + PyObject *str = PyUnicode_InternFromString("("); + PyObject *tail; + PyObject *joined; + swig_globalvar *var; + for (var = v->vars; var; var = var->next) { + tail = PyUnicode_FromString(var->name); + joined = PyUnicode_Concat(str, tail); + Py_DecRef(str); + Py_DecRef(tail); + str = joined; + if (var->next) { + tail = PyUnicode_InternFromString(", "); joined = PyUnicode_Concat(str, tail); Py_DecRef(str); Py_DecRef(tail); str = joined; - if (var->next) { - tail = PyUnicode_InternFromString(", "); - joined = PyUnicode_Concat(str, tail); - Py_DecRef(str); - Py_DecRef(tail); - str = joined; - } } - tail = PyUnicode_InternFromString(")"); - joined = PyUnicode_Concat(str, tail); - Py_DecRef(str); - Py_DecRef(tail); - str = joined; + } + tail = PyUnicode_InternFromString(")"); + joined = PyUnicode_Concat(str, tail); + Py_DecRef(str); + Py_DecRef(tail); + str = joined; #else - PyObject *str = PyString_FromString("("); - swig_globalvar *var; - for (var = v->vars; var; var=var->next) { - PyString_ConcatAndDel(&str,PyString_FromString(var->name)); - if (var->next) PyString_ConcatAndDel(&str,PyString_FromString(", ")); - } - PyString_ConcatAndDel(&str,PyString_FromString(")")); -#endif - return str; + PyObject *str = PyString_FromString("("); + swig_globalvar *var; + for (var = v->vars; var; var = var->next) { + PyString_ConcatAndDel(&str, PyString_FromString(var->name)); + if (var->next) + PyString_ConcatAndDel(&str, PyString_FromString(", ")); } - - SWIGINTERN int - swig_varlink_print(swig_varlinkobject *v, FILE *fp, int SWIGUNUSEDPARM(flags)) { - char *tmp; - PyObject *str = swig_varlink_str(v); - fprintf(fp,"Swig global variables "); - fprintf(fp,"%s\n", tmp = SWIG_Python_str_AsChar(str)); - SWIG_Python_str_DelForPy3(tmp); - Py_DECREF(str); - return 0; + PyString_ConcatAndDel(&str, PyString_FromString(")")); +#endif + return str; +} + +SWIGINTERN int swig_varlink_print(swig_varlinkobject *v, FILE *fp, + int SWIGUNUSEDPARM(flags)) { + char *tmp; + PyObject *str = swig_varlink_str(v); + fprintf(fp, "Swig global variables "); + fprintf(fp, "%s\n", tmp = SWIG_Python_str_AsChar(str)); + SWIG_Python_str_DelForPy3(tmp); + Py_DECREF(str); + return 0; +} + +SWIGINTERN void swig_varlink_dealloc(swig_varlinkobject *v) { + swig_globalvar *var = v->vars; + while (var) { + swig_globalvar *n = var->next; + free(var->name); + free(var); + var = n; } - - SWIGINTERN void - swig_varlink_dealloc(swig_varlinkobject *v) { - swig_globalvar *var = v->vars; - while (var) { - swig_globalvar *n = var->next; - free(var->name); - free(var); - var = n; +} + +SWIGINTERN PyObject *swig_varlink_getattr(swig_varlinkobject *v, char *n) { + PyObject *res = NULL; + swig_globalvar *var = v->vars; + while (var) { + if (strcmp(var->name, n) == 0) { + res = (*var->get_attr)(); + break; } + var = var->next; } - - SWIGINTERN PyObject * - swig_varlink_getattr(swig_varlinkobject *v, char *n) { - PyObject *res = NULL; - swig_globalvar *var = v->vars; - while (var) { - if (strcmp(var->name,n) == 0) { - res = (*var->get_attr)(); - break; - } - var = var->next; - } - if (res == NULL && !PyErr_Occurred()) { - PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n); - } - return res; - } - - SWIGINTERN int - swig_varlink_setattr(swig_varlinkobject *v, char *n, PyObject *p) { - int res = 1; - swig_globalvar *var = v->vars; - while (var) { - if (strcmp(var->name,n) == 0) { - res = (*var->set_attr)(p); - break; - } - var = var->next; - } - if (res == 1 && !PyErr_Occurred()) { - PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n); + if (res == NULL && !PyErr_Occurred()) { + PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n); + } + return res; +} + +SWIGINTERN int swig_varlink_setattr(swig_varlinkobject *v, char *n, + PyObject *p) { + int res = 1; + swig_globalvar *var = v->vars; + while (var) { + if (strcmp(var->name, n) == 0) { + res = (*var->set_attr)(p); + break; } - return res; - } - - SWIGINTERN PyTypeObject* - swig_varlink_type(void) { - static char varlink__doc__[] = "Swig var link object"; - static PyTypeObject varlink_type; - static int type_init = 0; - if (!type_init) { - const PyTypeObject tmp = { - /* PyObject header changed in Python 3 */ + var = var->next; + } + if (res == 1 && !PyErr_Occurred()) { + PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n); + } + return res; +} + +SWIGINTERN PyTypeObject *swig_varlink_type(void) { + static char varlink__doc__[] = "Swig var link object"; + static PyTypeObject varlink_type; + static int type_init = 0; + if (!type_init) { + const PyTypeObject tmp = { + /* PyObject header changed in Python 3 */ #if PY_VERSION_HEX >= 0x03000000 - PyVarObject_HEAD_INIT(NULL, 0) + PyVarObject_HEAD_INIT(NULL, 0) #else - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ -#endif - (char *)"swigvarlink", /* tp_name */ - sizeof(swig_varlinkobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor) swig_varlink_dealloc, /* tp_dealloc */ - (printfunc) swig_varlink_print, /* tp_print */ - (getattrfunc) swig_varlink_getattr, /* tp_getattr */ - (setattrfunc) swig_varlink_setattr, /* tp_setattr */ - 0, /* tp_compare */ - (reprfunc) swig_varlink_repr, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - (reprfunc) swig_varlink_str, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - 0, /* tp_flags */ - varlink__doc__, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ + PyObject_HEAD_INIT(NULL) 0, /* ob_size */ +#endif + (char *) "swigvarlink", /* tp_name */ + sizeof(swig_varlinkobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)swig_varlink_dealloc, /* tp_dealloc */ + (printfunc)swig_varlink_print, /* tp_print */ + (getattrfunc)swig_varlink_getattr, /* tp_getattr */ + (setattrfunc)swig_varlink_setattr, /* tp_setattr */ + 0, /* tp_compare */ + (reprfunc)swig_varlink_repr, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + (reprfunc)swig_varlink_str, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + 0, /* tp_flags */ + varlink__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ #if PY_VERSION_HEX >= 0x02020000 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* tp_iter -> tp_weaklist */ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, /* tp_iter -> tp_weaklist */ #endif #if PY_VERSION_HEX >= 0x02030000 - 0, /* tp_del */ + 0, /* tp_del */ #endif #if PY_VERSION_HEX >= 0x02060000 - 0, /* tp_version_tag */ + 0, /* tp_version_tag */ #endif #if PY_VERSION_HEX >= 0x03040000 - 0, /* tp_finalize */ + 0, /* tp_finalize */ #endif #ifdef COUNT_ALLOCS - 0, /* tp_allocs */ - 0, /* tp_frees */ - 0, /* tp_maxalloc */ + 0, /* tp_allocs */ + 0, /* tp_frees */ + 0, /* tp_maxalloc */ #if PY_VERSION_HEX >= 0x02050000 - 0, /* tp_prev */ + 0, /* tp_prev */ #endif - 0 /* tp_next */ + 0 /* tp_next */ #endif - }; - varlink_type = tmp; - type_init = 1; + }; + varlink_type = tmp; + type_init = 1; #if PY_VERSION_HEX < 0x02020000 - varlink_type.ob_type = &PyType_Type; + varlink_type.ob_type = &PyType_Type; #else - if (PyType_Ready(&varlink_type) < 0) + if (PyType_Ready(&varlink_type) < 0) return NULL; #endif + } + return &varlink_type; +} + +/* Create a variable linking object for use later */ +SWIGINTERN PyObject *SWIG_Python_newvarlink(void) { + swig_varlinkobject *result = + PyObject_NEW(swig_varlinkobject, swig_varlink_type()); + if (result) { + result->vars = 0; + } + return ((PyObject *)result); +} + +SWIGINTERN void SWIG_Python_addvarlink(PyObject *p, char *name, + PyObject *(*get_attr)(void), + int (*set_attr)(PyObject *p)) { + swig_varlinkobject *v = (swig_varlinkobject *)p; + swig_globalvar *gv = (swig_globalvar *)malloc(sizeof(swig_globalvar)); + if (gv) { + size_t size = strlen(name) + 1; + gv->name = (char *)malloc(size); + if (gv->name) { + strncpy(gv->name, name, size); + gv->get_attr = get_attr; + gv->set_attr = set_attr; + gv->next = v->vars; } - return &varlink_type; - } - - /* Create a variable linking object for use later */ - SWIGINTERN PyObject * - SWIG_Python_newvarlink(void) { - swig_varlinkobject *result = PyObject_NEW(swig_varlinkobject, swig_varlink_type()); - if (result) { - result->vars = 0; - } - return ((PyObject*) result); - } - - SWIGINTERN void - SWIG_Python_addvarlink(PyObject *p, char *name, PyObject *(*get_attr)(void), int (*set_attr)(PyObject *p)) { - swig_varlinkobject *v = (swig_varlinkobject *) p; - swig_globalvar *gv = (swig_globalvar *) malloc(sizeof(swig_globalvar)); - if (gv) { - size_t size = strlen(name)+1; - gv->name = (char *)malloc(size); - if (gv->name) { - strncpy(gv->name,name,size); - gv->get_attr = get_attr; - gv->set_attr = set_attr; - gv->next = v->vars; - } + } + v->vars = gv; +} + +SWIGINTERN PyObject *SWIG_globals(void) { + static PyObject *_SWIG_globals = 0; + if (!_SWIG_globals) + _SWIG_globals = SWIG_newvarlink(); + return _SWIG_globals; +} + +/* ----------------------------------------------------------------------------- + * constants/methods manipulation + * ----------------------------------------------------------------------------- + */ + +/* Install Constants */ +SWIGINTERN void SWIG_Python_InstallConstants(PyObject *d, + swig_const_info constants[]) { + PyObject *obj = 0; + size_t i; + for (i = 0; constants[i].type; ++i) { + switch (constants[i].type) { + case SWIG_PY_POINTER: + obj = SWIG_InternalNewPointerObj(constants[i].pvalue, + *(constants[i]).ptype, 0); + break; + case SWIG_PY_BINARY: + obj = SWIG_NewPackedObj(constants[i].pvalue, constants[i].lvalue, + *(constants[i].ptype)); + break; + default: + obj = 0; + break; } - v->vars = gv; - } - - SWIGINTERN PyObject * - SWIG_globals(void) { - static PyObject *_SWIG_globals = 0; - if (!_SWIG_globals) _SWIG_globals = SWIG_newvarlink(); - return _SWIG_globals; - } - - /* ----------------------------------------------------------------------------- - * constants/methods manipulation - * ----------------------------------------------------------------------------- */ - - /* Install Constants */ - SWIGINTERN void - SWIG_Python_InstallConstants(PyObject *d, swig_const_info constants[]) { - PyObject *obj = 0; - size_t i; - for (i = 0; constants[i].type; ++i) { - switch(constants[i].type) { - case SWIG_PY_POINTER: - obj = SWIG_InternalNewPointerObj(constants[i].pvalue, *(constants[i]).ptype,0); - break; - case SWIG_PY_BINARY: - obj = SWIG_NewPackedObj(constants[i].pvalue, constants[i].lvalue, *(constants[i].ptype)); - break; - default: - obj = 0; - break; - } - if (obj) { - PyDict_SetItemString(d, constants[i].name, obj); - Py_DECREF(obj); - } + if (obj) { + PyDict_SetItemString(d, constants[i].name, obj); + Py_DECREF(obj); } } - - /* -----------------------------------------------------------------------------*/ - /* Fix SwigMethods to carry the callback ptrs when needed */ - /* -----------------------------------------------------------------------------*/ - - SWIGINTERN void - SWIG_Python_FixMethods(PyMethodDef *methods, - swig_const_info *const_table, - swig_type_info **types, - swig_type_info **types_initial) { - size_t i; - for (i = 0; methods[i].ml_name; ++i) { - const char *c = methods[i].ml_doc; - if (!c) continue; - c = strstr(c, "swig_ptr: "); - if (c) { - int j; - swig_const_info *ci = 0; - const char *name = c + 10; - for (j = 0; const_table[j].type; ++j) { - if (strncmp(const_table[j].name, name, - strlen(const_table[j].name)) == 0) { - ci = &(const_table[j]); - break; - } +} + +/* -----------------------------------------------------------------------------*/ +/* Fix SwigMethods to carry the callback ptrs when needed */ +/* -----------------------------------------------------------------------------*/ + +SWIGINTERN void SWIG_Python_FixMethods(PyMethodDef *methods, + swig_const_info *const_table, + swig_type_info **types, + swig_type_info **types_initial) { + size_t i; + for (i = 0; methods[i].ml_name; ++i) { + const char *c = methods[i].ml_doc; + if (!c) + continue; + c = strstr(c, "swig_ptr: "); + if (c) { + int j; + swig_const_info *ci = 0; + const char *name = c + 10; + for (j = 0; const_table[j].type; ++j) { + if (strncmp(const_table[j].name, name, strlen(const_table[j].name)) == + 0) { + ci = &(const_table[j]); + break; } - if (ci) { - void *ptr = (ci->type == SWIG_PY_POINTER) ? ci->pvalue : 0; - if (ptr) { - size_t shift = (ci->ptype) - types; - swig_type_info *ty = types_initial[shift]; - size_t ldoc = (c - methods[i].ml_doc); - size_t lptr = strlen(ty->name)+2*sizeof(void*)+2; - char *ndoc = (char*)malloc(ldoc + lptr + 10); - if (ndoc) { - char *buff = ndoc; - strncpy(buff, methods[i].ml_doc, ldoc); - buff += ldoc; - strncpy(buff, "swig_ptr: ", 10); - buff += 10; - SWIG_PackVoidPtr(buff, ptr, ty->name, lptr); - methods[i].ml_doc = ndoc; - } + } + if (ci) { + void *ptr = (ci->type == SWIG_PY_POINTER) ? ci->pvalue : 0; + if (ptr) { + size_t shift = (ci->ptype) - types; + swig_type_info *ty = types_initial[shift]; + size_t ldoc = (c - methods[i].ml_doc); + size_t lptr = strlen(ty->name) + 2 * sizeof(void *) + 2; + char *ndoc = (char *)malloc(ldoc + lptr + 10); + if (ndoc) { + char *buff = ndoc; + strncpy(buff, methods[i].ml_doc, ldoc); + buff += ldoc; + strncpy(buff, "swig_ptr: ", 10); + buff += 10; + SWIG_PackVoidPtr(buff, ptr, ty->name, lptr); + methods[i].ml_doc = ndoc; } } } } - } - + } +} + #ifdef __cplusplus } #endif @@ -4614,27 +4708,26 @@ extern "C" { extern "C" #endif -SWIGEXPORT + SWIGEXPORT #if PY_VERSION_HEX >= 0x03000000 -PyObject* + PyObject * #else void #endif -SWIG_init(void) { + SWIG_init(void) { PyObject *m, *d, *md; #if PY_VERSION_HEX >= 0x03000000 static struct PyModuleDef SWIG_module = { -# if PY_VERSION_HEX >= 0x03020000 +#if PY_VERSION_HEX >= 0x03020000 PyModuleDef_HEAD_INIT, -# else +#else { - PyObject_HEAD_INIT(NULL) - NULL, /* m_init */ - 0, /* m_index */ - NULL, /* m_copy */ + PyObject_HEAD_INIT(NULL) NULL, /* m_init */ + 0, /* m_index */ + NULL, /* m_copy */ }, -# endif - (char *) SWIG_name, +#endif + (char *)SWIG_name, NULL, -1, SwigMethods, @@ -4644,21 +4737,16 @@ SWIG_init(void) { NULL }; #endif - + #if defined(SWIGPYTHON_BUILTIN) - static SwigPyClientData SwigPyObject_clientdata = { - 0, 0, 0, 0, 0, 0, 0 - }; + static SwigPyClientData SwigPyObject_clientdata = {0, 0, 0, 0, 0, 0, 0}; static PyGetSetDef this_getset_def = { - (char *)"this", &SwigPyBuiltin_ThisClosure, NULL, NULL, NULL - }; - static SwigPyGetSet thisown_getset_closure = { - (PyCFunction) SwigPyObject_own, - (PyCFunction) SwigPyObject_own - }; + (char *)"this", &SwigPyBuiltin_ThisClosure, NULL, NULL, NULL}; + static SwigPyGetSet thisown_getset_closure = {(PyCFunction)SwigPyObject_own, + (PyCFunction)SwigPyObject_own}; static PyGetSetDef thisown_getset_def = { - (char *)"thisown", SwigPyBuiltin_GetterClosure, SwigPyBuiltin_SetterClosure, NULL, &thisown_getset_closure - }; + (char *)"thisown", SwigPyBuiltin_GetterClosure, + SwigPyBuiltin_SetterClosure, NULL, &thisown_getset_closure}; PyObject *metatype_args; PyTypeObject *builtin_pytype; int builtin_base_count; @@ -4672,83 +4760,85 @@ SWIG_init(void) { PyObject *thisown_descr; PyObject *self = 0; int i; - + (void)builtin_pytype; (void)builtin_base_count; (void)builtin_basetype; (void)tuple; (void)static_getset; (void)self; - + /* metatype is used to implement static member variables. */ metatype_args = Py_BuildValue("(s(O){})", "SwigPyObjectType", &PyType_Type); assert(metatype_args); - metatype = (PyTypeObject *) PyType_Type.tp_call((PyObject *) &PyType_Type, metatype_args, NULL); + metatype = (PyTypeObject *)PyType_Type.tp_call((PyObject *)&PyType_Type, + metatype_args, NULL); assert(metatype); Py_DECREF(metatype_args); - metatype->tp_setattro = (setattrofunc) &SwigPyObjectType_setattro; + metatype->tp_setattro = (setattrofunc)&SwigPyObjectType_setattro; assert(PyType_Ready(metatype) >= 0); #endif - + /* Fix SwigMethods to carry the callback ptrs when needed */ - SWIG_Python_FixMethods(SwigMethods, swig_const_table, swig_types, swig_type_initial); - + SWIG_Python_FixMethods(SwigMethods, swig_const_table, swig_types, + swig_type_initial); + #if PY_VERSION_HEX >= 0x03000000 m = PyModule_Create(&SWIG_module); #else - m = Py_InitModule((char *) SWIG_name, SwigMethods); + m = Py_InitModule((char *)SWIG_name, SwigMethods); #endif - + md = d = PyModule_GetDict(m); (void)md; - + SWIG_InitializeModule(0); - + #ifdef SWIGPYTHON_BUILTIN SwigPyObject_stype = SWIG_MangledTypeQuery("_p_SwigPyObject"); assert(SwigPyObject_stype); - cd = (SwigPyClientData*) SwigPyObject_stype->clientdata; + cd = (SwigPyClientData *)SwigPyObject_stype->clientdata; if (!cd) { SwigPyObject_stype->clientdata = &SwigPyObject_clientdata; SwigPyObject_clientdata.pytype = SwigPyObject_TypeOnce(); - } else if (SwigPyObject_TypeOnce()->tp_basicsize != cd->pytype->tp_basicsize) { - PyErr_SetString(PyExc_RuntimeError, "Import error: attempted to load two incompatible swig-generated modules."); -# if PY_VERSION_HEX >= 0x03000000 + } else if (SwigPyObject_TypeOnce()->tp_basicsize != + cd->pytype->tp_basicsize) { + PyErr_SetString(PyExc_RuntimeError, "Import error: attempted to load two " + "incompatible swig-generated modules."); +#if PY_VERSION_HEX >= 0x03000000 return NULL; -# else +#else return; -# endif +#endif } - + /* All objects have a 'this' attribute */ this_descr = PyDescr_NewGetSet(SwigPyObject_type(), &this_getset_def); (void)this_descr; - + /* All objects have a 'thisown' attribute */ thisown_descr = PyDescr_NewGetSet(SwigPyObject_type(), &thisown_getset_def); (void)thisown_descr; - + public_interface = PyList_New(0); public_symbol = 0; (void)public_symbol; - + PyDict_SetItemString(md, "__all__", public_interface); Py_DECREF(public_interface); for (i = 0; SwigMethods[i].ml_name != NULL; ++i) - SwigPyBuiltin_AddPublicSymbol(public_interface, SwigMethods[i].ml_name); + SwigPyBuiltin_AddPublicSymbol(public_interface, SwigMethods[i].ml_name); for (i = 0; swig_const_table[i].name != 0; ++i) - SwigPyBuiltin_AddPublicSymbol(public_interface, swig_const_table[i].name); + SwigPyBuiltin_AddPublicSymbol(public_interface, swig_const_table[i].name); #endif - - SWIG_InstallConstants(d,swig_const_table); - - + + SWIG_InstallConstants(d, swig_const_table); + import_array(); - + #if PY_VERSION_HEX >= 0x03000000 return m; #else return; #endif } - diff --git a/hpvm/test/hpvm-cava/src/cam_pipe.c b/hpvm/test/hpvm-cava/src/cam_pipe.c index 7874ff9d529afebc40d1660637e85b3a1e00f23e..cdeaf393320121706d13d423212896e2551142c8 100644 --- a/hpvm/test/hpvm-cava/src/cam_pipe.c +++ b/hpvm/test/hpvm-cava/src/cam_pipe.c @@ -1,11 +1,11 @@ +#include "cam_pipe_utility.h" +#include "dma_interface.h" +#include "load_cam_model.h" +#include "pipe_stages.h" +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <assert.h> -#include "pipe_stages.h" -#include "load_cam_model.h" -#include "cam_pipe_utility.h" -#include "dma_interface.h" #ifdef DMA_MODE #include "gem5_harness.h" #endif @@ -13,7 +13,7 @@ // FIXME: Include gem5/dma_interface.cc/h separately #ifndef DMA_INTERFACE_V3 #define DMA_INTERFACE_V3 -#endif//DMA_INTERFACE_V3 +#endif // DMA_INTERFACE_V3 /////////////////////////////////////////////////////////////// // Camera Model Parameters @@ -71,7 +71,8 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size, uint8_t *acc_input, *acc_result; float *acc_input_scaled, *acc_result_scaled; float *host_TsTw, *host_ctrl_pts, *host_weights, *host_coefs, *host_tone_map; - float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, *acc_l2_dist; + float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, + *acc_l2_dist; strcat(cam_model_path, "cam_models/NikonD7000/"); @@ -84,20 +85,25 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size, host_coefs = get_coefs(cam_model_path, num_ctrl_pts); host_tone_map = get_tone_map(cam_model_path); - acc_input = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE); - acc_result = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE); - acc_input_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE); - acc_result_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE); - acc_TsTw = (float*) malloc_aligned(sizeof(float) * 9); - acc_ctrl_pts = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE); - acc_weights = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE); - acc_coefs = (float*) malloc_aligned(sizeof(float) * 12); - acc_tone_map = (float*) malloc_aligned(sizeof(float) * 256 * CHAN_SIZE); - acc_l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts); + acc_input = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size * + CHAN_SIZE); + acc_result = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size * + CHAN_SIZE); + acc_input_scaled = + (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE); + acc_result_scaled = + (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE); + acc_TsTw = (float *)malloc_aligned(sizeof(float) * 9); + acc_ctrl_pts = + (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE); + acc_weights = + (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE); + acc_coefs = (float *)malloc_aligned(sizeof(float) * 12); + acc_tone_map = (float *)malloc_aligned(sizeof(float) * 256 * CHAN_SIZE); + acc_l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts); // Load camera model parameters for the ISP - MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw, - sizeof(float) * 9); + MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw, sizeof(float) * 9); MAP_ARRAY_TO_ACCEL(ISP, "host_ctrl_pts", host_ctrl_pts, sizeof(float) * num_ctrl_pts * CHAN_SIZE); MAP_ARRAY_TO_ACCEL(ISP, "host_weights", host_weights, @@ -136,4 +142,3 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size, free(acc_tone_map); free(acc_l2_dist); } - diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c index f806e9ee1a2e288fabcb8ad658a47c3919fbb661..864f02d5b28f2c4738279cf66cba5f4312c2a3de 100644 --- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c +++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c @@ -1,6 +1,6 @@ +#include <assert.h> #include <stdio.h> #include <stdlib.h> -#include <assert.h> #include "cam_pipe_utility.h" //#include "pipe_stages.h" @@ -26,10 +26,11 @@ uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size) { return image; } -void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int col_size) { +void write_image_to_binary(char *file_path, uint8_t *image, int row_size, + int col_size) { FILE *fp = fopen(file_path, "w"); - int shape[3] = { row_size, col_size, CHAN_SIZE }; + int shape[3] = {row_size, col_size, CHAN_SIZE}; fwrite(shape, sizeof(int), 3, fp); int size = row_size * col_size * CHAN_SIZE; @@ -40,8 +41,8 @@ void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int co float *transpose_mat(float *inmat, int width, int height) { // Define vectors float *outmat; - int err = - posix_memalign((void **)&outmat, CACHELINE_SIZE, sizeof(float) * height * width); + int err = posix_memalign((void **)&outmat, CACHELINE_SIZE, + sizeof(float) * height * width); assert(err == 0 && "Failed to allocate memory!"); // Transpose the matrix @@ -71,7 +72,7 @@ void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size, uint8_t **result) { if (*result == NULL) { *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE * - sizeof(uint8_t)); + sizeof(uint8_t)); } ARRAY_3D(uint8_t, _input, input, row_size, col_size); ARRAY_3D(uint8_t, _result, *result, col_size, CHAN_SIZE); diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h index b4fb6cde0c438b23c2b596cf0418953aaedca501..b61b7cc9b52aa59522f93661895fca960b947f17 100644 --- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h +++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h @@ -1,8 +1,8 @@ #ifndef _CAM_PIPE_UTILITY_H_ #define _CAM_PIPE_UTILITY_H_ -#include "utility.h" #include "pipe_stages.h" +#include "utility.h" uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size); void write_image_to_binary(char *file_path, uint8_t *image, int row_size, diff --git a/hpvm/test/hpvm-cava/src/defs.h b/hpvm/test/hpvm-cava/src/defs.h index ccc8acc857c36fd13115670932a38dc3a406dc29..0fa95ef3d2ea55c67a921e0bc5fc8a6ec6ba949f 100644 --- a/hpvm/test/hpvm-cava/src/defs.h +++ b/hpvm/test/hpvm-cava/src/defs.h @@ -10,46 +10,46 @@ typedef unsigned long uint64_t; // Debugging message macros. #if DEBUG_LEVEL >= 1 - #define INFO_MSG(args...) printf(args) - - #if DEBUG_LEVEL >= 2 - #define PRINT_MSG(args...) printf(args) - #define PRINT_DEBUG(hid, rows, cols, num_cols) \ - print_debug(hid, rows, cols, num_cols) - #define PRINT_DEBUG4D(hid, rows, cols, height) \ - print_debug4d(hid, rows, cols, height) - #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) \ - print_debug4d_fp16(hid, num, height, rows, cols) - - #if DEBUG_LEVEL >= 3 - #define PRINT_DEBUG_V(hid, rows, cols, num_cols) \ - print_debug(hid, rows, cols, num_cols) - #define PRINT_DEBUG4D_V(hid, rows, cols, height) \ - print_debug4d(hid, rows, cols, height) - #define PRINT_MSG_V(args...) printf(args) - #else - #define PRINT_DEBUG_V(hid, rows, cols, num_cols) - #define PRINT_DEBUG4D_V(hid, rows, cols, height) - #define PRINT_MSG_V(args...) - #endif - #else - #define PRINT_MSG(args...) - #define PRINT_DEBUG(hid, rows, cols, num_cols) - #define PRINT_DEBUG4D(hid, rows, cols, height) - #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) - #define PRINT_DEBUG_V(hid, rows, cols, height) - #define PRINT_DEBUG4D_V(hid, rows, cols, height) - #define PRINT_MSG_V(args...) - #endif +#define INFO_MSG(args...) printf(args) + +#if DEBUG_LEVEL >= 2 +#define PRINT_MSG(args...) printf(args) +#define PRINT_DEBUG(hid, rows, cols, num_cols) \ + print_debug(hid, rows, cols, num_cols) +#define PRINT_DEBUG4D(hid, rows, cols, height) \ + print_debug4d(hid, rows, cols, height) +#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) \ + print_debug4d_fp16(hid, num, height, rows, cols) + +#if DEBUG_LEVEL >= 3 +#define PRINT_DEBUG_V(hid, rows, cols, num_cols) \ + print_debug(hid, rows, cols, num_cols) +#define PRINT_DEBUG4D_V(hid, rows, cols, height) \ + print_debug4d(hid, rows, cols, height) +#define PRINT_MSG_V(args...) printf(args) #else - #define INFO_MSG(args...) - #define PRINT_DEBUG(hid, rows, cols, num_cols) - #define PRINT_DEBUG4D(hid, rows, cols, height) - #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) - #define PRINT_MSG(args...) - #define PRINT_DEBUG_V(hid, rows, cols, height) - #define PRINT_DEBUG4D_V(hid, rows, cols, height) - #define PRINT_MSG_V(args...) +#define PRINT_DEBUG_V(hid, rows, cols, num_cols) +#define PRINT_DEBUG4D_V(hid, rows, cols, height) +#define PRINT_MSG_V(args...) +#endif +#else +#define PRINT_MSG(args...) +#define PRINT_DEBUG(hid, rows, cols, num_cols) +#define PRINT_DEBUG4D(hid, rows, cols, height) +#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) +#define PRINT_DEBUG_V(hid, rows, cols, height) +#define PRINT_DEBUG4D_V(hid, rows, cols, height) +#define PRINT_MSG_V(args...) +#endif +#else +#define INFO_MSG(args...) +#define PRINT_DEBUG(hid, rows, cols, num_cols) +#define PRINT_DEBUG4D(hid, rows, cols, height) +#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) +#define PRINT_MSG(args...) +#define PRINT_DEBUG_V(hid, rows, cols, height) +#define PRINT_DEBUG4D_V(hid, rows, cols, height) +#define PRINT_MSG_V(args...) #endif #define STRING(arg) #arg @@ -72,9 +72,9 @@ typedef unsigned long uint64_t; #define max3(e0, e1, e2) max2(max2(e0, e1), e2) #define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3)) #define max8(e0, e1, e2, e3, e4, e5, e6, e7) \ - max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7)) + max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7)) #define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8) \ - max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8) + max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8) #define min2(A, B) (((A) < (B)) ? (A) : (B)) @@ -92,7 +92,8 @@ typedef unsigned long uint64_t; // If GEM5_HARNESS is defined: // // MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize) -// ===> mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize) +// ===> mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, +// mySize) // // INVOKE_KERNEL(myReqCode, kernelFuncName, args...) // ===> invokeAcceleratorAndBlock(myReqCode) @@ -107,69 +108,69 @@ typedef unsigned long uint64_t; #ifdef GEM5_HARNESS #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size) \ - mapArrayToAccelerator(req_code, name, base_addr, size) + mapArrayToAccelerator(req_code, name, base_addr, size) #define INVOKE_KERNEL(req_code, kernel_ptr, args...) \ - do { \ - UNUSED(kernel_ptr); \ - invokeAcceleratorAndBlock(req_code); \ - } while (0) + do { \ + UNUSED(kernel_ptr); \ + invokeAcceleratorAndBlock(req_code); \ + } while (0) #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...) \ - do { \ - UNUSED(kernel_ptr); \ - invokeAcceleratorAndReturn2(req_code, finish_flag); \ - } while (0) + do { \ + UNUSED(kernel_ptr); \ + invokeAcceleratorAndReturn2(req_code, finish_flag); \ + } while (0) #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size) \ - do { \ - invokeAladdinTrafficGenAndBlock(start_addr, size, false, false); \ - } while (0) + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, false, false); \ + } while (0) #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size) \ - do { \ - invokeAladdinTrafficGenAndBlock(start_addr, size, true, false); \ - } while (0) + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, true, false); \ + } while (0) #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size) \ - do { \ - invokeAladdinTrafficGenAndBlock(start_addr, size, false, true); \ - } while (0) + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, false, true); \ + } while (0) #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size) \ - do { \ - invokeAladdinTrafficGenAndBlock(start_addr, size, true, true); \ - } while (0) + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, true, true); \ + } while (0) #else #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size) \ - do { \ - INFO_MSG("Mapping array %s @ %p, size %d.\n", \ - name, (void*)base_addr, (int)(size)); \ - UNUSED(req_code); \ - UNUSED(name); \ - UNUSED(base_addr); \ - UNUSED(size); \ - } while (0) + do { \ + INFO_MSG("Mapping array %s @ %p, size %d.\n", name, (void *)base_addr, \ + (int)(size)); \ + UNUSED(req_code); \ + UNUSED(name); \ + UNUSED(base_addr); \ + UNUSED(size); \ + } while (0) #define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args) #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...) \ - kernel_ptr(args) + kernel_ptr(args) #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size) \ - do { \ - UNUSED(start_addr); \ - UNUSED(size); \ - } while (0) + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size) \ - do { \ - UNUSED(start_addr); \ - UNUSED(size); \ - } while (0) + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size) \ - do { \ - UNUSED(start_addr); \ - UNUSED(size); \ - } while (0) + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size) \ - do { \ - UNUSED(start_addr); \ - UNUSED(size); \ - } while (0) + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) #endif @@ -177,14 +178,14 @@ typedef unsigned long uint64_t; // // This assumes that the current name of the base pointer is also the name of // the array in the top level function of the dynamic trace. THIS IS VERY -// IMPORTANT - if the argument passed to a top level function has been renamed in -// the function, then this WILL NOT WORK! +// IMPORTANT - if the argument passed to a top level function has been renamed +// in the function, then this WILL NOT WORK! // // MAP_ARRAY(myReqCode, myArray, mySize) // ===> MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize) #define MAP_ARRAY(req_code, name_and_base_addr, size) \ - MAP_ARRAY_TO_ACCEL( \ - req_code, STRING(name_and_base_addr), name_and_base_addr, size) + MAP_ARRAY_TO_ACCEL(req_code, STRING(name_and_base_addr), name_and_base_addr, \ + size) // Use these convenience macros to cast a raw pointer into a multidimensional // variable-length array, which lets us use [] notation inside of the ugly @@ -202,23 +203,24 @@ typedef unsigned long uint64_t; // // And so on... #define ARRAY_1D(TYPE, output_array_name, input_array_name) \ - TYPE* output_array_name = (TYPE*)input_array_name + TYPE *output_array_name = (TYPE *)input_array_name #define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1) \ - TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name + TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name #define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2) \ - TYPE(*output_array_name)[DIM_1][DIM_2] = \ - (TYPE(*)[DIM_1][DIM_2])input_array_name - -#define ARRAY_4D( \ - TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3) \ - TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] = \ - (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name - -#define ARRAY_5D( \ - TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4) \ - TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] = \ - (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name + TYPE(*output_array_name) \ + [DIM_1][DIM_2] = (TYPE(*)[DIM_1][DIM_2])input_array_name + +#define ARRAY_4D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2, \ + DIM_3) \ + TYPE(*output_array_name) \ + [DIM_1][DIM_2][DIM_3] = (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name + +#define ARRAY_5D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2, \ + DIM_3, DIM_4) \ + TYPE(*output_array_name) \ + [DIM_1][DIM_2][DIM_3][DIM_4] = \ + (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name #endif diff --git a/hpvm/test/hpvm-cava/src/dma_interface.c b/hpvm/test/hpvm-cava/src/dma_interface.c index 81bce54469886153170f994a77250a784cc9b7d7..68698635a4fceb4fe67e323bd0f354bd70bca99d 100644 --- a/hpvm/test/hpvm-cava/src/dma_interface.c +++ b/hpvm/test/hpvm-cava/src/dma_interface.c @@ -1,6 +1,6 @@ +#include "dma_interface.h" #include <assert.h> #include <string.h> -#include "dma_interface.h" // All _dmaImplN functions must be always inlined or we'll get extra functions // in the trace. @@ -10,22 +10,22 @@ // Starting with version 3, all versioning will be distinguished by the return // value of the DMA functions. -__attribute__((__always_inline__)) -int _dmaImpl3(void* dst_addr, void* src_addr, size_t size) { +__attribute__((__always_inline__)) int _dmaImpl3(void *dst_addr, void *src_addr, + size_t size) { assert(size > 0); memmove(dst_addr, src_addr, size); return 3; } -int dmaLoad(void* dst_addr, void* src_host_addr, size_t size) { +int dmaLoad(void *dst_addr, void *src_host_addr, size_t size) { return _dmaImpl3(dst_addr, src_host_addr, size); } -int dmaStore(void* dst_host_addr, void* src_addr, size_t size) { +int dmaStore(void *dst_host_addr, void *src_addr, size_t size) { return _dmaImpl3(dst_host_addr, src_addr, size); } -int setReadyBits(void* start_addr, size_t size, unsigned value) { +int setReadyBits(void *start_addr, size_t size, unsigned value) { asm(""); return 0; } @@ -35,39 +35,37 @@ int setReadyBits(void* start_addr, size_t size, unsigned value) { // With version 2 and earlier, we return (void*)NULL and use the number of // function arguments to distinguish the DMA functions. -__attribute__((__always_inline__)) -void* _dmaImpl2(void* base_addr, size_t src_off, size_t dst_off, size_t size) { +__attribute__((__always_inline__)) void * +_dmaImpl2(void *base_addr, size_t src_off, size_t dst_off, size_t size) { assert(size > 0); memmove(base_addr + dst_off, base_addr + src_off, size); return NULL; } -void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size) { +void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size) { return _dmaImpl2(base_addr, src_off, dst_off, size); } -void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size) { +void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size) { return _dmaImpl2(base_addr, src_off, dst_off, size); } #else -__attribute__((__always_inline__)) -void* _dmaImpl1(void* base_addr, size_t offset, size_t size) { +__attribute__((__always_inline__)) void *_dmaImpl1(void *base_addr, + size_t offset, size_t size) { assert(size > 0); asm(""); return NULL; } -void* dmaLoad(void* addr, size_t offset, size_t size) { +void *dmaLoad(void *addr, size_t offset, size_t size) { return _dmaImpl1(addr, offset, size); } -void* dmaStore(void* addr, size_t offset, size_t size) { +void *dmaStore(void *addr, size_t offset, size_t size) { return _dmaImpl1(addr, offset, size); } #endif -void dmaFence() { - asm(""); -} +void dmaFence() { asm(""); } diff --git a/hpvm/test/hpvm-cava/src/dma_interface.h b/hpvm/test/hpvm-cava/src/dma_interface.h index f23234eede4df99db84b144646530dfe240c6e62..771ece523824cff5923581aca671ab7d26fae706 100644 --- a/hpvm/test/hpvm-cava/src/dma_interface.h +++ b/hpvm/test/hpvm-cava/src/dma_interface.h @@ -10,12 +10,12 @@ // Version 3 of the DMA interface enables memcpy operations from arbitrary // source and destination addresses. -int dmaLoad(void* dst_addr, void* src_host_addr, size_t size); -int dmaStore(void* dst_host_addr, void* src_addr, size_t size); +int dmaLoad(void *dst_addr, void *src_host_addr, size_t size); +int dmaStore(void *dst_host_addr, void *src_addr, size_t size); // The user can explicitly toggle the state of ready bits, if ready mode is // enabled. This requires support from DMA v3. -int setReadyBits(void* start_addr, size_t size, unsigned value); +int setReadyBits(void *start_addr, size_t size, unsigned value); #elif defined(DMA_INTERFACE_V2) @@ -26,17 +26,18 @@ int setReadyBits(void* start_addr, size_t size, unsigned value); // actually copied from source to destination (the memory copy will not show up // in the trace). -void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size); -void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size); +void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size); +void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size); #else #warning "DMA interface v1 is deprecated!" -// Version 1 of the DMA interface is now deprecated and will be removed entirely. +// Version 1 of the DMA interface is now deprecated and will be removed +// entirely. -void* dmaLoad(void* addr, size_t offset, size_t size); -void* dmaStore(void* addr, size_t offset, size_t size); +void *dmaLoad(void *addr, size_t offset, size_t size); +void *dmaStore(void *addr, size_t offset, size_t size); #endif void dmaFence(); diff --git a/hpvm/test/hpvm-cava/src/load_cam_model.c b/hpvm/test/hpvm-cava/src/load_cam_model.c index 124fe0b7d175c2655feac562ecd6e2a5b73cc96a..3ef24cf429e31ac8f35744005d82b57ac0200611 100644 --- a/hpvm/test/hpvm-cava/src/load_cam_model.c +++ b/hpvm/test/hpvm-cava/src/load_cam_model.c @@ -1,13 +1,13 @@ +#include "load_cam_model.h" +#include "pipe_stages.h" +#include "utility.h" +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <assert.h> -#include "utility.h" -#include "pipe_stages.h" -#include "load_cam_model.h" // Get color space transform -float* get_Ts(char* cam_model_path) { +float *get_Ts(char *cam_model_path) { float *Ts; int err = posix_memalign((void **)&Ts, CACHELINE_SIZE, sizeof(float) * 9); assert(err == 0 && "Failed to allocate memory!"); @@ -32,7 +32,7 @@ float* get_Ts(char* cam_model_path) { str = strtok(line, " \n"); int i = 0; while (str != NULL) { - line_data[i] = atof(str); + line_data[i] = atof(str); str = strtok(NULL, " \n"); i++; } @@ -50,7 +50,7 @@ float* get_Ts(char* cam_model_path) { } // Get white balance transform -float* get_Tw(char* cam_model_path, int wb_index) { +float *get_Tw(char *cam_model_path, int wb_index) { float *Tw; int err = posix_memalign((void **)&Tw, CACHELINE_SIZE, sizeof(float) * 9); assert(err == 0 && "Failed to allocate memory!"); @@ -62,7 +62,7 @@ float* get_Tw(char* cam_model_path, int wb_index) { // Calculate base for the white balance transform selected // For more details see the camera model readme - int wb_base = 8 + 5*(wb_index-1); + int wb_base = 8 + 5 * (wb_index - 1); // Open file for reading // Open file for reading @@ -81,15 +81,15 @@ float* get_Tw(char* cam_model_path, int wb_index) { str = strtok(line, " \n"); int i = 0; while (str != NULL) { - line_data[i] = atof(str); + line_data[i] = atof(str); str = strtok(NULL, " \n"); i++; } if (line_idx == wb_base) { // Convert the white balance vector into a diagaonal matrix - for (int i=0; i<3; i++) { - for (int j=0; j<3; j++) { + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { if (i == j) { Tw[i * 3 + j] = line_data[i]; } else { @@ -105,9 +105,8 @@ float* get_Tw(char* cam_model_path, int wb_index) { return Tw; } - // Get combined transforms for checking -float* get_TsTw(char* cam_model_path, int wb_index) { +float *get_TsTw(char *cam_model_path, int wb_index) { float *TsTw; int err = posix_memalign((void **)&TsTw, CACHELINE_SIZE, sizeof(float) * 9); assert(err == 0 && "Failed to allocate memory!"); @@ -119,7 +118,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) { // Calculate base for the white balance transform selected // For more details see the camera model readme - int wb_base = 5 + 5*(wb_index-1); + int wb_base = 5 + 5 * (wb_index - 1); // Open file for reading char file_name[] = "raw2jpg_transform.txt"; @@ -137,7 +136,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) { str = strtok(line, " \n"); int i = 0; while (str != NULL) { - line_data[i] = atof(str); + line_data[i] = atof(str); str = strtok(NULL, " \n"); i++; } @@ -155,7 +154,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) { } // Get control points -float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) { +float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts) { float *ctrl_pnts; int err = posix_memalign((void **)&ctrl_pnts, CACHELINE_SIZE, sizeof(float) * num_cntrl_pts * 3); @@ -200,7 +199,7 @@ float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) { } // Get weights -float* get_weights(char* cam_model_path, int num_cntrl_pts) { +float *get_weights(char *cam_model_path, int num_cntrl_pts) { float *weights; int err = posix_memalign((void **)&weights, CACHELINE_SIZE, sizeof(float) * num_cntrl_pts * 3); @@ -245,7 +244,7 @@ float* get_weights(char* cam_model_path, int num_cntrl_pts) { } // Get coeficients -float* get_coefs(char* cam_model_path, int num_cntrl_pts) { +float *get_coefs(char *cam_model_path, int num_cntrl_pts) { float *coefs; int err = posix_memalign((void **)&coefs, CACHELINE_SIZE, sizeof(float) * 12); assert(err == 0 && "Failed to allocate memory!"); @@ -288,9 +287,8 @@ float* get_coefs(char* cam_model_path, int num_cntrl_pts) { return coefs; } - // Get tone mapping table -float* get_tone_map(char* cam_model_path) { +float *get_tone_map(char *cam_model_path) { float *tone_map; int err = posix_memalign((void **)&tone_map, CACHELINE_SIZE, sizeof(float) * 256 * CHAN_SIZE); diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c index c1c0130b4c2c0ec6ec7e792c72323b03a4d508a5..8e7bd197d026773b47fd0e954b56821cd151c60a 100644 --- a/hpvm/test/hpvm-cava/src/main.c +++ b/hpvm/test/hpvm-cava/src/main.c @@ -1,14 +1,14 @@ +#include "utility.h" #include <argp.h> +#include <assert.h> +#include <math.h> #include <stdio.h> #include <stdlib.h> -#include <assert.h> #include <string.h> -#include <math.h> -#include "utility.h" #include "cam_pipe_utility.h" -#include "pipe_stages.h" #include "load_cam_model.h" +#include "pipe_stages.h" #include "visc.h" @@ -20,123 +20,135 @@ int NUM_WORKER_THREADS; // Type of struct holding the return value from the last node. struct RetStruct { size_t bytesRet; -}; +}; // Type of struct that is used to pass arguments to the HPVM dataflow graph // using the hpvm launch operation typedef struct __attribute__((__packed__)) { - uint8_t *input; size_t bytes_input; - uint8_t *result; size_t bytes_result; - float *input_scaled; size_t bytes_input_scaled; - float *result_scaled; size_t bytes_result_scaled; - float *demosaic_out; size_t bytes_demosaic_out; - float *denoise_out; size_t bytes_denoise_out; - float *transform_out; size_t bytes_transform_out; - float *gamut_out;size_t bytes_gamut_out; - float *TsTw; size_t bytes_TsTw; - float *ctrl_pts; size_t bytes_ctrl_pts; - float *weights; size_t bytes_weights; - float*coefs; size_t bytes_coefs; - float *l2_dist; size_t bytes_l2_dist; - float *tone_map; size_t bytes_tone_map; - int row_size; int col_size; - struct RetStruct ret; // Instance of RetStruct holding the return value. -} -RootIn; + uint8_t *input; + size_t bytes_input; + uint8_t *result; + size_t bytes_result; + float *input_scaled; + size_t bytes_input_scaled; + float *result_scaled; + size_t bytes_result_scaled; + float *demosaic_out; + size_t bytes_demosaic_out; + float *denoise_out; + size_t bytes_denoise_out; + float *transform_out; + size_t bytes_transform_out; + float *gamut_out; + size_t bytes_gamut_out; + float *TsTw; + size_t bytes_TsTw; + float *ctrl_pts; + size_t bytes_ctrl_pts; + float *weights; + size_t bytes_weights; + float *coefs; + size_t bytes_coefs; + float *l2_dist; + size_t bytes_l2_dist; + float *tone_map; + size_t bytes_tone_map; + int row_size; + int col_size; + struct RetStruct ret; // Instance of RetStruct holding the return value. +} RootIn; typedef enum _argnum { - RAW_IMAGE_BIN, - OUTPUT_IMAGE_BIN, - NUM_REQUIRED_ARGS, - DATA_FILE = NUM_REQUIRED_ARGS, - NUM_ARGS, + RAW_IMAGE_BIN, + OUTPUT_IMAGE_BIN, + NUM_REQUIRED_ARGS, + DATA_FILE = NUM_REQUIRED_ARGS, + NUM_ARGS, } argnum; typedef struct _arguments { - char* args[NUM_ARGS]; - int num_inputs; - int num_threads; + char *args[NUM_ARGS]; + int num_inputs; + int num_threads; } arguments; static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n"; static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary"; static struct argp_option options[] = { - { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 }, - { "data-file", 'f', "F", 0, - "File to read data and weights from (if data-init-mode == READ_FILE or " - "save-params is true). *.txt files are decoded as text files, while " - "*.bin files are decoded as binary files." }, + {"num-inputs", 'n', "N", 0, "Number of input images"}, + {0}, + {"data-file", 'f', "F", 0, + "File to read data and weights from (if data-init-mode == READ_FILE or " + "save-params is true). *.txt files are decoded as text files, while " + "*.bin files are decoded as binary files."}, }; -static error_t parse_opt(int key, char* arg, struct argp_state* state) { - arguments* args = (arguments*)(state->input); - switch (key) { - case 'n': { - args->num_inputs = strtol(arg, NULL, 10); - break; - } - case 'f': { - args->args[DATA_FILE] = arg; - break; - } - case 't': { - args->num_threads = strtol(arg, NULL, 10); - break; - } - case ARGP_KEY_ARG: { - if (state->arg_num >= NUM_REQUIRED_ARGS) - argp_usage(state); - args->args[state->arg_num] = arg; - break; - } - case ARGP_KEY_END: { - if (state->arg_num < NUM_REQUIRED_ARGS) { - fprintf(stderr, - "Not enough arguments! Got %d, require %d.\n", - state->arg_num, - NUM_REQUIRED_ARGS); - argp_usage(state); - } - break; - } - default: - return ARGP_ERR_UNKNOWN; +static error_t parse_opt(int key, char *arg, struct argp_state *state) { + arguments *args = (arguments *)(state->input); + switch (key) { + case 'n': { + args->num_inputs = strtol(arg, NULL, 10); + break; + } + case 'f': { + args->args[DATA_FILE] = arg; + break; + } + case 't': { + args->num_threads = strtol(arg, NULL, 10); + break; + } + case ARGP_KEY_ARG: { + if (state->arg_num >= NUM_REQUIRED_ARGS) + argp_usage(state); + args->args[state->arg_num] = arg; + break; + } + case ARGP_KEY_END: { + if (state->arg_num < NUM_REQUIRED_ARGS) { + fprintf(stderr, "Not enough arguments! Got %d, require %d.\n", + state->arg_num, NUM_REQUIRED_ARGS); + argp_usage(state); } - return 0; + break; + } + default: + return ARGP_ERR_UNKNOWN; + } + return 0; } -void set_default_args(arguments* args) { - args->num_inputs = 1; - args->num_threads = 0; - for (int i = 0; i < NUM_ARGS; i++) { - args->args[i] = NULL; - } +void set_default_args(arguments *args) { + args->num_inputs = 1; + args->num_threads = 0; + for (int i = 0; i < NUM_ARGS; i++) { + args->args[i] = NULL; + } } -static struct argp parser = { options, parse_opt, args_doc, prog_doc }; +static struct argp parser = {options, parse_opt, args_doc, prog_doc}; // Helper function for printing intermediate results -void descale_cpu(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - size_t row_size, size_t col_size) { - +void descale_cpu(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, size_t row_size, size_t col_size) { + for (int chan = 0; chan < CHAN_SIZE; chan++) for (int row = 0; row < row_size; row++) for (int col = 0; col < col_size; col++) { - int index = (chan*row_size + row) * col_size + col; + int index = (chan * row_size + row) * col_size + col; output[index] = min(max(input[index] * 255, 0), 255); } } static void sort(float arr[], int n) { - int i, j; - for (i = 0; i < n - 1; i++) - for (j = 0; j < n - i - 1; j++) - if (arr[j] > arr[j + 1]) { - float temp = arr[j]; - arr[j] = arr[j + 1]; - arr[j + 1] = temp; - } + int i, j; + for (i = 0; i < n - 1; i++) + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } } /**************************************************************/ @@ -146,255 +158,258 @@ static void sort(float arr[], int n) { // In this benchmark, no use of HPVM query intrinsics in the leaf node functions // Leaf HPVM node function for scale -void scale_fxp(uint8_t *input, size_t bytes_input, - float *output, size_t bytes_output, - size_t row_size, size_t col_size) { +void scale_fxp(uint8_t *input, size_t bytes_input, float *output, + size_t bytes_output, size_t row_size, size_t col_size) { - //Specifies compilation target for current node + // Specifies compilation target for current node __visc__hint(CPU_TARGET); // Specifies pointer arguments that will be used as "in" and "out" arguments // - count of "in" arguments // - list of "in" argument , and similar for "out" __visc__attributes(2, input, output, 1, output); - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + void *thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++){ - int index = (chan*row_size + row) * col_size + col; - output[index] = input[index] * 1.0 / 255; - } + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + output[index] = input[index] * 1.0 / 255; + } __visc__return(1, bytes_output); } // Leaf HPVM node function for descale -void descale_fxp(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - size_t row_size, size_t col_size) { +void descale_fxp(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, size_t row_size, size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(2, input, output, 1, output); - + for (int chan = 0; chan < CHAN_SIZE; chan++) for (int row = 0; row < row_size; row++) for (int col = 0; col < col_size; col++) { - int index = (chan*row_size + row) * col_size + col; + int index = (chan * row_size + row) * col_size + col; output[index] = min(max(input[index] * 255, 0), 255); } __visc__return(1, bytes_result); } // Leaf HPVM node function for demosaicing -void demosaic_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { +void demosaic_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size) { __visc__hint(DEVICE); __visc__attributes(2, input, result, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); -// for (int row = 1; row < row_size - 1; row++) - for (int col = 1; col < col_size - 1; col++) { - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - if (row % 2 == 0 && col % 2 == 0) { - // Green pixel - // Getting the R values - float R1 = input[index_0 - 1]; - float R2 = input[index_0 + 1]; - // Getting the B values - float B1 = input[index_2 - col_size]; - float B2 = input[index_2 + col_size]; - // R - result[index_0] = (R1 + R2) / 2; - // G - result[index_1] = input[index_1] * 2; - // B - result[index_2] = (B1 + B2) / 2; - } else if (row % 2 == 0 && col % 2 == 1) { - // Red pixel - // Getting the G values - float G1 = input[index_1 - col_size]; - float G2 = input[index_1 + col_size]; - float G3 = input[index_1 - 1]; - float G4 = input[index_1 + 1]; - // Getting the B values - float B1 = input[index_2 - col_size - 1]; - float B2 = input[index_2 - col_size + 1]; - float B3 = input[index_2 + col_size - 1]; - float B4 = input[index_2 + col_size + 1]; - // R - result[index_0] = input[index_0]; - // G - result[index_1] = (G1 + G2 + G3 + G4) / 2; - // B (center pixel) - result[index_2] = (B1 + B2 + B3 + B4) / 4; - } else if (row % 2 == 1 && col % 2 == 0) { - // Blue pixel - // Getting the R values - float R1 = input[index_0 - col_size - 1]; - float R2 = input[index_0 + col_size - 1]; - float R3 = input[index_0 - col_size + 1]; - float R4 = input[index_0 + col_size + 1]; - // Getting the G values - float G1 = input[index_1 - col_size]; - float G2 = input[index_1 + col_size]; - float G3 = input[index_1 - 1]; - float G4 = input[index_1 + 1]; - // R - result[index_0] = (R1 + R2 + R3 + R4) / 4; - // G - result[index_1] = (G1 + G2 + G3 + G4) / 2; - // B - result[index_2] = input[index_2]; - } else { - // Bottom Green pixel - // Getting the R values - float R1 = input[index_0 - col_size]; - float R2 = input[index_0 + col_size]; - // Getting the B values - float B1 = input[index_2 - 1]; - float B2 = input[index_2 + 1]; - // R - result[index_0] = (R1 + R2) / 2; - // G - result[index_1] = input[index_1] * 2; - // B - result[index_2] = (B1 + B2) / 2; - } - } + + void *thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); + // for (int row = 1; row < row_size - 1; row++) + for (int col = 1; col < col_size - 1; col++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = input[index_0 - 1]; + float R2 = input[index_0 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size]; + float B2 = input[index_2 + col_size]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size - 1]; + float B2 = input[index_2 - col_size + 1]; + float B3 = input[index_2 + col_size - 1]; + float B4 = input[index_2 + col_size + 1]; + // R + result[index_0] = input[index_0]; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + result[index_2] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = input[index_0 - col_size - 1]; + float R2 = input[index_0 + col_size - 1]; + float R3 = input[index_0 - col_size + 1]; + float R4 = input[index_0 + col_size + 1]; + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // R + result[index_0] = (R1 + R2 + R3 + R4) / 4; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B + result[index_2] = input[index_2]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = input[index_0 - col_size]; + float R2 = input[index_0 + col_size]; + // Getting the B values + float B1 = input[index_2 - 1]; + float B2 = input[index_2 + 1]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } + } __visc__return(1, bytes_result); } // Leaf HPVM node function for denoise -void denoise_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { +void denoise_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(2, input, result, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + + void *thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) - if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { - float filter[9]; - for (int i = -1; i < 2; i++) - for (int j = -1; j < 2; j++) { - int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1; - filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)]; - } - sort(filter, 9); - result[(chan * row_size + row) * col_size + col] = filter[4]; - } else { - result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col]; - } + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) + if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { + float filter[9]; + for (int i = -1; i < 2; i++) + for (int j = -1; j < 2; j++) { + int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1; + filter[index] = + input[(chan * row_size + (i + row)) * col_size + (j + col)]; + } + sort(filter, 9); + result[(chan * row_size + row) * col_size + col] = filter[4]; + } else { + result[(chan * row_size + row) * col_size + col] = + input[(chan * row_size + row) * col_size + col]; + } __visc__return(1, bytes_result); } // Leaf HPVM node function, for color map and white balance transform -void transform_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, size_t row_size, size_t col_size) { __visc__hint(DEVICE); __visc__attributes(3, input, result, TsTw_tran, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + + void *thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - int index_2d_0 = 0 * CHAN_SIZE + chan; - int index_2d_1 = 1 * CHAN_SIZE + chan; - int index_2d_2 = 2 * CHAN_SIZE + chan; - result[index] = - max(input[index_0] * TsTw_tran[index_2d_0] + - input[index_1] * TsTw_tran[index_2d_1] + - input[index_2] * TsTw_tran[index_2d_2], - 0); - } + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + int index_2d_0 = 0 * CHAN_SIZE + chan; + int index_2d_1 = 1 * CHAN_SIZE + chan; + int index_2d_2 = 2 * CHAN_SIZE + chan; + result[index] = max(input[index_0] * TsTw_tran[index_2d_0] + + input[index_1] * TsTw_tran[index_2d_1] + + input[index_2] * TsTw_tran[index_2d_2], + 0); + } __visc__return(1, bytes_result); } // Leaf HPVM node function, for gamut mapping -void gamut_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, +void gamut_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, float *coefs, + size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, size_t row_size, size_t col_size) { __visc__hint(CPU_TARGET); - __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist); - - // First, get the L2 norm from every pixel to the control points, - // Then, sum it and weight it. Finally, add the bias. - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - float chan_val_0 = 0.0; - float chan_val_1 = 0.0; - float chan_val_2 = 0.0; - for (int cp = 0; cp < 3702; cp++) { - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); - float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); - float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); - float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); - float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); - float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); - float val = val1 * val2 + val3 * val4 + val5 * val6; - float sqrt_val = sqrt(val); - chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0]; - chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1]; - chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2]; - } - chan_val_0 += coefs[0 * CHAN_SIZE + 0] + - coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col]; - chan_val_1 += coefs[0 * CHAN_SIZE + 1] + - coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col]; - chan_val_2 += coefs[0 * CHAN_SIZE + 2] + - coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col]; - result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0); - result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0); - result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, + result, l2_dist); + + // First, get the L2 norm from every pixel to the control points, + // Then, sum it and weight it. Finally, add the bias. + void *thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + float chan_val_0 = 0.0; + float chan_val_1 = 0.0; + float chan_val_2 = 0.0; + for (int cp = 0; cp < 3702; cp++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val = val1 * val2 + val3 * val4 + val5 * val6; + float sqrt_val = sqrt(val); + chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0]; + chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1]; + chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2]; } + chan_val_0 += + coefs[0 * CHAN_SIZE + 0] + + coefs[1 * CHAN_SIZE + 0] * + input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 0] * + input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col]; + chan_val_1 += + coefs[0 * CHAN_SIZE + 1] + + coefs[1 * CHAN_SIZE + 1] * + input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 1] * + input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col]; + chan_val_2 += + coefs[0 * CHAN_SIZE + 2] + + coefs[1 * CHAN_SIZE + 2] * + input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 2] * + input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col]; + result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0); + result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0); + result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0); + } __visc__return(1, bytes_result); } // HPVM leaf node function, for tone mapping -void tone_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, size_t bytes_tone_map, size_t row_size, size_t col_size) { __visc__hint(DEVICE); __visc__attributes(3, input, result, tone_map, 1, result); - - void* thisNode = __visc__getNode(); - int row = __visc__getNodeInstanceID_x(thisNode); + + void *thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) -// for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - uint8_t x = input[index] * 255; - result[index] = tone_map[x * CHAN_SIZE + chan]; - } + // for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + uint8_t x = input[index] * 255; + result[index] = tone_map[x * CHAN_SIZE + chan]; + } __visc__return(1, bytes_result); } @@ -406,9 +421,8 @@ void tone_map_fxp(float *input, size_t bytes_input, // requirement for the FPGA backend . The CPU backend also supports this, // so it does not cause a portability issue. -void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { +void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(2, input, result, 1, result); @@ -433,9 +447,9 @@ void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, __visc__bindOut(ScaleNode, 0, 0, 0); } -void descale_fxp_wrapper(float *input, size_t bytes_input, - uint8_t *result, size_t bytes_result, - size_t row_size, size_t col_size) { +void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result, + size_t bytes_result, size_t row_size, + size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(2, input, result, 1, result); void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size); @@ -445,13 +459,13 @@ void descale_fxp_wrapper(float *input, size_t bytes_input, __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size - + __visc__bindOut(DescaleNode, 0, 0, 0); } -void demosaic_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { +void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, + size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(2, input, result, 1, result); void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size); @@ -461,13 +475,13 @@ void demosaic_fxp_wrapper(float *input, size_t bytes_input, __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size - + __visc__bindOut(DemosaicNode, 0, 0, 0); } -void denoise_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size) { +void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, + size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(2, input, result, 1, result); void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size); @@ -477,14 +491,14 @@ void denoise_fxp_wrapper(float *input, size_t bytes_input, __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size - + __visc__bindOut(DenoiseNode, 0, 0, 0); } -void transform_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, - size_t row_size, size_t col_size) { +void transform_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, + size_t bytes_TsTw, size_t row_size, + size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(3, input, result, TsTw_tran, 1, result); void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size); @@ -496,41 +510,41 @@ void transform_fxp_wrapper(float *input, size_t bytes_input, __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size - + __visc__bindOut(TransformNode, 0, 0, 0); } -void gamut_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, - size_t row_size, size_t col_size) { +void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, + size_t bytes_ctrl_pts, float *weights, + size_t bytes_weights, float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, size_t row_size, + size_t col_size) { __visc__hint(CPU_TARGET); - __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, + result); void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size); - __visc__bindIn(GamutNode, 0, 0, 0); // bind input - __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input - __visc__bindIn(GamutNode, 2, 2, 0); // bind result - __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts - __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts - __visc__bindIn(GamutNode, 6, 6, 0); // bind weights - __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights - __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs - __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs + __visc__bindIn(GamutNode, 0, 0, 0); // bind input + __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(GamutNode, 2, 2, 0); // bind result + __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts + __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts + __visc__bindIn(GamutNode, 6, 6, 0); // bind weights + __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights + __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs + __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size - + __visc__bindOut(GamutNode, 0, 0, 0); } -void tone_map_fxp_wrapper(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, - size_t row_size, size_t col_size) { +void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, + size_t bytes_tone_map, size_t row_size, + size_t col_size) { __visc__hint(CPU_TARGET); __visc__attributes(3, input, result, tone_map, 1, result); @@ -539,52 +553,52 @@ void tone_map_fxp_wrapper(float *input, size_t bytes_input, __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result - __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map + __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size - + __visc__bindOut(ToneMapNode, 0, 0, 0); } - /*** ROOT Node - Top Level of the Graph Hierarchy ***/ -void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, - /*2*/ uint8_t *result, /*3*/ size_t bytes_result, - /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, - /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, - /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, - /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, - /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, - /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, - /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, - /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, - /*20*/ float *weights, /*21*/ size_t bytes_weights, - /*22*/ float*coefs, /*23*/ size_t bytes_coefs, - /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, - /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, - /*28*/ size_t row_size, /*29*/ size_t col_size) { - - //Specifies compilation target for current node - __visc__hint(CPU_TARGET); +void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, + /*2*/ uint8_t *result, /*3*/ size_t bytes_result, + /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, + /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, + /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, + /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, + /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, + /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, + /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, + /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, + /*20*/ float *weights, /*21*/ size_t bytes_weights, + /*22*/ float *coefs, /*23*/ size_t bytes_coefs, + /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, + /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, + /*28*/ size_t row_size, /*29*/ size_t col_size) { + + // Specifies compilation target for current node + __visc__hint(CPU_TARGET); // Specifies pointer arguments that will be used as "in" and "out" arguments // - count of "in" arguments // - list of "in" argument , and similar for "out" - __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, - transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, - 5, result, demosaic_out, denoise_out, transform_out, gamut_out); + __visc__attributes(14, input, result, input_scaled, result_scaled, + demosaic_out, denoise_out, transform_out, gamut_out, TsTw, + ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result, + demosaic_out, denoise_out, transform_out, gamut_out); // Create an 0D (specified by 1st argument) HPVM node - so a single node // associated with node function ---_fxp_wrapper - void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper); - void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper); - void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper); - void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper); - void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper); - void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper); - void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper); - + void *ScNode = __visc__createNodeND(0, scale_fxp_wrapper); + void *DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper); + void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper); + void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper); + void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper); + void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper); + void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper); + // BindIn binds inputs of current node with specified node // - destination node // - argument position in argument list of function of source node @@ -598,272 +612,283 @@ void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, // - destination position (in argument list of destination node) // - streaming (1) or non-streaming (0) - // scale_fxp inputs - __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input - __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input - __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result - __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result - __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size - __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size - - // demosaic_fxp inputs - __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input - __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input - __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result - __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result - __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size - __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size - - // denoise_fxp inputs - __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input - __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input - __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result - __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result - __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size - __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size - - // transform_fxp inputs - __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input - __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input - __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result - __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result - __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann - __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw - __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size - __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size - - // gamut_fxp inputs - __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input - __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input - __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result - __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result - __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts - __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts - __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights - __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights - __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs - __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs - __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist - __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist - __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size - __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size - - // tone_map_fxp inputs - __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input - __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input - __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result - __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result - __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map - __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map - __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size - __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size - - // descale_fxp inputs - __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input - __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input - __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result - __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result - __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size - __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size + // scale_fxp inputs + __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input + __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input + __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result + __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result + __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size + __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size + + // demosaic_fxp inputs + __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input + __visc__edge(ScNode, DmNode, 1, 0, 1, + 0); // SCNode:bytes_result -> DmNode:bytes_input + __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result + __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result + __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size + __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size + + // denoise_fxp inputs + __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input + __visc__edge(DmNode, DnNode, 1, 0, 1, + 0); // DMNode:bytes_result -> DnNode:bytes_input + __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result + __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result + __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size + __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size + + // transform_fxp inputs + __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input + __visc__edge(DnNode, TrNode, 1, 0, 1, + 0); // DnNode:bytes_result -> TrNode:bytes_input + __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result + __visc__bindIn(TrNode, 13, 3, + 0); // bytes_result_scaled -> TrNode:bytes_result + __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann + __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw + __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size + __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size + + // gamut_fxp inputs + __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input + __visc__edge(TrNode, GmNode, 1, 0, 1, + 0); // TrNode:bytes_result -> GmNode:bytes_input + __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result + __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result + __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts + __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts + __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights + __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights + __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs + __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs + __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist + __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist + __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size + __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size + + // tone_map_fxp inputs + __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input + __visc__edge(GmNode, TnNode, 1, 0, 1, + 0); // GmNode:bytes_result -> TnNode:bytes_input + __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result + __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result + __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map + __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map + __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size + __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size + + // descale_fxp inputs + __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input + __visc__edge(TnNode, DsNode, 1, 0, 1, + 0); // TnNode:bytes_result -> DsNode:bytes_input + __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result + __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result + __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size + __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size // Similar to bindIn, but for the output. Output of a node is a struct, and // we consider the fields in increasing ordering. - __visc__bindOut(DsNode, 0, 0, 0); - + __visc__bindOut(DsNode, 0, 0, 0); } -int main(int argc, char* argv[]) { - // Parse the arguments. - arguments args; - set_default_args(&args); - argp_parse(&parser, argc, argv, 0, 0, &args); - - // Read a raw image. - // NOTE: We deliberately perform this file I/O outside of the kernel. - printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); - size_t row_size, col_size; - uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); - - printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); - - // Allocate a buffer for storing the output image data. - // (This is currently the same size as the input image data.) - size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; - size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; - uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image); - uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image); - - __visc__init(); - - /////////////////////////////////////////////////////////////// - // Camera Model Parameters - /////////////////////////////////////////////////////////////// - // Path to the camera model to be used -// char cam_model_path[100]; -// char cam_model_path = "cam_models/NikonD7000/"; - // White balance index (select white balance from transform file) - // The first white balance in the file has a wb_index of 1 - // For more information on model format see the readme - int wb_index = 6; - - // Number of control points - int num_ctrl_pts = 3702; - uint8_t *input, *result; - float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out; - float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; - - TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); - float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); - free(TsTw); - TsTw = trans; - ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); - weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); - coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); - tone_map = get_tone_map("cam_models/NikonD7000/"); - - input_scaled = (float*) malloc_aligned(bytes_fimage); - result_scaled = (float*) malloc_aligned(bytes_fimage); - demosaic_out = (float*) malloc_aligned(bytes_fimage); - denoise_out = (float*) malloc_aligned(bytes_fimage); - transform_out = (float*) malloc_aligned(bytes_fimage); - gamut_out = (float*) malloc_aligned(bytes_fimage); - l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts); - - // This is host_input in cam_pipe() - input = (uint8_t*) malloc_aligned(bytes_image); - convert_hwc_to_chw(image_in, row_size, col_size, &input); - - // This is host_result in cam_pipe() - result = (uint8_t*) malloc_aligned(bytes_image); - - // Allocate struct to pass DFG inputs - RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn)); - - // Set up HPVM DFG inputs in the rootArgs struct. - rootArgs->input = input; - rootArgs->bytes_input = bytes_image; - - rootArgs->result = result; - rootArgs->bytes_result = bytes_image; - - rootArgs->input_scaled = input_scaled; - rootArgs->bytes_input_scaled = bytes_fimage; - - rootArgs->result_scaled = result_scaled; - rootArgs->bytes_result_scaled = bytes_fimage; - - rootArgs->demosaic_out = demosaic_out; - rootArgs->bytes_demosaic_out = bytes_fimage; - - rootArgs->denoise_out = denoise_out; - rootArgs->bytes_denoise_out = bytes_fimage; - - rootArgs->transform_out = transform_out; - rootArgs->bytes_transform_out = bytes_fimage; - - rootArgs->gamut_out = gamut_out; - rootArgs->bytes_gamut_out = bytes_fimage; - - rootArgs->TsTw = TsTw; - rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); - - rootArgs->ctrl_pts = ctrl_pts; - rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); - - rootArgs->weights = weights; - rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); - - rootArgs->coefs = coefs; - rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); - - rootArgs->tone_map = tone_map; - rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); - - rootArgs->l2_dist = l2_dist; - rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); - - rootArgs->row_size = row_size; - rootArgs->col_size = col_size; - - // Memory tracking is required for pointer arguments. - // Nodes can be scheduled on different targets, and - // dataflow edge implementation needs to request data. - // The pair (pointer, size) is inserted in memory tracker using this call - llvm_visc_track_mem(input, bytes_image); - llvm_visc_track_mem(result, bytes_image); - llvm_visc_track_mem(input_scaled, bytes_fimage); - llvm_visc_track_mem(result_scaled, bytes_fimage); - llvm_visc_track_mem(demosaic_out, bytes_fimage); - llvm_visc_track_mem(denoise_out, bytes_fimage); - llvm_visc_track_mem(transform_out, bytes_fimage); - llvm_visc_track_mem(gamut_out, bytes_fimage); - llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float)); - llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); - llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); - - printf("\n\nLaunching CAVA pipeline!\n"); - - void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs); - __visc__wait(camPipeDFG); - - printf("\n\nPipeline execution completed!\n"); - printf( - "Pipeline final stage returned %lu; should be %lu\n", - rootArgs->ret.bytesRet, bytes_image - ); - printf("\n\nRequesting memory!\n"); - - // Request data from graph. - llvm_visc_request_mem(result, bytes_image); - llvm_visc_request_mem(demosaic_out, bytes_fimage); - llvm_visc_request_mem(denoise_out, bytes_fimage); - llvm_visc_request_mem(transform_out, bytes_fimage); - llvm_visc_request_mem(gamut_out, bytes_fimage); - printf("\n\nDone requesting memory!\n"); - - - uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image); - - descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size); - descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size); - descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size); - descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size); - - convert_chw_to_hwc(result, row_size, col_size, &image_out); - convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); - convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic); - convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise); - convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform); - - - // Remove tracked pointers. - llvm_visc_untrack_mem(input); - llvm_visc_untrack_mem(result); - llvm_visc_untrack_mem(input_scaled); - llvm_visc_untrack_mem(result_scaled); - llvm_visc_untrack_mem(demosaic_out); - llvm_visc_untrack_mem(denoise_out); - llvm_visc_untrack_mem(transform_out); - llvm_visc_untrack_mem(gamut_out); - - llvm_visc_untrack_mem(TsTw); - llvm_visc_untrack_mem(ctrl_pts); - llvm_visc_untrack_mem(weights); - llvm_visc_untrack_mem(coefs); - llvm_visc_untrack_mem(tone_map); - llvm_visc_untrack_mem(l2_dist); - - // Output the image. - // NOTE: We deliberately perform this file I/O outside of the kernel. +int main(int argc, char *argv[]) { + // Parse the arguments. + arguments args; + set_default_args(&args); + argp_parse(&parser, argc, argv, 0, 0, &args); + + // Read a raw image. + // NOTE: We deliberately perform this file I/O outside of the kernel. + printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); + size_t row_size, col_size; + uint8_t *image_in = + read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); + + printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); + + // Allocate a buffer for storing the output image data. + // (This is currently the same size as the input image data.) + size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; + size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; + uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image); + + __visc__init(); + + /////////////////////////////////////////////////////////////// + // Camera Model Parameters + /////////////////////////////////////////////////////////////// + // Path to the camera model to be used + // char cam_model_path[100]; + // char cam_model_path = "cam_models/NikonD7000/"; + // White balance index (select white balance from transform file) + // The first white balance in the file has a wb_index of 1 + // For more information on model format see the readme + int wb_index = 6; + + // Number of control points + int num_ctrl_pts = 3702; + uint8_t *input, *result; + float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, + *transform_out, *gamut_out; + float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; + + TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); + float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); + free(TsTw); + TsTw = trans; + ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); + weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); + coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); + tone_map = get_tone_map("cam_models/NikonD7000/"); + + input_scaled = (float *)malloc_aligned(bytes_fimage); + result_scaled = (float *)malloc_aligned(bytes_fimage); + demosaic_out = (float *)malloc_aligned(bytes_fimage); + denoise_out = (float *)malloc_aligned(bytes_fimage); + transform_out = (float *)malloc_aligned(bytes_fimage); + gamut_out = (float *)malloc_aligned(bytes_fimage); + l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts); + + // This is host_input in cam_pipe() + input = (uint8_t *)malloc_aligned(bytes_image); + convert_hwc_to_chw(image_in, row_size, col_size, &input); + + // This is host_result in cam_pipe() + result = (uint8_t *)malloc_aligned(bytes_image); + + // Allocate struct to pass DFG inputs + RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn)); + + // Set up HPVM DFG inputs in the rootArgs struct. + rootArgs->input = input; + rootArgs->bytes_input = bytes_image; + + rootArgs->result = result; + rootArgs->bytes_result = bytes_image; + + rootArgs->input_scaled = input_scaled; + rootArgs->bytes_input_scaled = bytes_fimage; + + rootArgs->result_scaled = result_scaled; + rootArgs->bytes_result_scaled = bytes_fimage; + + rootArgs->demosaic_out = demosaic_out; + rootArgs->bytes_demosaic_out = bytes_fimage; + + rootArgs->denoise_out = denoise_out; + rootArgs->bytes_denoise_out = bytes_fimage; + + rootArgs->transform_out = transform_out; + rootArgs->bytes_transform_out = bytes_fimage; + + rootArgs->gamut_out = gamut_out; + rootArgs->bytes_gamut_out = bytes_fimage; + + rootArgs->TsTw = TsTw; + rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); + + rootArgs->ctrl_pts = ctrl_pts; + rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->weights = weights; + rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->coefs = coefs; + rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); + + rootArgs->tone_map = tone_map; + rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); + + rootArgs->l2_dist = l2_dist; + rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); + + rootArgs->row_size = row_size; + rootArgs->col_size = col_size; + + // Memory tracking is required for pointer arguments. + // Nodes can be scheduled on different targets, and + // dataflow edge implementation needs to request data. + // The pair (pointer, size) is inserted in memory tracker using this call + llvm_visc_track_mem(input, bytes_image); + llvm_visc_track_mem(result, bytes_image); + llvm_visc_track_mem(input_scaled, bytes_fimage); + llvm_visc_track_mem(result_scaled, bytes_fimage); + llvm_visc_track_mem(demosaic_out, bytes_fimage); + llvm_visc_track_mem(denoise_out, bytes_fimage); + llvm_visc_track_mem(transform_out, bytes_fimage); + llvm_visc_track_mem(gamut_out, bytes_fimage); + llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); + + printf("\n\nLaunching CAVA pipeline!\n"); + + void *camPipeDFG = __visc__launch(0, CamPipeRoot, (void *)rootArgs); + __visc__wait(camPipeDFG); + + printf("\n\nPipeline execution completed!\n"); + printf("Pipeline final stage returned %lu; should be %lu\n", + rootArgs->ret.bytesRet, bytes_image); + printf("\n\nRequesting memory!\n"); + + // Request data from graph. + llvm_visc_request_mem(result, bytes_image); + llvm_visc_request_mem(demosaic_out, bytes_fimage); + llvm_visc_request_mem(denoise_out, bytes_fimage); + llvm_visc_request_mem(transform_out, bytes_fimage); + llvm_visc_request_mem(gamut_out, bytes_fimage); + printf("\n\nDone requesting memory!\n"); + + uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image); + + descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, + row_size, col_size); + descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, + row_size, col_size); + descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, + row_size, col_size); + descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, + row_size, col_size); + + convert_chw_to_hwc(result, row_size, col_size, &image_out); + convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); + convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, + &image_out_demosaic); + convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, + &image_out_denoise); + convert_chw_to_hwc(transform_out_descaled, row_size, col_size, + &image_out_transform); + + // Remove tracked pointers. + llvm_visc_untrack_mem(input); + llvm_visc_untrack_mem(result); + llvm_visc_untrack_mem(input_scaled); + llvm_visc_untrack_mem(result_scaled); + llvm_visc_untrack_mem(demosaic_out); + llvm_visc_untrack_mem(denoise_out); + llvm_visc_untrack_mem(transform_out); + llvm_visc_untrack_mem(gamut_out); + + llvm_visc_untrack_mem(TsTw); + llvm_visc_untrack_mem(ctrl_pts); + llvm_visc_untrack_mem(weights); + llvm_visc_untrack_mem(coefs); + llvm_visc_untrack_mem(tone_map); + llvm_visc_untrack_mem(l2_dist); + + // Output the image. + // NOTE: We deliberately perform this file I/O outside of the kernel. char str[50], base_str[50]; strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]); strcpy(str, base_str); @@ -887,8 +912,7 @@ int main(int argc, char* argv[]) { printf("Writing output image to %s\n", str); write_image_to_binary(str, image_out_transform, row_size, col_size); - __visc__cleanup(); + __visc__cleanup(); - return 0; + return 0; } - diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c index 2ebedec936915b5e7f11881c5001c84b6db26474..253052af872838f6ed363e3497ef64dd288db84e 100644 --- a/hpvm/test/hpvm-cava/src/pipe_stages.c +++ b/hpvm/test/hpvm-cava/src/pipe_stages.c @@ -1,44 +1,43 @@ -#include <stdio.h> -#include <math.h> #include "pipe_stages.h" #include "cam_pipe_utility.h" +#include <math.h> +#include <stdio.h> -//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { -void scale_fxp(uint8_t *input, size_t bytes_input, - float *output, size_t bytes_output, - int row_size, int col_size) { +// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { +void scale_fxp(uint8_t *input, size_t bytes_input, float *output, + size_t bytes_output, int row_size, int col_size) { __visc__hint(DEVICE); __visc__attributes(2, input, output, 1, output); - + ARRAY_3D(uint8_t, _input, input, row_size, col_size); ARRAY_3D(float, _output, output, row_size, col_size); - sl_chan: +sl_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - sl_row: + sl_row: for (int row = 0; row < row_size; row++) - sl_col: + sl_col: for (int col = 0; col < col_size; col++) _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255; __visc__return(1, bytes_output); } -//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { -void descale_fxp(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - int row_size, int col_size) { +// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { +void descale_fxp(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, int row_size, int col_size) { __visc__hint(DEVICE); __visc__attributes(2, input, output, 1, output); - + ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(uint8_t, _output, output, row_size, col_size); - dsl_chan: +dsl_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - dsl_row: + dsl_row: for (int row = 0; row < row_size; row++) - dsl_col: + dsl_col: for (int col = 0; col < col_size; col++) - _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255); + _output[chan][row][col] = + min(max(_input[chan][row][col] * 255, 0), 255); __visc__return(1, bytes_output); } @@ -46,127 +45,125 @@ void descale_fxp(float *input, size_t bytes_input, // Demosaicing stage // G R // B G -//void demosaic_fxp(float *input, int row_size, int col_size, float *result) { -void demosaic_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - int row_size, int col_size) { +// void demosaic_fxp(float *input, int row_size, int col_size, float *result) { +void demosaic_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, int row_size, int col_size) { __visc__hint(DEVICE); __visc__attributes(2, input, result, 1, result); - + printf("Demosaicing.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); - dm_row: +dm_row: for (int row = 1; row < row_size - 1; row++) - dm_col: + dm_col: for (int col = 1; col < col_size - 1; col++) - if (row % 2 == 0 && col % 2 == 0) { - // Green pixel - // Getting the R values - float R1 = _input[0][row][col - 1]; - float R2 = _input[0][row][col + 1]; - // Getting the B values - float B1 = _input[2][row - 1][col]; - float B2 = _input[2][row + 1][col]; - // R - _result[0][row][col] = (R1 + R2) / 2; - // G - _result[1][row][col] = _input[1][row][col] * 2; - // B - _result[2][row][col] = (B1 + B2) / 2; - } else if (row % 2 == 0 && col % 2 == 1) { - // Red pixel - // Getting the G values - float G1 = _input[1][row - 1][col]; - float G2 = _input[1][row + 1][col]; - float G3 = _input[1][row][col - 1]; - float G4 = _input[1][row][col + 1]; - // Getting the B values - float B1 = _input[2][row - 1][col - 1]; - float B2 = _input[2][row - 1][col + 1]; - float B3 = _input[2][row + 1][col - 1]; - float B4 = _input[2][row + 1][col + 1]; - // R - _result[0][row][col] = _input[0][row][col]; - // G - _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; - // B (center pixel) - _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; - } else if (row % 2 == 1 && col % 2 == 0) { - // Blue pixel - // Getting the R values - float R1 = _input[0][row - 1][col - 1]; - float R2 = _input[0][row + 1][col - 1]; - float R3 = _input[0][row - 1][col + 1]; - float R4 = _input[0][row + 1][col + 1]; - // Getting the G values - float G1 = _input[1][row - 1][col]; - float G2 = _input[1][row + 1][col]; - float G3 = _input[1][row][col - 1]; - float G4 = _input[1][row][col + 1]; - // R - _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; - // G - _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; - // B - _result[2][row][col] = _input[2][row][col]; - } else { - // Bottom Green pixel - // Getting the R values - float R1 = _input[0][row - 1][col]; - float R2 = _input[0][row + 1][col]; - // Getting the B values - float B1 = _input[2][row][col - 1]; - float B2 = _input[2][row][col + 1]; - // R - _result[0][row][col] = (R1 + R2) / 2; - // G - _result[1][row][col] = _input[1][row][col] * 2; - // B - _result[2][row][col] = (B1 + B2) / 2; - } + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = _input[0][row][col - 1]; + float R2 = _input[0][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col]; + float B2 = _input[2][row + 1][col]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col - 1]; + float B2 = _input[2][row - 1][col + 1]; + float B3 = _input[2][row + 1][col - 1]; + float B4 = _input[2][row + 1][col + 1]; + // R + _result[0][row][col] = _input[0][row][col]; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = _input[0][row - 1][col - 1]; + float R2 = _input[0][row + 1][col - 1]; + float R3 = _input[0][row - 1][col + 1]; + float R4 = _input[0][row + 1][col + 1]; + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B + _result[2][row][col] = _input[2][row][col]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = _input[0][row - 1][col]; + float R2 = _input[0][row + 1][col]; + // Getting the B values + float B1 = _input[2][row][col - 1]; + float B2 = _input[2][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } __visc__return(1, bytes_result); } static void sort(float arr[], int n) { - int i, j; - dn_sort_i: - for (i = 0; i < n - 1; i++) - dn_sort_j: - for (j = 0; j < n - i - 1; j++) - if (arr[j] > arr[j + 1]) { - float temp = arr[j]; - arr[j] = arr[j + 1]; - arr[j + 1] = temp; - } + int i, j; +dn_sort_i: + for (i = 0; i < n - 1; i++) + dn_sort_j: + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } } // Simple denoise -//void denoise_fxp(float *input, int row_size, int col_size, float *result) { -void denoise_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - int row_size, int col_size) { +// void denoise_fxp(float *input, int row_size, int col_size, float *result) { +void denoise_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, int row_size, int col_size) { __visc__hint(DEVICE); __visc__attributes(2, input, result, 1, result); - + printf("Denoising.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); - dn_chan: +dn_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - dn_row: + dn_row: for (int row = 0; row < row_size; row++) - dn_col: + dn_col: for (int col = 0; col < col_size; col++) if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { float filter[9]; - dn_slide_row: - for (int i = row-1; i < row+2; i++) - dn_slide_col: - for (int j = col-1; j < col+2; j++) { + dn_slide_row: + for (int i = row - 1; i < row + 2; i++) + dn_slide_col: + for (int j = col - 1; j < col + 2; j++) { int index = (i - row + 1) * 3 + j - col + 1; filter[index] = _input[chan][i][j]; } @@ -179,25 +176,24 @@ void denoise_fxp(float *input, size_t bytes_input, } // Color map and white balance transform -//void transform_fxp(float *input, int row_size, int col_size, float *result, +// void transform_fxp(float *input, int row_size, int col_size, float *result, // float *TsTw_tran) { -void transform_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, int row_size, int col_size) { __visc__hint(DEVICE); __visc__attributes(3, input, result, TsTw_tran, 1, result); - + printf("Color mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3); - tr_chan: +tr_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - tr_row: + tr_row: for (int row = 0; row < row_size; row++) - tr_col: + tr_col: for (int col = 0; col < col_size; col++) _result[chan][row][col] = max(_input[0][row][col] * _TsTw_tran[0][chan] + @@ -210,18 +206,18 @@ void transform_fxp(float *input, size_t bytes_input, // // Weighted radial basis function for gamut mapping // -//void gamut_map_fxp(float *input, int row_size, int col_size, float *result, -// float *ctrl_pts, float *weights, float *coefs, float *l2_dist) { -void gamut_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, +// void gamut_map_fxp(float *input, int row_size, int col_size, float *result, +// float *ctrl_pts, float *weights, float *coefs, float +// *l2_dist) { +void gamut_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, float *coefs, + size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, int row_size, int col_size) { __visc__hint(DEVICE); - __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); - + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, + result); + printf("Gamut mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); @@ -229,26 +225,25 @@ void gamut_map_fxp(float *input, size_t bytes_input, ARRAY_2D(float, _weights, weights, 3); ARRAY_2D(float, _coefs, coefs, 3); - // First, get the L2 norm from every pixel to the control points, - // Then, sum it and weight it. Finally, add the bias. - gm_rbf_row: +// First, get the L2 norm from every pixel to the control points, +// Then, sum it and weight it. Finally, add the bias. +gm_rbf_row: for (int row = 0; row < row_size; row++) - gm_rbf_col: + gm_rbf_col: for (int col = 0; col < col_size; col++) { - gm_rbf_cp0: + gm_rbf_cp0: for (int cp = 0; cp < num_ctrl_pts; cp++) { - l2_dist[cp] = - sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * - (_input[0][row][col] - _ctrl_pts[cp][0]) + - (_input[1][row][col] - _ctrl_pts[cp][1]) * - (_input[1][row][col] - _ctrl_pts[cp][1]) + - (_input[2][row][col] - _ctrl_pts[cp][2]) * - (_input[2][row][col] - _ctrl_pts[cp][2])); + l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * + (_input[0][row][col] - _ctrl_pts[cp][0]) + + (_input[1][row][col] - _ctrl_pts[cp][1]) * + (_input[1][row][col] - _ctrl_pts[cp][1]) + + (_input[2][row][col] - _ctrl_pts[cp][2]) * + (_input[2][row][col] - _ctrl_pts[cp][2])); } - gm_rbf_chan: + gm_rbf_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) { float chan_val = 0.0; - gm_rbf_cp1: + gm_rbf_cp1: for (int cp = 0; cp < num_ctrl_pts; cp++) { chan_val += l2_dist[cp] * _weights[cp][chan]; } @@ -263,25 +258,24 @@ void gamut_map_fxp(float *input, size_t bytes_input, } // Tone mapping -//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, +// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, // float *result) { -void tone_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, size_t bytes_tone_map, int row_size, int col_size) { __visc__hint(DEVICE); __visc__attributes(3, input, result, tone_map, 1, result); - + printf("Tone mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); ARRAY_2D(float, _tone_map, tone_map, 3); - tm_chan: +tm_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - tm_row: + tm_row: for (int row = 0; row < row_size; row++) - tm_col: + tm_col: for (int col = 0; col < col_size; col++) { uint8_t x = _input[chan][row][col] * 255; _result[chan][row][col] = _tone_map[x][chan]; diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.h b/hpvm/test/hpvm-cava/src/pipe_stages.h index 8d98cb65cc8af7353cc1faf08988f3b1a6758046..f960822a03326638189c8d294938452ba2670b41 100644 --- a/hpvm/test/hpvm-cava/src/pipe_stages.h +++ b/hpvm/test/hpvm-cava/src/pipe_stages.h @@ -7,54 +7,52 @@ #define ISP 0x4 -#define max(a,b) \ - ({ __typeof__ (a) _a = (a); \ - __typeof__ (b) _b = (b); \ - _a > _b ? _a : _b; }) - -#define min(a,b) \ - ({ __typeof__ (a) _a = (a); \ - __typeof__ (b) _b = (b); \ - _a < _b ? _a : _b; }) - -#define abs(a) \ - ({ __typeof__ (a) _a = (a); \ - _a < 0 ? -_a : _a; }) +#define max(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +#define min(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +#define abs(a) \ + ({ \ + __typeof__(a) _a = (a); \ + _a < 0 ? -_a : _a; \ + }) extern int num_ctrl_pts; -void scale_fxp(uint8_t *input, size_t bytes_input, - float *output, size_t bytes_output, - size_t row_size, size_t col_size); +void scale_fxp(uint8_t *input, size_t bytes_input, float *output, + size_t bytes_output, size_t row_size, size_t col_size); -void descale_fxp(float *input, size_t bytes_input, - uint8_t *output, size_t bytes_result, - size_t row_size, size_t col_size); +void descale_fxp(float *input, size_t bytes_input, uint8_t *output, + size_t bytes_result, size_t row_size, size_t col_size); -void demosaic_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size); +void demosaic_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size); -void denoise_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - size_t row_size, size_t col_size); +void denoise_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, size_t row_size, size_t col_size); -void transform_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, size_t row_size, size_t col_size); -void gamut_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, - float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, +void gamut_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, float *coefs, + size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, size_t row_size, size_t col_size); -void tone_map_fxp(float *input, size_t bytes_input, - float *result, size_t bytes_result, - float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, float *result, + size_t bytes_result, float *tone_map, size_t bytes_tone_map, size_t row_size, size_t col_size); void tone_map_approx_fxp(float *input, size_t row_size, size_t col_size, diff --git a/hpvm/test/hpvm-cava/src/utility.c b/hpvm/test/hpvm-cava/src/utility.c index c1eaee3333c2afffdcae827f956efa4e25705352..86bd018183403f637ca8fb7cfb634a09c3ceace8 100644 --- a/hpvm/test/hpvm-cava/src/utility.c +++ b/hpvm/test/hpvm-cava/src/utility.c @@ -1,7 +1,7 @@ -#include <stdlib.h> -#include <assert.h> -#include "defs.h" #include "utility.h" +#include "defs.h" +#include <assert.h> +#include <stdlib.h> void *malloc_aligned(size_t size) { void *ptr = NULL; diff --git a/hpvm/test/hpvm-cava/src/visc.h b/hpvm/test/hpvm-cava/src/visc.h index 3a05f49e299a0a63a2251db65762561c25ed3981..917aec5a3773657e63655191b7897b9035b6d378 100644 --- a/hpvm/test/hpvm-cava/src/visc.h +++ b/hpvm/test/hpvm-cava/src/visc.h @@ -15,62 +15,62 @@ #ifdef __cplusplus extern "C" { void __visc__hint(visc::Target); -//void __visc__wait(void*); +// void __visc__wait(void*); #else void __visc__hint(enum Target); -//void __visc__wait(unsigned); +// void __visc__wait(unsigned); #endif #ifdef __cplusplus -//void* __visc__node(...); -//void* __visc__createNode(...); -//void* __visc__createNode1D(...); -//void* __visc__createNode2D(...); -//void* __visc__createNode3D(...); -//void __visc__return(...); +// void* __visc__node(...); +// void* __visc__createNode(...); +// void* __visc__createNode1D(...); +// void* __visc__createNode2D(...); +// void* __visc__createNode3D(...); +// void __visc__return(...); #endif -void* __visc__createNodeND(unsigned,...); +void *__visc__createNodeND(unsigned, ...); void __visc__return(unsigned, ...); void __visc__attributes(unsigned, ...); void __visc__init(); void __visc__cleanup(); -void __visc__bindIn(void*, unsigned, unsigned, unsigned); -void __visc__bindOut(void*, unsigned, unsigned, unsigned); -void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned); -void __visc__push(void*, void*); -void* __visc__pop(void*); -void* __visc__launch(unsigned, ...); -void __visc__wait(void*); +void __visc__bindIn(void *, unsigned, unsigned, unsigned); +void __visc__bindOut(void *, unsigned, unsigned, unsigned); +void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, unsigned); +void __visc__push(void *, void *); +void *__visc__pop(void *); +void *__visc__launch(unsigned, ...); +void __visc__wait(void *); -void* __visc__getNode(); -void* __visc__getParentNode(void*); +void *__visc__getNode(); +void *__visc__getParentNode(void *); void __visc__barrier(); -void* __visc__malloc(long); -long __visc__getNodeInstanceID_x(void*); -long __visc__getNodeInstanceID_y(void*); -long __visc__getNodeInstanceID_z(void*); -long __visc__getNumNodeInstances_x(void*); -long __visc__getNumNodeInstances_y(void*); -long __visc__getNumNodeInstances_z(void*); +void *__visc__malloc(long); +long __visc__getNodeInstanceID_x(void *); +long __visc__getNodeInstanceID_y(void *); +long __visc__getNodeInstanceID_z(void *); +long __visc__getNumNodeInstances_x(void *); +long __visc__getNumNodeInstances_y(void *); +long __visc__getNumNodeInstances_z(void *); // Atomic // signed int -int __visc__atomic_cmpxchg(int*, int, int); -int __visc__atomic_add(int*, int); -int __visc__atomic_sub(int*, int); -int __visc__atomic_xchg(int*, int); -int __visc__atomic_inc(int*); -int __visc__atomic_dec(int*); -int __visc__atomic_min(int*, int); -int __visc__atomic_max(int*, int); -int __visc__atomic_umax(int*, int); -int __visc__atomic_umin(int*, int); -int __visc__atomic_and(int*, int); -int __visc__atomic_or(int*, int); -int __visc__atomic_xor(int*, int); +int __visc__atomic_cmpxchg(int *, int, int); +int __visc__atomic_add(int *, int); +int __visc__atomic_sub(int *, int); +int __visc__atomic_xchg(int *, int); +int __visc__atomic_inc(int *); +int __visc__atomic_dec(int *); +int __visc__atomic_min(int *, int); +int __visc__atomic_max(int *, int); +int __visc__atomic_umax(int *, int); +int __visc__atomic_umin(int *, int); +int __visc__atomic_and(int *, int); +int __visc__atomic_or(int *, int); +int __visc__atomic_xor(int *, int); // Special Func float __visc__floor(float); @@ -79,18 +79,17 @@ float __visc__sqrt(float); float __visc__sin(float); float __visc__cos(float); // unsigned int -//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); -//unsigned __visc__atomic_add(unsigned*, unsigned); -//unsigned __visc__atomic_sub(unsigned*, unsigned); -//unsigned __visc__atomic_xchg(unsigned*, unsigned); -//unsigned __visc__atomic_inc(unsigned*); -//unsigned __visc__atomic_dec(unsigned*); -//unsigned __visc__atomic_min(unsigned*, unsigned); -//unsigned __visc__atomic_max(unsigned*, unsigned); -//unsigned __visc__atomic_and(unsigned*, unsigned); -//unsigned __visc__atomic_or(unsigned*, unsigned); -//unsigned __visc__atomic_xor(unsigned*, unsigned); - +// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); +// unsigned __visc__atomic_add(unsigned*, unsigned); +// unsigned __visc__atomic_sub(unsigned*, unsigned); +// unsigned __visc__atomic_xchg(unsigned*, unsigned); +// unsigned __visc__atomic_inc(unsigned*); +// unsigned __visc__atomic_dec(unsigned*); +// unsigned __visc__atomic_min(unsigned*, unsigned); +// unsigned __visc__atomic_max(unsigned*, unsigned); +// unsigned __visc__atomic_and(unsigned*, unsigned); +// unsigned __visc__atomic_or(unsigned*, unsigned); +// unsigned __visc__atomic_xor(unsigned*, unsigned); #include <unistd.h> @@ -99,12 +98,10 @@ long get_group_id(int); long get_local_id(int); long get_local_size(int); - -void llvm_visc_track_mem(void*, size_t); -void llvm_visc_untrack_mem(void*); -void llvm_visc_request_mem(void*, size_t); +void llvm_visc_track_mem(void *, size_t); +void llvm_visc_untrack_mem(void *); +void llvm_visc_request_mem(void *, size_t); #ifdef __cplusplus } #endif - diff --git a/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc b/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc index 24aa24bf8b645ec0669662ec16c16b2b09d7936c..ba55abc2697a854a0eccb269ffd8301a79343b3b 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc +++ b/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc @@ -9,204 +9,197 @@ Implementing Breadth first search on CUDA using algorithm given in DAC'10 paper "An Effective GPU Implementation of Breadth-First Search" - Copyright (c) 2010 University of Illinois at Urbana-Champaign. + Copyright (c) 2010 University of Illinois at Urbana-Champaign. All rights reserved. - Permission to use, copy, modify and distribute this software and its documentation for - educational purpose is hereby granted without fee, provided that the above copyright - notice and this permission notice appear in all copies of this software and that you do - not sell the software. + Permission to use, copy, modify and distribute this software and its + documentation for educational purpose is hereby granted without fee, provided + that the above copyright notice and this permission notice appear in all + copies of this software and that you do not sell the software. - THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR - OTHERWISE. + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, + IMPLIED OR OTHERWISE. Author: Lijiuan Luo (lluo3@uiuc.edu) */ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <math.h> -#include <parboil.h> #include <deque> #include <iostream> +#include <math.h> +#include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #define MAX_THREADS_PER_BLOCK 512 -#define NUM_SM 30//the number of Streaming Multiprocessors; may change in the future archs -#define NUM_SP 16//8//the number of Streaming processors within each SM; may change in the future - //architectures -#define EXP 4//3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future architecture - //using EXP and shifting can speed up division operation -#define MOD_OP 8//7 // This variable is also related with NUM_SP; may change in the future architecture; - //using MOD_OP and "bitwise and" can speed up mod operation -#define INF 2147483647//2^31-1 - -#define UP_LIMIT 16677216//2^24 +#define NUM_SM \ + 30 // the number of Streaming Multiprocessors; may change in the future archs +#define NUM_SP \ + 16 // 8//the number of Streaming processors within each SM; may change in the + // future +// architectures +#define EXP \ + 4 // 3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future + // architecture +// using EXP and shifting can speed up division operation +#define MOD_OP \ + 8 // 7 // This variable is also related with NUM_SP; may change in the future + // architecture; +// using MOD_OP and "bitwise and" can speed up mod operation +#define INF 2147483647 // 2^31-1 + +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 #define GRAY1 16677220 #define BLACK 16677221 -int no_of_nodes; //the number of nodes in the graph -int edge_list_size;//the number of edges in the graph +int no_of_nodes; // the number of nodes in the graph +int edge_list_size; // the number of edges in the graph FILE *fp; -//typedef int2 Node; -//typedef int2 Edge; +// typedef int2 Node; +// typedef int2 Edge; -struct Node{ - int x; - int y; +struct Node { + int x; + int y; }; -struct Edge{ - int x; - int y; +struct Edge { + int x; + int y; }; -//Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables for initialization +// Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables +// for initialization const int h_top = 1; const int zero = 0; -void runCPU(int argc, char** argv); -void runGPU(int argc, char** argv); +void runCPU(int argc, char **argv); +void runGPU(int argc, char **argv); //////////////////////////////////////////////////////////////////// -//the cpu version of bfs for speed comparison -//the text book version ("Introduction to Algorithms") +// the cpu version of bfs for speed comparison +// the text book version ("Introduction to Algorithms") //////////////////////////////////////////////////////////////////// -void BFS_CPU( Node * h_graph_nodes,Edge * h_graph_edges, - int * color, int * h_cost, int source){ - std::deque<int> wavefront; - wavefront.push_back(source); - color[source] = GRAY; - int index; - while(!wavefront.empty()){ - index = wavefront.front(); - wavefront.pop_front(); - for(int i=h_graph_nodes[index].x; - i<(h_graph_nodes[index].y + - h_graph_nodes[index].x); i++) - { - int id = h_graph_edges[i].x; - if(color[id] == WHITE){ - h_cost[id]=h_cost[index]+1; - wavefront.push_back(id); - color[id] = GRAY; - } - } - color[index] = BLACK; - - - } - +void BFS_CPU(Node *h_graph_nodes, Edge *h_graph_edges, int *color, int *h_cost, + int source) { + std::deque<int> wavefront; + wavefront.push_back(source); + color[source] = GRAY; + int index; + while (!wavefront.empty()) { + index = wavefront.front(); + wavefront.pop_front(); + for (int i = h_graph_nodes[index].x; + i < (h_graph_nodes[index].y + h_graph_nodes[index].x); i++) { + int id = h_graph_edges[i].x; + if (color[id] == WHITE) { + h_cost[id] = h_cost[index] + 1; + wavefront.push_back(id); + color[id] = GRAY; + } + } + color[index] = BLACK; + } } //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// -int main( int argc, char** argv) -{ - no_of_nodes=0; - edge_list_size=0; - runCPU(argc,argv); -// if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) -// cutilDeviceInit(argc, argv); -// else - //cudaSetDevice( cutGetMaxGflopsDeviceId() ); -// cudaSetDevice( 1); - - - //CUT_EXIT(argc, argv); +int main(int argc, char **argv) { + no_of_nodes = 0; + edge_list_size = 0; + runCPU(argc, argv); + // if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) + // cutilDeviceInit(argc, argv); + // else + // cudaSetDevice( cutGetMaxGflopsDeviceId() ); + // cudaSetDevice( 1); + + // CUT_EXIT(argc, argv); } /////////////////////////////// -//FUNCTION: only run CPU version +// FUNCTION: only run CPU version //////////////////////////////////////////// -void runCPU( int argc, char** argv) -{ - - struct pb_Parameters *params; - struct pb_TimerSet timers; - - pb_InitializeTimerSet(&timers); - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //printf("Reading File\n"); - //Read in Graph from a file - fp = fopen(params->inpFiles[0],"r"); - if(!fp) - { - printf("Error Reading graph file\n"); - return; - } - - int source; - - fscanf(fp,"%d",&no_of_nodes); - // allocate host memory - Node* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); - int *color = (int*) malloc(sizeof(int)*no_of_nodes); - int start, edgeno; - // initalize the memory - for( unsigned int i = 0; i < no_of_nodes; i++) - { - fscanf(fp,"%d %d",&start,&edgeno); - h_graph_nodes[i].x = start; - h_graph_nodes[i].y = edgeno; - color[i]=WHITE; - } - //read the source node from the file - fscanf(fp,"%d",&source); - fscanf(fp,"%d",&edge_list_size); - int id,cost; - Edge* h_graph_edges = (Edge*) malloc(sizeof(Edge)*edge_list_size); - for(int i=0; i < edge_list_size ; i++) - { - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&cost); - h_graph_edges[i].x = id; - h_graph_edges[i].y = cost; - } - if(fp) - fclose(fp); - - //printf("Read File\n"); - - // allocate mem for the result on host side - int* h_cost = (int*) malloc( sizeof(int)*no_of_nodes); - for(int i = 0; i < no_of_nodes; i++){ - h_cost[i] = INF; - } - h_cost[source] = 0; - //printf("start cpu version\n"); - unsigned int cpu_timer = 0; - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - BFS_CPU( h_graph_nodes, h_graph_edges, color, h_cost, source - ); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if(params->outFile!=NULL) - { - //printf("Result stored in %s\n", params->outFile); - FILE *fp = fopen(params->outFile,"w"); - fprintf(fp,"%d\n", no_of_nodes); - for(int i=0;i<no_of_nodes;i++) - fprintf(fp,"%d %d\n",i,h_cost[i]); - fclose(fp); - } - - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - // cleanup memory - free( h_graph_nodes); - free( h_graph_edges); - free( color); - free( h_cost); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); +void runCPU(int argc, char **argv) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + pb_InitializeTimerSet(&timers); + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // printf("Reading File\n"); + // Read in Graph from a file + fp = fopen(params->inpFiles[0], "r"); + if (!fp) { + printf("Error Reading graph file\n"); + return; + } + + int source; + + fscanf(fp, "%d", &no_of_nodes); + // allocate host memory + Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes); + int *color = (int *)malloc(sizeof(int) * no_of_nodes); + int start, edgeno; + // initalize the memory + for (unsigned int i = 0; i < no_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); + h_graph_nodes[i].x = start; + h_graph_nodes[i].y = edgeno; + color[i] = WHITE; + } + // read the source node from the file + fscanf(fp, "%d", &source); + fscanf(fp, "%d", &edge_list_size); + int id, cost; + Edge *h_graph_edges = (Edge *)malloc(sizeof(Edge) * edge_list_size); + for (int i = 0; i < edge_list_size; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); + h_graph_edges[i].x = id; + h_graph_edges[i].y = cost; + } + if (fp) + fclose(fp); + + // printf("Read File\n"); + + // allocate mem for the result on host side + int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes); + for (int i = 0; i < no_of_nodes; i++) { + h_cost[i] = INF; + } + h_cost[source] = 0; + // printf("start cpu version\n"); + unsigned int cpu_timer = 0; + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + BFS_CPU(h_graph_nodes, h_graph_edges, color, h_cost, source); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (params->outFile != NULL) { + // printf("Result stored in %s\n", params->outFile); + FILE *fp = fopen(params->outFile, "w"); + fprintf(fp, "%d\n", no_of_nodes); + for (int i = 0; i < no_of_nodes; i++) + fprintf(fp, "%d %d\n", i, h_cost[i]); + fclose(fp); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // cleanup memory + free(h_graph_nodes); + free(h_graph_edges); + free(color); + free(h_cost); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); } /////////////////////////////// -//FUNCTION:only run GPU version +// FUNCTION:only run GPU version //////////////////////////////////////////// diff --git a/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h b/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h index 18039547e2244b33f30f02ea4df1edc445debcf8..e5e2420355ef67bcd9f638c5d4e48ee47e657942 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h @@ -1,12 +1,19 @@ #define MAX_THREADS_PER_BLOCK 512 -#define NUM_SM 14 //the number of Streaming Multiprocessors; 15 for Fermi architecture 30 for G280 at the moment of this document -#define NUM_BIN 8 //the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU -#define EXP 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future architecture - //using EXP and shifting can speed up division operation -#define MOD_OP 7 // This variable is also related with NUM_BIN; may change in the future architecture; - //using MOD_OP and "bitwise and" can speed up mod operation -#define INF 2147483647//2^31-1 -#define UP_LIMIT 16677216//2^24 +#define NUM_SM \ + 14 // the number of Streaming Multiprocessors; 15 for Fermi architecture 30 + // for G280 at the moment of this document +#define NUM_BIN \ + 8 // the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU +#define EXP \ + 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future + // architecture + // using EXP and shifting can speed up division operation +#define MOD_OP \ + 7 // This variable is also related with NUM_BIN; may change in the future + // architecture; + // using MOD_OP and "bitwise and" can speed up mod operation +#define INF 2147483647 // 2^31-1 +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h b/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h index 18039547e2244b33f30f02ea4df1edc445debcf8..e5e2420355ef67bcd9f638c5d4e48ee47e657942 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h @@ -1,12 +1,19 @@ #define MAX_THREADS_PER_BLOCK 512 -#define NUM_SM 14 //the number of Streaming Multiprocessors; 15 for Fermi architecture 30 for G280 at the moment of this document -#define NUM_BIN 8 //the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU -#define EXP 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future architecture - //using EXP and shifting can speed up division operation -#define MOD_OP 7 // This variable is also related with NUM_BIN; may change in the future architecture; - //using MOD_OP and "bitwise and" can speed up mod operation -#define INF 2147483647//2^31-1 -#define UP_LIMIT 16677216//2^24 +#define NUM_SM \ + 14 // the number of Streaming Multiprocessors; 15 for Fermi architecture 30 + // for G280 at the moment of this document +#define NUM_BIN \ + 8 // the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU +#define EXP \ + 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future + // architecture + // using EXP and shifting can speed up division operation +#define MOD_OP \ + 7 // This variable is also related with NUM_BIN; may change in the future + // architecture; + // using MOD_OP and "bitwise and" can speed up mod operation +#define INF 2147483647 // 2^31-1 +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc b/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc index d5d91ea4ccef7f03b788d41b06f5d7f12a57f4ac..01664c78345db542b37530b77eed54eb3c1fd1cd 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc +++ b/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc @@ -9,200 +9,195 @@ Implementing Breadth first search on CUDA using algorithm given in DAC'10 paper "An Effective GPU Implementation of Breadth-First Search" - Copyright (c) 2010 University of Illinois at Urbana-Champaign. + Copyright (c) 2010 University of Illinois at Urbana-Champaign. All rights reserved. - Permission to use, copy, modify and distribute this software and its documentation for - educational purpose is hereby granted without fee, provided that the above copyright - notice and this permission notice appear in all copies of this software and that you do - not sell the software. + Permission to use, copy, modify and distribute this software and its + documentation for educational purpose is hereby granted without fee, provided + that the above copyright notice and this permission notice appear in all + copies of this software and that you do not sell the software. - THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR - OTHERWISE. + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, + IMPLIED OR OTHERWISE. Author: Lijiuan Luo (lluo3@uiuc.edu) */ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <math.h> -#include <parboil.h> #include <deque> #include <iostream> +#include <math.h> +#include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #define MAX_THREADS_PER_BLOCK 512 -#define NUM_SM 30//the number of Streaming Multiprocessors; may change in the future archs -#define NUM_SP 16//8//the number of Streaming processors within each SM; may change in the future - //architectures -#define EXP 4//3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future architecture - //using EXP and shifting can speed up division operation -#define MOD_OP 8//7 // This variable is also related with NUM_SP; may change in the future architecture; - //using MOD_OP and "bitwise and" can speed up mod operation -#define INF 2147483647//2^31-1 - -#define UP_LIMIT 16677216//2^24 +#define NUM_SM \ + 30 // the number of Streaming Multiprocessors; may change in the future archs +#define NUM_SP \ + 16 // 8//the number of Streaming processors within each SM; may change in the + // future +// architectures +#define EXP \ + 4 // 3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future + // architecture +// using EXP and shifting can speed up division operation +#define MOD_OP \ + 8 // 7 // This variable is also related with NUM_SP; may change in the future + // architecture; +// using MOD_OP and "bitwise and" can speed up mod operation +#define INF 2147483647 // 2^31-1 + +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 #define GRAY1 16677220 #define BLACK 16677221 -int no_of_nodes; //the number of nodes in the graph -int edge_list_size;//the number of edges in the graph +int no_of_nodes; // the number of nodes in the graph +int edge_list_size; // the number of edges in the graph FILE *fp; -//typedef int2 Node; -//typedef int2 Edge; +// typedef int2 Node; +// typedef int2 Edge; -struct Node{ - int x; - int y; +struct Node { + int x; + int y; }; -struct Edge{ - int x; - int y; +struct Edge { + int x; + int y; }; const int h_top = 1; const int zero = 0; -void runCPU(int argc, char** argv); -void runGPU(int argc, char** argv); +void runCPU(int argc, char **argv); +void runGPU(int argc, char **argv); //////////////////////////////////////////////////////////////////// -//the cpu version of bfs for speed comparison -//the text book version ("Introduction to Algorithms") +// the cpu version of bfs for speed comparison +// the text book version ("Introduction to Algorithms") //////////////////////////////////////////////////////////////////// -void BFS_CPU( Node * h_graph_nodes,Edge * h_graph_edges, - int * color, int * h_cost, int source){ - std::deque<int> wavefront; - wavefront.push_back(source); - color[source] = GRAY; - int index; - while(!wavefront.empty()){ - index = wavefront.front(); - wavefront.pop_front(); +void BFS_CPU(Node *h_graph_nodes, Edge *h_graph_edges, int *color, int *h_cost, + int source) { + std::deque<int> wavefront; + wavefront.push_back(source); + color[source] = GRAY; + int index; + while (!wavefront.empty()) { + index = wavefront.front(); + wavefront.pop_front(); #pragma omp parallel for - for(int i=h_graph_nodes[index].x; - i<(h_graph_nodes[index].y + - h_graph_nodes[index].x); i++) - { - int id = h_graph_edges[i].x; - if(color[id] == WHITE){ - h_cost[id]=h_cost[index]+1; + for (int i = h_graph_nodes[index].x; + i < (h_graph_nodes[index].y + h_graph_nodes[index].x); i++) { + int id = h_graph_edges[i].x; + if (color[id] == WHITE) { + h_cost[id] = h_cost[index] + 1; #pragma omp critical - wavefront.push_back(id); - - color[id] = GRAY; - } - } - color[index] = BLACK; - } - + wavefront.push_back(id); + + color[id] = GRAY; + } + } + color[index] = BLACK; + } } //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// -int main( int argc, char** argv) -{ - no_of_nodes=0; - edge_list_size=0; - runCPU(argc,argv); +int main(int argc, char **argv) { + no_of_nodes = 0; + edge_list_size = 0; + runCPU(argc, argv); } /////////////////////////////// -//FUNCTION: only run CPU version +// FUNCTION: only run CPU version //////////////////////////////////////////// -void runCPU( int argc, char** argv) -{ - - struct pb_Parameters *params; - struct pb_TimerSet timers; - - pb_InitializeTimerSet(&timers); - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //printf("Reading File\n"); - //Read in Graph from a file - fp = fopen(params->inpFiles[0],"r"); - if(!fp) - { - printf("Error Reading graph file\n"); - return; - } - - int source; - - fscanf(fp,"%d",&no_of_nodes); - // allocate host memory - Node* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); - int *color = (int*) malloc(sizeof(int)*no_of_nodes); - int start, edgeno; - // initalize the memory - for( unsigned int i = 0; i < no_of_nodes; i++) - { - fscanf(fp,"%d %d",&start,&edgeno); - h_graph_nodes[i].x = start; - h_graph_nodes[i].y = edgeno; - color[i]=WHITE; - } - //read the source node from the file - fscanf(fp,"%d",&source); - fscanf(fp,"%d",&edge_list_size); - int id,cost; - Edge* h_graph_edges = (Edge*) malloc(sizeof(Edge)*edge_list_size); - for(int i=0; i < edge_list_size ; i++) - { - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&cost); - h_graph_edges[i].x = id; - h_graph_edges[i].y = cost; - } - if(fp) - fclose(fp); - - //printf("Read File\n"); - - // allocate mem for the result on host side - int* h_cost = (int*) malloc( sizeof(int)*no_of_nodes); - for(int i = 0; i < no_of_nodes; i++){ - h_cost[i] = INF; - } - h_cost[source] = 0; - //printf("start cpu version\n"); - unsigned int cpu_timer = 0; - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - BFS_CPU( h_graph_nodes, h_graph_edges, color, h_cost, source - ); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if(params->outFile!=NULL) - { - //printf("Result stored in %s\n", params->outFile); - FILE *fp = fopen(params->outFile,"w"); - fprintf(fp,"%d\n", no_of_nodes); - for(int i=0;i<no_of_nodes;i++) - fprintf(fp,"%d %d\n",i,h_cost[i]); - fclose(fp); - } - - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - // cleanup memory - free( h_graph_nodes); - free( h_graph_edges); - free( color); - free( h_cost); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); +void runCPU(int argc, char **argv) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + pb_InitializeTimerSet(&timers); + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // printf("Reading File\n"); + // Read in Graph from a file + fp = fopen(params->inpFiles[0], "r"); + if (!fp) { + printf("Error Reading graph file\n"); + return; + } + + int source; + + fscanf(fp, "%d", &no_of_nodes); + // allocate host memory + Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes); + int *color = (int *)malloc(sizeof(int) * no_of_nodes); + int start, edgeno; + // initalize the memory + for (unsigned int i = 0; i < no_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); + h_graph_nodes[i].x = start; + h_graph_nodes[i].y = edgeno; + color[i] = WHITE; + } + // read the source node from the file + fscanf(fp, "%d", &source); + fscanf(fp, "%d", &edge_list_size); + int id, cost; + Edge *h_graph_edges = (Edge *)malloc(sizeof(Edge) * edge_list_size); + for (int i = 0; i < edge_list_size; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); + h_graph_edges[i].x = id; + h_graph_edges[i].y = cost; + } + if (fp) + fclose(fp); + + // printf("Read File\n"); + + // allocate mem for the result on host side + int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes); + for (int i = 0; i < no_of_nodes; i++) { + h_cost[i] = INF; + } + h_cost[source] = 0; + // printf("start cpu version\n"); + unsigned int cpu_timer = 0; + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + BFS_CPU(h_graph_nodes, h_graph_edges, color, h_cost, source); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (params->outFile != NULL) { + // printf("Result stored in %s\n", params->outFile); + FILE *fp = fopen(params->outFile, "w"); + fprintf(fp, "%d\n", no_of_nodes); + for (int i = 0; i < no_of_nodes; i++) + fprintf(fp, "%d %d\n", i, h_cost[i]); + fclose(fp); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // cleanup memory + free(h_graph_nodes); + free(h_graph_edges); + free(color); + free(h_cost); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); } /////////////////////////////// -//FUNCTION:only run GPU version +// FUNCTION:only run GPU version //////////////////////////////////////////// diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp index 57368eda9ada364e6edf6e1eccd35758fa349b62..38e60a1cbff3d9e4ce8d56204e9213943ea4fd55 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp @@ -4,41 +4,47 @@ #include <string.h> // -1 for NO suitable device found, 0 if an appropriate device was found -int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) { - - // Supported Device Requests (anything that returns cl_bool) - // CL_DEVICE_IMAGE_SUPPORT - // CL_DEVICE_HOST_UNIFIED_MEMORY - // CL_DEVICE_ERROR_CORRECTION_SUPPORT - // CL_DEVICE_AVAILABLE - // CL_DEVICE_COMPILER_AVAILABLE - +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, + cl_device_type *reqDeviceType, int numRequests, ...) { + + // Supported Device Requests (anything that returns cl_bool) + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_HOST_UNIFIED_MEMORY + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + // CL_DEVICE_AVAILABLE + // CL_DEVICE_COMPILER_AVAILABLE + cl_uint numEntries = 16; cl_platform_id clPlatforms[numEntries]; cl_uint numPlatforms; - + cl_device_id clDevices[numEntries]; cl_uint numDevices; - OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) ); + OCL_SIMPLE_ERRCK_RETVAL( + clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms)); fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms); bool needDevice = true; - + for (int ip = 0; ip < numPlatforms && needDevice; ++ip) { cl_platform_id clPlatform = clPlatforms[ip]; - - OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) ); - fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices); - - for (int id = 0; (id < numDevices) && needDevice ; ++id) { + + OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, + numEntries, clDevices, &numDevices)); + fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, + numDevices); + + for (int id = 0; (id < numDevices) && needDevice; ++id) { cl_device_id clDevice = clDevices[id]; cl_device_type clDeviceType; bool canSatisfy = true; - + if (reqDeviceType != NULL) { - OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL)); + OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, + sizeof(cl_device_type), + &clDeviceType, NULL)); if (*reqDeviceType != CL_DEVICE_TYPE_ALL) { if (*reqDeviceType != clDeviceType) { canSatisfy = false; @@ -48,32 +54,34 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty va_list paramList; va_start(paramList, numRequests); - for (int i = 0; (i < numRequests) && canSatisfy ; ++i) { - - cl_device_info devReq = va_arg( paramList, cl_device_info ); + for (int i = 0; (i < numRequests) && canSatisfy; ++i) { + + cl_device_info devReq = va_arg(paramList, cl_device_info); cl_bool clInfoBool; size_t infoRetSize = sizeof(cl_bool); - - OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); + + OCL_SIMPLE_ERRCK_RETVAL( + clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); if (clInfoBool != true) { canSatisfy = false; } } - + va_end(paramList); if (canSatisfy) { *device = clDevice; *platform = clPlatform; needDevice = false; fprintf(stderr, "Chose Device Type: %s\n", - (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other" - ); + (clDeviceType == CL_DEVICE_TYPE_CPU) + ? "CPU" + : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"); if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) { *reqDeviceType = clDeviceType; } } } // End checking all devices for a platform - } // End checking all platforms + } // End checking all platforms int retVal = -1; if (needDevice) { @@ -81,214 +89,213 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty } else { retVal = 0; } - + return retVal; } -const char* oclErrorString(cl_int error) -{ -// From NVIDIA SDK - static const char* errorString[] = { - "CL_SUCCESS", - "CL_DEVICE_NOT_FOUND", - "CL_DEVICE_NOT_AVAILABLE", - "CL_COMPILER_NOT_AVAILABLE", - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - "CL_OUT_OF_RESOURCES", - "CL_OUT_OF_HOST_MEMORY", - "CL_PROFILING_INFO_NOT_AVAILABLE", - "CL_MEM_COPY_OVERLAP", - "CL_IMAGE_FORMAT_MISMATCH", - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - "CL_BUILD_PROGRAM_FAILURE", - "CL_MAP_FAILURE", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "CL_INVALID_VALUE", - "CL_INVALID_DEVICE_TYPE", - "CL_INVALID_PLATFORM", - "CL_INVALID_DEVICE", - "CL_INVALID_CONTEXT", - "CL_INVALID_QUEUE_PROPERTIES", - "CL_INVALID_COMMAND_QUEUE", - "CL_INVALID_HOST_PTR", - "CL_INVALID_MEM_OBJECT", - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - "CL_INVALID_IMAGE_SIZE", - "CL_INVALID_SAMPLER", - "CL_INVALID_BINARY", - "CL_INVALID_BUILD_OPTIONS", - "CL_INVALID_PROGRAM", - "CL_INVALID_PROGRAM_EXECUTABLE", - "CL_INVALID_KERNEL_NAME", - "CL_INVALID_KERNEL_DEFINITION", - "CL_INVALID_KERNEL", - "CL_INVALID_ARG_INDEX", - "CL_INVALID_ARG_VALUE", - "CL_INVALID_ARG_SIZE", - "CL_INVALID_KERNEL_ARGS", - "CL_INVALID_WORK_DIMENSION", - "CL_INVALID_WORK_GROUP_SIZE", - "CL_INVALID_WORK_ITEM_SIZE", - "CL_INVALID_GLOBAL_OFFSET", - "CL_INVALID_EVENT_WAIT_LIST", - "CL_INVALID_EVENT", - "CL_INVALID_OPERATION", - "CL_INVALID_GL_OBJECT", - "CL_INVALID_BUFFER_SIZE", - "CL_INVALID_MIP_LEVEL", - "CL_INVALID_GLOBAL_WORK_SIZE", - }; - - const int errorCount = sizeof(errorString) / sizeof(errorString[0]); - - const int index = -error; - - return (index >= 0 && index < errorCount) ? errorString[index] : ""; +const char *oclErrorString(cl_int error) { + // From NVIDIA SDK + static const char *errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; } -const char* oclDebugErrString(cl_int error, cl_device_id device) -{ -// From NVIDIA SDK - static const char* errorString[] = { - "CL_SUCCESS", - "CL_DEVICE_NOT_FOUND", - "CL_DEVICE_NOT_AVAILABLE", - "CL_COMPILER_NOT_AVAILABLE", - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - "CL_OUT_OF_RESOURCES", - "CL_OUT_OF_HOST_MEMORY", - "CL_PROFILING_INFO_NOT_AVAILABLE", - "CL_MEM_COPY_OVERLAP", - "CL_IMAGE_FORMAT_MISMATCH", - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - "CL_BUILD_PROGRAM_FAILURE", - "CL_MAP_FAILURE", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "CL_INVALID_VALUE", - "CL_INVALID_DEVICE_TYPE", - "CL_INVALID_PLATFORM", - "CL_INVALID_DEVICE", - "CL_INVALID_CONTEXT", - "CL_INVALID_QUEUE_PROPERTIES", - "CL_INVALID_COMMAND_QUEUE", - "CL_INVALID_HOST_PTR", - "CL_INVALID_MEM_OBJECT", - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - "CL_INVALID_IMAGE_SIZE", - "CL_INVALID_SAMPLER", - "CL_INVALID_BINARY", - "CL_INVALID_BUILD_OPTIONS", - "CL_INVALID_PROGRAM", - "CL_INVALID_PROGRAM_EXECUTABLE", - "CL_INVALID_KERNEL_NAME", - "CL_INVALID_KERNEL_DEFINITION", - "CL_INVALID_KERNEL", - "CL_INVALID_ARG_INDEX", - "CL_INVALID_ARG_VALUE", - "CL_INVALID_ARG_SIZE", - "CL_INVALID_KERNEL_ARGS", - "CL_INVALID_WORK_DIMENSION", - "CL_INVALID_WORK_GROUP_SIZE", - "CL_INVALID_WORK_ITEM_SIZE", - "CL_INVALID_GLOBAL_OFFSET", - "CL_INVALID_EVENT_WAIT_LIST", - "CL_INVALID_EVENT", - "CL_INVALID_OPERATION", - "CL_INVALID_GL_OBJECT", - "CL_INVALID_BUFFER_SIZE", - "CL_INVALID_MIP_LEVEL", - "CL_INVALID_GLOBAL_WORK_SIZE", - }; - - const int errorCount = sizeof(errorString) / sizeof(errorString[0]); - - const int index = -error; - - if (index == 4) { - cl_uint maxMemAlloc = 0; - OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) ); - fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc); - } - - return (index >= 0 && index < errorCount) ? errorString[index] : ""; +const char *oclDebugErrString(cl_int error, cl_device_id device) { + // From NVIDIA SDK + static const char *errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + if (index == 4) { + cl_uint maxMemAlloc = 0; + OCL_SIMPLE_ERRCK_RETVAL( + clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), + &maxMemAlloc, NULL)); + fprintf(stderr, " Device Maximum block allocation size: %lu\n", + maxMemAlloc); + } + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; } -char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) -{ - // locals - FILE* pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - #ifdef _WIN32 // Windows version - if(fopen_s(&pFileStream, cFilename, "rb") != 0) - { - return NULL; - } - #else // Linux version - pFileStream = fopen(cFilename, "rb"); - if(pFileStream == 0) - { - return NULL; - } - #endif - - size_t szPreambleLength = strlen(cPreamble); - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); - memcpy(cSourceString, cPreamble, szPreambleLength); - if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + source) string +char *oclLoadProgSource(const char *cFilename, const char *cPreamble, + size_t *szFinalLength) { + // locals + FILE *pFileStream = NULL; + size_t szSourceLength; + +// open the OpenCL source code file +#ifdef _WIN32 // Windows version + if (fopen_s(&pFileStream, cFilename, "rb") != 0) { + return NULL; + } +#else // Linux version + pFileStream = fopen(cFilename, "rb"); + if (pFileStream == 0) { + return NULL; + } +#endif + + size_t szPreambleLength = strlen(cPreamble); + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, + pFileStream) != 1) { fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength + szPreambleLength; - } - cSourceString[szSourceLength + szPreambleLength] = '\0'; + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + + // source) string + fclose(pFileStream); + if (szFinalLength != 0) { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; - return cSourceString; + return cSourceString; } diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h index 976c692055501532d65a1ac25e74630732fd2a86..27b084487c6289196337ca064b94f1353f8bbbad 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h @@ -2,26 +2,40 @@ #ifndef __OPENCL_COMMON_H_ #define __OPENCL_COMMON_H_ -#include <stdio.h> +#include <CL/cl.h> #include <stdarg.h> +#include <stdio.h> #include <string.h> -#include <CL/cl.h> -int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...); -const char* oclErrorString(cl_int error); -const char* oclDebugErrString(cl_int error, cl_device_id device); - -#define OCL_ERRCK_VAR(var) \ - { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); } - -#define OCL_ERRCK_RETVAL(s) \ - { cl_int clerr = (s);\ - if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); } - -#define OCL_SIMPLE_ERRCK_RETVAL(s) \ - { cl_int clerr = (s);\ - if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); } - -char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, + cl_device_type *reqDeviceType, int numRequests, ...); +const char *oclErrorString(cl_int error); +const char *oclDebugErrString(cl_int error, cl_device_id device); + +#define OCL_ERRCK_VAR(var) \ + { \ + if (var != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclErrorString(var)); \ + } + +#define OCL_ERRCK_RETVAL(s) \ + { \ + cl_int clerr = (s); \ + if (clerr != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclDebugErrString(clerr, clDevice)); \ + } + +#define OCL_SIMPLE_ERRCK_RETVAL(s) \ + { \ + cl_int clerr = (s); \ + if (clerr != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclErrorString(clerr)); \ + } + +char *oclLoadProgSource(const char *cFilename, const char *cPreamble, + size_t *szFinalLength); #endif diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h index 1a00ef98e054e50e654b0a52ccbb05ce136bab27..f9cdb59e9cd6cc39364fd9389ee39216646aedb2 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h @@ -1,7 +1,8 @@ #define MAX_THREADS_PER_BLOCK 256 -#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees -#define INF 2147483647//2^31-1 -#define UP_LIMIT 16677216//2^24 +#define LOCAL_MEM_SIZE \ + 1600 // This needs to be adjusted for certain graphs with high degrees +#define INF 2147483647 // 2^31-1 +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp index c38fcec8b895eb2559d7c4f1fc974ad4b2cf97e3..278c1bf085c9a5f0ea4809b61806c1a647e9afe5 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp @@ -12,61 +12,56 @@ Copyright (c) 2010 University of Illinois at Urbana-Champaign. All rights reserved. - Permission to use, copy, modify and distribute this software and its documentation for - educational purpose is hereby granted without fee, provided that the above copyright - notice and this permission notice appear in all copies of this software and that you do - not sell the software. + Permission to use, copy, modify and distribute this software and its + documentation for educational purpose is hereby granted without fee, provided + that the above copyright notice and this permission notice appear in all + copies of this software and that you do not sell the software. - THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR - OTHERWISE. + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, + IMPLIED OR OTHERWISE. Author: Lijiuan Luo (lluo3@uiuc.edu) - Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu) + Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu + (gengliu2@illinois.edu) */ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <math.h> -#include <CL/cl.h> -#include "parboil.h" #include "OpenCL_common.h" #include "config.h" +#include "parboil.h" +#include <CL/cl.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> - -#define CHECK_ERROR(errorMessage) \ -if(clStatus != CL_SUCCESS) \ -{ \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ -} +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ + } FILE *fp; -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { printf("Error 1!\n"); exit(1); } - fseek(fp,0,SEEK_END); + fseek(fp, 0, SEEK_END); long size = ftell(fp); rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { printf("Error 2!\n"); fclose(fp); exit(1); } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { + size_t res = fread(buffer, 1, size, fp); + if (res != size) { printf("Error 3!\n"); fclose(fp); exit(1); @@ -77,70 +72,67 @@ char* readFile(const char* fileName) } const int h_top = 1; const int zero = 0; -void runGPU(int argc, char** argv); +void runGPU(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// -int main( int argc, char** argv) -{ +int main(int argc, char **argv) { - //the number of nodes in the graph + // the number of nodes in the graph int num_of_nodes = 0; - //the number of edges in the graph + // the number of edges in the graph int num_of_edges = 0; struct pb_Parameters *params; struct pb_TimerSet timers; params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) - { + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - //Read in Graph from a file - fp = fopen(params->inpFiles[0],"r"); - if(!fp) - { + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + // Read in Graph from a file + fp = fopen(params->inpFiles[0], "r"); + if (!fp) { printf("Error Reading graph file\n"); return 0; } int source; - fscanf(fp,"%d",&num_of_nodes); + fscanf(fp, "%d", &num_of_nodes); // allocate host memory - struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes); - int *color = (int*) malloc(sizeof(int)*num_of_nodes); + struct Node *h_graph_nodes = + (struct Node *)malloc(sizeof(struct Node) * num_of_nodes); + int *color = (int *)malloc(sizeof(int) * num_of_nodes); int start, edgeno; // initalize the memory int i; - for( i = 0; i < num_of_nodes; i++) - { - fscanf(fp,"%d %d",&start,&edgeno); + for (i = 0; i < num_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); h_graph_nodes[i].x = start; h_graph_nodes[i].y = edgeno; - color[i]=WHITE; + color[i] = WHITE; } - //read the source node from the file - fscanf(fp,"%d",&source); - fscanf(fp,"%d",&num_of_edges); - int id,cost; - struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges); - for(i=0; i < num_of_edges ; i++) - { - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&cost); + // read the source node from the file + fscanf(fp, "%d", &source); + fscanf(fp, "%d", &num_of_edges); + int id, cost; + struct Edge *h_graph_edges = + (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges); + for (i = 0; i < num_of_edges; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); h_graph_edges[i].x = id; h_graph_edges[i].y = cost; } - if(fp) + if (fp) fclose(fp); pb_InitializeTimerSet(&timers); // allocate mem for the result on host side - int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes); - for(i = 0; i < num_of_nodes; i++){ + int *h_cost = (int *)malloc(sizeof(int) * num_of_nodes); + for (i = 0; i < num_of_nodes; i++) { h_cost[i] = INF; } h_cost[source] = 0; @@ -151,17 +143,20 @@ int main( int argc, char** argv) cl_device_id clDevice; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; cl_platform_id clPlatform; - OCL_ERRCK_RETVAL(clGetPlatformIDs(1,&clPlatform,NULL)); - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; + OCL_ERRCK_RETVAL(clGetPlatformIDs(1, &clPlatform, NULL)); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0); if (deviceFound < 0) { fprintf(stderr, "No suitable device was found\n"); exit(1); } - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); OCL_ERRCK_VAR(clStatus); pb_SetOpenCL(&clContext, &clCommandQueue); @@ -170,111 +165,151 @@ int main( int argc, char** argv) size_t program_length; const char *clSource_path = "src/opencl_base/kernel.cl"; clSource = oclLoadProgSource(clSource_path, "", &program_length); - //printf("Program Source:\n%s\n", clSource); - cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&clSource, &program_length, &clStatus); + // printf("Program Source:\n%s\n", clSource); + cl_program clProgram = clCreateProgramWithSource( + clContext, 1, (const char **)&clSource, &program_length, &clStatus); OCL_ERRCK_VAR(clStatus); char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); - OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL)); + sprintf(clOptions, "-I src/opencl_base"); + OCL_ERRCK_RETVAL( + clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL)); // Uncomment to view build log from compiler for debugging /* char *build_log; size_t ret_val_size; - clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - build_log = (char *)malloc(ret_val_size+1); - clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); - // there's no information in the reference whether the string is 0 terminated or not - build_log[ret_val_size] = '\0'; - printf("%s\n", build_log ); + clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, + NULL, &ret_val_size); build_log = (char *)malloc(ret_val_size+1); clStatus = + clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, + build_log, NULL); + // there's no information in the reference whether the string is 0 terminated + or not build_log[ret_val_size] = '\0'; printf("%s\n", build_log ); */ - cl_kernel BFS_kernel = clCreateKernel(clProgram,"BFS_kernel",&clStatus); + cl_kernel BFS_kernel = clCreateKernel(clProgram, "BFS_kernel", &clStatus); OCL_ERRCK_VAR(clStatus); - //Copy the Node list to device memory + // Copy the Node list to device memory cl_mem d_graph_nodes; - d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus); + d_graph_nodes = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + num_of_nodes * sizeof(struct Node), NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL)); - //Copy the Edge List to device Memory + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_nodes, CL_TRUE, + 0, num_of_nodes * sizeof(struct Node), + h_graph_nodes, 0, NULL, NULL)); + // Copy the Edge List to device Memory cl_mem d_graph_edges; - d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus); + d_graph_edges = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + num_of_edges * sizeof(struct Edge), NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_edges, CL_TRUE, + 0, num_of_edges * sizeof(struct Edge), + h_graph_edges, 0, NULL, NULL)); cl_mem d_color, d_cost, d_q1, d_q2, tail; - d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); + d_color = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_cost = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_q1 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_q2 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + tail = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_color, CL_TRUE, 0, + num_of_nodes * sizeof(int), color, 0, + NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + num_of_nodes * sizeof(int), h_cost, 0, + NULL, NULL)); printf("Starting GPU kernel\n"); pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); int num_of_blocks; int num_of_threads_per_block; - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL)); - - int num_t;//number of threads - int k=0;//BFS level index - - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,2,sizeof(cl_mem),(void*)&d_graph_nodes)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,3,sizeof(cl_mem),(void*)&d_graph_edges)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,4,sizeof(cl_mem),(void*)&d_color)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,5,sizeof(cl_mem),(void*)&d_cost)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,6,sizeof(cl_mem),(void*)&tail)); - - do - { - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); - - if(num_t == 0){//frontier is empty + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &h_top, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_q1, CL_TRUE, 0, + sizeof(int), &source, 0, NULL, NULL)); + + int num_t; // number of threads + int k = 0; // BFS level index + + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 2, sizeof(cl_mem), (void *)&d_graph_nodes)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 3, sizeof(cl_mem), (void *)&d_graph_edges)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 4, sizeof(cl_mem), (void *)&d_color)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 5, sizeof(cl_mem), (void *)&d_cost)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 6, sizeof(cl_mem), (void *)&tail)); + + do { + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &num_t, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); + + if (num_t == 0) { // frontier is empty break; } - num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK); - num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t; + num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = + num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t; - size_t grid[1] = {num_of_blocks*num_of_threads_per_block}; + size_t grid[1] = {num_of_blocks * num_of_threads_per_block}; size_t block[1] = {num_of_threads_per_block}; - - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,7,sizeof(int),(void*)&num_t)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,9,sizeof(int),(void*)&k)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,10,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,11,LOCAL_MEM_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,12,sizeof(int),NULL)); - if(k%2 == 0){ + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 7, sizeof(int), (void *)&num_t)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 9, sizeof(int), (void *)&k)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 10, sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 11, LOCAL_MEM_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 12, sizeof(int), NULL)); + if (k % 2 == 0) { int gray = GRAY0; - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray)); - } - else{ + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray)); + } else { int gray = GRAY1; - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray)); } - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel, 1, 0, + grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); k++; - } while(1); + } while (1); pb_SwitchToTimer(&timers, pb_TimerID_COPY); printf("GPU kernel done\n"); // copy result from device to host - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + num_of_nodes * sizeof(int), h_cost, 0, + NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_color, CL_TRUE, 0, + num_of_nodes * sizeof(int), color, 0, + NULL, NULL)); OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_nodes)); OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_edges)); @@ -285,14 +320,13 @@ int main( int argc, char** argv) pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - - //Store the result into a file - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - FILE *fp = fopen(params->outFile,"w"); + // Store the result into a file + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + FILE *fp = fopen(params->outFile, "w"); fprintf(fp, "%d\n", num_of_nodes); int j = 0; - for(j=0;j<num_of_nodes;j++) - fprintf(fp,"%d %d\n",j,h_cost[j]); + for (j = 0; j < num_of_nodes; j++) + fprintf(fp, "%d %d\n", j, h_cost[j]); fclose(fp); // cleanup memory free(h_graph_nodes); diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp index 57368eda9ada364e6edf6e1eccd35758fa349b62..38e60a1cbff3d9e4ce8d56204e9213943ea4fd55 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp @@ -4,41 +4,47 @@ #include <string.h> // -1 for NO suitable device found, 0 if an appropriate device was found -int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) { - - // Supported Device Requests (anything that returns cl_bool) - // CL_DEVICE_IMAGE_SUPPORT - // CL_DEVICE_HOST_UNIFIED_MEMORY - // CL_DEVICE_ERROR_CORRECTION_SUPPORT - // CL_DEVICE_AVAILABLE - // CL_DEVICE_COMPILER_AVAILABLE - +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, + cl_device_type *reqDeviceType, int numRequests, ...) { + + // Supported Device Requests (anything that returns cl_bool) + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_HOST_UNIFIED_MEMORY + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + // CL_DEVICE_AVAILABLE + // CL_DEVICE_COMPILER_AVAILABLE + cl_uint numEntries = 16; cl_platform_id clPlatforms[numEntries]; cl_uint numPlatforms; - + cl_device_id clDevices[numEntries]; cl_uint numDevices; - OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) ); + OCL_SIMPLE_ERRCK_RETVAL( + clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms)); fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms); bool needDevice = true; - + for (int ip = 0; ip < numPlatforms && needDevice; ++ip) { cl_platform_id clPlatform = clPlatforms[ip]; - - OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) ); - fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices); - - for (int id = 0; (id < numDevices) && needDevice ; ++id) { + + OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, + numEntries, clDevices, &numDevices)); + fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, + numDevices); + + for (int id = 0; (id < numDevices) && needDevice; ++id) { cl_device_id clDevice = clDevices[id]; cl_device_type clDeviceType; bool canSatisfy = true; - + if (reqDeviceType != NULL) { - OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL)); + OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, + sizeof(cl_device_type), + &clDeviceType, NULL)); if (*reqDeviceType != CL_DEVICE_TYPE_ALL) { if (*reqDeviceType != clDeviceType) { canSatisfy = false; @@ -48,32 +54,34 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty va_list paramList; va_start(paramList, numRequests); - for (int i = 0; (i < numRequests) && canSatisfy ; ++i) { - - cl_device_info devReq = va_arg( paramList, cl_device_info ); + for (int i = 0; (i < numRequests) && canSatisfy; ++i) { + + cl_device_info devReq = va_arg(paramList, cl_device_info); cl_bool clInfoBool; size_t infoRetSize = sizeof(cl_bool); - - OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); + + OCL_SIMPLE_ERRCK_RETVAL( + clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); if (clInfoBool != true) { canSatisfy = false; } } - + va_end(paramList); if (canSatisfy) { *device = clDevice; *platform = clPlatform; needDevice = false; fprintf(stderr, "Chose Device Type: %s\n", - (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other" - ); + (clDeviceType == CL_DEVICE_TYPE_CPU) + ? "CPU" + : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"); if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) { *reqDeviceType = clDeviceType; } } } // End checking all devices for a platform - } // End checking all platforms + } // End checking all platforms int retVal = -1; if (needDevice) { @@ -81,214 +89,213 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty } else { retVal = 0; } - + return retVal; } -const char* oclErrorString(cl_int error) -{ -// From NVIDIA SDK - static const char* errorString[] = { - "CL_SUCCESS", - "CL_DEVICE_NOT_FOUND", - "CL_DEVICE_NOT_AVAILABLE", - "CL_COMPILER_NOT_AVAILABLE", - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - "CL_OUT_OF_RESOURCES", - "CL_OUT_OF_HOST_MEMORY", - "CL_PROFILING_INFO_NOT_AVAILABLE", - "CL_MEM_COPY_OVERLAP", - "CL_IMAGE_FORMAT_MISMATCH", - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - "CL_BUILD_PROGRAM_FAILURE", - "CL_MAP_FAILURE", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "CL_INVALID_VALUE", - "CL_INVALID_DEVICE_TYPE", - "CL_INVALID_PLATFORM", - "CL_INVALID_DEVICE", - "CL_INVALID_CONTEXT", - "CL_INVALID_QUEUE_PROPERTIES", - "CL_INVALID_COMMAND_QUEUE", - "CL_INVALID_HOST_PTR", - "CL_INVALID_MEM_OBJECT", - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - "CL_INVALID_IMAGE_SIZE", - "CL_INVALID_SAMPLER", - "CL_INVALID_BINARY", - "CL_INVALID_BUILD_OPTIONS", - "CL_INVALID_PROGRAM", - "CL_INVALID_PROGRAM_EXECUTABLE", - "CL_INVALID_KERNEL_NAME", - "CL_INVALID_KERNEL_DEFINITION", - "CL_INVALID_KERNEL", - "CL_INVALID_ARG_INDEX", - "CL_INVALID_ARG_VALUE", - "CL_INVALID_ARG_SIZE", - "CL_INVALID_KERNEL_ARGS", - "CL_INVALID_WORK_DIMENSION", - "CL_INVALID_WORK_GROUP_SIZE", - "CL_INVALID_WORK_ITEM_SIZE", - "CL_INVALID_GLOBAL_OFFSET", - "CL_INVALID_EVENT_WAIT_LIST", - "CL_INVALID_EVENT", - "CL_INVALID_OPERATION", - "CL_INVALID_GL_OBJECT", - "CL_INVALID_BUFFER_SIZE", - "CL_INVALID_MIP_LEVEL", - "CL_INVALID_GLOBAL_WORK_SIZE", - }; - - const int errorCount = sizeof(errorString) / sizeof(errorString[0]); - - const int index = -error; - - return (index >= 0 && index < errorCount) ? errorString[index] : ""; +const char *oclErrorString(cl_int error) { + // From NVIDIA SDK + static const char *errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; } -const char* oclDebugErrString(cl_int error, cl_device_id device) -{ -// From NVIDIA SDK - static const char* errorString[] = { - "CL_SUCCESS", - "CL_DEVICE_NOT_FOUND", - "CL_DEVICE_NOT_AVAILABLE", - "CL_COMPILER_NOT_AVAILABLE", - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - "CL_OUT_OF_RESOURCES", - "CL_OUT_OF_HOST_MEMORY", - "CL_PROFILING_INFO_NOT_AVAILABLE", - "CL_MEM_COPY_OVERLAP", - "CL_IMAGE_FORMAT_MISMATCH", - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - "CL_BUILD_PROGRAM_FAILURE", - "CL_MAP_FAILURE", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "CL_INVALID_VALUE", - "CL_INVALID_DEVICE_TYPE", - "CL_INVALID_PLATFORM", - "CL_INVALID_DEVICE", - "CL_INVALID_CONTEXT", - "CL_INVALID_QUEUE_PROPERTIES", - "CL_INVALID_COMMAND_QUEUE", - "CL_INVALID_HOST_PTR", - "CL_INVALID_MEM_OBJECT", - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - "CL_INVALID_IMAGE_SIZE", - "CL_INVALID_SAMPLER", - "CL_INVALID_BINARY", - "CL_INVALID_BUILD_OPTIONS", - "CL_INVALID_PROGRAM", - "CL_INVALID_PROGRAM_EXECUTABLE", - "CL_INVALID_KERNEL_NAME", - "CL_INVALID_KERNEL_DEFINITION", - "CL_INVALID_KERNEL", - "CL_INVALID_ARG_INDEX", - "CL_INVALID_ARG_VALUE", - "CL_INVALID_ARG_SIZE", - "CL_INVALID_KERNEL_ARGS", - "CL_INVALID_WORK_DIMENSION", - "CL_INVALID_WORK_GROUP_SIZE", - "CL_INVALID_WORK_ITEM_SIZE", - "CL_INVALID_GLOBAL_OFFSET", - "CL_INVALID_EVENT_WAIT_LIST", - "CL_INVALID_EVENT", - "CL_INVALID_OPERATION", - "CL_INVALID_GL_OBJECT", - "CL_INVALID_BUFFER_SIZE", - "CL_INVALID_MIP_LEVEL", - "CL_INVALID_GLOBAL_WORK_SIZE", - }; - - const int errorCount = sizeof(errorString) / sizeof(errorString[0]); - - const int index = -error; - - if (index == 4) { - cl_uint maxMemAlloc = 0; - OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) ); - fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc); - } - - return (index >= 0 && index < errorCount) ? errorString[index] : ""; +const char *oclDebugErrString(cl_int error, cl_device_id device) { + // From NVIDIA SDK + static const char *errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + if (index == 4) { + cl_uint maxMemAlloc = 0; + OCL_SIMPLE_ERRCK_RETVAL( + clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), + &maxMemAlloc, NULL)); + fprintf(stderr, " Device Maximum block allocation size: %lu\n", + maxMemAlloc); + } + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; } -char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) -{ - // locals - FILE* pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - #ifdef _WIN32 // Windows version - if(fopen_s(&pFileStream, cFilename, "rb") != 0) - { - return NULL; - } - #else // Linux version - pFileStream = fopen(cFilename, "rb"); - if(pFileStream == 0) - { - return NULL; - } - #endif - - size_t szPreambleLength = strlen(cPreamble); - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); - memcpy(cSourceString, cPreamble, szPreambleLength); - if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + source) string +char *oclLoadProgSource(const char *cFilename, const char *cPreamble, + size_t *szFinalLength) { + // locals + FILE *pFileStream = NULL; + size_t szSourceLength; + +// open the OpenCL source code file +#ifdef _WIN32 // Windows version + if (fopen_s(&pFileStream, cFilename, "rb") != 0) { + return NULL; + } +#else // Linux version + pFileStream = fopen(cFilename, "rb"); + if (pFileStream == 0) { + return NULL; + } +#endif + + size_t szPreambleLength = strlen(cPreamble); + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, + pFileStream) != 1) { fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength + szPreambleLength; - } - cSourceString[szSourceLength + szPreambleLength] = '\0'; + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + + // source) string + fclose(pFileStream); + if (szFinalLength != 0) { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; - return cSourceString; + return cSourceString; } diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h index 976c692055501532d65a1ac25e74630732fd2a86..27b084487c6289196337ca064b94f1353f8bbbad 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h @@ -2,26 +2,40 @@ #ifndef __OPENCL_COMMON_H_ #define __OPENCL_COMMON_H_ -#include <stdio.h> +#include <CL/cl.h> #include <stdarg.h> +#include <stdio.h> #include <string.h> -#include <CL/cl.h> -int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...); -const char* oclErrorString(cl_int error); -const char* oclDebugErrString(cl_int error, cl_device_id device); - -#define OCL_ERRCK_VAR(var) \ - { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); } - -#define OCL_ERRCK_RETVAL(s) \ - { cl_int clerr = (s);\ - if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); } - -#define OCL_SIMPLE_ERRCK_RETVAL(s) \ - { cl_int clerr = (s);\ - if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); } - -char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, + cl_device_type *reqDeviceType, int numRequests, ...); +const char *oclErrorString(cl_int error); +const char *oclDebugErrString(cl_int error, cl_device_id device); + +#define OCL_ERRCK_VAR(var) \ + { \ + if (var != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclErrorString(var)); \ + } + +#define OCL_ERRCK_RETVAL(s) \ + { \ + cl_int clerr = (s); \ + if (clerr != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclDebugErrString(clerr, clDevice)); \ + } + +#define OCL_SIMPLE_ERRCK_RETVAL(s) \ + { \ + cl_int clerr = (s); \ + if (clerr != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclErrorString(clerr)); \ + } + +char *oclLoadProgSource(const char *cFilename, const char *cPreamble, + size_t *szFinalLength); #endif diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h index 1a00ef98e054e50e654b0a52ccbb05ce136bab27..f9cdb59e9cd6cc39364fd9389ee39216646aedb2 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h @@ -1,7 +1,8 @@ #define MAX_THREADS_PER_BLOCK 256 -#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees -#define INF 2147483647//2^31-1 -#define UP_LIMIT 16677216//2^24 +#define LOCAL_MEM_SIZE \ + 1600 // This needs to be adjusted for certain graphs with high degrees +#define INF 2147483647 // 2^31-1 +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp index 8e021463567b304f384993052692668559166fe6..9b8b502688abb01934b337bc7fb178b32fda4633 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp @@ -12,61 +12,56 @@ Copyright (c) 2010 University of Illinois at Urbana-Champaign. All rights reserved. - Permission to use, copy, modify and distribute this software and its documentation for - educational purpose is hereby granted without fee, provided that the above copyright - notice and this permission notice appear in all copies of this software and that you do - not sell the software. + Permission to use, copy, modify and distribute this software and its + documentation for educational purpose is hereby granted without fee, provided + that the above copyright notice and this permission notice appear in all + copies of this software and that you do not sell the software. - THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR - OTHERWISE. + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, + IMPLIED OR OTHERWISE. Author: Lijiuan Luo (lluo3@uiuc.edu) - Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu) + Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu + (gengliu2@illinois.edu) */ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <math.h> -#include <CL/cl.h> -#include "parboil.h" #include "OpenCL_common.h" #include "config.h" +#include "parboil.h" +#include <CL/cl.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> - -#define CHECK_ERROR(errorMessage) \ -if(clStatus != CL_SUCCESS) \ -{ \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ -} +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ + } FILE *fp; -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { printf("Error 1!\n"); exit(1); } - fseek(fp,0,SEEK_END); + fseek(fp, 0, SEEK_END); long size = ftell(fp); rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { printf("Error 2!\n"); fclose(fp); exit(1); } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { + size_t res = fread(buffer, 1, size, fp); + if (res != size) { printf("Error 3!\n"); fclose(fp); exit(1); @@ -77,97 +72,98 @@ char* readFile(const char* fileName) } const int h_top = 1; const int zero = 0; -void runGPU(int argc, char** argv); +void runGPU(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// -int main( int argc, char** argv) -{ +int main(int argc, char **argv) { - //the number of nodes in the graph + // the number of nodes in the graph int num_of_nodes = 0; - //the number of edges in the graph + // the number of edges in the graph int num_of_edges = 0; struct pb_Parameters *params; struct pb_TimerSet timers; params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) - { + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - //Read in Graph from a file - fp = fopen(params->inpFiles[0],"r"); - if(!fp) - { + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + // Read in Graph from a file + fp = fopen(params->inpFiles[0], "r"); + if (!fp) { printf("Error Reading graph file\n"); return 0; } int source; - fscanf(fp,"%d",&num_of_nodes); + fscanf(fp, "%d", &num_of_nodes); // allocate host memory - struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes); - int *color = (int*) malloc(sizeof(int)*num_of_nodes); + struct Node *h_graph_nodes = + (struct Node *)malloc(sizeof(struct Node) * num_of_nodes); + int *color = (int *)malloc(sizeof(int) * num_of_nodes); int start, edgeno; // initalize the memory int i; - for( i = 0; i < num_of_nodes; i++) - { - fscanf(fp,"%d %d",&start,&edgeno); + for (i = 0; i < num_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); h_graph_nodes[i].x = start; h_graph_nodes[i].y = edgeno; - color[i]=WHITE; + color[i] = WHITE; } - //read the source node from the file - fscanf(fp,"%d",&source); - fscanf(fp,"%d",&num_of_edges); - int id,cost; - struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges); - for(i=0; i < num_of_edges ; i++) - { - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&cost); + // read the source node from the file + fscanf(fp, "%d", &source); + fscanf(fp, "%d", &num_of_edges); + int id, cost; + struct Edge *h_graph_edges = + (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges); + for (i = 0; i < num_of_edges; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); h_graph_edges[i].x = id; h_graph_edges[i].y = cost; } - if(fp) + if (fp) fclose(fp); pb_InitializeTimerSet(&timers); // allocate mem for the result on host side pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes); - for(i = 0; i < num_of_nodes; i++){ + int *h_cost = (int *)malloc(sizeof(int) * num_of_nodes); + for (i = 0; i < num_of_nodes; i++) { h_cost[i] = INF; } h_cost[source] = 0; - //pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // pb_SwitchToTimer(&timers, pb_TimerID_COPY); cl_int clStatus; cl_device_id clDevice; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + OCL_ERRCK_VAR(clStatus); - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); OCL_ERRCK_VAR(clStatus); pb_SetOpenCL(&clContext, &clCommandQueue); @@ -176,117 +172,157 @@ int main( int argc, char** argv) size_t program_length; const char *clSource_path = "src/opencl_cpu_baseline/kernel.cl"; clSource = oclLoadProgSource(clSource_path, "", &program_length); - //printf("Program Source:\n%s\n", clSource); - cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&clSource, &program_length, &clStatus); + // printf("Program Source:\n%s\n", clSource); + cl_program clProgram = clCreateProgramWithSource( + clContext, 1, (const char **)&clSource, &program_length, &clStatus); OCL_ERRCK_VAR(clStatus); char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); - OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL)); + sprintf(clOptions, "-I src/opencl_base"); + OCL_ERRCK_RETVAL( + clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL)); // Uncomment to view build log from compiler for debugging /* char *build_log; size_t ret_val_size; - clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - build_log = (char *)malloc(ret_val_size+1); - clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); - // there's no information in the reference whether the string is 0 terminated or not - build_log[ret_val_size] = '\0'; - printf("%s\n", build_log ); + clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, + NULL, &ret_val_size); build_log = (char *)malloc(ret_val_size+1); clStatus = + clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, + build_log, NULL); + // there's no information in the reference whether the string is 0 terminated + or not build_log[ret_val_size] = '\0'; printf("%s\n", build_log ); */ - cl_kernel BFS_kernel = clCreateKernel(clProgram,"BFS_kernel",&clStatus); + cl_kernel BFS_kernel = clCreateKernel(clProgram, "BFS_kernel", &clStatus); OCL_ERRCK_VAR(clStatus); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //Copy the Node list to device memory + // Copy the Node list to device memory cl_mem d_graph_nodes; - d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus); + d_graph_nodes = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + num_of_nodes * sizeof(struct Node), NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL)); - //Copy the Edge List to device Memory + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_nodes, CL_TRUE, + 0, num_of_nodes * sizeof(struct Node), + h_graph_nodes, 0, NULL, NULL)); + // Copy the Edge List to device Memory cl_mem d_graph_edges; - d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus); + d_graph_edges = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + num_of_edges * sizeof(struct Edge), NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_edges, CL_TRUE, + 0, num_of_edges * sizeof(struct Edge), + h_graph_edges, 0, NULL, NULL)); cl_mem d_color, d_cost, d_q1, d_q2, tail; - d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); + d_color = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_cost = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_q1 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_q2 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + tail = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_color, CL_TRUE, 0, + num_of_nodes * sizeof(int), color, 0, + NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + num_of_nodes * sizeof(int), h_cost, 0, + NULL, NULL)); printf("Starting GPU kernel\n"); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); int num_of_blocks; int num_of_threads_per_block; - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL)); - + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &h_top, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_q1, CL_TRUE, 0, + sizeof(int), &source, 0, NULL, NULL)); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int num_t;//number of threads - int k=0;//BFS level index + int num_t; // number of threads + int k = 0; // BFS level index + + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 2, sizeof(cl_mem), (void *)&d_graph_nodes)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 3, sizeof(cl_mem), (void *)&d_graph_edges)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 4, sizeof(cl_mem), (void *)&d_color)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 5, sizeof(cl_mem), (void *)&d_cost)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 6, sizeof(cl_mem), (void *)&tail)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,2,sizeof(cl_mem),(void*)&d_graph_nodes)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,3,sizeof(cl_mem),(void*)&d_graph_edges)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,4,sizeof(cl_mem),(void*)&d_color)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,5,sizeof(cl_mem),(void*)&d_cost)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,6,sizeof(cl_mem),(void*)&tail)); + do { - do - { - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &num_t, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - if(num_t == 0){//frontier is empty + if (num_t == 0) { // frontier is empty break; } - num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK); - num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t; + num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = + num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t; - size_t grid[1] = {num_of_blocks*num_of_threads_per_block}; + size_t grid[1] = {num_of_blocks * num_of_threads_per_block}; size_t block[1] = {num_of_threads_per_block}; - - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,7,sizeof(int),(void*)&num_t)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,9,sizeof(int),(void*)&k)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,10,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,11,LOCAL_MEM_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,12,sizeof(int),NULL)); - if(k%2 == 0){ + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 7, sizeof(int), (void *)&num_t)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 9, sizeof(int), (void *)&k)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 10, sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 11, LOCAL_MEM_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 12, sizeof(int), NULL)); + if (k % 2 == 0) { int gray = GRAY0; - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray)); - } - else{ + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray)); + } else { int gray = GRAY1; - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray)); } - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel, 1, 0, + grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); k++; - } while(1); + } while (1); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //printf("GPU kernel done\n"); + // printf("GPU kernel done\n"); // copy result from device to host - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + num_of_nodes * sizeof(int), h_cost, 0, + NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_color, CL_TRUE, 0, + num_of_nodes * sizeof(int), color, 0, + NULL, NULL)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -299,14 +335,13 @@ int main( int argc, char** argv) pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - - //Store the result into a file - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - FILE *fp = fopen(params->outFile,"w"); + // Store the result into a file + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + FILE *fp = fopen(params->outFile, "w"); fprintf(fp, "%d\n", num_of_nodes); int j = 0; - for(j=0;j<num_of_nodes;j++) - fprintf(fp,"%d %d\n",j,h_cost[j]); + for (j = 0; j < num_of_nodes; j++) + fprintf(fp, "%d %d\n", j, h_cost[j]); fclose(fp); // cleanup memory free(h_graph_nodes); diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp index 57368eda9ada364e6edf6e1eccd35758fa349b62..38e60a1cbff3d9e4ce8d56204e9213943ea4fd55 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp @@ -4,41 +4,47 @@ #include <string.h> // -1 for NO suitable device found, 0 if an appropriate device was found -int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) { - - // Supported Device Requests (anything that returns cl_bool) - // CL_DEVICE_IMAGE_SUPPORT - // CL_DEVICE_HOST_UNIFIED_MEMORY - // CL_DEVICE_ERROR_CORRECTION_SUPPORT - // CL_DEVICE_AVAILABLE - // CL_DEVICE_COMPILER_AVAILABLE - +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, + cl_device_type *reqDeviceType, int numRequests, ...) { + + // Supported Device Requests (anything that returns cl_bool) + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_HOST_UNIFIED_MEMORY + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + // CL_DEVICE_AVAILABLE + // CL_DEVICE_COMPILER_AVAILABLE + cl_uint numEntries = 16; cl_platform_id clPlatforms[numEntries]; cl_uint numPlatforms; - + cl_device_id clDevices[numEntries]; cl_uint numDevices; - OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) ); + OCL_SIMPLE_ERRCK_RETVAL( + clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms)); fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms); bool needDevice = true; - + for (int ip = 0; ip < numPlatforms && needDevice; ++ip) { cl_platform_id clPlatform = clPlatforms[ip]; - - OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) ); - fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices); - - for (int id = 0; (id < numDevices) && needDevice ; ++id) { + + OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, + numEntries, clDevices, &numDevices)); + fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, + numDevices); + + for (int id = 0; (id < numDevices) && needDevice; ++id) { cl_device_id clDevice = clDevices[id]; cl_device_type clDeviceType; bool canSatisfy = true; - + if (reqDeviceType != NULL) { - OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL)); + OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, + sizeof(cl_device_type), + &clDeviceType, NULL)); if (*reqDeviceType != CL_DEVICE_TYPE_ALL) { if (*reqDeviceType != clDeviceType) { canSatisfy = false; @@ -48,32 +54,34 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty va_list paramList; va_start(paramList, numRequests); - for (int i = 0; (i < numRequests) && canSatisfy ; ++i) { - - cl_device_info devReq = va_arg( paramList, cl_device_info ); + for (int i = 0; (i < numRequests) && canSatisfy; ++i) { + + cl_device_info devReq = va_arg(paramList, cl_device_info); cl_bool clInfoBool; size_t infoRetSize = sizeof(cl_bool); - - OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); + + OCL_SIMPLE_ERRCK_RETVAL( + clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); if (clInfoBool != true) { canSatisfy = false; } } - + va_end(paramList); if (canSatisfy) { *device = clDevice; *platform = clPlatform; needDevice = false; fprintf(stderr, "Chose Device Type: %s\n", - (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other" - ); + (clDeviceType == CL_DEVICE_TYPE_CPU) + ? "CPU" + : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"); if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) { *reqDeviceType = clDeviceType; } } } // End checking all devices for a platform - } // End checking all platforms + } // End checking all platforms int retVal = -1; if (needDevice) { @@ -81,214 +89,213 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty } else { retVal = 0; } - + return retVal; } -const char* oclErrorString(cl_int error) -{ -// From NVIDIA SDK - static const char* errorString[] = { - "CL_SUCCESS", - "CL_DEVICE_NOT_FOUND", - "CL_DEVICE_NOT_AVAILABLE", - "CL_COMPILER_NOT_AVAILABLE", - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - "CL_OUT_OF_RESOURCES", - "CL_OUT_OF_HOST_MEMORY", - "CL_PROFILING_INFO_NOT_AVAILABLE", - "CL_MEM_COPY_OVERLAP", - "CL_IMAGE_FORMAT_MISMATCH", - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - "CL_BUILD_PROGRAM_FAILURE", - "CL_MAP_FAILURE", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "CL_INVALID_VALUE", - "CL_INVALID_DEVICE_TYPE", - "CL_INVALID_PLATFORM", - "CL_INVALID_DEVICE", - "CL_INVALID_CONTEXT", - "CL_INVALID_QUEUE_PROPERTIES", - "CL_INVALID_COMMAND_QUEUE", - "CL_INVALID_HOST_PTR", - "CL_INVALID_MEM_OBJECT", - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - "CL_INVALID_IMAGE_SIZE", - "CL_INVALID_SAMPLER", - "CL_INVALID_BINARY", - "CL_INVALID_BUILD_OPTIONS", - "CL_INVALID_PROGRAM", - "CL_INVALID_PROGRAM_EXECUTABLE", - "CL_INVALID_KERNEL_NAME", - "CL_INVALID_KERNEL_DEFINITION", - "CL_INVALID_KERNEL", - "CL_INVALID_ARG_INDEX", - "CL_INVALID_ARG_VALUE", - "CL_INVALID_ARG_SIZE", - "CL_INVALID_KERNEL_ARGS", - "CL_INVALID_WORK_DIMENSION", - "CL_INVALID_WORK_GROUP_SIZE", - "CL_INVALID_WORK_ITEM_SIZE", - "CL_INVALID_GLOBAL_OFFSET", - "CL_INVALID_EVENT_WAIT_LIST", - "CL_INVALID_EVENT", - "CL_INVALID_OPERATION", - "CL_INVALID_GL_OBJECT", - "CL_INVALID_BUFFER_SIZE", - "CL_INVALID_MIP_LEVEL", - "CL_INVALID_GLOBAL_WORK_SIZE", - }; - - const int errorCount = sizeof(errorString) / sizeof(errorString[0]); - - const int index = -error; - - return (index >= 0 && index < errorCount) ? errorString[index] : ""; +const char *oclErrorString(cl_int error) { + // From NVIDIA SDK + static const char *errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; } -const char* oclDebugErrString(cl_int error, cl_device_id device) -{ -// From NVIDIA SDK - static const char* errorString[] = { - "CL_SUCCESS", - "CL_DEVICE_NOT_FOUND", - "CL_DEVICE_NOT_AVAILABLE", - "CL_COMPILER_NOT_AVAILABLE", - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - "CL_OUT_OF_RESOURCES", - "CL_OUT_OF_HOST_MEMORY", - "CL_PROFILING_INFO_NOT_AVAILABLE", - "CL_MEM_COPY_OVERLAP", - "CL_IMAGE_FORMAT_MISMATCH", - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - "CL_BUILD_PROGRAM_FAILURE", - "CL_MAP_FAILURE", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "CL_INVALID_VALUE", - "CL_INVALID_DEVICE_TYPE", - "CL_INVALID_PLATFORM", - "CL_INVALID_DEVICE", - "CL_INVALID_CONTEXT", - "CL_INVALID_QUEUE_PROPERTIES", - "CL_INVALID_COMMAND_QUEUE", - "CL_INVALID_HOST_PTR", - "CL_INVALID_MEM_OBJECT", - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - "CL_INVALID_IMAGE_SIZE", - "CL_INVALID_SAMPLER", - "CL_INVALID_BINARY", - "CL_INVALID_BUILD_OPTIONS", - "CL_INVALID_PROGRAM", - "CL_INVALID_PROGRAM_EXECUTABLE", - "CL_INVALID_KERNEL_NAME", - "CL_INVALID_KERNEL_DEFINITION", - "CL_INVALID_KERNEL", - "CL_INVALID_ARG_INDEX", - "CL_INVALID_ARG_VALUE", - "CL_INVALID_ARG_SIZE", - "CL_INVALID_KERNEL_ARGS", - "CL_INVALID_WORK_DIMENSION", - "CL_INVALID_WORK_GROUP_SIZE", - "CL_INVALID_WORK_ITEM_SIZE", - "CL_INVALID_GLOBAL_OFFSET", - "CL_INVALID_EVENT_WAIT_LIST", - "CL_INVALID_EVENT", - "CL_INVALID_OPERATION", - "CL_INVALID_GL_OBJECT", - "CL_INVALID_BUFFER_SIZE", - "CL_INVALID_MIP_LEVEL", - "CL_INVALID_GLOBAL_WORK_SIZE", - }; - - const int errorCount = sizeof(errorString) / sizeof(errorString[0]); - - const int index = -error; - - if (index == 4) { - cl_uint maxMemAlloc = 0; - OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) ); - fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc); - } - - return (index >= 0 && index < errorCount) ? errorString[index] : ""; +const char *oclDebugErrString(cl_int error, cl_device_id device) { + // From NVIDIA SDK + static const char *errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + if (index == 4) { + cl_uint maxMemAlloc = 0; + OCL_SIMPLE_ERRCK_RETVAL( + clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), + &maxMemAlloc, NULL)); + fprintf(stderr, " Device Maximum block allocation size: %lu\n", + maxMemAlloc); + } + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; } -char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) -{ - // locals - FILE* pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - #ifdef _WIN32 // Windows version - if(fopen_s(&pFileStream, cFilename, "rb") != 0) - { - return NULL; - } - #else // Linux version - pFileStream = fopen(cFilename, "rb"); - if(pFileStream == 0) - { - return NULL; - } - #endif - - size_t szPreambleLength = strlen(cPreamble); - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); - memcpy(cSourceString, cPreamble, szPreambleLength); - if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + source) string +char *oclLoadProgSource(const char *cFilename, const char *cPreamble, + size_t *szFinalLength) { + // locals + FILE *pFileStream = NULL; + size_t szSourceLength; + +// open the OpenCL source code file +#ifdef _WIN32 // Windows version + if (fopen_s(&pFileStream, cFilename, "rb") != 0) { + return NULL; + } +#else // Linux version + pFileStream = fopen(cFilename, "rb"); + if (pFileStream == 0) { + return NULL; + } +#endif + + size_t szPreambleLength = strlen(cPreamble); + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, + pFileStream) != 1) { fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength + szPreambleLength; - } - cSourceString[szSourceLength + szPreambleLength] = '\0'; + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + + // source) string + fclose(pFileStream); + if (szFinalLength != 0) { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; - return cSourceString; + return cSourceString; } diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h index 976c692055501532d65a1ac25e74630732fd2a86..27b084487c6289196337ca064b94f1353f8bbbad 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h @@ -2,26 +2,40 @@ #ifndef __OPENCL_COMMON_H_ #define __OPENCL_COMMON_H_ -#include <stdio.h> +#include <CL/cl.h> #include <stdarg.h> +#include <stdio.h> #include <string.h> -#include <CL/cl.h> -int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...); -const char* oclErrorString(cl_int error); -const char* oclDebugErrString(cl_int error, cl_device_id device); - -#define OCL_ERRCK_VAR(var) \ - { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); } - -#define OCL_ERRCK_RETVAL(s) \ - { cl_int clerr = (s);\ - if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); } - -#define OCL_SIMPLE_ERRCK_RETVAL(s) \ - { cl_int clerr = (s);\ - if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); } - -char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, + cl_device_type *reqDeviceType, int numRequests, ...); +const char *oclErrorString(cl_int error); +const char *oclDebugErrString(cl_int error, cl_device_id device); + +#define OCL_ERRCK_VAR(var) \ + { \ + if (var != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclErrorString(var)); \ + } + +#define OCL_ERRCK_RETVAL(s) \ + { \ + cl_int clerr = (s); \ + if (clerr != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclDebugErrString(clerr, clDevice)); \ + } + +#define OCL_SIMPLE_ERRCK_RETVAL(s) \ + { \ + cl_int clerr = (s); \ + if (clerr != CL_SUCCESS) \ + fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, \ + oclErrorString(clerr)); \ + } + +char *oclLoadProgSource(const char *cFilename, const char *cPreamble, + size_t *szFinalLength); #endif diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h index 36640fd99d5dc86b0509ab30724e419dbc4720c5..9cfe7257ba16f72cbec7e00faa1a078778e0ab50 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h @@ -1,10 +1,15 @@ -#define NUM_BIN 8 //the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU -#define EXP 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future architecture - //using EXP and shifting can speed up division operation -#define MOD_OP 7 // This variable is also related with NUM_BIN; may change in the future architecture; - //using MOD_OP and "bitwise and" can speed up mod operation -#define INF 2147483647//2^31-1 -#define UP_LIMIT 16677216//2^24 +#define NUM_BIN \ + 8 // the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU +#define EXP \ + 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future + // architecture + // using EXP and shifting can speed up division operation +#define MOD_OP \ + 7 // This variable is also related with NUM_BIN; may change in the future + // architecture; + // using MOD_OP and "bitwise and" can speed up mod operation +#define INF 2147483647 // 2^31-1 +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp index 0a1b13ee1c677de5a129dfcf0adb78ce293718e6..3f9bc775574f597bdcf69c6999553c3c37bd352d 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp @@ -9,36 +9,36 @@ Implementing Breadth first search on CUDA using algorithm given in DAC'10 paper "An Effective GPU Implementation of Breadth-First Search" - Copyright (c) 2010 University of Illinois at Urbana-Champaign. + Copyright (c) 2010 University of Illinois at Urbana-Champaign. All rights reserved. - Permission to use, copy, modify and distribute this software and its documentation for - educational purpose is hereby granted without fee, provided that the above copyright - notice and this permission notice appear in all copies of this software and that you do - not sell the software. + Permission to use, copy, modify and distribute this software and its + documentation for educational purpose is hereby granted without fee, provided + that the above copyright notice and this permission notice appear in all + copies of this software and that you do not sell the software. - THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR - OTHERWISE. + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, + IMPLIED OR OTHERWISE. Author: Lijiuan Luo (lluo3@uiuc.edu) - Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu) + Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu + (gengliu2@illinois.edu) */ +#include "OpenCL_common.h" +#include "config.h" #include <CL/cl.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> #include <math.h> #include <parboil.h> -#include "OpenCL_common.h" -#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> -#define CHECK_ERROR(errorMessage) \ -if(clStatus != CL_SUCCESS) \ -{ \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ -} +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ + } FILE *fp; struct Node { @@ -49,113 +49,110 @@ struct Edge { int x; int y; }; -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { printf("Error 1!\n"); exit(1); } - fseek(fp,0,SEEK_END); + fseek(fp, 0, SEEK_END); long size = ftell(fp); rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { printf("Error 2!\n"); fclose(fp); exit(1); } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { + size_t res = fread(buffer, 1, size, fp); + if (res != size) { printf("Error 3!\n"); fclose(fp); exit(1); } - fclose(fp); + fclose(fp); return buffer; } //#include "kernel.cl" -//Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables for initialization +// Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables +// for initialization const int h_top = 1; const int zero = 0; -int BFS_GPU(cl_mem d_graph_nodes,cl_mem d_graph_edges, cl_mem d_color, cl_mem d_cost, cl_mem d_q1, cl_mem d_q2, cl_mem tail, int * source, cl_int clStatus, cl_command_queue clCommandQueue, cl_kernel BFS_kernel_S, cl_kernel BFS_kernel_M, cl_kernel BFS_kernel_L, cl_device_id clDevice, cl_context clContext){ -} -void runGPU(int argc, char** argv); +int BFS_GPU(cl_mem d_graph_nodes, cl_mem d_graph_edges, cl_mem d_color, + cl_mem d_cost, cl_mem d_q1, cl_mem d_q2, cl_mem tail, int *source, + cl_int clStatus, cl_command_queue clCommandQueue, + cl_kernel BFS_kernel_S, cl_kernel BFS_kernel_M, + cl_kernel BFS_kernel_L, cl_device_id clDevice, + cl_context clContext) {} +void runGPU(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char** argv) -{ - //the number of nodes in the graph - int num_of_nodes = 0; - //the number of edges in the graph +int main(int argc, char **argv) { + // the number of nodes in the graph + int num_of_nodes = 0; + // the number of edges in the graph int num_of_edges = 0; struct pb_Parameters *params; struct pb_TimerSet timers; params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) - { + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - //printf("Reading File\n"); - //Read in Graph from a file - fp = fopen(params->inpFiles[0],"r"); - if(!fp) - { + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + // printf("Reading File\n"); + // Read in Graph from a file + fp = fopen(params->inpFiles[0], "r"); + if (!fp) { printf("Error Reading graph file\n"); return 0; } int source; - fscanf(fp,"%d",&num_of_nodes); + fscanf(fp, "%d", &num_of_nodes); // allocate host memory - struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes); - int *color = (int*) malloc(sizeof(int)*num_of_nodes); - int start, edgeno; + struct Node *h_graph_nodes = + (struct Node *)malloc(sizeof(struct Node) * num_of_nodes); + int *color = (int *)malloc(sizeof(int) * num_of_nodes); + int start, edgeno; // initalize the memory int i; - for( i = 0; i < num_of_nodes; i++) - { - fscanf(fp,"%d %d",&start,&edgeno); + for (i = 0; i < num_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); h_graph_nodes[i].x = start; h_graph_nodes[i].y = edgeno; - color[i]=WHITE; + color[i] = WHITE; } - //read the source node from the file - fscanf(fp,"%d",&source); - fscanf(fp,"%d",&num_of_edges); - int id,cost; - struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges); - for(i=0; i < num_of_edges ; i++) - { - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&cost); + // read the source node from the file + fscanf(fp, "%d", &source); + fscanf(fp, "%d", &num_of_edges); + int id, cost; + struct Edge *h_graph_edges = + (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges); + for (i = 0; i < num_of_edges; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); h_graph_edges[i].x = id; h_graph_edges[i].y = cost; } - if(fp) - fclose(fp); + if (fp) + fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate mem for the result on host side - int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes); - for(i = 0; i < num_of_nodes; i++){ + int *h_cost = (int *)malloc(sizeof(int) * num_of_nodes); + for (i = 0; i < num_of_nodes; i++) { h_cost[i] = INF; } h_cost[source] = 0; @@ -165,319 +162,451 @@ int main(int argc, char** argv) cl_int clStatus; cl_device_id clDevice; cl_platform_id clPlatform; - OCL_ERRCK_RETVAL(clGetPlatformIDs(1,&clPlatform,NULL)); - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - OCL_ERRCK_RETVAL(clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL)); - size_t MAX_THREADS_PER_BLOCK = 0; - clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_THREADS_PER_BLOCK), &MAX_THREADS_PER_BLOCK, NULL); - if(MAX_THREADS_PER_BLOCK > 512) + OCL_ERRCK_RETVAL(clGetPlatformIDs(1, &clPlatform, NULL)); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + OCL_ERRCK_RETVAL( + clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL)); + size_t MAX_THREADS_PER_BLOCK = 0; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(MAX_THREADS_PER_BLOCK), + &MAX_THREADS_PER_BLOCK, NULL); + if (MAX_THREADS_PER_BLOCK > 512) MAX_THREADS_PER_BLOCK = 512; OCL_ERRCK_VAR(clStatus); int NUM_SM = 0; - clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(NUM_SM), &NUM_SM, NULL); + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(NUM_SM), &NUM_SM, NULL); OCL_ERRCK_VAR(clStatus); - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); OCL_ERRCK_VAR(clStatus); pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource_path = "src/opencl_nvidia/kernel.cl"; + const char *clSource_path = "src/opencl_nvidia/kernel.cl"; size_t program_length; - char* clSource = oclLoadProgSource(clSource_path, "", &program_length); - //printf("Program Source:\n%s\n", clSource); + char *clSource = oclLoadProgSource(clSource_path, "", &program_length); + // printf("Program Source:\n%s\n", clSource); printf("Program building ...\n"); - cl_program clProgram = clCreateProgramWithSource(clContext,1,(const char**)&clSource, &program_length,&clStatus); + cl_program clProgram = clCreateProgramWithSource( + clContext, 1, (const char **)&clSource, &program_length, &clStatus); printf("Program built\n"); OCL_ERRCK_VAR(clStatus); char clOptions[100]; - //printf("NUM_SM = %d, MAX_THREADS_PER_BLOCK = %d\n", NUM_SM, MAX_THREADS_PER_BLOCK); - sprintf(clOptions,"-I src/opencl_nvidia -DMAX_THREADS_PER_BLOCK=%d -DNUM_SM=%d", MAX_THREADS_PER_BLOCK, NUM_SM); - OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL)); + // printf("NUM_SM = %d, MAX_THREADS_PER_BLOCK = %d\n", NUM_SM, + // MAX_THREADS_PER_BLOCK); + sprintf(clOptions, + "-I src/opencl_nvidia -DMAX_THREADS_PER_BLOCK=%d -DNUM_SM=%d", + MAX_THREADS_PER_BLOCK, NUM_SM); + OCL_ERRCK_RETVAL( + clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL)); // Uncomment to view build log from compiler for debugging - + char *build_log; size_t ret_val_size; - clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - build_log = (char *)malloc(ret_val_size+1); - clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); - // there's no information in the reference whether the string is 0 terminated or not + clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, + NULL, &ret_val_size); + build_log = (char *)malloc(ret_val_size + 1); + clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, + ret_val_size, build_log, NULL); + // there's no information in the reference whether the string is 0 terminated + // or not build_log[ret_val_size] = '\0'; - printf("%s\n", build_log ); - - - //Small kernel: only 1 block - cl_kernel BFS_kernel_S = clCreateKernel(clProgram,"BFS_in_GPU_kernel",&clStatus); - //Medium kernel: 1 block per SM - cl_kernel BFS_kernel_M = clCreateKernel(clProgram,"BFS_kernel_multi_blk_inGPU",&clStatus); - //Large kernel: No restriction - cl_kernel BFS_kernel_L = clCreateKernel(clProgram,"BFS_kernel",&clStatus); + printf("%s\n", build_log); + + // Small kernel: only 1 block + cl_kernel BFS_kernel_S = + clCreateKernel(clProgram, "BFS_in_GPU_kernel", &clStatus); + // Medium kernel: 1 block per SM + cl_kernel BFS_kernel_M = + clCreateKernel(clProgram, "BFS_kernel_multi_blk_inGPU", &clStatus); + // Large kernel: No restriction + cl_kernel BFS_kernel_L = clCreateKernel(clProgram, "BFS_kernel", &clStatus); OCL_ERRCK_VAR(clStatus); - //Copy the Node list to device memory + // Copy the Node list to device memory cl_mem d_graph_nodes; - d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus); + d_graph_nodes = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + num_of_nodes * sizeof(struct Node), NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL)); - //Copy the Edge List to device Memory + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_nodes, CL_TRUE, + 0, num_of_nodes * sizeof(struct Node), + h_graph_nodes, 0, NULL, NULL)); + // Copy the Edge List to device Memory cl_mem d_graph_edges; - d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus); + d_graph_edges = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + num_of_edges * sizeof(struct Edge), NULL, &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_edges, CL_TRUE, + 0, num_of_edges * sizeof(struct Edge), + h_graph_edges, 0, NULL, NULL)); cl_mem d_color, d_cost, d_q1, d_q2, tail; - d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus); - tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); + d_color = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_cost = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_q1 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + d_q2 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, + num_of_nodes * sizeof(int), NULL, &clStatus); + tail = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_color, CL_TRUE, 0, + num_of_nodes * sizeof(int), color, 0, + NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + num_of_nodes * sizeof(int), h_cost, 0, + NULL, NULL)); printf("Starting GPU kernel\n"); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - int num_of_blocks; + int num_of_blocks; int num_of_threads_per_block; - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &h_top, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_q1, CL_TRUE, 0, + sizeof(int), &source, 0, NULL, NULL)); - int num_t;//number of threads - int k=0;//BFS level index + int num_t; // number of threads + int k = 0; // BFS level index cl_mem switch_kd, num_td, global_kt_d; - switch_kd = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); - num_td = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); - global_kt_d = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); + switch_kd = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); + num_td = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); + global_kt_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); OCL_ERRCK_VAR(clStatus); int switch_k; int global_kt = 0; - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,global_kt_d,CL_TRUE,0,sizeof(int),&global_kt,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, global_kt_d, CL_TRUE, 0, + sizeof(int), &global_kt, 0, NULL, + NULL)); cl_mem count; cl_mem num_of_nodes_vol; cl_mem stay_vol; - count = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); - num_of_nodes_vol = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); - stay_vol = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus); + count = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); + num_of_nodes_vol = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), + NULL, &clStatus); + stay_vol = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL, + &clStatus); OCL_ERRCK_VAR(clStatus); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,count,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,num_of_nodes_vol,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,stay_vol,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, count, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, num_of_nodes_vol, + CL_TRUE, 0, sizeof(int), &zero, 0, NULL, + NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, stay_vol, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //BFS_kernel_S arguments setup - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,2,sizeof(cl_mem),(void*)&d_graph_nodes)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,3,sizeof(cl_mem),(void*)&d_graph_edges)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,4,sizeof(cl_mem),(void*)&d_color)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,5,sizeof(cl_mem),(void*)&d_cost)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,6,sizeof(cl_mem),(void*)&tail)); - - //BFS_kernel_M arguments setup - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,2,sizeof(cl_mem),(void*)&d_graph_nodes)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,3,sizeof(cl_mem),(void*)&d_graph_edges)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,4,sizeof(cl_mem),(void*)&d_color)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,5,sizeof(cl_mem),(void*)&d_cost)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,6,sizeof(cl_mem),(void*)&num_td)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,7,sizeof(cl_mem),(void*)&tail)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,10,sizeof(cl_mem),(void*)&switch_kd)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,11,sizeof(cl_mem),(void*)&global_kt_d)); - //volatile mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,19,sizeof(cl_mem),(void*)&count)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,20,sizeof(cl_mem),(void*)&num_of_nodes_vol)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,21,sizeof(cl_mem),(void*)&stay_vol)); - - //BFS_kernel_L arguments setup - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,2,sizeof(cl_mem),(void*)&d_graph_nodes)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,3,sizeof(cl_mem),(void*)&d_graph_edges)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,4,sizeof(cl_mem),(void*)&d_color)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,5,sizeof(cl_mem),(void*)&d_cost)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,6,sizeof(cl_mem),(void*)&tail)); + // BFS_kernel_S arguments setup + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 2, sizeof(cl_mem), (void *)&d_graph_nodes)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 3, sizeof(cl_mem), (void *)&d_graph_edges)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 4, sizeof(cl_mem), (void *)&d_color)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 5, sizeof(cl_mem), (void *)&d_cost)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 6, sizeof(cl_mem), (void *)&tail)); + + // BFS_kernel_M arguments setup + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 2, sizeof(cl_mem), (void *)&d_graph_nodes)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 3, sizeof(cl_mem), (void *)&d_graph_edges)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 4, sizeof(cl_mem), (void *)&d_color)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 5, sizeof(cl_mem), (void *)&d_cost)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 6, sizeof(cl_mem), (void *)&num_td)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 7, sizeof(cl_mem), (void *)&tail)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 10, sizeof(cl_mem), (void *)&switch_kd)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 11, sizeof(cl_mem), (void *)&global_kt_d)); + // volatile mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 19, sizeof(cl_mem), (void *)&count)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 20, sizeof(cl_mem), + (void *)&num_of_nodes_vol)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 21, sizeof(cl_mem), (void *)&stay_vol)); + + // BFS_kernel_L arguments setup + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 2, sizeof(cl_mem), (void *)&d_graph_nodes)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 3, sizeof(cl_mem), (void *)&d_graph_edges)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 4, sizeof(cl_mem), (void *)&d_color)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 5, sizeof(cl_mem), (void *)&d_cost)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 6, sizeof(cl_mem), (void *)&tail)); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - do - { - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL)); + do { + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &num_t, 0, NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, + sizeof(int), &zero, 0, NULL, NULL)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - if(num_t == 0){//frontier is empty + if (num_t == 0) { // frontier is empty break; } num_of_blocks = 1; num_of_threads_per_block = num_t; - if(num_of_threads_per_block <NUM_BIN) + if (num_of_threads_per_block < NUM_BIN) num_of_threads_per_block = NUM_BIN; - if(num_t>MAX_THREADS_PER_BLOCK) - { - num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK); + if (num_t > MAX_THREADS_PER_BLOCK) { + num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK); num_of_threads_per_block = MAX_THREADS_PER_BLOCK; } - if(num_of_blocks == 1)//will call "BFS_in_GPU_kernel" - num_of_threads_per_block = MAX_THREADS_PER_BLOCK; - if(num_of_blocks >1 && num_of_blocks <= NUM_SM)// will call "BFS_kernel_multi_blk_inGPU" + if (num_of_blocks == 1) // will call "BFS_in_GPU_kernel" + num_of_threads_per_block = MAX_THREADS_PER_BLOCK; + if (num_of_blocks > 1 && + num_of_blocks <= NUM_SM) // will call "BFS_kernel_multi_blk_inGPU" num_of_blocks = NUM_SM; - //assume "num_of_blocks" can not be very large - size_t grid[1] = {num_of_blocks*num_of_threads_per_block}; + // assume "num_of_blocks" can not be very large + size_t grid[1] = {num_of_blocks * num_of_threads_per_block}; size_t block[1] = {num_of_threads_per_block}; - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,7,sizeof(int),(void*)&num_t)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,9,sizeof(int),(void*)&k)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 7, sizeof(int), (void *)&num_t)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 9, sizeof(int), (void *)&k)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,9,sizeof(int),(void*)&k)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 9, sizeof(int), (void *)&k)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,7,sizeof(int),(void*)&num_t)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,9,sizeof(int),(void*)&k)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 7, sizeof(int), (void *)&num_t)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 9, sizeof(int), (void *)&k)); - if(k%2 == 0){ + if (k % 2 == 0) { int gray = GRAY0; - if(num_of_blocks == 1) { - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,0,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,1,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,8,sizeof(int),(void*)&gray)); - //shared_mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,10,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,12,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,13,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,14,MAX_THREADS_PER_BLOCK*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,15,sizeof(int),NULL)); + if (num_of_blocks == 1) { + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 0, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 1, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 8, sizeof(int), (void *)&gray)); + // shared_mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 10, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_S, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 12, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 13, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL)); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_S,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1, + 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); } else if (num_of_blocks <= NUM_SM) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,num_td,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, num_td, CL_TRUE, + 0, sizeof(int), &num_t, 0, NULL, + NULL)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,0,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,1,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,8,sizeof(int),(void*)&gray)); - //shared_mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,12,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,13,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,14,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,15,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,16,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,17,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,18,sizeof(int),NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 0, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 1, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 8, sizeof(int), (void *)&gray)); + // shared_mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 12, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_M, 13, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 14, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 15, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL)); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_M,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1, + 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,switch_kd,CL_TRUE,0,sizeof(int),&switch_k,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, switch_kd, CL_TRUE, + 0, sizeof(int), &switch_k, 0, NULL, + NULL)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - if(!switch_k){ + if (!switch_k) { k--; } } else { - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,0,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,1,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,8,sizeof(int),(void*)&gray)); - //shared_mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,10,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,12,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,13,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,14,sizeof(int),NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 0, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 1, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 8, sizeof(int), (void *)&gray)); + // shared_mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 10, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_L, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 12, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL)); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_L,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1, + 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); } - } - else { + } else { int gray = GRAY1; - if(num_of_blocks == 1) { - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,0,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,1,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,8,sizeof(int),(void*)&gray)); - //shared_mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,10,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,12,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,13,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,14,MAX_THREADS_PER_BLOCK*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,15,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_S,1,0,grid,block,0,0,0)); + if (num_of_blocks == 1) { + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 0, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 1, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 8, sizeof(int), (void *)&gray)); + // shared_mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 10, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_S, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 12, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_S, 13, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1, + 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); } else if (num_of_blocks <= NUM_SM) { - OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,num_td,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,0,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,1,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,8,sizeof(int),(void*)&gray)); - //shared_mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,12,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,13,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,14,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,15,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,16,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,17,sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,18,sizeof(int),NULL)); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, num_td, CL_TRUE, + 0, sizeof(int), &num_t, 0, NULL, + NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 0, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 1, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 8, sizeof(int), (void *)&gray)); + // shared_mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 12, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_M, 13, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 14, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_M, 15, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL)); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_M,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1, + 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,switch_kd,CL_TRUE,0,sizeof(int),&switch_k,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, switch_kd, CL_TRUE, + 0, sizeof(int), &switch_k, 0, NULL, + NULL)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - if(!switch_k){ + if (!switch_k) { k--; } } else { - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,0,sizeof(cl_mem),(void*)&d_q2)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,1,sizeof(cl_mem),(void*)&d_q1)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,8,sizeof(int),(void*)&gray)); - //shared_mem - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,10,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,12,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,13,NUM_BIN*sizeof(int),NULL)); - OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,14,sizeof(int),NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 0, sizeof(cl_mem), (void *)&d_q2)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 1, sizeof(cl_mem), (void *)&d_q1)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 8, sizeof(int), (void *)&gray)); + // shared_mem + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 10, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg( + BFS_kernel_L, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 12, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL( + clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL)); + OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL)); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_L,1,0,grid,block,0,0,0)); + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1, + 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - } } k++; - } while(1); + } while (1); pb_SwitchToTimer(&timers, pb_TimerID_COPY); // copy result from device to host - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL)); - OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_cost, CL_TRUE, 0, + num_of_nodes * sizeof(int), h_cost, 0, + NULL, NULL)); + OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_color, CL_TRUE, 0, + num_of_nodes * sizeof(int), color, 0, + NULL, NULL)); OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_nodes)); OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_edges)); OCL_ERRCK_RETVAL(clReleaseMemObject(d_color)); OCL_ERRCK_RETVAL(clReleaseMemObject(d_cost)); OCL_ERRCK_RETVAL(clReleaseMemObject(tail)); - + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - //Store the result into a file - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - FILE *fp = fopen(params->outFile,"w"); + // Store the result into a file + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + FILE *fp = fopen(params->outFile, "w"); fprintf(fp, "%d\n", num_of_nodes); int j = 0; - for(j=0;j<num_of_nodes;j++) - fprintf(fp,"%d %d\n",j,h_cost[j]); + for (j = 0; j < num_of_nodes; j++) + fprintf(fp, "%d %d\n", j, h_cost[j]); fclose(fp); // cleanup memory diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h b/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h index 1a00ef98e054e50e654b0a52ccbb05ce136bab27..f9cdb59e9cd6cc39364fd9389ee39216646aedb2 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h +++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h @@ -1,7 +1,8 @@ #define MAX_THREADS_PER_BLOCK 256 -#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees -#define INF 2147483647//2^31-1 -#define UP_LIMIT 16677216//2^24 +#define LOCAL_MEM_SIZE \ + 1600 // This needs to be adjusted for certain graphs with high degrees +#define INF 2147483647 // 2^31-1 +#define UP_LIMIT 16677216 // 2^24 #define WHITE 16677217 #define GRAY 16677218 #define GRAY0 16677219 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp index 6227ef498f10eb82e685f4dab518caf17e7757ac..9491218e5e93d39fc1bda4fac3c14770ee48645b 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp @@ -12,84 +12,84 @@ Copyright (c) 2010 University of Illinois at Urbana-Champaign. All rights reserved. - Permission to use, copy, modify and distribute this software and its documentation for - educational purpose is hereby granted without fee, provided that the above copyright - notice and this permission notice appear in all copies of this software and that you do - not sell the software. + Permission to use, copy, modify and distribute this software and its + documentation for educational purpose is hereby granted without fee, provided + that the above copyright notice and this permission notice appear in all + copies of this software and that you do not sell the software. - THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR - OTHERWISE. + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, + IMPLIED OR OTHERWISE. Author: Lijiuan Luo (lluo3@uiuc.edu) - Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu) + Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu + (gengliu2@illinois.edu) */ -#include <stdlib.h> +#include "config.h" +#include "parboil.h" +#include <math.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <math.h> -#include "parboil.h" -#include "config.h" #include <visc.h> /********** Define colors for BFS -1) the definition of White, gray and black comes from the text book "Introduction to Algorithms" -2) For path search problems, people may choose to use different colors to record the found paths. -Therefore we reserve numbers (0-16677216) for this purpose. Only nodes with colors bigger than -UP_LIMIT are free to visit -3) We define two gray shades to differentiate between the new frontier nodes and the old frontier nodes that - have not been marked BLACK +1) the definition of White, gray and black comes from the text book +"Introduction to Algorithms" 2) For path search problems, people may choose to +use different colors to record the found paths. Therefore we reserve numbers +(0-16677216) for this purpose. Only nodes with colors bigger than UP_LIMIT are +free to visit 3) We define two gray shades to differentiate between the new +frontier nodes and the old frontier nodes that have not been marked BLACK *************/ //#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable //#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable //#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable /***************************************************************************** -This is the most general version of BFS kernel, i.e. no assumption about #block in the grid -\param q1: the array to hold the current frontier -\param q2: the array to hold the new frontier -\param g_graph_nodes: the nodes in the input graph -\param g_graph_edges: the edges i nthe input graph -\param g_color: the colors of nodes -\param g_cost: the costs of nodes -\param no_of_nodes: the number of nodes in the current frontier -\param tail: pointer to the location of the tail of the new frontier. *tail is the size of the new frontier -\param gray_shade: the shade of the gray in current BFS propagation. See GRAY0, GRAY1 macro definitions for more details -\param k: the level of current propagation in the BFS tree. k= 0 for the first propagation. +This is the most general version of BFS kernel, i.e. no assumption about #block +in the grid \param q1: the array to hold the current frontier \param q2: the +array to hold the new frontier \param g_graph_nodes: the nodes in the input +graph \param g_graph_edges: the edges i nthe input graph \param g_color: the +colors of nodes \param g_cost: the costs of nodes \param no_of_nodes: the number +of nodes in the current frontier \param tail: pointer to the location of the +tail of the new frontier. *tail is the size of the new frontier \param +gray_shade: the shade of the gray in current BFS propagation. See GRAY0, GRAY1 +macro definitions for more details \param k: the level of current propagation in +the BFS tree. k= 0 for the first propagation. ***********************************************************************/ -//typedef struct { +// typedef struct { //} VoidRetTy; typedef struct __attribute__((__packed__)) { - int* q1; size_t bytesq1; - int* q2; size_t bytesq2; - struct Node* graph_nodes; size_t bytes_graph_nodes; - struct Edge* graph_edges; size_t bytes_graph_edges; - int* color; size_t bytes_color; - int* cost; size_t bytes_cost; - int* tail; size_t bytes_tail; + int *q1; + size_t bytesq1; + int *q2; + size_t bytesq2; + struct Node *graph_nodes; + size_t bytes_graph_nodes; + struct Edge *graph_edges; + size_t bytes_graph_edges; + int *color; + size_t bytes_color; + int *cost; + size_t bytes_cost; + int *tail; + size_t bytes_tail; int no_of_nodes; int gray_shade; int k; long block; long grid; - //VoidRetTy* out; + // VoidRetTy* out; } RootIn; -void packData(RootIn* args, - int* q1, size_t bytesq1, - int* q2, size_t bytesq2, - struct Node* graph_nodes, size_t bytes_graph_nodes, - struct Edge* graph_edges, size_t bytes_graph_edges, - int* color, size_t bytes_color, - int* cost, size_t bytes_cost, - int* tail, size_t bytes_tail, - int no_of_nodes, - int gray_shade, - int k, - long block, - long grid) { +void packData(RootIn *args, int *q1, size_t bytesq1, int *q2, size_t bytesq2, + struct Node *graph_nodes, size_t bytes_graph_nodes, + struct Edge *graph_edges, size_t bytes_graph_edges, int *color, + size_t bytes_color, int *cost, size_t bytes_cost, int *tail, + size_t bytes_tail, int no_of_nodes, int gray_shade, int k, + long block, long grid) { args->q1 = q1; args->bytesq1 = bytesq1; args->q2 = q2; @@ -113,78 +113,72 @@ void packData(RootIn* args, void Allocation(long block) { // Memory shared between threadblocks - void* local_q_tail = __visc__malloc(sizeof(int)); - void* local_q = __visc__malloc(LOCAL_MEM_SIZE*sizeof(int)); - void* shift = __visc__malloc(sizeof(int)); - - __visc__return(6, local_q_tail, sizeof(int), local_q, LOCAL_MEM_SIZE*sizeof(int), shift, sizeof(int)); + void *local_q_tail = __visc__malloc(sizeof(int)); + void *local_q = __visc__malloc(LOCAL_MEM_SIZE * sizeof(int)); + void *shift = __visc__malloc(sizeof(int)); + + __visc__return(6, local_q_tail, sizeof(int), local_q, + LOCAL_MEM_SIZE * sizeof(int), shift, sizeof(int)); } -//VoidRetTy -void -BFSLeaf(int *q1, size_t bytesq1, - int *q2, size_t bytesq2, - struct Node *g_graph_nodes, size_t bytesg_graph_nodes, - struct Edge *g_graph_edges, size_t bytesg_graph_edges, - int *g_color, size_t bytesg_color, - int *g_cost, size_t bytesg_cost, - int *tail, size_t bytestail, - int no_of_nodes, - int gray_shade, - int k, - // data local to thread block. The next three arguments should - // ideally be placed in local memory - int *local_q_tail, size_t byteslocal_q_tail, - int *local_q, size_t byteslocal_q, - int *shift, size_t bytesshift - ) -{ +// VoidRetTy +void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, + struct Node *g_graph_nodes, size_t bytesg_graph_nodes, + struct Edge *g_graph_edges, size_t bytesg_graph_edges, + int *g_color, size_t bytesg_color, int *g_cost, size_t bytesg_cost, + int *tail, size_t bytestail, int no_of_nodes, int gray_shade, + int k, + // data local to thread block. The next three arguments should + // ideally be placed in local memory + int *local_q_tail, size_t byteslocal_q_tail, int *local_q, + size_t byteslocal_q, int *shift, size_t bytesshift) { __visc__hint(visc::DEVICE); __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, - 4, q2, g_color, g_cost, tail); + 4, q2, g_color, g_cost, tail); - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); int lx = __visc__getNodeInstanceID_x(thisNode); int gx = __visc__getNodeInstanceID_x(parentNode); int dimx = __visc__getNumNodeInstances_x(thisNode); - if(lx == 0){ - *local_q_tail = 0;//initialize the tail of w-queue + if (lx == 0) { + *local_q_tail = 0; // initialize the tail of w-queue } - __visc__barrier(); + __visc__barrier(); - //first, propagate and add the new frontier elements into w-queues - //int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0); + // first, propagate and add the new frontier elements into w-queues + // int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0); int tid = gx * dimx + lx; - if( tid<no_of_nodes) - { - int pid = q1[tid]; //the current frontier node, or the parent node of the new frontier nodes + if (tid < no_of_nodes) { + int pid = q1[tid]; // the current frontier node, or the parent node of the + // new frontier nodes g_color[pid] = BLACK; int cur_cost = g_cost[pid]; - //into + // into struct Node cur_node = g_graph_nodes[pid]; - for(int i=cur_node.x; i<cur_node.y + cur_node.x; i++)//visit each neighbor of the - //current frontier node. + for (int i = cur_node.x; i < cur_node.y + cur_node.x; + i++) // visit each neighbor of the + // current frontier node. { struct Edge cur_edge = g_graph_edges[i]; int id = cur_edge.x; int cost = cur_edge.y; cost += cur_cost; - int orig_cost = __visc__atomic_min(&g_cost[id],cost); - if(orig_cost > cost){//the node should be visited - if(g_color[id] > UP_LIMIT){ - int old_color = __visc__atomic_xchg(&g_color[id],gray_shade); - //this guarantees that only one thread will push this node - //into a queue - if(old_color != gray_shade) { - //atomic operation guarantees the correctness - //even if multiple warps are executing simultaneously + int orig_cost = __visc__atomic_min(&g_cost[id], cost); + if (orig_cost > cost) { // the node should be visited + if (g_color[id] > UP_LIMIT) { + int old_color = __visc__atomic_xchg(&g_color[id], gray_shade); + // this guarantees that only one thread will push this node + // into a queue + if (old_color != gray_shade) { + // atomic operation guarantees the correctness + // even if multiple warps are executing simultaneously int index = __visc__atomic_add(local_q_tail, 1); local_q[index] = id; } @@ -192,111 +186,100 @@ BFSLeaf(int *q1, size_t bytesq1, } } } - - __visc__barrier(); - - if(lx == 0){ - int tot_sum = *local_q_tail; - //the offset or "shift" of the block-level queue within the grid-level queue - //is determined by atomic operation - *shift = __visc__atomic_add (tail,tot_sum); + + __visc__barrier(); + + if (lx == 0) { + int tot_sum = *local_q_tail; + // the offset or "shift" of the block-level queue within the grid-level + // queue is determined by atomic operation + *shift = __visc__atomic_add(tail, tot_sum); } - - __visc__barrier(); - //shift within a w-queue + __visc__barrier(); + + // shift within a w-queue int local_shift = lx; - while(local_shift < *local_q_tail){ + while (local_shift < *local_q_tail) { q2[*shift + local_shift] = local_q[local_shift]; - //multiple threads are copying elements at the same time, - //so we shift by multiple elements for next iteration + // multiple threads are copying elements at the same time, + // so we shift by multiple elements for next iteration local_shift += dimx; } } -//VoidRetTy -void BlockingBFS(int *q1, size_t bytesq1, - int *q2, size_t bytesq2, - struct Node *g_graph_nodes, size_t bytesg_graph_nodes, - struct Edge *g_graph_edges, size_t bytesg_graph_edges, - int *g_color, size_t bytesg_color, - int *g_cost, size_t bytesg_cost, - int *tail, size_t bytestail, - int no_of_nodes, - int gray_shade, - int k, - long block, - // data local to thread block. The next three arguments should - // ideally be placed in local memory - int *local_q_tail, size_t byteslocal_q_tail, - int *local_q, size_t byteslocal_q, - int *shift, size_t bytesshift) { +// VoidRetTy +void BlockingBFS(int *q1, size_t bytesq1, int *q2, size_t bytesq2, + struct Node *g_graph_nodes, size_t bytesg_graph_nodes, + struct Edge *g_graph_edges, size_t bytesg_graph_edges, + int *g_color, size_t bytesg_color, int *g_cost, + size_t bytesg_cost, int *tail, size_t bytestail, + int no_of_nodes, int gray_shade, int k, long block, + // data local to thread block. The next three arguments should + // ideally be placed in local memory + int *local_q_tail, size_t byteslocal_q_tail, int *local_q, + size_t byteslocal_q, int *shift, size_t bytesshift) { __visc__hint(visc::CPU_TARGET); __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, - 4, q2, g_color, g_cost, tail); + 4, q2, g_color, g_cost, tail); - void* AllocationNode = __visc__createNodeND(0, Allocation); - void* BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block); + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block); // Bind edges __visc__bindIn(AllocationNode, 17, 0, 0); // Bind block - __visc__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1 - __visc__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1 - __visc__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2 - __visc__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 - __visc__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes - __visc__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes - __visc__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges - __visc__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges - __visc__bindIn(BFSLeafNode, 8, 8, 0); // Bind color - __visc__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color - __visc__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost - __visc__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost - __visc__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail - __visc__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail - __visc__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes - __visc__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade - __visc__bindIn(BFSLeafNode, 16, 16, 0); // Bind k - + __visc__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BFSLeafNode, 8, 8, 0); // Bind color + __visc__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost + __visc__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost + __visc__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail + __visc__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail + __visc__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes + __visc__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade + __visc__bindIn(BFSLeafNode, 16, 16, 0); // Bind k + // Create Edges between AllocationNode and BFSLeafNodeNode - __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail - __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18, 0); // Edge bytes_local_q_tail - __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q - __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q - __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift - __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift + __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail + __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18, + 0); // Edge bytes_local_q_tail + __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q + __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q + __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift + __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift } -//VoidRetTy -void BFS_Root(int *q1, size_t bytesq1, - int *q2, size_t bytesq2, +// VoidRetTy +void BFS_Root(int *q1, size_t bytesq1, int *q2, size_t bytesq2, struct Node *g_graph_nodes, size_t bytesg_graph_nodes, struct Edge *g_graph_edges, size_t bytesg_graph_edges, - int *g_color, size_t bytesg_color, - int *g_cost, size_t bytesg_cost, - int *tail, size_t bytestail, - int no_of_nodes, - int gray_shade, - int k, - long block, - long grid) { + int *g_color, size_t bytesg_color, int *g_cost, + size_t bytesg_cost, int *tail, size_t bytestail, int no_of_nodes, + int gray_shade, int k, long block, long grid) { __visc__hint(visc::CPU_TARGET); - __visc__attributes( 6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, - 4, q2, g_color, g_cost, tail); - void* BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid); + __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + 4, q2, g_color, g_cost, tail); + void *BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid); // Bind edges - __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 - __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 - __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 - __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 - __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes - __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes - __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges - __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges - __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color - __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail @@ -305,39 +288,34 @@ void BFS_Root(int *q1, size_t bytesq1, __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block - - } -void BFS_Wrapper( - int *q1, size_t bytesq1, // 0, 1 - int *q2, size_t bytesq2, // 2, 3 - struct Node *g_graph_nodes, size_t bytesg_graph_nodes, // 4, 5 - struct Edge *g_graph_edges, size_t bytesg_graph_edges, // 6, 7 - int *g_color, size_t bytesg_color, // 8, 9 - int *g_cost, size_t bytesg_cost, // 10, 11 - int *tail, size_t bytestail, // 12, 13 - int no_of_nodes, int gray_shade, // 14, 15 - int k, long block, long grid // 16 - 18 +void BFS_Wrapper(int *q1, size_t bytesq1, // 0, 1 + int *q2, size_t bytesq2, // 2, 3 + struct Node *g_graph_nodes, size_t bytesg_graph_nodes, // 4, 5 + struct Edge *g_graph_edges, size_t bytesg_graph_edges, // 6, 7 + int *g_color, size_t bytesg_color, // 8, 9 + int *g_cost, size_t bytesg_cost, // 10, 11 + int *tail, size_t bytestail, // 12, 13 + int no_of_nodes, int gray_shade, // 14, 15 + int k, long block, long grid // 16 - 18 ) { __visc__hint(visc::CPU_TARGET); - __visc__attributes( - 6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, - 4, q2, g_color, g_cost, tail - ); - void* BlockingBFSNode = __visc__createNodeND(0, BFS_Root); + __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + 4, q2, g_color, g_cost, tail); + void *BlockingBFSNode = __visc__createNodeND(0, BFS_Root); // Bind edges - __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 - __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 - __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 - __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 - __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes - __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes - __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges - __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges - __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color - __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail @@ -345,35 +323,31 @@ void BFS_Wrapper( __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k - __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block - __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid + __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block + __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid } FILE *fp; -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { printf("Error 1!\n"); exit(1); } - fseek(fp,0,SEEK_END); + fseek(fp, 0, SEEK_END); long size = ftell(fp); rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { printf("Error 2!\n"); fclose(fp); exit(1); } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { + size_t res = fread(buffer, 1, size, fp); + if (res != size) { printf("Error 3!\n"); fclose(fp); exit(1); @@ -384,63 +358,60 @@ char* readFile(const char* fileName) } const int h_top = 1; const int zero = 0; -void runGPU(int argc, char** argv); +void runGPU(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// -int main( int argc, char** argv) -{ +int main(int argc, char **argv) { - //the number of nodes in the graph + // the number of nodes in the graph int num_of_nodes = 0; - //the number of edges in the graph + // the number of edges in the graph int num_of_edges = 0; struct pb_Parameters *params; struct pb_TimerSet timers; params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) - { + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } - //Read in Graph from a file - fp = fopen(params->inpFiles[0],"r"); - if(!fp) - { + // Read in Graph from a file + fp = fopen(params->inpFiles[0], "r"); + if (!fp) { printf("Error Reading graph file\n"); return 0; } int source; - fscanf(fp,"%d",&num_of_nodes); + fscanf(fp, "%d", &num_of_nodes); // allocate host memory - struct Node* graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes); - int *color = (int*) malloc(sizeof(int)*num_of_nodes); + struct Node *graph_nodes = + (struct Node *)malloc(sizeof(struct Node) * num_of_nodes); + int *color = (int *)malloc(sizeof(int) * num_of_nodes); int start, edgeno; // initalize the memory int i; - for( i = 0; i < num_of_nodes; i++) - { - fscanf(fp,"%d %d",&start,&edgeno); + for (i = 0; i < num_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); graph_nodes[i].x = start; graph_nodes[i].y = edgeno; - color[i]=WHITE; + color[i] = WHITE; } - //read the source node from the file - fscanf(fp,"%d",&source); - fscanf(fp,"%d",&num_of_edges); - int id,edge_cost; - struct Edge* graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges); - for(i=0; i < num_of_edges ; i++) - { - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&edge_cost); + // read the source node from the file + fscanf(fp, "%d", &source); + fscanf(fp, "%d", &num_of_edges); + int id, edge_cost; + struct Edge *graph_edges = + (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges); + for (i = 0; i < num_of_edges; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &edge_cost); graph_edges[i].x = id; graph_edges[i].y = edge_cost; } - if(fp) + if (fp) fclose(fp); pb_InitializeTimerSet(&timers); @@ -448,19 +419,19 @@ int main( int argc, char** argv) pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate mem for the result on host side - int* cost = (int*) malloc( sizeof(int)*num_of_nodes); - for(i = 0; i < num_of_nodes; i++){ + int *cost = (int *)malloc(sizeof(int) * num_of_nodes); + for (i = 0; i < num_of_nodes; i++) { cost[i] = INF; } cost[source] = 0; - size_t bytes_graph_nodes = num_of_nodes* sizeof(struct Node); - size_t bytes_graph_edges = num_of_edges* sizeof(struct Edge); - size_t bytes_cost = sizeof(int) * num_of_nodes; + size_t bytes_graph_nodes = num_of_nodes * sizeof(struct Node); + size_t bytes_graph_edges = num_of_edges * sizeof(struct Edge); + size_t bytes_cost = sizeof(int) * num_of_nodes; - int* q1 = (int*) malloc(sizeof(int)*num_of_nodes); - int* q2 = (int*) malloc(sizeof(int)*num_of_nodes); - int* tail = (int*) malloc(sizeof(int)); + int *q1 = (int *)malloc(sizeof(int) * num_of_nodes); + int *q2 = (int *)malloc(sizeof(int) * num_of_nodes); + int *tail = (int *)malloc(sizeof(int)); llvm_visc_track_mem(graph_nodes, bytes_graph_nodes); llvm_visc_track_mem(graph_edges, bytes_graph_edges); @@ -478,50 +449,38 @@ int main( int argc, char** argv) // Initializations. Can some of these be done in the graph. That way we can // move these arrays completely in the graph *tail = h_top; - // Potential source of inefficiency. - //Entire array would be copied intially + // Potential source of inefficiency. + // Entire array would be copied intially cost[0] = zero; q1[0] = source; - int num_t;//number of threads - int k=0;//BFS level index + int num_t; // number of threads + int k = 0; // BFS level index int gray; long grid = num_of_blocks; long block = num_of_threads_per_block; // Pack data in struct - RootIn* args = (RootIn*) malloc(sizeof(RootIn)); - packData(args, - q1, bytes_cost, - q2, bytes_cost, - graph_nodes, bytes_graph_nodes, - graph_edges, bytes_graph_edges, - color, bytes_cost, - cost, bytes_cost, - tail, sizeof(int), - num_of_nodes, - gray, - k, - block, - grid - ); - + RootIn *args = (RootIn *)malloc(sizeof(RootIn)); + packData(args, q1, bytes_cost, q2, bytes_cost, graph_nodes, bytes_graph_nodes, + graph_edges, bytes_graph_edges, color, bytes_cost, cost, bytes_cost, + tail, sizeof(int), num_of_nodes, gray, k, block, grid); pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - do - { + do { llvm_visc_request_mem(tail, sizeof(int)); num_t = *tail; - //printf("tail for iteration %d = %d\n",k, num_t); + // printf("tail for iteration %d = %d\n",k, num_t); *tail = 0; - //tail = 0; + // tail = 0; - if(num_t == 0){//frontier is empty + if (num_t == 0) { // frontier is empty break; } - num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK); - num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t; + num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = + num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t; args->grid = num_of_blocks; args->block = num_of_threads_per_block; @@ -529,33 +488,32 @@ int main( int argc, char** argv) args->no_of_nodes = num_t; args->k = k; - if(k%2 == 0){ + if (k % 2 == 0) { args->gray_shade = GRAY0; - } - else{ + } else { args->gray_shade = GRAY1; } - //void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17, - //q1, bytes_cost, - //q2, bytes_cost, - //graph_nodes, bytes_graph_nodes, - //graph_edges, bytes_graph_edges, - //color, bytes_cost, - //cost, bytes_cost, - //tail, sizeof(int), - //num_of_nodes, - //gray, - //k, - //0); - void* bfsDFG = __visc__launch(0, BFS_Wrapper, (void*) args); + // void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17, + // q1, bytes_cost, + // q2, bytes_cost, + // graph_nodes, bytes_graph_nodes, + // graph_edges, bytes_graph_edges, + // color, bytes_cost, + // cost, bytes_cost, + // tail, sizeof(int), + // num_of_nodes, + // gray, + // k, + // 0); + void *bfsDFG = __visc__launch(0, BFS_Wrapper, (void *)args); __visc__wait(bfsDFG); // Swap q1 and q2 // Swap q1 and q2 - int* temp = args->q1; + int *temp = args->q1; args->q1 = args->q2; args->q2 = temp; k++; - } while(1); + } while (1); // copy result from device to host pb_SwitchToTimer(&timers, pb_TimerID_COPY); @@ -577,13 +535,13 @@ int main( int argc, char** argv) pb_PrintTimerSet(&timers); __visc__cleanup(); - //Store the result into a file - //FIXME: color is not even printed. Why are we reading it back?? - FILE *fp = fopen(params->outFile,"w"); + // Store the result into a file + // FIXME: color is not even printed. Why are we reading it back?? + FILE *fp = fopen(params->outFile, "w"); fprintf(fp, "%d\n", num_of_nodes); int j = 0; - for(j=0;j<num_of_nodes;j++) - fprintf(fp,"%d %d\n",j,cost[j]); + for (j = 0; j < num_of_nodes; j++) + fprintf(fp, "%d %d\n", j, cost[j]); fclose(fp); // cleanup memory free(graph_nodes); @@ -593,7 +551,6 @@ int main( int argc, char** argv) free(q1); free(q2); - pb_FreeParameters(params); return 0; } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c index 14d183cc985acc6f3c6c1a2c1af5598314c186fa..e54192c9d32a1512e73ac1ea98689dda1d7d0169 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c @@ -6,25 +6,24 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -39,8 +38,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,7 +61,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -75,44 +74,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -123,26 +123,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -150,27 +157,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -178,7 +184,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -186,8 +192,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h index 477e5649b6ff4f58690fb80a017f8bcec86d135c..0f8b0ff96aaab0c84bfca49c112b717d568815b9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h @@ -15,46 +15,44 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; - - /* Lowest corner of lattice */ - Vec3 lo; - - /* Lattice spacing */ - float h; - } LatticeDim; - - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; - - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); - - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); - - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; + + /* Lowest corner of lattice */ + Vec3 lo; + + /* Lattice spacing */ + float h; +} LatticeDim; + +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; + +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); + +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); + +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); + +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c index 9598bda26b98a3f26bc36c9b616f11048c2e5860..26769b76d1dac3310e6f5066f3393133091d6477 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c @@ -6,22 +6,20 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -33,8 +31,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,44 +60,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -110,42 +109,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c index 9b8ef2014dc7deab1e9238be8e9d9ea1d0cf4a38..d361c16a34a6821dff328235a3c8fd59283734bd 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c @@ -6,27 +6,26 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include "parboil.h" #include "atom.h" #include "cutoff.h" #include "output.h" +#include "parboil.h" +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -75,10 +70,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -90,13 +82,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *cpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -136,9 +128,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); cpu_lattice = create_lattice(lattice_dim); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c @@ -6,16 +6,14 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <inttypes.h> -#include <math.h> #include "atom.h" #include "cutoff.h" +#include <inttypes.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); } @@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice) { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); } @@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c index 14d183cc985acc6f3c6c1a2c1af5598314c186fa..e54192c9d32a1512e73ac1ea98689dda1d7d0169 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c @@ -6,25 +6,24 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -39,8 +38,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,7 +61,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -75,44 +74,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -123,26 +123,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -150,27 +157,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -178,7 +184,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -186,8 +192,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h index d5949745afaa90d234b949f75ac9e534931c748c..7c5d265a9b2e865f82a197642e1a1a4201cc0e78 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h @@ -17,54 +17,51 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; - /* Lowest corner of lattice */ - Vec3 lo; + /* Lowest corner of lattice */ + Vec3 lo; - /* Lattice spacing */ - float h; - } LatticeDim; + /* Lattice spacing */ + float h; +} LatticeDim; - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); - int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ - ); +int gpu_compute_cutoff_potential_lattice6overlap( + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +); - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c index 9598bda26b98a3f26bc36c9b616f11048c2e5860..26769b76d1dac3310e6f5066f3393133091d6477 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c @@ -6,22 +6,20 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -33,8 +31,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,44 +60,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -110,42 +109,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c index 6cada17e4ede54d75d0f611259847ffb3cffb707..763ddcbb7316795e2de433a4ea0dd9be467fc831 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c @@ -6,27 +6,26 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include "parboil.h" #include "atom.h" #include "cutoff.h" #include "output.h" +#include "parboil.h" +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -75,10 +70,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -90,13 +82,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *gpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -136,9 +128,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); gpu_lattice = create_lattice(lattice_dim); @@ -147,7 +140,8 @@ int main(int argc, char *argv[]) { * CUDA kernel, with overlapped GPU/CPU computation * (enter and exit with the 'compute' timer active) */ - if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) { + if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, + atom, 0)) { fprintf(stderr, "Computation failed\n"); exit(1); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c @@ -6,16 +6,14 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <inttypes.h> -#include <math.h> #include "atom.h" #include "cutoff.h" +#include <inttypes.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); } @@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice) { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); } @@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c index f409c29e46276417918da1db9c1a785d1eaa39ae..ba029f4c8f3271ea45666e36b2a24de5ac9bbff5 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c @@ -6,26 +6,25 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include "parboil.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "parboil.h" -#include "cutoff.h" #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -40,8 +39,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -63,7 +62,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -76,44 +75,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -124,26 +124,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -151,27 +158,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -179,7 +185,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -187,8 +193,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h index a4e8d1ae94a901c0e07ec15ef216bac6c544007e..e8f8978e93cb1f235774d246385810af1254bde2 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h @@ -17,54 +17,51 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; - /* Lowest corner of lattice */ - Vec3 lo; + /* Lowest corner of lattice */ + Vec3 lo; - /* Lattice spacing */ - float h; - } LatticeDim; + /* Lattice spacing */ + float h; +} LatticeDim; - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); - int gpu_compute_cutoff_potential_lattice( - struct pb_TimerSet *timers, - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atom, /* array of atoms */ - int verbose /* print info/debug messages */ - ); +int gpu_compute_cutoff_potential_lattice( + struct pb_TimerSet *timers, Lattice *lattice, + float cutoff, /* cutoff distance */ + Atoms *atom, /* array of atoms */ + int verbose /* print info/debug messages */ +); - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c index 9598bda26b98a3f26bc36c9b616f11048c2e5860..26769b76d1dac3310e6f5066f3393133091d6477 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c @@ -6,22 +6,20 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -33,8 +31,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,44 +60,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -110,42 +109,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c index 905b7226312a9b0736b1b6d4deb18118c92476cc..3819e18adfbc33f7943fab4784cfddd513eab60b 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c @@ -6,27 +6,26 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include "parboil.h" #include "atom.h" #include "cutoff.h" #include "output.h" +#include "parboil.h" +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -75,10 +70,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -90,13 +82,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *gpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -136,9 +128,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); gpu_lattice = create_lattice(lattice_dim); @@ -148,7 +141,8 @@ int main(int argc, char *argv[]) { * Run CUDA kernel * (enter and exit with the 'compute' timer active) */ - if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0)) { + if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, + 0)) { fprintf(stderr, "Computation failed\n"); exit(1); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c @@ -6,16 +6,14 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <inttypes.h> -#include <math.h> #include "atom.h" #include "cutoff.h" +#include <inttypes.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); } @@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice) { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); } @@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c index 372903e6b00d7600d71d0596be3f1287fd8e927f..5ad77220e3656676845975992adba245153510d7 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c @@ -6,25 +6,24 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -39,8 +38,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,7 +61,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -75,49 +74,49 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } -#pragma omp parallel for private (n, q, x, y, z, ic, jc, kc, ia, ib, ja, jb, ka, kb, \ - xstart, ystart, dz, k, koff, dz2, j, dy, jkoff, \ - dydz2, dx, index, pg, i, r2, s, e \ - ) +#pragma omp parallel for private(n, q, x, y, z, ic, jc, kc, ia, ib, ja, jb, \ + ka, kb, xstart, ystart, dz, k, koff, dz2, j, \ + dy, jkoff, dydz2, dx, index, pg, i, r2, s, e) /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -128,26 +127,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -155,27 +161,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; #pragma omp atomic *pg += e; @@ -185,7 +190,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -193,8 +198,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h index 477e5649b6ff4f58690fb80a017f8bcec86d135c..0f8b0ff96aaab0c84bfca49c112b717d568815b9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h @@ -15,46 +15,44 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; - - /* Lowest corner of lattice */ - Vec3 lo; - - /* Lattice spacing */ - float h; - } LatticeDim; - - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; - - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); - - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); - - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; + + /* Lowest corner of lattice */ + Vec3 lo; + + /* Lattice spacing */ + float h; +} LatticeDim; + +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; + +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); + +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); + +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); + +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c index e157941114a0a51e8c60080d726d02d8e62d9fd4..ac36cc63fe87bb39485e989e3fe8e784699a0eb6 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c @@ -6,22 +6,20 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" -#include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -33,8 +31,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -62,48 +60,49 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } -#pragma omp parallel for private(n, x, y, z, q, ic, jc, kc, ia, ib, ja, jb, \ - ka, kb, xstart, ystart, dz, k, koff, dz2, \ - dy, j, jkoff, dydz2, dx, index, pg, i, r2) +#pragma omp parallel for private(n, x, y, z, q, ic, jc, kc, ia, ib, ja, jb, \ + ka, kb, xstart, ystart, dz, k, koff, dz2, dy, \ + j, jkoff, dydz2, dx, index, pg, i, r2) /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -114,45 +113,52 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ + /* If atom and lattice point are too close, set the lattice value + * to zero */ -//All threads are writing the same value -//No need for an atomic update - if (r2 < a2) *pg = 0; + // All threads are writing the same value + // No need for an atomic update + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c index 9b8ef2014dc7deab1e9238be8e9d9ea1d0cf4a38..d361c16a34a6821dff328235a3c8fd59283734bd 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c @@ -6,27 +6,26 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include "parboil.h" #include "atom.h" #include "cutoff.h" #include "output.h" +#include "parboil.h" +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -75,10 +70,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -90,13 +82,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *cpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -136,9 +128,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); cpu_lattice = create_lattice(lattice_dim); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c @@ -6,16 +6,14 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> -#include <inttypes.h> -#include <math.h> #include "atom.h" #include "cutoff.h" +#include <inttypes.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); } @@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice) { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); } @@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c index 5f440752a5951de65f5e0e51bba214fea37157e8..faca2a682a351f894b3cebcd9ccd8c176f6250b1 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c @@ -6,26 +6,25 @@ *cr ***************************************************************************/ +#include "atom.h" +#include "cutoff.h" +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> -#include "atom.h" -#include "cutoff.h" #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -40,8 +39,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -63,7 +62,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -76,44 +75,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -124,26 +124,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -151,27 +158,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -179,7 +185,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -187,8 +193,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c index f0567d19c76f19f1c3632b8784d50ef6f077b7cd..dcd0a629cb9cd765683415895376144816765e64 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c @@ -8,11 +8,11 @@ #include <CL/cl.h> +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -23,15 +23,13 @@ // we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used. typedef cl_int4 xyz; -//extern "C" int gpu_compute_cutoff_potential_lattice( +// extern "C" int gpu_compute_cutoff_potential_lattice( int gpu_compute_cutoff_potential_lattice( - struct pb_TimerSet *timers, - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ - ) -{ + struct pb_TimerSet *timers, Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -45,8 +43,8 @@ int gpu_compute_cutoff_potential_lattice( xyz nbrlist[NBRLIST_MAXLEN]; int nbrlistlen = 0; - int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */ - int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */ + int binHistoFull[BIN_DEPTH + 1] = {0}; /* clear every array element */ + int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */ int num_excluded = 0; int xRegionDim, yRegionDim, zRegionDim; @@ -80,9 +78,9 @@ int gpu_compute_cutoff_potential_lattice( // The "compute" timer should be active upon entry to this function /* pad lattice to be factor of 8 in each dimension */ - xRegionDim = (int) ceilf(nx/8.f); - yRegionDim = (int) ceilf(ny/8.f); - zRegionDim = (int) ceilf(nz/8.f); + xRegionDim = (int)ceilf(nx / 8.f); + yRegionDim = (int)ceilf(ny / 8.f); + zRegionDim = (int)ceilf(nz / 8.f); lnx = 8 * xRegionDim; lny = 8 * yRegionDim; @@ -90,35 +88,36 @@ int gpu_compute_cutoff_potential_lattice( lnall = lnx * lny * lnz; /* will receive energies from OpenCL */ - regionZeroAddr = (float *) malloc(lnall * sizeof(float)); + regionZeroAddr = (float *)malloc(lnall * sizeof(float)); /* create bins */ - c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ - binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c; - binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c; - binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c; + c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ + binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c; + binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c; + binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c; nbins = binDim.x * binDim.y * binDim.z; - binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4)); + binBaseAddr = (cl_float4 *)calloc(nbins * BIN_DEPTH, sizeof(cl_float4)); binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - bincntBaseAddr = (int *) calloc(nbins, sizeof(int)); + bincntBaseAddr = (int *)calloc(nbins, sizeof(int)); bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c; /* create neighbor list */ - if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) { + if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) { float s = sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); int cnt = 0; /* develop neighbor list around 1 cell */ - if (2*c + 1 > NBRLIST_DIM) { + if (2 * c + 1 > NBRLIST_DIM) { fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-1)/2 * BIN_LENGTH); + (NBRLIST_DIM - 1) / 2 * BIN_LENGTH); return -1; } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; nbrlist[cnt].x = i; nbrlist[cnt].y = j; nbrlist[cnt].z = k; @@ -127,21 +126,21 @@ int gpu_compute_cutoff_potential_lattice( } } nbrlistlen = cnt; - } - else if (8*h <= 2*BIN_LENGTH) { - float s = 2.f*sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); + } else if (8 * h <= 2 * BIN_LENGTH) { + float s = 2.f * sqrtf(3); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); int cnt = 0; /* develop neighbor list around 3-cube of cells */ - if (2*c + 3 > NBRLIST_DIM) { + if (2 * c + 3 > NBRLIST_DIM) { fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-3)/2 * BIN_LENGTH); + (NBRLIST_DIM - 3) / 2 * BIN_LENGTH); return -1; } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; nbrlist[cnt].x = i; nbrlist[cnt].y = j; nbrlist[cnt].z = k; @@ -150,8 +149,7 @@ int gpu_compute_cutoff_potential_lattice( } } nbrlistlen = cnt; - } - else { + } else { fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH); return -1; } @@ -159,43 +157,39 @@ int gpu_compute_cutoff_potential_lattice( /* perform geometric hashing of atoms into bins */ { /* array of extra atoms, permit average of one extra per bin */ - Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom)); + Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom)); int extra_len = 0; - - for (n = 0; n < natoms; n++) { + + for (n = 0; n < natoms; n++) { cl_float4 p; p.x = atom[n].x - xlo; p.y = atom[n].y - ylo; p.z = atom[n].z - zlo; p.w = atom[n].q; - i = (int) floorf(p.x * BIN_INVLEN); - j = (int) floorf(p.y * BIN_INVLEN); - k = (int) floorf(p.z * BIN_INVLEN); - if (i >= -c && i < binDim.x - c && - j >= -c && j < binDim.y - c && - k >= -c && k < binDim.z - c && - atom[n].q != 0) { - int index = (k * binDim.y + j) * binDim.x + i; - cl_float4 *bin = binZeroAddr + index * BIN_DEPTH; - int bindex = bincntZeroAddr[index]; - if (bindex < BIN_DEPTH) { - /* copy atom into bin and increase counter for this bin */ - bin[bindex] = p; - bincntZeroAddr[index]++; - } - else { - /* add index to array of extra atoms to be computed with CPU */ - if (extra_len >= nbins) { - fprintf(stderr, "exceeded space for storing extra atoms\n"); - return -1; - } - extra_atoms[extra_len] = atom[n]; - extra_len++; - } - } - else { - /* excluded atoms are either outside bins or neutrally charged */ - num_excluded++; + i = (int)floorf(p.x * BIN_INVLEN); + j = (int)floorf(p.y * BIN_INVLEN); + k = (int)floorf(p.z * BIN_INVLEN); + if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c && + k >= -c && k < binDim.z - c && atom[n].q != 0) { + int index = (k * binDim.y + j) * binDim.x + i; + cl_float4 *bin = binZeroAddr + index * BIN_DEPTH; + int bindex = bincntZeroAddr[index]; + if (bindex < BIN_DEPTH) { + /* copy atom into bin and increase counter for this bin */ + bin[bindex] = p; + bincntZeroAddr[index]++; + } else { + /* add index to array of extra atoms to be computed with CPU */ + if (extra_len >= nbins) { + fprintf(stderr, "exceeded space for storing extra atoms\n"); + return -1; + } + extra_atoms[extra_len] = atom[n]; + extra_len++; + } + } else { + /* excluded atoms are either outside bins or neutrally charged */ + num_excluded++; } } @@ -207,24 +201,24 @@ int gpu_compute_cutoff_potential_lattice( /* bin stats */ sum = total = 0; - for (n = 0; n < nbins; n++) { - binHistoFull[ bincntBaseAddr[n] ]++; + for (n = 0; n < nbins; n++) { + binHistoFull[bincntBaseAddr[n]]++; sum += bincntBaseAddr[n]; total += BIN_DEPTH; } - avgFillFull = sum / (float) total; + avgFillFull = sum / (float)total; sum = total = 0; - for (k = 0; k < binDim.z - 2*c; k++) { - for (j = 0; j < binDim.y - 2*c; j++) { - for (i = 0; i < binDim.x - 2*c; i++) { + for (k = 0; k < binDim.z - 2 * c; k++) { + for (j = 0; j < binDim.y - 2 * c; j++) { + for (i = 0; i < binDim.x - 2 * c; i++) { int index = (k * binDim.y + j) * binDim.x + i; - binHistoCover[ bincntZeroAddr[index] ]++; + binHistoCover[bincntZeroAddr[index]]++; sum += bincntZeroAddr[index]; total += BIN_DEPTH; } } } - avgFillCover = sum / (float) total; + avgFillCover = sum / (float)total; if (verbose) { /* report */ @@ -233,25 +227,25 @@ int gpu_compute_cutoff_potential_lattice( printf("cutoff distance = %g\n", cutoff); printf("\n"); printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz); - printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h); + printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h); printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz); - printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h); - printf("number of bytes for lattice data = %u\n", lnall*sizeof(float)); + printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h); + printf("number of bytes for lattice data = %u\n", lnall * sizeof(float)); printf("\n"); printf("bin padding thickness = %d\n", c); - printf("bin cover dimensions = %d %d %d\n", - binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c); + printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c, + binDim.y - 2 * c, binDim.z - 2 * c); printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z); printf("number of bins = %d\n", nbins); printf("total number of atom slots = %d\n", nbins * BIN_DEPTH); printf("%% overhead space = %g\n", - (natoms / (double) (nbins * BIN_DEPTH)) * 100); + (natoms / (double)(nbins * BIN_DEPTH)) * 100); printf("number of bytes for bin data = %u\n", - nbins * BIN_DEPTH * sizeof(cl_float4)); + nbins * BIN_DEPTH * sizeof(cl_float4)); printf("\n"); printf("bin histogram with padding:\n"); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]); sum += binHistoFull[n]; } @@ -260,7 +254,7 @@ int gpu_compute_cutoff_potential_lattice( printf("\n"); printf("bin histogram excluding padding:\n"); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]); sum += binHistoCover[n]; } @@ -268,24 +262,27 @@ int gpu_compute_cutoff_potential_lattice( printf(" %% average fill: %g\n", avgFillCover * 100); printf("\n"); printf("number of extra atoms = %d\n", extra->size); - printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100); + printf("%% atoms that are extra = %g\n", + (extra->size / (double)natoms) * 100); printf("\n"); /* sanity check on bins */ sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { sum += n * binHistoFull[n]; } sum += extra->size + num_excluded; printf("sanity check on bin histogram with edges: " - "sum + others = %d\n", sum); + "sum + others = %d\n", + sum); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { sum += n * binHistoCover[n]; } sum += extra->size + num_excluded; printf("sanity check on bin histogram excluding edges: " - "sum + others = %d\n", sum); + "sum + others = %d\n", + sum); printf("\n"); /* neighbor list */ @@ -295,34 +292,39 @@ int gpu_compute_cutoff_potential_lattice( cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs"); - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose + sprintf(clOptions, "-I src/opencl_base"); //-cl-nv-verbose - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus); + cl_kernel clKernel = + clCreateKernel(clProgram, "opencl_cutoff_potential_lattice", &clStatus); CHECK_ERROR("clCreateKernel") /* setup OpenCL kernel parameters */ @@ -337,66 +339,75 @@ int gpu_compute_cutoff_potential_lattice( pb_SwitchToTimer(timers, pb_TimerID_COPY); if (verbose) { printf("Allocating %.2fMB on OpenCL device for potentials\n", - lnall * sizeof(float) / (double) (1024*1024)); + lnall * sizeof(float) / (double)(1024 * 1024)); } - regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus); + regionZeroCl = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + lnall * sizeof(float), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float)); + clMemSet(clCommandQueue, regionZeroCl, 0, lnall * sizeof(float)); if (verbose) { printf("Allocating %.2fMB on OpenCL device for atom bins\n", - nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024)); + nbins * BIN_DEPTH * sizeof(cl_float4) / (double)(1024 * 1024)); } - binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus); + binBaseCl = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nbins * BIN_DEPTH * sizeof(cl_float4), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL); + + clStatus = clEnqueueWriteBuffer(clCommandQueue, binBaseCl, CL_TRUE, 0, + nbins * BIN_DEPTH * sizeof(cl_float4), + binBaseAddr, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - //Sub buffers are not supported in OpenCL v1.0 - int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; + // Sub buffers are not supported in OpenCL v1.0 + int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus); + NbrListLen = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(int), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, NbrListLen, CL_TRUE, 0, + sizeof(int), &nbrlistlen, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus); + NbrList = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + NBRLIST_MAXLEN * sizeof(xyz), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, NbrList, CL_TRUE, 0, + nbrlistlen * sizeof(xyz), nbrlist, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - if (verbose) + if (verbose) printf("\n"); - - clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x)); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y)); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset); - clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h); - clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2); - clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2); - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),®ionZeroCl); - clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen); - clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList); + clStatus = clSetKernelArg(clKernel, 0, sizeof(int), &(binDim.x)); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), &(binDim.y)); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &binBaseCl); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &offset); + clStatus = clSetKernelArg(clKernel, 4, sizeof(float), &h); + clStatus = clSetKernelArg(clKernel, 5, sizeof(float), &cutoff2); + clStatus = clSetKernelArg(clKernel, 6, sizeof(float), &inv_cutoff2); + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), ®ionZeroCl); + clStatus = clSetKernelArg(clKernel, 9, sizeof(cl_mem), &NbrListLen); + clStatus = clSetKernelArg(clKernel, 10, sizeof(cl_mem), &NbrList); CHECK_ERROR("clSetKernelArg") - /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ pb_SwitchToTimer(timers, pb_TimerID_KERNEL); printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); - for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { + for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { printf(" computing plane %d\r", zRegionIndex); fflush(stdout); - clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex); + clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &zRegionIndex); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, + gridDim, blockDim, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") @@ -406,7 +417,9 @@ int gpu_compute_cutoff_potential_lattice( /* copy result regions from OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL); + clStatus = + clEnqueueReadBuffer(clCommandQueue, regionZeroCl, CL_TRUE, 0, + lnall * sizeof(float), regionZeroAddr, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") /* free OpenCL memory allocations */ @@ -421,25 +434,26 @@ int gpu_compute_cutoff_potential_lattice( clStatus = clReleaseCommandQueue(clCommandQueue); clStatus = clReleaseContext(clContext); - free((void*)clSource[0]); + free((void *)clSource[0]); /* transpose regions back into lattice */ pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - for (k = 0; k < nz; k++) { + for (k = 0; k < nz; k++) { zRegionIndex = (k >> 3); zOffset = (k & 7); - for (j = 0; j < ny; j++) { + for (j = 0; j < ny; j++) { yRegionIndex = (j >> 3); yOffset = (j & 7); - for (i = 0; i < nx; i++) { + for (i = 0; i < nx; i++) { xRegionIndex = (i >> 3); xOffset = (i & 7); - thisRegion = regionZeroAddr - + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim - + xRegionIndex) * REGION_SIZE; + thisRegion = regionZeroAddr + + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim + + xRegionIndex) * + REGION_SIZE; indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset; index = (k * ny + j) * nx + i; @@ -454,7 +468,7 @@ int gpu_compute_cutoff_potential_lattice( printf("computing extra atoms on CPU\n"); if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " - "for extra atoms\n"); + "for extra atoms\n"); return -1; } printf("\n"); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h index 883738c120465a53fd8af91a1d3845994d5144d3..c3e011bd14bda43b07a1eb82b0c436d18d1c8356 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h @@ -15,54 +15,51 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; - /* Lowest corner of lattice */ - Vec3 lo; + /* Lowest corner of lattice */ + Vec3 lo; - /* Lattice spacing */ - float h; - } LatticeDim; + /* Lattice spacing */ + float h; +} LatticeDim; - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); - int gpu_compute_cutoff_potential_lattice( - struct pb_TimerSet *timers, - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atom, /* array of atoms */ - int verbose /* print info/debug messages */ - ); +int gpu_compute_cutoff_potential_lattice( + struct pb_TimerSet *timers, Lattice *lattice, + float cutoff, /* cutoff distance */ + Atoms *atom, /* array of atoms */ + int verbose /* print info/debug messages */ +); - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c @@ -6,24 +6,22 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -35,8 +33,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,44 +62,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -112,42 +111,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h index fed9553c9629207ff7592e3a1ff320eed027c1fb..adb557123d07c16baabba79d727ff8cfd2c3ad83 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h @@ -4,22 +4,24 @@ #ifdef __DEVICE_EMULATION__ #define DEBUG /* define which grid block and which thread to examine */ -#define BX 0 -#define BY 0 -#define TX 0 -#define TY 0 -#define TZ 0 -#define EMU(code) do { \ - if (blockIdx.x==BX && blockIdx.y==BY && \ - threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \ - code; \ - } \ -} while (0) -#define INT(n) printf("%s = %d\n", #n, n) -#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) -#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) -#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \ - (double)(f).y, (double)(f).z, (double)(f).w) +#define BX 0 +#define BY 0 +#define TX 0 +#define TY 0 +#define TZ 0 +#define EMU(code) \ + do { \ + if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX && \ + threadIdx.y == TY && threadIdx.z == TZ) { \ + code; \ + } \ + } while (0) +#define INT(n) printf("%s = %d\n", #n, n) +#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) +#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) +#define FLOAT4(f) \ + printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y, \ + (double)(f).z, (double)(f).w) #else #define EMU(code) #define INT(n) @@ -29,12 +31,11 @@ #endif /* report error from OpenCL */ -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } /* @@ -45,7 +46,7 @@ * reserve enough memory for 11^3 stencil of grid cells * this fits within 16K of memory */ -#define NBRLIST_DIM 11 +#define NBRLIST_DIM 11 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM) /* @@ -54,16 +55,16 @@ * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms, * should permit scheduling of up to 3 thread blocks per SM */ -#define BIN_DEPTH 8 /* max number of atoms per bin */ -#define BIN_SIZE 32 /* size of bin in floats */ -#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ +#define BIN_DEPTH 8 /* max number of atoms per bin */ +#define BIN_SIZE 32 /* size of bin in floats */ +#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ -#define BIN_LENGTH 4.f /* spatial length in Angstroms */ -#define BIN_INVLEN (1.f / BIN_LENGTH) +#define BIN_LENGTH 4.f /* spatial length in Angstroms */ +#define BIN_INVLEN (1.f / BIN_LENGTH) /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin * so that bin fill should be 80% (for non-empty regions of space) */ -#define REGION_SIZE 512 /* number of floats in lattice region */ -#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */ +#define REGION_SIZE 512 /* number of floats in lattice region */ +#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */ #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c index 9bc6837371b847b8e7a2dd99945ce635d48d1f66..4bc31d3e9b78ddf32d4a3617e69c7bc3e4bc62c6 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c @@ -6,11 +6,11 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -18,16 +18,15 @@ #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -37,23 +36,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -76,10 +71,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -91,13 +83,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *gpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -137,9 +129,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); gpu_lattice = create_lattice(lattice_dim); @@ -149,7 +142,8 @@ int main(int argc, char *argv[]) { * Run OpenCL kernel * (Begin and end with COMPUTE timer active) */ - if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0)) { + if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, + 0)) { fprintf(stderr, "Computation failed\n"); exit(1); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h @@ -2,14 +2,13 @@ #define __OCLH__ void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c index 36ee7e2b06e7650a1d096f2f3f80f8894f24cdf8..73fa63903a84d3cc741917d020f198133d898062 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c @@ -6,18 +6,16 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> #include <inttypes.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> #include "atom.h" #include "cutoff.h" -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -38,9 +36,9 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); } @@ -49,7 +47,7 @@ write_lattice_summary(const char *filename, Lattice *lattice) { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); } @@ -58,8 +56,8 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c index f0fbdc79f25679053ae2b8fbcd997db178b5a4d4..475a4666e1a6366873dc49d18d311b76ef6cde38 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c @@ -6,11 +6,11 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -18,15 +18,14 @@ #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -41,8 +40,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,7 +63,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -77,44 +76,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -125,26 +125,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -152,27 +159,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -180,7 +186,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -188,8 +194,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h index 955c788f658ae823e103ea4d040ba4f8c6179fef..13378e5e9be17209476e71e749b44be6733bb8d9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h @@ -15,54 +15,51 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; - /* Lowest corner of lattice */ - Vec3 lo; + /* Lowest corner of lattice */ + Vec3 lo; - /* Lattice spacing */ - float h; - } LatticeDim; + /* Lattice spacing */ + float h; +} LatticeDim; - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); - int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ - ); +int gpu_compute_cutoff_potential_lattice6overlap( + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +); - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c index 15e7aae1e160eb242d10c31911fee3fefdb50889..06f856c1a0fa43dc95cb896450baa42f74c047fd 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c @@ -7,19 +7,19 @@ ***************************************************************************/ #include <CL/cl.h> +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#include "ocl.h" #include "macros.h" +#include "ocl.h" -//OpenCL v1.0 -//cl_int3 not defined +// OpenCL v1.0 +// cl_int3 not defined #ifdef CL_VERSION_1_1 #if CL_VERSION_1_1 != 1 typedef cl_int4 cl_int3; @@ -37,15 +37,13 @@ const cl_version_check = 0; // we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used. typedef cl_int4 xyz; -//extern "C" int gpu_compute_cutoff_potential_lattice6overlap( +// extern "C" int gpu_compute_cutoff_potential_lattice6overlap( int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ - ) -{ + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -59,8 +57,8 @@ int gpu_compute_cutoff_potential_lattice6overlap( xyz nbrlist[NBRLIST_MAXLEN]; int nbrlistlen = 0; - int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */ - int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */ + int binHistoFull[BIN_DEPTH + 1] = {0}; /* clear every array element */ + int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */ int num_excluded = 0; int xRegionDim, yRegionDim, zRegionDim; @@ -92,16 +90,16 @@ int gpu_compute_cutoff_potential_lattice6overlap( size_t gridDim[3], blockDim[3]; #ifdef NEIGHBOR_COUNT - double neighbor_count = 0; /* used to profile the number of atoms near a - * lattice point */ + double neighbor_count = 0; /* used to profile the number of atoms near a + * lattice point */ #endif // Caller has made the "compute" timer active /* pad lattice to be factor of 8 in each dimension */ - xRegionDim = (int) ceilf(nx/8.f); - yRegionDim = (int) ceilf(ny/8.f); - zRegionDim = (int) ceilf(nz/8.f); + xRegionDim = (int)ceilf(nx / 8.f); + yRegionDim = (int)ceilf(ny / 8.f); + zRegionDim = (int)ceilf(nz / 8.f); lnx = 8 * xRegionDim; lny = 8 * yRegionDim; @@ -109,35 +107,36 @@ int gpu_compute_cutoff_potential_lattice6overlap( lnall = lnx * lny * lnz; /* will receive energies from OpenCL */ - regionZeroAddr = (ener_t *) malloc(lnall * sizeof(float)); + regionZeroAddr = (ener_t *)malloc(lnall * sizeof(float)); /* create bins */ - c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ - binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c; - binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c; - binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c; + c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ + binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c; + binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c; + binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c; nbins = binDim.x * binDim.y * binDim.z; - binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4)); + binBaseAddr = (cl_float4 *)calloc(nbins * BIN_DEPTH, sizeof(cl_float4)); binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - bincntBaseAddr = (int *) calloc(nbins, sizeof(int)); + bincntBaseAddr = (int *)calloc(nbins, sizeof(int)); bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c; /* create neighbor list */ - if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) { + if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) { float s = sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); int cnt = 0; /* develop neighbor list around 1 cell */ - if (2*c + 1 > NBRLIST_DIM) { + if (2 * c + 1 > NBRLIST_DIM) { fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-1)/2 * BIN_LENGTH); + (NBRLIST_DIM - 1) / 2 * BIN_LENGTH); return -1; } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; nbrlist[cnt].x = i; nbrlist[cnt].y = j; nbrlist[cnt].z = k; @@ -146,21 +145,21 @@ int gpu_compute_cutoff_potential_lattice6overlap( } } nbrlistlen = cnt; - } - else if (8*h <= 2*BIN_LENGTH) { - float s = 2.f*sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); + } else if (8 * h <= 2 * BIN_LENGTH) { + float s = 2.f * sqrtf(3); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); int cnt = 0; /* develop neighbor list around 3-cube of cells */ - if (2*c + 3 > NBRLIST_DIM) { + if (2 * c + 3 > NBRLIST_DIM) { fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-3)/2 * BIN_LENGTH); + (NBRLIST_DIM - 3) / 2 * BIN_LENGTH); return -1; } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; nbrlist[cnt].x = i; nbrlist[cnt].y = j; nbrlist[cnt].z = k; @@ -169,8 +168,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( } } nbrlistlen = cnt; - } - else { + } else { fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH); return -1; } @@ -178,43 +176,39 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* perform geometric hashing of atoms into bins */ { /* array of extra atoms, permit average of one extra per bin */ - Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom)); + Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom)); int extra_len = 0; - - for (n = 0; n < natoms; n++) { + + for (n = 0; n < natoms; n++) { cl_float4 p; p.x = atom[n].x - xlo; p.y = atom[n].y - ylo; p.z = atom[n].z - zlo; p.w = atom[n].q; - i = (int) floorf(p.x * BIN_INVLEN); - j = (int) floorf(p.y * BIN_INVLEN); - k = (int) floorf(p.z * BIN_INVLEN); - if (i >= -c && i < binDim.x - c && - j >= -c && j < binDim.y - c && - k >= -c && k < binDim.z - c && - atom[n].q != 0) { - int index = (k * binDim.y + j) * binDim.x + i; - cl_float4 *bin = binZeroAddr + index * BIN_DEPTH; - int bindex = bincntZeroAddr[index]; - if (bindex < BIN_DEPTH) { - /* copy atom into bin and increase counter for this bin */ - bin[bindex] = p; - bincntZeroAddr[index]++; - } - else { - /* add index to array of extra atoms to be computed with CPU */ - if (extra_len >= nbins) { - fprintf(stderr, "exceeded space for storing extra atoms\n"); - return -1; - } - extra_atoms[extra_len] = atom[n]; - extra_len++; - } - } - else { - /* excluded atoms are either outside bins or neutrally charged */ - num_excluded++; + i = (int)floorf(p.x * BIN_INVLEN); + j = (int)floorf(p.y * BIN_INVLEN); + k = (int)floorf(p.z * BIN_INVLEN); + if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c && + k >= -c && k < binDim.z - c && atom[n].q != 0) { + int index = (k * binDim.y + j) * binDim.x + i; + cl_float4 *bin = binZeroAddr + index * BIN_DEPTH; + int bindex = bincntZeroAddr[index]; + if (bindex < BIN_DEPTH) { + /* copy atom into bin and increase counter for this bin */ + bin[bindex] = p; + bincntZeroAddr[index]++; + } else { + /* add index to array of extra atoms to be computed with CPU */ + if (extra_len >= nbins) { + fprintf(stderr, "exceeded space for storing extra atoms\n"); + return -1; + } + extra_atoms[extra_len] = atom[n]; + extra_len++; + } + } else { + /* excluded atoms are either outside bins or neutrally charged */ + num_excluded++; } } @@ -226,24 +220,24 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* bin stats */ sum = total = 0; - for (n = 0; n < nbins; n++) { - binHistoFull[ bincntBaseAddr[n] ]++; + for (n = 0; n < nbins; n++) { + binHistoFull[bincntBaseAddr[n]]++; sum += bincntBaseAddr[n]; total += BIN_DEPTH; } - avgFillFull = sum / (float) total; + avgFillFull = sum / (float)total; sum = total = 0; - for (k = 0; k < binDim.z - 2*c; k++) { - for (j = 0; j < binDim.y - 2*c; j++) { - for (i = 0; i < binDim.x - 2*c; i++) { + for (k = 0; k < binDim.z - 2 * c; k++) { + for (j = 0; j < binDim.y - 2 * c; j++) { + for (i = 0; i < binDim.x - 2 * c; i++) { int index = (k * binDim.y + j) * binDim.x + i; - binHistoCover[ bincntZeroAddr[index] ]++; + binHistoCover[bincntZeroAddr[index]]++; sum += bincntZeroAddr[index]; total += BIN_DEPTH; } } } - avgFillCover = sum / (float) total; + avgFillCover = sum / (float)total; if (verbose) { /* report */ @@ -252,25 +246,25 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("cutoff distance = %g\n", cutoff); printf("\n"); printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz); - printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h); + printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h); printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz); - printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h); - printf("number of bytes for lattice data = %u\n", lnall*sizeof(float)); + printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h); + printf("number of bytes for lattice data = %u\n", lnall * sizeof(float)); printf("\n"); printf("bin padding thickness = %d\n", c); - printf("bin cover dimensions = %d %d %d\n", - binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c); + printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c, + binDim.y - 2 * c, binDim.z - 2 * c); printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z); printf("number of bins = %d\n", nbins); printf("total number of atom slots = %d\n", nbins * BIN_DEPTH); printf("%% overhead space = %g\n", - (natoms / (double) (nbins * BIN_DEPTH)) * 100); + (natoms / (double)(nbins * BIN_DEPTH)) * 100); printf("number of bytes for bin data = %u\n", - nbins * BIN_DEPTH * sizeof(cl_float4)); + nbins * BIN_DEPTH * sizeof(cl_float4)); printf("\n"); printf("bin histogram with padding:\n"); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]); sum += binHistoFull[n]; } @@ -279,7 +273,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("\n"); printf("bin histogram excluding padding:\n"); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]); sum += binHistoCover[n]; } @@ -287,125 +281,145 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf(" %% average fill: %g\n", avgFillCover * 100); printf("\n"); printf("number of extra atoms = %d\n", extra->size); - printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100); + printf("%% atoms that are extra = %g\n", + (extra->size / (double)natoms) * 100); printf("\n"); /* sanity check on bins */ sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { sum += n * binHistoFull[n]; } sum += extra->size + num_excluded; printf("sanity check on bin histogram with edges: " - "sum + others = %d\n", sum); + "sum + others = %d\n", + sum); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { sum += n * binHistoCover[n]; } sum += extra->size + num_excluded; printf("sanity check on bin histogram excluding edges: " - "sum + others = %d\n", sum); + "sum + others = %d\n", + sum); printf("\n"); /* neighbor list */ printf("neighbor list length = %d\n", nbrlistlen); printf("\n"); } - + cl_int clStatus; cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + + const char *clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_nvidia -DVERSION_CHECK=%d", cl_version_check); //-cl-nv-verbose + sprintf(clOptions, "-I src/opencl_nvidia -DVERSION_CHECK=%d", + cl_version_check); //-cl-nv-verbose - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice6overlap",&clStatus); + + cl_kernel clKernel = clCreateKernel( + clProgram, "opencl_cutoff_potential_lattice6overlap", &clStatus); CHECK_ERROR("clCreateKernel") /* setup OpenCL kernel parameters */ blockDim[0] = 8; blockDim[1] = 2; blockDim[2] = 8; - gridDim[0] = xRegionDim*blockDim[0]; - gridDim[1] = yRegionDim*blockDim[1]; - gridDim[2] = 1*blockDim[2]; + gridDim[0] = xRegionDim * blockDim[0]; + gridDim[1] = yRegionDim * blockDim[1]; + gridDim[2] = 1 * blockDim[2]; /* allocate and initialize memory on OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); if (verbose) { printf("Allocating %.2fMB on OpenCL device for potentials\n", - lnall * sizeof(float) / (double) (1024*1024)); + lnall * sizeof(float) / (double)(1024 * 1024)); } - - regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(ener_t),NULL,&clStatus); + + regionZeroCl = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + lnall * sizeof(ener_t), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(ener_t)); + clMemSet(clCommandQueue, regionZeroCl, 0, lnall * sizeof(ener_t)); if (verbose) { printf("Allocating %.2fMB on OpenCL device for atom bins\n", - nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024)); + nbins * BIN_DEPTH * sizeof(cl_float4) / (double)(1024 * 1024)); } - binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus); + binBaseCl = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nbins * BIN_DEPTH * sizeof(cl_float4), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL); + + clStatus = clEnqueueWriteBuffer(clCommandQueue, binBaseCl, CL_TRUE, 0, + nbins * BIN_DEPTH * sizeof(cl_float4), + binBaseAddr, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - //Sub buffers are not supported in OpenCL v1.0 - int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; + // Sub buffers are not supported in OpenCL v1.0 + int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus); + NbrListLen = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(int), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, NbrListLen, CL_TRUE, 0, + sizeof(int), &nbrlistlen, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus); + NbrList = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + NBRLIST_MAXLEN * sizeof(xyz), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, NbrList, CL_TRUE, 0, + nbrlistlen * sizeof(xyz), nbrlist, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - if (verbose) + if (verbose) printf("\n"); pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x)); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y)); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset); - clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h); - clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2); - clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2); - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),®ionZeroCl); - clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen); - clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList); + clStatus = clSetKernelArg(clKernel, 0, sizeof(int), &(binDim.x)); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), &(binDim.y)); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &binBaseCl); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &offset); + clStatus = clSetKernelArg(clKernel, 4, sizeof(float), &h); + clStatus = clSetKernelArg(clKernel, 5, sizeof(float), &cutoff2); + clStatus = clSetKernelArg(clKernel, 6, sizeof(float), &inv_cutoff2); + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), ®ionZeroCl); + clStatus = clSetKernelArg(clKernel, 9, sizeof(cl_mem), &NbrListLen); + clStatus = clSetKernelArg(clKernel, 10, sizeof(cl_mem), &NbrList); CHECK_ERROR("clSetKernelArg") /*cl_command_queue cutoffstream;*/ @@ -414,21 +428,22 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); - if(verbose) + if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); - for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { + for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { #ifndef NO_DEBUG printf(" computing plane %d\n", zRegionIndex); -#endif - clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex); +#endif + clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &zRegionIndex); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, + gridDim, blockDim, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } - /* + /* * handle extra atoms on the CPU, concurrently with the GPU calculations */ @@ -437,10 +452,10 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("computing extra atoms on CPU\n"); if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " - "for extra atoms\n"); + "for extra atoms\n"); return -1; } - if(verbose) + if (verbose) printf("\n"); } @@ -458,7 +473,9 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* copy result regions from OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(ener_t),regionZeroAddr,0,NULL,NULL); + clStatus = clEnqueueReadBuffer(clCommandQueue, regionZeroCl, CL_TRUE, 0, + lnall * sizeof(ener_t), regionZeroAddr, 0, + NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") /* free OpenCL memory allocations */ @@ -473,28 +490,29 @@ int gpu_compute_cutoff_potential_lattice6overlap( clStatus = clReleaseCommandQueue(clCommandQueue); clStatus = clReleaseContext(clContext); - free((void*)clSource[0]); + free((void *)clSource[0]); /* * transpose on CPU, updating, producing the final lattice */ /* transpose regions back into lattice */ pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - for (k = 0; k < nz; k++) { + for (k = 0; k < nz; k++) { zRegionIndex = (k >> 3); zOffset = (k & 7); - for (j = 0; j < ny; j++) { + for (j = 0; j < ny; j++) { yRegionIndex = (j >> 3); yOffset = (j & 7); - for (i = 0; i < nx; i++) { + for (i = 0; i < nx; i++) { xRegionIndex = (i >> 3); xOffset = (i & 7); - thisRegion = regionZeroAddr - + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim - + xRegionIndex) * REGION_SIZE; + thisRegion = regionZeroAddr + + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim + + xRegionIndex) * + REGION_SIZE; indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset; index = (k * ny + j) * nx + i; @@ -502,7 +520,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( #ifndef NEIGHBOR_COUNT lattice->lattice[index] += thisRegion[indexRegion]; #else - neighbor_count += thisRegion[indexRegion]; + neighbor_count += thisRegion[indexRegion]; #endif } } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c @@ -6,24 +6,22 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -35,8 +33,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,44 +62,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -112,42 +111,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h index 513e65d64f72d2b9603a9d7e594417feffb324a5..2bd0ad46d3ac72073a85e97d3c7b51fc999fb006 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h @@ -4,22 +4,24 @@ #ifdef __DEVICE_EMULATION__ #define DEBUG /* define which grid block and which thread to examine */ -#define BX 0 -#define BY 0 -#define TX 0 -#define TY 0 -#define TZ 0 -#define EMU(code) do { \ - if (blockIdx.x==BX && blockIdx.y==BY && \ - threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \ - code; \ - } \ -} while (0) -#define INT(n) printf("%s = %d\n", #n, n) -#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) -#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) -#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \ - (double)(f).y, (double)(f).z, (double)(f).w) +#define BX 0 +#define BY 0 +#define TX 0 +#define TY 0 +#define TZ 0 +#define EMU(code) \ + do { \ + if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX && \ + threadIdx.y == TY && threadIdx.z == TZ) { \ + code; \ + } \ + } while (0) +#define INT(n) printf("%s = %d\n", #n, n) +#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) +#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) +#define FLOAT4(f) \ + printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y, \ + (double)(f).z, (double)(f).w) #else #define EMU(code) #define INT(n) @@ -29,13 +31,12 @@ #endif // report error from OpenCL -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Errorcode = %d\n", clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Errorcode = %d\n", clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #undef OPENCL11 @@ -48,7 +49,7 @@ * reserve enough memory for 11^3 stencil of grid cells * this fits within 16K of memory */ -#define NBRLIST_DIM 11 +#define NBRLIST_DIM 11 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM) /* Normally, we're summing electrostatic potential. However, for @@ -57,7 +58,7 @@ */ #undef NEIGHBOR_COUNT //#define NEIGHBOR_COUNT - + #ifndef NEIGHBOR_COUNT typedef float ener_t; #else @@ -70,16 +71,16 @@ typedef int ener_t; * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms, * should permit scheduling of up to 3 thread blocks per SM */ -#define BIN_DEPTH 8 /* max number of atoms per bin */ -#define BIN_SIZE 32 /* size of bin in floats */ -#define BIN_SHIFT 5 /* # of bits to shift for mul/div by BIN_SIZE */ -#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ +#define BIN_DEPTH 8 /* max number of atoms per bin */ +#define BIN_SIZE 32 /* size of bin in floats */ +#define BIN_SHIFT 5 /* # of bits to shift for mul/div by BIN_SIZE */ +#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ -#define BIN_LENGTH 4.f /* spatial length in Angstroms */ -#define BIN_INVLEN (1.f / BIN_LENGTH) +#define BIN_LENGTH 4.f /* spatial length in Angstroms */ +#define BIN_INVLEN (1.f / BIN_LENGTH) /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin * so that bin fill should be 80% (for non-empty regions of space) */ -#define REGION_SIZE 512 /* number of floats in lattice region */ +#define REGION_SIZE 512 /* number of floats in lattice region */ #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c index 1e00f3e562d12e4bfd628a497eb56e03cfa9e2f4..bae7ca7339d41724520e1242a9b4d154c1cb073c 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c @@ -6,11 +6,11 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -18,16 +18,15 @@ #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -37,23 +36,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -76,10 +71,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -91,13 +83,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *gpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -138,9 +130,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); gpu_lattice = create_lattice(lattice_dim); @@ -149,7 +142,8 @@ int main(int argc, char *argv[]) { * OpenCL kernel, with overlapped GPU/CPU computation * (Enter and exit the function with the COMPUTE timer active) */ - if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) { + if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, + atom, 0)) { fprintf(stderr, "Computation failed\n"); exit(1); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h @@ -2,14 +2,13 @@ #define __OCLH__ void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c index ac45761fb86afd598dfe24f2ecead5622cf00954..145f59cc065131db3461a04f9674a94afbf0cfb5 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c @@ -6,18 +6,16 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> #include <inttypes.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> #include "atom.h" #include "cutoff.h" -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -38,21 +36,21 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); - //fprintf(outfile,"%f\n",tmp); + // fprintf(outfile,"%f\n",tmp); } /* Write the size of a lattice plane */ { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); - //fprintf(outfile,"%u\n",tmp); + // fprintf(outfile,"%u\n",tmp); } /* Write the plane of lattice data at z=0 and z = nz-1 */ @@ -60,11 +58,11 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); -//int i; - //for(i=0;i<100;i++) - //fprintf(outfile,"%f ",lattice_data[i]); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); + // int i; + // for(i=0;i<100;i++) + // fprintf(outfile,"%f ",lattice_data[i]); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h index 2ddd39227e6c043207897e923f9c7076452eff52..78a5f846e2feda2d1142ae0e1ea4f5edb4eb5ad6 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c index f0fbdc79f25679053ae2b8fbcd997db178b5a4d4..475a4666e1a6366873dc49d18d311b76ef6cde38 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c @@ -6,11 +6,11 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -18,15 +18,14 @@ #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -41,8 +40,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,7 +63,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -77,44 +76,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -125,26 +125,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -152,27 +159,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -180,7 +186,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -188,8 +194,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h index 955c788f658ae823e103ea4d040ba4f8c6179fef..13378e5e9be17209476e71e749b44be6733bb8d9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h @@ -15,54 +15,51 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; - /* Lowest corner of lattice */ - Vec3 lo; + /* Lowest corner of lattice */ + Vec3 lo; - /* Lattice spacing */ - float h; - } LatticeDim; + /* Lattice spacing */ + float h; +} LatticeDim; - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); - int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ - ); +int gpu_compute_cutoff_potential_lattice6overlap( + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +); - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c index be32a48fc61636bfaeff041d831a93b9c18e708f..96ebeafbdf377a2d2e6e8e7f2cf5e1e58a3e7a6a 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c @@ -7,19 +7,19 @@ ***************************************************************************/ #include <CL/cl.h> +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#include "ocl.h" #include "macros.h" +#include "ocl.h" -//OpenCL v1.0 -//cl_int3 not defined +// OpenCL v1.0 +// cl_int3 not defined #ifdef CL_VERSION_1_1 #if CL_VERSION_1_1 != 1 typedef cl_int4 cl_int3; @@ -37,15 +37,13 @@ const cl_version_check = 0; // we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used. typedef cl_int4 xyz; -//extern "C" int gpu_compute_cutoff_potential_lattice6overlap( +// extern "C" int gpu_compute_cutoff_potential_lattice6overlap( int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ - ) -{ + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -59,8 +57,8 @@ int gpu_compute_cutoff_potential_lattice6overlap( xyz nbrlist[NBRLIST_MAXLEN]; int nbrlistlen = 0; - int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */ - int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */ + int binHistoFull[BIN_DEPTH + 1] = {0}; /* clear every array element */ + int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */ int num_excluded = 0; int xRegionDim, yRegionDim, zRegionDim; @@ -92,16 +90,16 @@ int gpu_compute_cutoff_potential_lattice6overlap( size_t gridDim[3], blockDim[3]; #ifdef NEIGHBOR_COUNT - double neighbor_count = 0; /* used to profile the number of atoms near a - * lattice point */ + double neighbor_count = 0; /* used to profile the number of atoms near a + * lattice point */ #endif // Caller has made the "compute" timer active /* pad lattice to be factor of 8 in each dimension */ - xRegionDim = (int) ceilf(nx/8.f); - yRegionDim = (int) ceilf(ny/8.f); - zRegionDim = (int) ceilf(nz/8.f); + xRegionDim = (int)ceilf(nx / 8.f); + yRegionDim = (int)ceilf(ny / 8.f); + zRegionDim = (int)ceilf(nz / 8.f); lnx = 8 * xRegionDim; lny = 8 * yRegionDim; @@ -109,35 +107,36 @@ int gpu_compute_cutoff_potential_lattice6overlap( lnall = lnx * lny * lnz; /* will receive energies from OpenCL */ - regionZeroAddr = (ener_t *) malloc(lnall * sizeof(float)); + regionZeroAddr = (ener_t *)malloc(lnall * sizeof(float)); /* create bins */ - c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ - binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c; - binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c; - binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c; + c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ + binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c; + binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c; + binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c; nbins = binDim.x * binDim.y * binDim.z; - binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4)); + binBaseAddr = (cl_float4 *)calloc(nbins * BIN_DEPTH, sizeof(cl_float4)); binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - bincntBaseAddr = (int *) calloc(nbins, sizeof(int)); + bincntBaseAddr = (int *)calloc(nbins, sizeof(int)); bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c; /* create neighbor list */ - if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) { + if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) { float s = sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); int cnt = 0; /* develop neighbor list around 1 cell */ - if (2*c + 1 > NBRLIST_DIM) { + if (2 * c + 1 > NBRLIST_DIM) { fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-1)/2 * BIN_LENGTH); + (NBRLIST_DIM - 1) / 2 * BIN_LENGTH); return -1; } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; nbrlist[cnt].x = i; nbrlist[cnt].y = j; nbrlist[cnt].z = k; @@ -146,21 +145,21 @@ int gpu_compute_cutoff_potential_lattice6overlap( } } nbrlistlen = cnt; - } - else if (8*h <= 2*BIN_LENGTH) { - float s = 2.f*sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); + } else if (8 * h <= 2 * BIN_LENGTH) { + float s = 2.f * sqrtf(3); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); int cnt = 0; /* develop neighbor list around 3-cube of cells */ - if (2*c + 3 > NBRLIST_DIM) { + if (2 * c + 3 > NBRLIST_DIM) { fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-3)/2 * BIN_LENGTH); + (NBRLIST_DIM - 3) / 2 * BIN_LENGTH); return -1; } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; nbrlist[cnt].x = i; nbrlist[cnt].y = j; nbrlist[cnt].z = k; @@ -169,8 +168,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( } } nbrlistlen = cnt; - } - else { + } else { fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH); return -1; } @@ -178,43 +176,39 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* perform geometric hashing of atoms into bins */ { /* array of extra atoms, permit average of one extra per bin */ - Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom)); + Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom)); int extra_len = 0; - - for (n = 0; n < natoms; n++) { + + for (n = 0; n < natoms; n++) { cl_float4 p; p.x = atom[n].x - xlo; p.y = atom[n].y - ylo; p.z = atom[n].z - zlo; p.w = atom[n].q; - i = (int) floorf(p.x * BIN_INVLEN); - j = (int) floorf(p.y * BIN_INVLEN); - k = (int) floorf(p.z * BIN_INVLEN); - if (i >= -c && i < binDim.x - c && - j >= -c && j < binDim.y - c && - k >= -c && k < binDim.z - c && - atom[n].q != 0) { - int index = (k * binDim.y + j) * binDim.x + i; - cl_float4 *bin = binZeroAddr + index * BIN_DEPTH; - int bindex = bincntZeroAddr[index]; - if (bindex < BIN_DEPTH) { - /* copy atom into bin and increase counter for this bin */ - bin[bindex] = p; - bincntZeroAddr[index]++; - } - else { - /* add index to array of extra atoms to be computed with CPU */ - if (extra_len >= nbins) { - fprintf(stderr, "exceeded space for storing extra atoms\n"); - return -1; - } - extra_atoms[extra_len] = atom[n]; - extra_len++; - } - } - else { - /* excluded atoms are either outside bins or neutrally charged */ - num_excluded++; + i = (int)floorf(p.x * BIN_INVLEN); + j = (int)floorf(p.y * BIN_INVLEN); + k = (int)floorf(p.z * BIN_INVLEN); + if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c && + k >= -c && k < binDim.z - c && atom[n].q != 0) { + int index = (k * binDim.y + j) * binDim.x + i; + cl_float4 *bin = binZeroAddr + index * BIN_DEPTH; + int bindex = bincntZeroAddr[index]; + if (bindex < BIN_DEPTH) { + /* copy atom into bin and increase counter for this bin */ + bin[bindex] = p; + bincntZeroAddr[index]++; + } else { + /* add index to array of extra atoms to be computed with CPU */ + if (extra_len >= nbins) { + fprintf(stderr, "exceeded space for storing extra atoms\n"); + return -1; + } + extra_atoms[extra_len] = atom[n]; + extra_len++; + } + } else { + /* excluded atoms are either outside bins or neutrally charged */ + num_excluded++; } } @@ -226,24 +220,24 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* bin stats */ sum = total = 0; - for (n = 0; n < nbins; n++) { - binHistoFull[ bincntBaseAddr[n] ]++; + for (n = 0; n < nbins; n++) { + binHistoFull[bincntBaseAddr[n]]++; sum += bincntBaseAddr[n]; total += BIN_DEPTH; } - avgFillFull = sum / (float) total; + avgFillFull = sum / (float)total; sum = total = 0; - for (k = 0; k < binDim.z - 2*c; k++) { - for (j = 0; j < binDim.y - 2*c; j++) { - for (i = 0; i < binDim.x - 2*c; i++) { + for (k = 0; k < binDim.z - 2 * c; k++) { + for (j = 0; j < binDim.y - 2 * c; j++) { + for (i = 0; i < binDim.x - 2 * c; i++) { int index = (k * binDim.y + j) * binDim.x + i; - binHistoCover[ bincntZeroAddr[index] ]++; + binHistoCover[bincntZeroAddr[index]]++; sum += bincntZeroAddr[index]; total += BIN_DEPTH; } } } - avgFillCover = sum / (float) total; + avgFillCover = sum / (float)total; if (verbose) { /* report */ @@ -252,25 +246,25 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("cutoff distance = %g\n", cutoff); printf("\n"); printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz); - printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h); + printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h); printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz); - printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h); - printf("number of bytes for lattice data = %u\n", lnall*sizeof(float)); + printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h); + printf("number of bytes for lattice data = %u\n", lnall * sizeof(float)); printf("\n"); printf("bin padding thickness = %d\n", c); - printf("bin cover dimensions = %d %d %d\n", - binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c); + printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c, + binDim.y - 2 * c, binDim.z - 2 * c); printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z); printf("number of bins = %d\n", nbins); printf("total number of atom slots = %d\n", nbins * BIN_DEPTH); printf("%% overhead space = %g\n", - (natoms / (double) (nbins * BIN_DEPTH)) * 100); + (natoms / (double)(nbins * BIN_DEPTH)) * 100); printf("number of bytes for bin data = %u\n", - nbins * BIN_DEPTH * sizeof(cl_float4)); + nbins * BIN_DEPTH * sizeof(cl_float4)); printf("\n"); printf("bin histogram with padding:\n"); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]); sum += binHistoFull[n]; } @@ -279,7 +273,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("\n"); printf("bin histogram excluding padding:\n"); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]); sum += binHistoCover[n]; } @@ -287,122 +281,141 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf(" %% average fill: %g\n", avgFillCover * 100); printf("\n"); printf("number of extra atoms = %d\n", extra->size); - printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100); + printf("%% atoms that are extra = %g\n", + (extra->size / (double)natoms) * 100); printf("\n"); /* sanity check on bins */ sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { sum += n * binHistoFull[n]; } sum += extra->size + num_excluded; printf("sanity check on bin histogram with edges: " - "sum + others = %d\n", sum); + "sum + others = %d\n", + sum); sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { + for (n = 0; n <= BIN_DEPTH; n++) { sum += n * binHistoCover[n]; } sum += extra->size + num_excluded; printf("sanity check on bin histogram excluding edges: " - "sum + others = %d\n", sum); + "sum + others = %d\n", + sum); printf("\n"); /* neighbor list */ printf("neighbor list length = %d\n", nbrlistlen); printf("\n"); } - + cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs"); - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_nvidia -DVERSION_CHECK=%d", cl_version_check); //-cl-nv-verbose + sprintf(clOptions, "-I src/opencl_nvidia -DVERSION_CHECK=%d", + cl_version_check); //-cl-nv-verbose - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice6overlap",&clStatus); + + cl_kernel clKernel = clCreateKernel( + clProgram, "opencl_cutoff_potential_lattice6overlap", &clStatus); CHECK_ERROR("clCreateKernel") /* setup OpenCL kernel parameters */ blockDim[0] = 8; blockDim[1] = 2; blockDim[2] = 8; - gridDim[0] = xRegionDim*blockDim[0]; - gridDim[1] = yRegionDim*blockDim[1]; - gridDim[2] = 1*blockDim[2]; + gridDim[0] = xRegionDim * blockDim[0]; + gridDim[1] = yRegionDim * blockDim[1]; + gridDim[2] = 1 * blockDim[2]; /* allocate and initialize memory on OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); if (verbose) { printf("Allocating %.2fMB on OpenCL device for potentials\n", - lnall * sizeof(float) / (double) (1024*1024)); + lnall * sizeof(float) / (double)(1024 * 1024)); } - - regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(ener_t),NULL,&clStatus); + + regionZeroCl = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + lnall * sizeof(ener_t), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(ener_t)); + clMemSet(clCommandQueue, regionZeroCl, 0, lnall * sizeof(ener_t)); if (verbose) { printf("Allocating %.2fMB on OpenCL device for atom bins\n", - nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024)); + nbins * BIN_DEPTH * sizeof(cl_float4) / (double)(1024 * 1024)); } - binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus); + binBaseCl = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nbins * BIN_DEPTH * sizeof(cl_float4), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL); + + clStatus = clEnqueueWriteBuffer(clCommandQueue, binBaseCl, CL_TRUE, 0, + nbins * BIN_DEPTH * sizeof(cl_float4), + binBaseAddr, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - //Sub buffers are not supported in OpenCL v1.0 - int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; + // Sub buffers are not supported in OpenCL v1.0 + int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus); + NbrListLen = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(int), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, NbrListLen, CL_TRUE, 0, + sizeof(int), &nbrlistlen, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus); + NbrList = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + NBRLIST_MAXLEN * sizeof(xyz), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, NbrList, CL_TRUE, 0, + nbrlistlen * sizeof(xyz), nbrlist, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - if (verbose) + if (verbose) printf("\n"); pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x)); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y)); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset); - clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h); - clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2); - clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2); - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),®ionZeroCl); - clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen); - clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList); + clStatus = clSetKernelArg(clKernel, 0, sizeof(int), &(binDim.x)); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), &(binDim.y)); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &binBaseCl); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &offset); + clStatus = clSetKernelArg(clKernel, 4, sizeof(float), &h); + clStatus = clSetKernelArg(clKernel, 5, sizeof(float), &cutoff2); + clStatus = clSetKernelArg(clKernel, 6, sizeof(float), &inv_cutoff2); + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), ®ionZeroCl); + clStatus = clSetKernelArg(clKernel, 9, sizeof(cl_mem), &NbrListLen); + clStatus = clSetKernelArg(clKernel, 10, sizeof(cl_mem), &NbrList); CHECK_ERROR("clSetKernelArg") /*cl_command_queue cutoffstream;*/ @@ -411,21 +424,22 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); - if(verbose) + if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); - for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { + for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { #ifndef NO_DEBUG printf(" computing plane %d\n", zRegionIndex); -#endif - clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex); +#endif + clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &zRegionIndex); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, + gridDim, blockDim, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } - /* + /* * handle extra atoms on the CPU, concurrently with the GPU calculations */ @@ -434,10 +448,10 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("computing extra atoms on CPU\n"); if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " - "for extra atoms\n"); + "for extra atoms\n"); return -1; } - if(verbose) + if (verbose) printf("\n"); } @@ -455,7 +469,9 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* copy result regions from OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(ener_t),regionZeroAddr,0,NULL,NULL); + clStatus = clEnqueueReadBuffer(clCommandQueue, regionZeroCl, CL_TRUE, 0, + lnall * sizeof(ener_t), regionZeroAddr, 0, + NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") /* free OpenCL memory allocations */ @@ -470,28 +486,29 @@ int gpu_compute_cutoff_potential_lattice6overlap( clStatus = clReleaseCommandQueue(clCommandQueue); clStatus = clReleaseContext(clContext); - free((void*)clSource[0]); + free((void *)clSource[0]); /* * transpose on CPU, updating, producing the final lattice */ /* transpose regions back into lattice */ pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - for (k = 0; k < nz; k++) { + for (k = 0; k < nz; k++) { zRegionIndex = (k >> 3); zOffset = (k & 7); - for (j = 0; j < ny; j++) { + for (j = 0; j < ny; j++) { yRegionIndex = (j >> 3); yOffset = (j & 7); - for (i = 0; i < nx; i++) { + for (i = 0; i < nx; i++) { xRegionIndex = (i >> 3); xOffset = (i & 7); - thisRegion = regionZeroAddr - + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim - + xRegionIndex) * REGION_SIZE; + thisRegion = regionZeroAddr + + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim + + xRegionIndex) * + REGION_SIZE; indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset; index = (k * ny + j) * nx + i; @@ -499,7 +516,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( #ifndef NEIGHBOR_COUNT lattice->lattice[index] += thisRegion[indexRegion]; #else - neighbor_count += thisRegion[indexRegion]; + neighbor_count += thisRegion[indexRegion]; #endif } } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c @@ -6,24 +6,22 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -35,8 +33,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,44 +62,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -112,42 +111,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h index 513e65d64f72d2b9603a9d7e594417feffb324a5..2bd0ad46d3ac72073a85e97d3c7b51fc999fb006 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h @@ -4,22 +4,24 @@ #ifdef __DEVICE_EMULATION__ #define DEBUG /* define which grid block and which thread to examine */ -#define BX 0 -#define BY 0 -#define TX 0 -#define TY 0 -#define TZ 0 -#define EMU(code) do { \ - if (blockIdx.x==BX && blockIdx.y==BY && \ - threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \ - code; \ - } \ -} while (0) -#define INT(n) printf("%s = %d\n", #n, n) -#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) -#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) -#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \ - (double)(f).y, (double)(f).z, (double)(f).w) +#define BX 0 +#define BY 0 +#define TX 0 +#define TY 0 +#define TZ 0 +#define EMU(code) \ + do { \ + if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX && \ + threadIdx.y == TY && threadIdx.z == TZ) { \ + code; \ + } \ + } while (0) +#define INT(n) printf("%s = %d\n", #n, n) +#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) +#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) +#define FLOAT4(f) \ + printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y, \ + (double)(f).z, (double)(f).w) #else #define EMU(code) #define INT(n) @@ -29,13 +31,12 @@ #endif // report error from OpenCL -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Errorcode = %d\n", clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Errorcode = %d\n", clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #undef OPENCL11 @@ -48,7 +49,7 @@ * reserve enough memory for 11^3 stencil of grid cells * this fits within 16K of memory */ -#define NBRLIST_DIM 11 +#define NBRLIST_DIM 11 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM) /* Normally, we're summing electrostatic potential. However, for @@ -57,7 +58,7 @@ */ #undef NEIGHBOR_COUNT //#define NEIGHBOR_COUNT - + #ifndef NEIGHBOR_COUNT typedef float ener_t; #else @@ -70,16 +71,16 @@ typedef int ener_t; * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms, * should permit scheduling of up to 3 thread blocks per SM */ -#define BIN_DEPTH 8 /* max number of atoms per bin */ -#define BIN_SIZE 32 /* size of bin in floats */ -#define BIN_SHIFT 5 /* # of bits to shift for mul/div by BIN_SIZE */ -#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ +#define BIN_DEPTH 8 /* max number of atoms per bin */ +#define BIN_SIZE 32 /* size of bin in floats */ +#define BIN_SHIFT 5 /* # of bits to shift for mul/div by BIN_SIZE */ +#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ -#define BIN_LENGTH 4.f /* spatial length in Angstroms */ -#define BIN_INVLEN (1.f / BIN_LENGTH) +#define BIN_LENGTH 4.f /* spatial length in Angstroms */ +#define BIN_INVLEN (1.f / BIN_LENGTH) /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin * so that bin fill should be 80% (for non-empty regions of space) */ -#define REGION_SIZE 512 /* number of floats in lattice region */ +#define REGION_SIZE 512 /* number of floats in lattice region */ #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c index 1e00f3e562d12e4bfd628a497eb56e03cfa9e2f4..bae7ca7339d41724520e1242a9b4d154c1cb073c 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c @@ -6,11 +6,11 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -18,16 +18,15 @@ #define ERRTOL 1e-4f -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 - +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 int appenddata(const char *filename, int size, double time) { FILE *fp; - fp=fopen(filename, "a"); + fp = fopen(filename, "a"); if (fp == NULL) { printf("error appending to file %s..\n", filename); return -1; @@ -37,23 +36,19 @@ int appenddata(const char *filename, int size, double time) { return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; ret.lo = lo; ret.h = h; return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ +Lattice *create_lattice(LatticeDim dim) { int size; Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); @@ -76,10 +71,7 @@ create_lattice(LatticeDim dim) return lat; } - -void -destroy_lattice(Lattice *lat) -{ +void destroy_lattice(Lattice *lat) { if (lat) { free(lat->lattice); free(lat); @@ -91,13 +83,13 @@ int main(int argc, char *argv[]) { LatticeDim lattice_dim; Lattice *gpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ int n; @@ -138,9 +130,10 @@ int main(int argc, char *argv[]) { printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; - hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); gpu_lattice = create_lattice(lattice_dim); @@ -149,7 +142,8 @@ int main(int argc, char *argv[]) { * OpenCL kernel, with overlapped GPU/CPU computation * (Enter and exit the function with the COMPUTE timer active) */ - if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) { + if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, + atom, 0)) { fprintf(stderr, "Computation failed\n"); exit(1); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h @@ -2,14 +2,13 @@ #define __OCLH__ void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c index ac45761fb86afd598dfe24f2ecead5622cf00954..145f59cc065131db3461a04f9674a94afbf0cfb5 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c @@ -6,18 +6,16 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> #include <inttypes.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> #include "atom.h" #include "cutoff.h" -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -38,21 +36,21 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); - //fprintf(outfile,"%f\n",tmp); + // fprintf(outfile,"%f\n",tmp); } /* Write the size of a lattice plane */ { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); - //fprintf(outfile,"%u\n",tmp); + // fprintf(outfile,"%u\n",tmp); } /* Write the plane of lattice data at z=0 and z = nz-1 */ @@ -60,11 +58,11 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); -//int i; - //for(i=0;i<100;i++) - //fprintf(outfile,"%f ",lattice_data[i]); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); + // int i; + // for(i=0;i<100;i++) + // fprintf(outfile,"%f ",lattice_data[i]); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h index 2ddd39227e6c043207897e923f9c7076452eff52..78a5f846e2feda2d1142ae0e1ea4f5edb4eb5ad6 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h @@ -13,22 +13,22 @@ extern "C" { #endif - typedef struct Atom_t { - float x, y, z, q; - } Atom; - - typedef struct Atoms_t { - Atom *atoms; - int size; - } Atoms; - - typedef struct Vec3_t { - float x, y, z; - } Vec3; - - Atoms *read_atom_file(const char *fname); - void free_atom(Atoms *atom); - void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); +typedef struct Atom_t { + float x, y, z, q; +} Atom; + +typedef struct Atoms_t { + Atom *atoms; + int size; +} Atoms; + +typedef struct Vec3_t { + float x, y, z; +} Vec3; + +Atoms *read_atom_file(const char *fname); +void free_atom(Atoms *atom); +void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c index f0fbdc79f25679053ae2b8fbcd997db178b5a4d4..475a4666e1a6366873dc49d18d311b76ef6cde38 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c @@ -6,11 +6,11 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" @@ -18,15 +18,14 @@ #undef DEBUG_PASS_RATE #define CHECK_CYLINDER_CPU -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int +cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -41,8 +40,8 @@ extern int cpu_compute_cutoff_potential_lattice( const float inv_a2 = 1.f / a2; float s; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,7 +63,7 @@ extern int cpu_compute_cutoff_potential_lattice( int ncell, nxcell, nycell, nzcell; int *first, *next; float inv_cellen = INV_CELLEN; - Vec3 minext, maxext; /* Extent of atom bounding box */ + Vec3 minext, maxext; /* Extent of atom bounding box */ float xmin, ymin, zmin; float xmax, ymax, zmax; @@ -77,44 +76,45 @@ extern int cpu_compute_cutoff_potential_lattice( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(natoms * sizeof(int)); - for (n = 0; n < natoms; n++) { + next = (int *)malloc(natoms * sizeof(int)); + for (n = 0; n < natoms; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < natoms; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < natoms; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -125,26 +125,33 @@ extern int cpu_compute_cutoff_potential_lattice( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; #ifdef CHECK_CYLINDER_CPU - if (dydz2 >= a2) continue; + if (dydz2 >= a2) + continue; #endif dx = xstart; @@ -152,27 +159,26 @@ extern int cpu_compute_cutoff_potential_lattice( pg = lattice->lattice + index; #if defined(__INTEL_COMPILER) - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s; - *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ + e = q * (1 / sqrtf(r2)) * s; + *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */ } #else - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; - if (r2 >= a2) - { + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; + if (r2 >= a2) { #ifdef DEBUG_PASS_RATE - fail_count++; + fail_count++; #endif - continue; - } + continue; + } #ifdef DEBUG_PASS_RATE - pass_count++; + pass_count++; #endif s = (1.f - r2 * inv_a2); - e = q * (1/sqrtf(r2)) * s * s; + e = q * (1 / sqrtf(r2)) * s * s; *pg += e; } #endif @@ -180,7 +186,7 @@ extern int cpu_compute_cutoff_potential_lattice( } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); @@ -188,8 +194,8 @@ extern int cpu_compute_cutoff_potential_lattice( /* For debugging: print the number of times that the test passed/failed */ #ifdef DEBUG_PASS_RATE - printf ("Pass :%lld\n", pass_count); - printf ("Fail :%lld\n", fail_count); + printf("Pass :%lld\n", pass_count); + printf("Fail :%lld\n", fail_count); #endif return 0; diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h index 477e5649b6ff4f58690fb80a017f8bcec86d135c..0f8b0ff96aaab0c84bfca49c112b717d568815b9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h @@ -15,46 +15,44 @@ extern "C" { #define SHIFTED - /* A structure to record how points in 3D space map to array - elements. Array element (z, y, x) - where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz - maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). - */ - typedef struct LatticeDim_t { - /* Number of lattice points in x, y, z dimensions */ - int nx, ny, nz; - - /* Lowest corner of lattice */ - Vec3 lo; - - /* Lattice spacing */ - float h; - } LatticeDim; - - /* An electric potential field sampled on a regular grid. The - lattice size and grid point positions are specified by 'dim'. - */ - typedef struct Lattice_t { - LatticeDim dim; - float *lattice; - } Lattice; - - LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); - - Lattice *create_lattice(LatticeDim dim); - void destroy_lattice(Lattice *); - - int cpu_compute_cutoff_potential_lattice( - Lattice *lattice, /* the lattice */ - float cutoff, /* cutoff distance */ - Atoms *atoms /* array of atoms */ - ); - - int remove_exclusions( - Lattice *lattice, /* the lattice */ - float exclcutoff, /* exclusion cutoff distance */ - Atoms *atom /* array of atoms */ - ); +/* A structure to record how points in 3D space map to array + elements. Array element (z, y, x) + where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz + maps to coordinate (xlo, ylo, zlo) + h * (x, y, z). +*/ +typedef struct LatticeDim_t { + /* Number of lattice points in x, y, z dimensions */ + int nx, ny, nz; + + /* Lowest corner of lattice */ + Vec3 lo; + + /* Lattice spacing */ + float h; +} LatticeDim; + +/* An electric potential field sampled on a regular grid. The + lattice size and grid point positions are specified by 'dim'. +*/ +typedef struct Lattice_t { + LatticeDim dim; + float *lattice; +} Lattice; + +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h); + +Lattice *create_lattice(LatticeDim dim); +void destroy_lattice(Lattice *); + +int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */ + float cutoff, /* cutoff distance */ + Atoms *atoms /* array of atoms */ +); + +int remove_exclusions(Lattice *lattice, /* the lattice */ + float exclcutoff, /* exclusion cutoff distance */ + Atoms *atom /* array of atoms */ +); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c index 7d3b880dafe70f877b596a1e1143489dcef19d2f..31b966e6f4cff21afee17e1ecd33103ec333d08c 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c @@ -7,19 +7,19 @@ ***************************************************************************/ #include <CL/cl.h> +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#include "ocl.h" #include "macros.h" +#include "ocl.h" -//OpenCL v1.0 -//cl_int3 not defined +// OpenCL v1.0 +// cl_int3 not defined #ifdef CL_VERSION_1_1 #if CL_VERSION_1_1 != 1 typedef cl_int4 cl_int3; @@ -34,5 +34,4 @@ const cl_version_check = 0; // we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used. typedef cl_int4 xyz; -//extern "C" int gpu_compute_cutoff_potential_lattice6overlap( - +// extern "C" int gpu_compute_cutoff_potential_lattice6overlap( diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c @@ -6,24 +6,22 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#define CELLEN 4.f -#define INV_CELLEN (1.f/CELLEN) +#define CELLEN 4.f +#define INV_CELLEN (1.f / CELLEN) -extern int remove_exclusions( - Lattice *lattice, /* the lattice */ - float cutoff, /* exclusion cutoff distance */ - Atoms *atoms /* array of atoms */ - ) -{ +extern int remove_exclusions(Lattice *lattice, /* the lattice */ + float cutoff, /* exclusion cutoff distance */ + Atoms *atoms /* array of atoms */ +) { int nx = lattice->dim.nx; int ny = lattice->dim.ny; int nz = lattice->dim.nz; @@ -35,8 +33,8 @@ extern int remove_exclusions( const float a2 = cutoff * cutoff; const float inv_gridspacing = 1.f / gridspacing; - const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1; - /* lattice point radius about each atom */ + const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1; + /* lattice point radius about each atom */ int n; int i, j, k; @@ -64,44 +62,45 @@ extern int remove_exclusions( get_atom_extent(&minext, &maxext, atoms); /* number of cells in each dimension */ - nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1; - nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1; - nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1; + nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1; + nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1; + nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1; ncell = nxcell * nycell * nzcell; /* allocate for cursor link list implementation */ - first = (int *) malloc(ncell * sizeof(int)); - for (gindex = 0; gindex < ncell; gindex++) { + first = (int *)malloc(ncell * sizeof(int)); + for (gindex = 0; gindex < ncell; gindex++) { first[gindex] = -1; } - next = (int *) malloc(atoms->size * sizeof(int)); - for (n = 0; n < atoms->size; n++) { + next = (int *)malloc(atoms->size * sizeof(int)); + for (n = 0; n < atoms->size; n++) { next[n] = -1; } /* geometric hashing */ - for (n = 0; n < atoms->size; n++) { - if (0==atom[n].q) continue; /* skip any non-contributing atoms */ - i = (int) floorf((atom[n].x - minext.x) * inv_cellen); - j = (int) floorf((atom[n].y - minext.y) * inv_cellen); - k = (int) floorf((atom[n].z - minext.z) * inv_cellen); - gindex = (k*nycell + j)*nxcell + i; + for (n = 0; n < atoms->size; n++) { + if (0 == atom[n].q) + continue; /* skip any non-contributing atoms */ + i = (int)floorf((atom[n].x - minext.x) * inv_cellen); + j = (int)floorf((atom[n].y - minext.y) * inv_cellen); + k = (int)floorf((atom[n].z - minext.z) * inv_cellen); + gindex = (k * nycell + j) * nxcell + i; next[n] = first[gindex]; first[gindex] = n; } /* traverse the grid cells */ - for (gindex = 0; gindex < ncell; gindex++) { - for (n = first[gindex]; n != -1; n = next[n]) { + for (gindex = 0; gindex < ncell; gindex++) { + for (n = first[gindex]; n != -1; n = next[n]) { x = atom[n].x - xlo; y = atom[n].y - ylo; z = atom[n].z - zlo; q = atom[n].q; /* find closest grid point with position less than or equal to atom */ - ic = (int) (x * inv_gridspacing); - jc = (int) (y * inv_gridspacing); - kc = (int) (z * inv_gridspacing); + ic = (int)(x * inv_gridspacing); + jc = (int)(y * inv_gridspacing); + kc = (int)(z * inv_gridspacing); /* find extent of surrounding box of grid points */ ia = ic - radius; @@ -112,42 +111,49 @@ extern int remove_exclusions( kb = kc + radius + 1; /* trim box edges so that they are within grid point lattice */ - if (ia < 0) ia = 0; - if (ib >= nx) ib = nx-1; - if (ja < 0) ja = 0; - if (jb >= ny) jb = ny-1; - if (ka < 0) ka = 0; - if (kb >= nz) kb = nz-1; + if (ia < 0) + ia = 0; + if (ib >= nx) + ib = nx - 1; + if (ja < 0) + ja = 0; + if (jb >= ny) + jb = ny - 1; + if (ka < 0) + ka = 0; + if (kb >= nz) + kb = nz - 1; /* loop over surrounding grid points */ - xstart = ia*gridspacing - x; - ystart = ja*gridspacing - y; - dz = ka*gridspacing - z; - for (k = ka; k <= kb; k++, dz += gridspacing) { - koff = k*ny; - dz2 = dz*dz; + xstart = ia * gridspacing - x; + ystart = ja * gridspacing - y; + dz = ka * gridspacing - z; + for (k = ka; k <= kb; k++, dz += gridspacing) { + koff = k * ny; + dz2 = dz * dz; dy = ystart; - for (j = ja; j <= jb; j++, dy += gridspacing) { - jkoff = (koff + j)*nx; - dydz2 = dy*dy + dz2; + for (j = ja; j <= jb; j++, dy += gridspacing) { + jkoff = (koff + j) * nx; + dydz2 = dy * dy + dz2; dx = xstart; index = jkoff + ia; pg = lattice->lattice + index; - for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { - r2 = dx*dx + dydz2; + for (i = ia; i <= ib; i++, pg++, dx += gridspacing) { + r2 = dx * dx + dydz2; - /* If atom and lattice point are too close, set the lattice value - * to zero */ - if (r2 < a2) *pg = 0; + /* If atom and lattice point are too close, set the lattice value + * to zero */ + if (r2 < a2) + *pg = 0; } } } /* end loop over surrounding grid points */ } /* end loop over atoms in a gridcell */ - } /* end loop over gridcells */ + } /* end loop over gridcells */ /* free memory */ free(next); diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h index 2fcf28332ac0169dadbd3a3367c43399c651663b..9095917846a0cbcef7b00da03f6e7fcedaabdd84 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h @@ -4,22 +4,24 @@ #ifdef __DEVICE_EMULATION__ #define DEBUG /* define which grid block and which thread to examine */ -#define BX 0 -#define BY 0 -#define TX 0 -#define TY 0 -#define TZ 0 -#define EMU(code) do { \ - if (blockIdx.x==BX && blockIdx.y==BY && \ - threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \ - code; \ - } \ -} while (0) -#define INT(n) printf("%s = %d\n", #n, n) -#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) -#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) -#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \ - (double)(f).y, (double)(f).z, (double)(f).w) +#define BX 0 +#define BY 0 +#define TX 0 +#define TY 0 +#define TZ 0 +#define EMU(code) \ + do { \ + if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX && \ + threadIdx.y == TY && threadIdx.z == TZ) { \ + code; \ + } \ + } while (0) +#define INT(n) printf("%s = %d\n", #n, n) +#define FLOAT(f) printf("%s = %g\n", #f, (double)(f)) +#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z) +#define FLOAT4(f) \ + printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y, \ + (double)(f).z, (double)(f).w) #else #define EMU(code) #define INT(n) @@ -29,12 +31,11 @@ #endif // report error from OpenCL -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #undef OPENCL11 @@ -47,7 +48,7 @@ * reserve enough memory for 11^3 stencil of grid cells * this fits within 16K of memory */ -#define NBRLIST_DIM 11 +#define NBRLIST_DIM 11 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM) /* Normally, we're summing electrostatic potential. However, for @@ -56,7 +57,7 @@ */ #undef NEIGHBOR_COUNT //#define NEIGHBOR_COUNT - + #ifndef NEIGHBOR_COUNT typedef float ener_t; #else @@ -69,16 +70,16 @@ typedef int ener_t; * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms, * should permit scheduling of up to 3 thread blocks per SM */ -#define BIN_DEPTH 8 /* max number of atoms per bin */ -#define BIN_SIZE 32 /* size of bin in floats */ -#define BIN_SHIFT 5 /* # of bits to shift for mul/div by BIN_SIZE */ -#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ +#define BIN_DEPTH 8 /* max number of atoms per bin */ +#define BIN_SIZE 32 /* size of bin in floats */ +#define BIN_SHIFT 5 /* # of bits to shift for mul/div by BIN_SIZE */ +#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */ -#define BIN_LENGTH 4.f /* spatial length in Angstroms */ -#define BIN_INVLEN (1.f / BIN_LENGTH) +#define BIN_LENGTH 4.f /* spatial length in Angstroms */ +#define BIN_INVLEN (1.f / BIN_LENGTH) /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin * so that bin fill should be 80% (for non-empty regions of space) */ -#define REGION_SIZE 512 /* number of floats in lattice region */ +#define REGION_SIZE 512 /* number of floats in lattice region */ #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp index c26621737c4c5979d863ccb7b42a8d4132f1b5c1..caf99a5b37daaa28af83cd058c138af1270feff9 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp @@ -6,16 +6,16 @@ *cr ***************************************************************************/ +#include <math.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <parboil.h> #include "atom.h" #include "cutoff.h" -#include "output.h" #include "macros.h" +#include "output.h" #include <visc.h> #define ERRTOL 1e-4f @@ -23,1047 +23,985 @@ #define NO_DEBUG //#undef NO_DEBUG -#define NOKERNELS 0 -#define CUTOFF1 1 -#define CUTOFF6 32 -#define CUTOFF6OVERLAP 64 -#define CUTOFFCPU 16384 +#define NOKERNELS 0 +#define CUTOFF1 1 +#define CUTOFF6 32 +#define CUTOFF6OVERLAP 64 +#define CUTOFFCPU 16384 -#define mul24(x,y) (x)*(y) +#define mul24(x, y) (x) * (y) // =================== CUTCP Graph ============================= - // Define a type for a 3D coordinate. Only 3 vector components are needed. // Using int4 type because int3 support is missing on some platforms. -typedef struct __attribute__((__packed__)){ - int x; - int y; - int z; - int w; +typedef struct __attribute__((__packed__)) { + int x; + int y; + int z; + int w; } xyz; // May want to align these -typedef struct __attribute__((__packed__)) __attribute__((aligned(16))){ - float x; - float y; - float z; - float w; +typedef struct __attribute__((__packed__)) __attribute__((aligned(16))) { + float x; + float y; + float z; + float w; } float4; extern float rsqrt(float x); void Allocation(long block) { - // Memory shared between threadblocks - size_t bytes_AtomBinCache = sizeof(float)*BIN_CACHE_MAXLEN * BIN_DEPTH * 4; - void* AtomBinCache = __visc__malloc(bytes_AtomBinCache); - - size_t bytes_myBinIndex = sizeof(xyz); - void* myBinIndex = __visc__malloc(bytes_myBinIndex); - __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, bytes_myBinIndex); + // Memory shared between threadblocks + size_t bytes_AtomBinCache = sizeof(float) * BIN_CACHE_MAXLEN * BIN_DEPTH * 4; + void *AtomBinCache = __visc__malloc(bytes_AtomBinCache); + + size_t bytes_myBinIndex = sizeof(xyz); + void *myBinIndex = __visc__malloc(bytes_myBinIndex); + __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, + bytes_myBinIndex); } -void CUTCPLeaf( - int binDim_x, - int binDim_y, - float *binBaseAddr, size_t bytes_binBaseAddr, - int offset, - float h, /* lattice spacing */ - float cutoff2, /* square of cutoff distance */ - float inv_cutoff2, - ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */ - int zRegionIndex, - // constant memory arguments the next two - int *NbrListLen, size_t bytes_NbrListLen, - xyz *NbrList, size_t bytes_NbrList, - // local memory args - float* AtomBinCache, size_t bytes_AtomBinCache, - int* myBinIndex, size_t bytes_myBinIndex -) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int ly = __visc__getNodeInstanceID_y(thisNode); - int lz = __visc__getNodeInstanceID_z(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - int dimx = __visc__getNumNodeInstances_x(thisNode); - int dimy = __visc__getNumNodeInstances_y(thisNode); - int gdimx = __visc__getNumNodeInstances_x(parentNode); - int gdimy = __visc__getNumNodeInstances_y(parentNode); - - float* binZeroAddr = binBaseAddr + 4*offset; - - /*__local float AtomBinCache[BIN_CACHE_MAXLEN * BIN_DEPTH * 4];*/ - ener_t *myRegionAddr; - /*__local xyz myBinIndex;*/ - - const int xRegionIndex = gx; - const int yRegionIndex = gy; - - /* thread id */ - const int tid = (lz*dimy+ly)*dimx+lx; - - /* neighbor index */ - int nbrid; - - /* this is the start of the sub-region indexed by tid */ - myRegionAddr = regionZeroAddr + ((zRegionIndex*gdimy - + yRegionIndex)*gdimx + xRegionIndex)*REGION_SIZE; - - /* spatial coordinate of this lattice point */ - float x = (8 * xRegionIndex + lx) * h; - float y = (8 * yRegionIndex + ly) * h; - float z = (8 * zRegionIndex + lz) * h; - - int totalbins = 0; - int numbins; - - /* bin number determined by center of region */ - myBinIndex[0] = (int) floor((8 * xRegionIndex + 4) * h * BIN_INVLEN); - myBinIndex[1] = (int) floor((8 * yRegionIndex + 4) * h * BIN_INVLEN); - myBinIndex[2] = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN); - - /* first neighbor in list for me to cache */ - nbrid = (tid >> 4); - - numbins = BIN_CACHE_MAXLEN; +void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, + size_t bytes_binBaseAddr, int offset, + float h, /* lattice spacing */ + float cutoff2, /* square of cutoff distance */ + float inv_cutoff2, ener_t *regionZeroAddr, + size_t bytes_regionZeroAddr, /* address of lattice regions + starting at origin */ + int zRegionIndex, + // constant memory arguments the next two + int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, + size_t bytes_NbrList, + // local memory args + float *AtomBinCache, size_t bytes_AtomBinCache, int *myBinIndex, + size_t bytes_myBinIndex) { + __visc__hint(visc::DEVICE); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + regionZeroAddr); + + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int ly = __visc__getNodeInstanceID_y(thisNode); + int lz = __visc__getNodeInstanceID_z(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + int dimx = __visc__getNumNodeInstances_x(thisNode); + int dimy = __visc__getNumNodeInstances_y(thisNode); + int gdimx = __visc__getNumNodeInstances_x(parentNode); + int gdimy = __visc__getNumNodeInstances_y(parentNode); + + float *binZeroAddr = binBaseAddr + 4 * offset; + + /*__local float AtomBinCache[BIN_CACHE_MAXLEN * BIN_DEPTH * 4];*/ + ener_t *myRegionAddr; + /*__local xyz myBinIndex;*/ + + const int xRegionIndex = gx; + const int yRegionIndex = gy; + + /* thread id */ + const int tid = (lz * dimy + ly) * dimx + lx; + + /* neighbor index */ + int nbrid; + + /* this is the start of the sub-region indexed by tid */ + myRegionAddr = + regionZeroAddr + + ((zRegionIndex * gdimy + yRegionIndex) * gdimx + xRegionIndex) * + REGION_SIZE; + + /* spatial coordinate of this lattice point */ + float x = (8 * xRegionIndex + lx) * h; + float y = (8 * yRegionIndex + ly) * h; + float z = (8 * zRegionIndex + lz) * h; + + int totalbins = 0; + int numbins; + + /* bin number determined by center of region */ + myBinIndex[0] = (int)floor((8 * xRegionIndex + 4) * h * BIN_INVLEN); + myBinIndex[1] = (int)floor((8 * yRegionIndex + 4) * h * BIN_INVLEN); + myBinIndex[2] = (int)floor((8 * zRegionIndex + 4) * h * BIN_INVLEN); + + /* first neighbor in list for me to cache */ + nbrid = (tid >> 4); + + numbins = BIN_CACHE_MAXLEN; #ifndef NEIGHBOR_COUNT - ener_t energy0 = 0.f; - ener_t energy1 = 0.f; - ener_t energy2 = 0.f; - ener_t energy3 = 0.f; + ener_t energy0 = 0.f; + ener_t energy1 = 0.f; + ener_t energy2 = 0.f; + ener_t energy3 = 0.f; #else - ener_t energy0 = 0, energy1 = 0, energy2 = 0, energy3 = 0; + ener_t energy0 = 0, energy1 = 0, energy2 = 0, energy3 = 0; #endif - for (totalbins = 0; totalbins < *NbrListLen; totalbins += numbins) { + for (totalbins = 0; totalbins < *NbrListLen; totalbins += numbins) { - int bincnt; + int bincnt; - /* start of where to write in shared memory */ - int startoff = BIN_SIZE * (tid >> 4); + /* start of where to write in shared memory */ + int startoff = BIN_SIZE * (tid >> 4); - /* each half-warp to cache up to 4 atom bins */ - for (bincnt = 0; bincnt < 4 && nbrid < *NbrListLen; bincnt++, nbrid += 8) { + /* each half-warp to cache up to 4 atom bins */ + for (bincnt = 0; bincnt < 4 && nbrid < *NbrListLen; bincnt++, nbrid += 8) { - int i = myBinIndex[0] + NbrList[nbrid].x; - int j = myBinIndex[1] + NbrList[nbrid].y; - int k = myBinIndex[2] + NbrList[nbrid].z; + int i = myBinIndex[0] + NbrList[nbrid].x; + int j = myBinIndex[1] + NbrList[nbrid].y; + int k = myBinIndex[2] + NbrList[nbrid].z; - /* determine global memory location of atom bin */ - float *p_global = (( float *) binZeroAddr) - + (((mul24(k, binDim_y) + j)*binDim_x + i) << BIN_SHIFT); + /* determine global memory location of atom bin */ + float *p_global = + ((float *)binZeroAddr) + + (((mul24(k, binDim_y) + j) * binDim_x + i) << BIN_SHIFT); - /* coalesced read from global memory - - * retain same ordering in shared memory for now */ - int binIndex = startoff + (bincnt << (3 + BIN_SHIFT)); - int tidmask = tid & 15; + /* coalesced read from global memory - + * retain same ordering in shared memory for now */ + int binIndex = startoff + (bincnt << (3 + BIN_SHIFT)); + int tidmask = tid & 15; - AtomBinCache[binIndex + tidmask ] = p_global[tidmask ]; - AtomBinCache[binIndex + tidmask+16] = p_global[tidmask+16]; - } + AtomBinCache[binIndex + tidmask] = p_global[tidmask]; + AtomBinCache[binIndex + tidmask + 16] = p_global[tidmask + 16]; + } - __visc__barrier(); - /* no warp divergence */ - if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) { - numbins = *NbrListLen - totalbins; - } + __visc__barrier(); + /* no warp divergence */ + if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) { + numbins = *NbrListLen - totalbins; + } - int stopbin = (numbins << BIN_SHIFT); - for (bincnt = 0; bincnt < stopbin; bincnt+=BIN_SIZE) { - int i; + int stopbin = (numbins << BIN_SHIFT); + for (bincnt = 0; bincnt < stopbin; bincnt += BIN_SIZE) { + int i; - for (i = 0; i < BIN_DEPTH; i++) { + for (i = 0; i < BIN_DEPTH; i++) { - int off = bincnt + (i<<2); + int off = bincnt + (i << 2); - float aq = AtomBinCache[off + 3]; - if (0.f == aq) - break; /* no more atoms in bin */ + float aq = AtomBinCache[off + 3]; + if (0.f == aq) + break; /* no more atoms in bin */ - float dx = AtomBinCache[off ] - x; - float dz = AtomBinCache[off + 2] - z; - float dxdz2 = dx*dx + dz*dz; - float dy = AtomBinCache[off + 1] - y; - float r2 = dy*dy + dxdz2; + float dx = AtomBinCache[off] - x; + float dz = AtomBinCache[off + 2] - z; + float dxdz2 = dx * dx + dz * dz; + float dy = AtomBinCache[off + 1] - y; + float r2 = dy * dy + dxdz2; #ifndef NEIGHBOR_COUNT - if (r2 < cutoff2) - { - float s = (1.f - r2 * inv_cutoff2); - energy0 += aq * rsqrt(r2) * s * s; - //energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s; - } + if (r2 < cutoff2) { + float s = (1.f - r2 * inv_cutoff2); + energy0 += aq * rsqrt(r2) * s * s; + // energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s; + } #else - energy0 += (r2 < cutoff2); + energy0 += (r2 < cutoff2); #endif - dy -= 2.0f*h; - r2 = dy*dy + dxdz2; + dy -= 2.0f * h; + r2 = dy * dy + dxdz2; #ifndef NEIGHBOR_COUNT - if (r2 < cutoff2) - { - float s = (1.f - r2 * inv_cutoff2); - energy1 += aq * rsqrt(r2) * s * s; - //energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s; - } + if (r2 < cutoff2) { + float s = (1.f - r2 * inv_cutoff2); + energy1 += aq * rsqrt(r2) * s * s; + // energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s; + } #else - energy1 += (r2 < cutoff2); + energy1 += (r2 < cutoff2); #endif - dy -= 2.0f*h; - r2 = dy*dy + dxdz2; + dy -= 2.0f * h; + r2 = dy * dy + dxdz2; #ifndef NEIGHBOR_COUNT - if (r2 < cutoff2) - { - float s = (1.f - r2 * inv_cutoff2); - energy2 += aq * rsqrt(r2) * s * s; - //energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s; - } + if (r2 < cutoff2) { + float s = (1.f - r2 * inv_cutoff2); + energy2 += aq * rsqrt(r2) * s * s; + // energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s; + } #else - energy2 += (r2 < cutoff2); + energy2 += (r2 < cutoff2); #endif - dy -= 2.0f*h; - r2 = dy*dy + dxdz2; + dy -= 2.0f * h; + r2 = dy * dy + dxdz2; #ifndef NEIGHBOR_COUNT - if (r2 < cutoff2) - { - float s = (1.f - r2 * inv_cutoff2); - energy3 += aq * rsqrt(r2) * s * s; - //energy3 += aq * (1.0/rsqrt(r2)) * s * s; - } + if (r2 < cutoff2) { + float s = (1.f - r2 * inv_cutoff2); + energy3 += aq * rsqrt(r2) * s * s; + // energy3 += aq * (1.0/rsqrt(r2)) * s * s; + } #else - energy3 += (r2 < cutoff2); + energy3 += (r2 < cutoff2); #endif - } /* end loop over atoms in bin */ - } /* end loop over cached atom bins */ - __visc__barrier(); - } /* end loop over neighbor list */ - - /* store into global memory */ - myRegionAddr[(tid>>4)*64 + (tid&15) ] = energy0; - myRegionAddr[(tid>>4)*64 + (tid&15) + 16] = energy1; - myRegionAddr[(tid>>4)*64 + (tid&15) + 32] = energy2; - myRegionAddr[(tid>>4)*64 + (tid&15) + 48] = energy3; + } /* end loop over atoms in bin */ + } /* end loop over cached atom bins */ + __visc__barrier(); + } /* end loop over neighbor list */ + + /* store into global memory */ + myRegionAddr[(tid >> 4) * 64 + (tid & 15)] = energy0; + myRegionAddr[(tid >> 4) * 64 + (tid & 15) + 16] = energy1; + myRegionAddr[(tid >> 4) * 64 + (tid & 15) + 32] = energy2; + myRegionAddr[(tid >> 4) * 64 + (tid & 15) + 48] = energy3; } -void BlockingCUTCP( - int binDim_x, - int binDim_y, - float4 *binBaseAddr, size_t bytes_binBaseAddr, - int offset, - float h, /* lattice spacing */ - float cutoff2, /* square of cutoff distance */ - float inv_cutoff2, - ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */ - int zRegionIndex, - // constant memory arguments the next two - int *NbrListLen, size_t bytes_NbrListLen, - xyz *NbrList, size_t bytes_NbrList, - long blockx, - long blocky, - long blockz -) { - - __visc__hint(visc::CPU_TARGET); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - - void* AllocationNode = __visc__createNodeND(0, Allocation); - void* CUTCPLeafNode = __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz); - - // Bind Inputs - __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx - __visc__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x - __visc__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y - __visc__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr - __visc__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr - __visc__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset - __visc__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h - __visc__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2 - __visc__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2 - __visc__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr - __visc__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex - __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen - __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen - __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList - __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList - - // Create Edges - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, 0); // Edge bytes_AtomBinCache - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex - __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, 0); // Edge bytes_myBinIndex - +void BlockingCUTCP(int binDim_x, int binDim_y, float4 *binBaseAddr, + size_t bytes_binBaseAddr, int offset, + float h, /* lattice spacing */ + float cutoff2, /* square of cutoff distance */ + float inv_cutoff2, ener_t *regionZeroAddr, + size_t bytes_regionZeroAddr, /* address of lattice regions + starting at origin */ + int zRegionIndex, + // constant memory arguments the next two + int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, + size_t bytes_NbrList, long blockx, long blocky, + long blockz) { + + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + regionZeroAddr); + + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *CUTCPLeafNode = + __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz); + + // Bind Inputs + __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx + __visc__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset + __visc__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h + __visc__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList + + // Create Edges + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, + 0); // Edge bytes_AtomBinCache + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, + 0); // Edge bytes_myBinIndex } typedef struct __attribute__((__packed__)) { - int binDim_x; - int binDim_y; - float4 *binBaseAddr; - size_t bytes_binBaseAddr; - int offset; - float h; /* lattice spacing */ - float cutoff2; /* square of cutoff distance */ - float inv_cutoff2; - ener_t *regionZeroAddr; - size_t bytes_regionZeroAddr; /* address of lattice regions starting at origin */ - int zRegionIndex; - // constant memory arguments the next two - int *NbrListLen; - size_t bytes_NbrListLen; - xyz *NbrList; - size_t bytes_NbrList; - long blockx; - long blocky; - long blockz; - long gridx; - long gridy; - long gridz; + int binDim_x; + int binDim_y; + float4 *binBaseAddr; + size_t bytes_binBaseAddr; + int offset; + float h; /* lattice spacing */ + float cutoff2; /* square of cutoff distance */ + float inv_cutoff2; + ener_t *regionZeroAddr; + size_t + bytes_regionZeroAddr; /* address of lattice regions starting at origin */ + int zRegionIndex; + // constant memory arguments the next two + int *NbrListLen; + size_t bytes_NbrListLen; + xyz *NbrList; + size_t bytes_NbrList; + long blockx; + long blocky; + long blockz; + long gridx; + long gridy; + long gridz; } RootIn; -void packData( - RootIn* args, - int binDim_x, - int binDim_y, - float4 *binBaseAddr, size_t bytes_binBaseAddr, - int offset, - float h, /* lattice spacing */ - float cutoff2, /* square of cutoff distance */ - float inv_cutoff2, - ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */ - int zRegionIndex, - // constant memory arguments the next two - int *NbrListLen, size_t bytes_NbrListLen, - xyz *NbrList, size_t bytes_NbrList, - long blockx, - long blocky, - long blockz, - long gridx, - long gridy, - long gridz -) { - args->binDim_x = binDim_x; - args->binDim_y = binDim_y; - args->binBaseAddr = binBaseAddr; - args->bytes_binBaseAddr = bytes_binBaseAddr; - args->offset = offset; - args->h = h; /* lattice spacing */ - args->cutoff2 = cutoff2; /* square of cutoff distance */ - args->inv_cutoff2 = inv_cutoff2; - args->regionZeroAddr = regionZeroAddr; - args->bytes_regionZeroAddr = bytes_regionZeroAddr; /* address of lattice regions starting at origin */ - args->zRegionIndex = zRegionIndex; - // constant memory arguments the next two - args->NbrListLen = NbrListLen; - args->bytes_NbrListLen = bytes_NbrListLen; - args->NbrList = NbrList; - args->bytes_NbrList = bytes_NbrList; - args->blockx = blockx; - args->blocky = blocky; - args->blockz = blockz; - args->gridx = gridx; - args->gridy = gridy; - args->gridz = gridz; - +void packData(RootIn *args, int binDim_x, int binDim_y, float4 *binBaseAddr, + size_t bytes_binBaseAddr, int offset, + float h, /* lattice spacing */ + float cutoff2, /* square of cutoff distance */ + float inv_cutoff2, ener_t *regionZeroAddr, + size_t bytes_regionZeroAddr, /* address of lattice regions + starting at origin */ + int zRegionIndex, + // constant memory arguments the next two + int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, + size_t bytes_NbrList, long blockx, long blocky, long blockz, + long gridx, long gridy, long gridz) { + args->binDim_x = binDim_x; + args->binDim_y = binDim_y; + args->binBaseAddr = binBaseAddr; + args->bytes_binBaseAddr = bytes_binBaseAddr; + args->offset = offset; + args->h = h; /* lattice spacing */ + args->cutoff2 = cutoff2; /* square of cutoff distance */ + args->inv_cutoff2 = inv_cutoff2; + args->regionZeroAddr = regionZeroAddr; + args->bytes_regionZeroAddr = + bytes_regionZeroAddr; /* address of lattice regions starting at origin */ + args->zRegionIndex = zRegionIndex; + // constant memory arguments the next two + args->NbrListLen = NbrListLen; + args->bytes_NbrListLen = bytes_NbrListLen; + args->NbrList = NbrList; + args->bytes_NbrList = bytes_NbrList; + args->blockx = blockx; + args->blocky = blocky; + args->blockz = blockz; + args->gridx = gridx; + args->gridy = gridy; + args->gridz = gridz; } -void CUTCPRoot( - int binDim_x, - int binDim_y, - float4 *binBaseAddr, size_t bytes_binBaseAddr, - int offset, - float h, /* lattice spacing */ - float cutoff2, /* square of cutoff distance */ - float inv_cutoff2, - ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */ - int zRegionIndex, - // constant memory arguments the next two - int *NbrListLen, size_t bytes_NbrListLen, - xyz *NbrList, size_t bytes_NbrList, - long blockx, - long blocky, - long blockz, - long gridx, - long gridy, - long gridz -) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - - void* BlockingCUTCPNode = __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz); - - // Bind Inputs - __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x - __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y - __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset - __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h - __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 - __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 - __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex - __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen - __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen - __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList - __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList - __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx - __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky - __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz - +void CUTCPRoot(int binDim_x, int binDim_y, float4 *binBaseAddr, + size_t bytes_binBaseAddr, int offset, + float h, /* lattice spacing */ + float cutoff2, /* square of cutoff distance */ + float inv_cutoff2, ener_t *regionZeroAddr, + size_t bytes_regionZeroAddr, /* address of lattice regions + starting at origin */ + int zRegionIndex, + // constant memory arguments the next two + int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, + size_t bytes_NbrList, long blockx, long blocky, long blockz, + long gridx, long gridy, long gridz) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + regionZeroAddr); + + void *BlockingCUTCPNode = + __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz); + + // Bind Inputs + __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz } - -void CUTCPWrapper( - int binDim_x, - int binDim_y, - float4 *binBaseAddr, size_t bytes_binBaseAddr, - int offset, - float h, /* lattice spacing */ - float cutoff2, /* square of cutoff distance */ - float inv_cutoff2, - ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */ - int zRegionIndex, - // constant memory arguments the next two - int *NbrListLen, size_t bytes_NbrListLen, - xyz *NbrList, size_t bytes_NbrList, - long blockx, - long blocky, - long blockz, - long gridx, - long gridy, - long gridz -) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - - void* BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot); - - // Bind Inputs - __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x - __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y - __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr - __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset - __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h - __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 - __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 - __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex - __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen - __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen - __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList - __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList - __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx - __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky - __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz - __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx - __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy - __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz +void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr, + size_t bytes_binBaseAddr, int offset, + float h, /* lattice spacing */ + float cutoff2, /* square of cutoff distance */ + float inv_cutoff2, ener_t *regionZeroAddr, + size_t bytes_regionZeroAddr, /* address of lattice regions + starting at origin */ + int zRegionIndex, + // constant memory arguments the next two + int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, + size_t bytes_NbrList, long blockx, long blocky, long blockz, + long gridx, long gridy, long gridz) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + regionZeroAddr); + + void *BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot); + + // Bind Inputs + __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz + __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx + __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy + __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz } // ==================== Host Code ============================== int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ ); int appenddata(const char *filename, int size, double time) { - FILE *fp; - fp=fopen(filename, "a"); - if (fp == NULL) { - printf("error appending to file %s..\n", filename); - return -1; - } - fprintf(fp, "%d %.3f\n", size, time); - fclose(fp); - return 0; + FILE *fp; + fp = fopen(filename, "a"); + if (fp == NULL) { + printf("error appending to file %s..\n", filename); + return -1; + } + fprintf(fp, "%d %.3f\n", size, time); + fclose(fp); + return 0; } -LatticeDim -lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) -{ - LatticeDim ret; +LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) { + LatticeDim ret; - ret.nx = (int) floorf((hi.x-lo.x)/h) + 1; - ret.ny = (int) floorf((hi.y-lo.y)/h) + 1; - ret.nz = (int) floorf((hi.z-lo.z)/h) + 1; - ret.lo = lo; - ret.h = h; + ret.nx = (int)floorf((hi.x - lo.x) / h) + 1; + ret.ny = (int)floorf((hi.y - lo.y) / h) + 1; + ret.nz = (int)floorf((hi.z - lo.z) / h) + 1; + ret.lo = lo; + ret.h = h; - return ret; + return ret; } -Lattice * -create_lattice(LatticeDim dim) -{ - int size; - Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); +Lattice *create_lattice(LatticeDim dim) { + int size; + Lattice *lat = (Lattice *)malloc(sizeof(Lattice)); - if (lat == NULL) { - fprintf(stderr, "Out of memory\n"); - exit(1); - } + if (lat == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(1); + } - lat->dim = dim; + lat->dim = dim; - /* Round up the allocated size to a multiple of 8 */ - size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7; - lat->lattice = (float *)calloc(size, sizeof(float)); + /* Round up the allocated size to a multiple of 8 */ + size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7; + lat->lattice = (float *)calloc(size, sizeof(float)); - if (lat->lattice == NULL) { - fprintf(stderr, "Out of memory\n"); - exit(1); - } + if (lat->lattice == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(1); + } - return lat; + return lat; } - -void -destroy_lattice(Lattice *lat) -{ - if (lat) { - free(lat->lattice); - free(lat); - } +void destroy_lattice(Lattice *lat) { + if (lat) { + free(lat->lattice); + free(lat); + } } int main(int argc, char *argv[]) { - Atoms *atom; + Atoms *atom; - LatticeDim lattice_dim; - Lattice *gpu_lattice; - Vec3 min_ext, max_ext; /* Bounding box of atoms */ - Vec3 lo, hi; /* Bounding box with padding */ + LatticeDim lattice_dim; + Lattice *gpu_lattice; + Vec3 min_ext, max_ext; /* Bounding box of atoms */ + Vec3 lo, hi; /* Bounding box with padding */ - float h = 0.5f; /* Lattice spacing */ - float cutoff = 12.f; /* Cutoff radius */ - float exclcutoff = 1.f; /* Radius for exclusion */ - float padding = 0.5f; /* Bounding box padding distance */ + float h = 0.5f; /* Lattice spacing */ + float cutoff = 12.f; /* Cutoff radius */ + float exclcutoff = 1.f; /* Radius for exclusion */ + float padding = 0.5f; /* Bounding box padding distance */ - int n; + int n; - struct pb_Parameters *parameters; - struct pb_TimerSet timers; + struct pb_Parameters *parameters; + struct pb_TimerSet timers; - /* Read input parameters */ - parameters = pb_ReadParameters(&argc, argv); - if (parameters == NULL) { - exit(1); - } - - /* Expect one input file */ - if (pb_Parameters_CountInputs(parameters) != 1) { - fprintf(stderr, "Expecting one input file\n"); - exit(1); - } - - - //pb_SwitchToTimer(&timers, pb_TimerID_IO); + /* Read input parameters */ + parameters = pb_ReadParameters(&argc, argv); + if (parameters == NULL) { + exit(1); + } - { - const char *pqrfilename = parameters->inpFiles[0]; - - if (!(atom = read_atom_file(pqrfilename))) { - fprintf(stderr, "read_atom_file() failed\n"); - exit(1); - } - printf("read %d atoms from file '%s'\n", atom->size, pqrfilename); - } - - pb_InitializeTimerSet(&timers); - __visc__init(); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - /* find extent of domain */ - get_atom_extent(&min_ext, &max_ext, atom); - printf("extent of domain is:\n"); - printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z); - printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); - - printf("padding domain by %g Angstroms\n", padding); - lo = (Vec3) { - min_ext.x - padding, min_ext.y - padding, min_ext.z - padding - }; - hi = (Vec3) { - max_ext.x + padding, max_ext.y + padding, max_ext.z + padding - }; - printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); - - lattice_dim = lattice_from_bounding_box(lo, hi, h); - gpu_lattice = create_lattice(lattice_dim); - - /* - * OpenCL kernel, with overlapped GPU/CPU computation - * (Enter and exit the function with the COMPUTE timer active) - */ - if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) { - fprintf(stderr, "Computation failed\n"); - exit(1); - } + /* Expect one input file */ + if (pb_Parameters_CountInputs(parameters) != 1) { + fprintf(stderr, "Expecting one input file\n"); + exit(1); + } - /* - * Zero the lattice points that are too close to an atom. This is - * necessary for numerical stability. - */ - if (remove_exclusions(gpu_lattice, exclcutoff, atom)) { - fprintf(stderr, "remove_exclusions() failed for gpu lattice\n"); - exit(1); - } + // pb_SwitchToTimer(&timers, pb_TimerID_IO); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - __visc__cleanup(); - + { + const char *pqrfilename = parameters->inpFiles[0]; - /* Print output */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - if (parameters->outFile) { - write_lattice_summary(parameters->outFile, gpu_lattice); + if (!(atom = read_atom_file(pqrfilename))) { + fprintf(stderr, "read_atom_file() failed\n"); + exit(1); } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - /* Cleanup */ - destroy_lattice(gpu_lattice); - free_atom(atom); - - pb_FreeParameters(parameters); - return 0; + printf("read %d atoms from file '%s'\n", atom->size, pqrfilename); + } + + pb_InitializeTimerSet(&timers); + __visc__init(); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + /* find extent of domain */ + get_atom_extent(&min_ext, &max_ext, atom); + printf("extent of domain is:\n"); + printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z); + printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); + + printf("padding domain by %g Angstroms\n", padding); + lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; + hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; + printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y, + hi.z - lo.z); + + lattice_dim = lattice_from_bounding_box(lo, hi, h); + gpu_lattice = create_lattice(lattice_dim); + + /* + * OpenCL kernel, with overlapped GPU/CPU computation + * (Enter and exit the function with the COMPUTE timer active) + */ + if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, + atom, 0)) { + fprintf(stderr, "Computation failed\n"); + exit(1); + } + + /* + * Zero the lattice points that are too close to an atom. This is + * necessary for numerical stability. + */ + if (remove_exclusions(gpu_lattice, exclcutoff, atom)) { + fprintf(stderr, "remove_exclusions() failed for gpu lattice\n"); + exit(1); + } + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + __visc__cleanup(); + + /* Print output */ + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (parameters->outFile) { + write_lattice_summary(parameters->outFile, gpu_lattice); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + /* Cleanup */ + destroy_lattice(gpu_lattice); + free_atom(atom); + + pb_FreeParameters(parameters); + return 0; } int gpu_compute_cutoff_potential_lattice6overlap( - struct pb_TimerSet *timers, /* for measuring execution time */ - Lattice *lattice, - float cutoff, /* cutoff distance */ - Atoms *atoms, /* array of atoms */ - int verbose /* print info/debug messages */ -) -{ - int nx = lattice->dim.nx; - int ny = lattice->dim.ny; - int nz = lattice->dim.nz; - float xlo = lattice->dim.lo.x; - float ylo = lattice->dim.lo.y; - float zlo = lattice->dim.lo.z; - float h = lattice->dim.h; - int natoms = atoms->size; - Atom *atom = atoms->atoms; - - xyz nbrlist[NBRLIST_MAXLEN]; - size_t bytes_nbrlist = sizeof(xyz) * NBRLIST_MAXLEN; - int* nbrlistlen = (int*) malloc(sizeof(int)); - - int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */ - int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */ - int num_excluded = 0; - - int xRegionDim, yRegionDim, zRegionDim; - int xRegionIndex, yRegionIndex, zRegionIndex; - int xOffset, yOffset, zOffset; - int lnx, lny, lnz, lnall; - ener_t *regionZeroAddr, *thisRegion; - int index, indexRegion; - - int c; - xyz binDim; - int nbins; - float4 *binBaseAddr, *binZeroAddr; - int *bincntBaseAddr, *bincntZeroAddr; - Atoms *extra = NULL; - - int i, j, k, n; - int sum, total; - - float avgFillFull, avgFillCover; - const float cutoff2 = cutoff * cutoff; - const float inv_cutoff2 = 1.f / cutoff2; - - long gridDim[3], blockDim[3]; + struct pb_TimerSet *timers, /* for measuring execution time */ + Lattice *lattice, float cutoff, /* cutoff distance */ + Atoms *atoms, /* array of atoms */ + int verbose /* print info/debug messages */ +) { + int nx = lattice->dim.nx; + int ny = lattice->dim.ny; + int nz = lattice->dim.nz; + float xlo = lattice->dim.lo.x; + float ylo = lattice->dim.lo.y; + float zlo = lattice->dim.lo.z; + float h = lattice->dim.h; + int natoms = atoms->size; + Atom *atom = atoms->atoms; + + xyz nbrlist[NBRLIST_MAXLEN]; + size_t bytes_nbrlist = sizeof(xyz) * NBRLIST_MAXLEN; + int *nbrlistlen = (int *)malloc(sizeof(int)); + + int binHistoFull[BIN_DEPTH + 1] = {0}; /* clear every array element */ + int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */ + int num_excluded = 0; + + int xRegionDim, yRegionDim, zRegionDim; + int xRegionIndex, yRegionIndex, zRegionIndex; + int xOffset, yOffset, zOffset; + int lnx, lny, lnz, lnall; + ener_t *regionZeroAddr, *thisRegion; + int index, indexRegion; + + int c; + xyz binDim; + int nbins; + float4 *binBaseAddr, *binZeroAddr; + int *bincntBaseAddr, *bincntZeroAddr; + Atoms *extra = NULL; + + int i, j, k, n; + int sum, total; + + float avgFillFull, avgFillCover; + const float cutoff2 = cutoff * cutoff; + const float inv_cutoff2 = 1.f / cutoff2; + + long gridDim[3], blockDim[3]; #ifdef NEIGHBOR_COUNT - double neighbor_count = 0; /* used to profile the number of atoms near a - * lattice point */ + double neighbor_count = 0; /* used to profile the number of atoms near a + * lattice point */ #endif - // Caller has made the "compute" timer active - - /* pad lattice to be factor of 8 in each dimension */ - xRegionDim = (int) ceilf(nx/8.f); - yRegionDim = (int) ceilf(ny/8.f); - zRegionDim = (int) ceilf(nz/8.f); - - lnx = 8 * xRegionDim; - lny = 8 * yRegionDim; - lnz = 8 * zRegionDim; - lnall = lnx * lny * lnz; - - /* will receive energies from OpenCL */ - size_t bytes_regionZeroAddr = lnall * sizeof(ener_t); - regionZeroAddr = (ener_t *) malloc(bytes_regionZeroAddr); - - /* create bins */ - c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ - binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c; - binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c; - binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c; - nbins = binDim.x * binDim.y * binDim.z; - binBaseAddr = (float4 *) calloc(nbins * BIN_DEPTH, sizeof(float4)); - size_t bytes_binBaseAddr = nbins * BIN_DEPTH * sizeof(float4); - - binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - - bincntBaseAddr = (int *) calloc(nbins, sizeof(int)); - bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c; - - /* create neighbor list */ - if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) { - float s = sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); - int cnt = 0; - /* develop neighbor list around 1 cell */ - if (2*c + 1 > NBRLIST_DIM) { - fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-1)/2 * BIN_LENGTH); - return -1; - } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; - nbrlist[cnt].x = i; - nbrlist[cnt].y = j; - nbrlist[cnt].z = k; - cnt++; - } - } - } - *nbrlistlen = cnt; + // Caller has made the "compute" timer active + + /* pad lattice to be factor of 8 in each dimension */ + xRegionDim = (int)ceilf(nx / 8.f); + yRegionDim = (int)ceilf(ny / 8.f); + zRegionDim = (int)ceilf(nz / 8.f); + + lnx = 8 * xRegionDim; + lny = 8 * yRegionDim; + lnz = 8 * zRegionDim; + lnall = lnx * lny * lnz; + + /* will receive energies from OpenCL */ + size_t bytes_regionZeroAddr = lnall * sizeof(ener_t); + regionZeroAddr = (ener_t *)malloc(bytes_regionZeroAddr); + + /* create bins */ + c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */ + binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c; + binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c; + binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c; + nbins = binDim.x * binDim.y * binDim.z; + binBaseAddr = (float4 *)calloc(nbins * BIN_DEPTH, sizeof(float4)); + size_t bytes_binBaseAddr = nbins * BIN_DEPTH * sizeof(float4); + + binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; + + bincntBaseAddr = (int *)calloc(nbins, sizeof(int)); + bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c; + + /* create neighbor list */ + if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) { + float s = sqrtf(3); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); + int cnt = 0; + /* develop neighbor list around 1 cell */ + if (2 * c + 1 > NBRLIST_DIM) { + fprintf(stderr, "must have cutoff <= %f\n", + (NBRLIST_DIM - 1) / 2 * BIN_LENGTH); + return -1; } - else if (8*h <= 2*BIN_LENGTH) { - float s = 2.f*sqrtf(3); - float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH); - int cnt = 0; - /* develop neighbor list around 3-cube of cells */ - if (2*c + 3 > NBRLIST_DIM) { - fprintf(stderr, "must have cutoff <= %f\n", - (NBRLIST_DIM-3)/2 * BIN_LENGTH); - return -1; - } - for (k = -c; k <= c; k++) { - for (j = -c; j <= c; j++) { - for (i = -c; i <= c; i++) { - if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue; - nbrlist[cnt].x = i; - nbrlist[cnt].y = j; - nbrlist[cnt].z = k; - cnt++; - } - } + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; + nbrlist[cnt].x = i; + nbrlist[cnt].y = j; + nbrlist[cnt].z = k; + cnt++; } - *nbrlistlen = cnt; + } } - else { - fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH); - return -1; + *nbrlistlen = cnt; + } else if (8 * h <= 2 * BIN_LENGTH) { + float s = 2.f * sqrtf(3); + float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH); + int cnt = 0; + /* develop neighbor list around 3-cube of cells */ + if (2 * c + 3 > NBRLIST_DIM) { + fprintf(stderr, "must have cutoff <= %f\n", + (NBRLIST_DIM - 3) / 2 * BIN_LENGTH); + return -1; } - - /* perform geometric hashing of atoms into bins */ - { - /* array of extra atoms, permit average of one extra per bin */ - Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom)); - int extra_len = 0; - - for (n = 0; n < natoms; n++) { - float4 p; - p.x = atom[n].x - xlo; - p.y = atom[n].y - ylo; - p.z = atom[n].z - zlo; - p.w = atom[n].q; - i = (int) floorf(p.x * BIN_INVLEN); - j = (int) floorf(p.y * BIN_INVLEN); - k = (int) floorf(p.z * BIN_INVLEN); - if (i >= -c && i < binDim.x - c && - j >= -c && j < binDim.y - c && - k >= -c && k < binDim.z - c && - atom[n].q != 0) { - int index = (k * binDim.y + j) * binDim.x + i; - float4 *bin = binZeroAddr + index * BIN_DEPTH; - int bindex = bincntZeroAddr[index]; - if (bindex < BIN_DEPTH) { - /* copy atom into bin and increase counter for this bin */ - bin[bindex] = p; - bincntZeroAddr[index]++; - } - else { - /* add index to array of extra atoms to be computed with CPU */ - if (extra_len >= nbins) { - fprintf(stderr, "exceeded space for storing extra atoms\n"); - return -1; - } - extra_atoms[extra_len] = atom[n]; - extra_len++; - } - } - else { - /* excluded atoms are either outside bins or neutrally charged */ - num_excluded++; - } + for (k = -c; k <= c; k++) { + for (j = -c; j <= c; j++) { + for (i = -c; i <= c; i++) { + if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2) + continue; + nbrlist[cnt].x = i; + nbrlist[cnt].y = j; + nbrlist[cnt].z = k; + cnt++; } - - /* Save result */ - extra = (Atoms *)malloc(sizeof(Atoms)); - extra->atoms = extra_atoms; - extra->size = extra_len; + } + } + *nbrlistlen = cnt; + } else { + fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH); + return -1; + } + + /* perform geometric hashing of atoms into bins */ + { + /* array of extra atoms, permit average of one extra per bin */ + Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom)); + int extra_len = 0; + + for (n = 0; n < natoms; n++) { + float4 p; + p.x = atom[n].x - xlo; + p.y = atom[n].y - ylo; + p.z = atom[n].z - zlo; + p.w = atom[n].q; + i = (int)floorf(p.x * BIN_INVLEN); + j = (int)floorf(p.y * BIN_INVLEN); + k = (int)floorf(p.z * BIN_INVLEN); + if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c && + k >= -c && k < binDim.z - c && atom[n].q != 0) { + int index = (k * binDim.y + j) * binDim.x + i; + float4 *bin = binZeroAddr + index * BIN_DEPTH; + int bindex = bincntZeroAddr[index]; + if (bindex < BIN_DEPTH) { + /* copy atom into bin and increase counter for this bin */ + bin[bindex] = p; + bincntZeroAddr[index]++; + } else { + /* add index to array of extra atoms to be computed with CPU */ + if (extra_len >= nbins) { + fprintf(stderr, "exceeded space for storing extra atoms\n"); + return -1; + } + extra_atoms[extra_len] = atom[n]; + extra_len++; + } + } else { + /* excluded atoms are either outside bins or neutrally charged */ + num_excluded++; + } } - /* bin stats */ - sum = total = 0; - for (n = 0; n < nbins; n++) { - binHistoFull[ bincntBaseAddr[n] ]++; - sum += bincntBaseAddr[n]; + /* Save result */ + extra = (Atoms *)malloc(sizeof(Atoms)); + extra->atoms = extra_atoms; + extra->size = extra_len; + } + + /* bin stats */ + sum = total = 0; + for (n = 0; n < nbins; n++) { + binHistoFull[bincntBaseAddr[n]]++; + sum += bincntBaseAddr[n]; + total += BIN_DEPTH; + } + avgFillFull = sum / (float)total; + sum = total = 0; + for (k = 0; k < binDim.z - 2 * c; k++) { + for (j = 0; j < binDim.y - 2 * c; j++) { + for (i = 0; i < binDim.x - 2 * c; i++) { + int index = (k * binDim.y + j) * binDim.x + i; + binHistoCover[bincntZeroAddr[index]]++; + sum += bincntZeroAddr[index]; total += BIN_DEPTH; + } } - avgFillFull = sum / (float) total; - sum = total = 0; - for (k = 0; k < binDim.z - 2*c; k++) { - for (j = 0; j < binDim.y - 2*c; j++) { - for (i = 0; i < binDim.x - 2*c; i++) { - int index = (k * binDim.y + j) * binDim.x + i; - binHistoCover[ bincntZeroAddr[index] ]++; - sum += bincntZeroAddr[index]; - total += BIN_DEPTH; - } - } + } + avgFillCover = sum / (float)total; + + if (verbose) { + /* report */ + printf("number of atoms = %d\n", natoms); + printf("lattice spacing = %g\n", h); + printf("cutoff distance = %g\n", cutoff); + printf("\n"); + printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz); + printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h); + printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz); + printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h); + printf("number of bytes for lattice data = %lu\n", lnall * sizeof(float)); + printf("\n"); + printf("bin padding thickness = %d\n", c); + printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c, + binDim.y - 2 * c, binDim.z - 2 * c); + printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z); + printf("number of bins = %d\n", nbins); + printf("total number of atom slots = %d\n", nbins * BIN_DEPTH); + printf("%% overhead space = %g\n", + (natoms / (double)(nbins * BIN_DEPTH)) * 100); + printf("number of bytes for bin data = %lu\n", + nbins * BIN_DEPTH * sizeof(float4)); + printf("\n"); + printf("bin histogram with padding:\n"); + sum = 0; + for (n = 0; n <= BIN_DEPTH; n++) { + printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]); + sum += binHistoFull[n]; } - avgFillCover = sum / (float) total; - - if (verbose) { - /* report */ - printf("number of atoms = %d\n", natoms); - printf("lattice spacing = %g\n", h); - printf("cutoff distance = %g\n", cutoff); - printf("\n"); - printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz); - printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h); - printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz); - printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h); - printf("number of bytes for lattice data = %lu\n", lnall*sizeof(float)); - printf("\n"); - printf("bin padding thickness = %d\n", c); - printf("bin cover dimensions = %d %d %d\n", - binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c); - printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z); - printf("number of bins = %d\n", nbins); - printf("total number of atom slots = %d\n", nbins * BIN_DEPTH); - printf("%% overhead space = %g\n", - (natoms / (double) (nbins * BIN_DEPTH)) * 100); - printf("number of bytes for bin data = %lu\n", - nbins * BIN_DEPTH * sizeof(float4)); - printf("\n"); - printf("bin histogram with padding:\n"); - sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { - printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]); - sum += binHistoFull[n]; - } - printf(" total number of bins: %d\n", sum); - printf(" %% average fill: %g\n", avgFillFull * 100); - printf("\n"); - printf("bin histogram excluding padding:\n"); - sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { - printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]); - sum += binHistoCover[n]; - } - printf(" total number of bins: %d\n", sum); - printf(" %% average fill: %g\n", avgFillCover * 100); - printf("\n"); - printf("number of extra atoms = %d\n", extra->size); - printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100); - printf("\n"); - - /* sanity check on bins */ - sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { - sum += n * binHistoFull[n]; - } - sum += extra->size + num_excluded; - printf("sanity check on bin histogram with edges: " - "sum + others = %d\n", sum); - sum = 0; - for (n = 0; n <= BIN_DEPTH; n++) { - sum += n * binHistoCover[n]; - } - sum += extra->size + num_excluded; - printf("sanity check on bin histogram excluding edges: " - "sum + others = %d\n", sum); - printf("\n"); - - /* neighbor list */ - printf("neighbor list length = %d\n", *nbrlistlen); - printf("\n"); + printf(" total number of bins: %d\n", sum); + printf(" %% average fill: %g\n", avgFillFull * 100); + printf("\n"); + printf("bin histogram excluding padding:\n"); + sum = 0; + for (n = 0; n <= BIN_DEPTH; n++) { + printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]); + sum += binHistoCover[n]; } - - // Track visc data - llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr); - llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr); - llvm_visc_track_mem(nbrlistlen, sizeof(int)); - llvm_visc_track_mem(nbrlist, bytes_nbrlist); - - /* setup OpenCL kernel parameters */ - blockDim[0] = 8; - blockDim[1] = 2; - blockDim[2] = 8; - gridDim[0] = xRegionDim; - gridDim[1] = yRegionDim; - gridDim[2] = 1; - - /* allocate and initialize memory on OpenCL device */ - if (verbose) { - printf("Allocating %.2fMB on OpenCL device for potentials\n", - lnall * sizeof(float) / (double) (1024*1024)); + printf(" total number of bins: %d\n", sum); + printf(" %% average fill: %g\n", avgFillCover * 100); + printf("\n"); + printf("number of extra atoms = %d\n", extra->size); + printf("%% atoms that are extra = %g\n", + (extra->size / (double)natoms) * 100); + printf("\n"); + + /* sanity check on bins */ + sum = 0; + for (n = 0; n <= BIN_DEPTH; n++) { + sum += n * binHistoFull[n]; } - - memset(regionZeroAddr,0,lnall*sizeof(ener_t)); - - if (verbose) { - printf("Allocating %.2fMB on OpenCL device for atom bins\n", - nbins * BIN_DEPTH * sizeof(float4) / (double) (1024*1024)); + sum += extra->size + num_excluded; + printf("sanity check on bin histogram with edges: " + "sum + others = %d\n", + sum); + sum = 0; + for (n = 0; n <= BIN_DEPTH; n++) { + sum += n * binHistoCover[n]; } - - //Sub buffers are not supported in OpenCL v1.0 - int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; - - if (verbose) - printf("\n"); - - - RootIn* args = (RootIn*) malloc(sizeof(RootIn)); - packData( args, - binDim.x, - binDim.y, - binBaseAddr, - bytes_binBaseAddr, - offset, - h, - cutoff2, - inv_cutoff2, - regionZeroAddr, - bytes_regionZeroAddr, - zRegionIndex, - nbrlistlen, - (size_t )sizeof(int), - nbrlist, - bytes_nbrlist, - blockDim[0], - blockDim[1], - blockDim[2], - gridDim[0], - gridDim[1], - gridDim[2] - ); - - /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); - void* CUTCP_DFG; - if(verbose) - printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); - for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { + sum += extra->size + num_excluded; + printf("sanity check on bin histogram excluding edges: " + "sum + others = %d\n", + sum); + printf("\n"); + + /* neighbor list */ + printf("neighbor list length = %d\n", *nbrlistlen); + printf("\n"); + } + + // Track visc data + llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr); + llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr); + llvm_visc_track_mem(nbrlistlen, sizeof(int)); + llvm_visc_track_mem(nbrlist, bytes_nbrlist); + + /* setup OpenCL kernel parameters */ + blockDim[0] = 8; + blockDim[1] = 2; + blockDim[2] = 8; + gridDim[0] = xRegionDim; + gridDim[1] = yRegionDim; + gridDim[2] = 1; + + /* allocate and initialize memory on OpenCL device */ + if (verbose) { + printf("Allocating %.2fMB on OpenCL device for potentials\n", + lnall * sizeof(float) / (double)(1024 * 1024)); + } + + memset(regionZeroAddr, 0, lnall * sizeof(ener_t)); + + if (verbose) { + printf("Allocating %.2fMB on OpenCL device for atom bins\n", + nbins * BIN_DEPTH * sizeof(float4) / (double)(1024 * 1024)); + } + + // Sub buffers are not supported in OpenCL v1.0 + int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH; + + if (verbose) + printf("\n"); + + RootIn *args = (RootIn *)malloc(sizeof(RootIn)); + packData(args, binDim.x, binDim.y, binBaseAddr, bytes_binBaseAddr, offset, h, + cutoff2, inv_cutoff2, regionZeroAddr, bytes_regionZeroAddr, + zRegionIndex, nbrlistlen, (size_t)sizeof(int), nbrlist, + bytes_nbrlist, blockDim[0], blockDim[1], blockDim[2], gridDim[0], + gridDim[1], gridDim[2]); + + /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ + pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); + void *CUTCP_DFG; + if (verbose) + printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); + for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { #ifndef NO_DEBUG - printf(" computing plane %d\n", zRegionIndex); - fflush(stdout); + printf(" computing plane %d\n", zRegionIndex); + fflush(stdout); #endif - args->zRegionIndex = zRegionIndex; - - CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void*)args); - __visc__wait(CUTCP_DFG); - //llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); - } - - /* - * handle extra atoms on the CPU, concurrently with the GPU calculations - */ + args->zRegionIndex = zRegionIndex; - pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - if (extra->size > 0) { - if(verbose) { - printf("computing extra atoms on CPU\n"); - } + CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void *)args); + __visc__wait(CUTCP_DFG); + // llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); + } - pb_SwitchToTimer(timers, visc_TimerID_MISC); + /* + * handle extra atoms on the CPU, concurrently with the GPU calculations + */ - if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { - fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " - "for extra atoms\n"); - return -1; - } - pb_SwitchToTimer(timers, visc_TimerID_MISC); - printf("\n"); + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); + if (extra->size > 0) { + if (verbose) { + printf("computing extra atoms on CPU\n"); } - if(verbose) - printf("Finished OpenCL kernel calls \n"); - - /* copy result regions from OpenCL device */ - pb_SwitchToTimer(timers, pb_TimerID_COPY); - - llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); - /* - * transpose on CPU, updating, producing the final lattice - */ - /* transpose regions back into lattice */ - pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); - for (k = 0; k < nz; k++) { - zRegionIndex = (k >> 3); - zOffset = (k & 7); + pb_SwitchToTimer(timers, visc_TimerID_MISC); - for (j = 0; j < ny; j++) { - yRegionIndex = (j >> 3); - yOffset = (j & 7); - - for (i = 0; i < nx; i++) { - xRegionIndex = (i >> 3); - xOffset = (i & 7); - - thisRegion = regionZeroAddr - + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim - + xRegionIndex) * REGION_SIZE; - - indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset; - index = (k * ny + j) * nx + i; + if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { + fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " + "for extra atoms\n"); + return -1; + } + pb_SwitchToTimer(timers, visc_TimerID_MISC); + printf("\n"); + } + if (verbose) + printf("Finished OpenCL kernel calls \n"); + + /* copy result regions from OpenCL device */ + pb_SwitchToTimer(timers, pb_TimerID_COPY); + + llvm_visc_request_mem(regionZeroAddr, lnall * sizeof(ener_t)); + + /* + * transpose on CPU, updating, producing the final lattice + */ + /* transpose regions back into lattice */ + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); + for (k = 0; k < nz; k++) { + zRegionIndex = (k >> 3); + zOffset = (k & 7); + + for (j = 0; j < ny; j++) { + yRegionIndex = (j >> 3); + yOffset = (j & 7); + + for (i = 0; i < nx; i++) { + xRegionIndex = (i >> 3); + xOffset = (i & 7); + + thisRegion = regionZeroAddr + + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim + + xRegionIndex) * + REGION_SIZE; + + indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset; + index = (k * ny + j) * nx + i; #ifndef NEIGHBOR_COUNT - lattice->lattice[index] += thisRegion[indexRegion]; + lattice->lattice[index] += thisRegion[indexRegion]; #else - neighbor_count += thisRegion[indexRegion]; + neighbor_count += thisRegion[indexRegion]; #endif - } - } + } } + } #ifdef NEIGHBOR_COUNT - printf("Neighbor count: %f\n", (float)neighbor_count); + printf("Neighbor count: %f\n", (float)neighbor_count); #endif - /* cleanup memory allocations */ - free(regionZeroAddr); - free(binBaseAddr); - free(bincntBaseAddr); - free_atom(extra); + /* cleanup memory allocations */ + free(regionZeroAddr); + free(binBaseAddr); + free(bincntBaseAddr); + free_atom(extra); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h @@ -2,14 +2,13 @@ #define __OCLH__ void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c index ac45761fb86afd598dfe24f2ecead5622cf00954..145f59cc065131db3461a04f9674a94afbf0cfb5 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c @@ -6,18 +6,16 @@ *cr ***************************************************************************/ -#include <stdio.h> -#include <stdlib.h> #include <inttypes.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> #include "atom.h" #include "cutoff.h" -void -write_lattice_summary(const char *filename, Lattice *lattice) -{ +void write_lattice_summary(const char *filename, Lattice *lattice) { float *lattice_data = lattice->lattice; int nx = lattice->dim.nx; int ny = lattice->dim.ny; @@ -38,21 +36,21 @@ write_lattice_summary(const char *filename, Lattice *lattice) int i; for (i = 0; i < nx * ny * nz; i++) - abspotential += fabs((double) lattice_data[i]); + abspotential += fabs((double)lattice_data[i]); - tmp = (float) abspotential; + tmp = (float)abspotential; fwrite(&tmp, 1, sizeof(float), outfile); - //fprintf(outfile,"%f\n",tmp); + // fprintf(outfile,"%f\n",tmp); } /* Write the size of a lattice plane */ { uint32_t tmp; - tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny); + tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny); fwrite(&tmp, 1, sizeof(uint32_t), outfile); - //fprintf(outfile,"%u\n",tmp); + // fprintf(outfile,"%u\n",tmp); } /* Write the plane of lattice data at z=0 and z = nz-1 */ @@ -60,11 +58,11 @@ write_lattice_summary(const char *filename, Lattice *lattice) int plane_size = nx * ny; fwrite(lattice_data, plane_size, sizeof(float), outfile); - fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float), - outfile); -//int i; - //for(i=0;i<100;i++) - //fprintf(outfile,"%f ",lattice_data[i]); + fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float), + outfile); + // int i; + // for(i=0;i<100;i++) + // fprintf(outfile,"%f ",lattice_data[i]); } /* Cleanup */ diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h index 2ddd39227e6c043207897e923f9c7076452eff52..78a5f846e2feda2d1142ae0e1ea4f5edb4eb5ad6 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h @@ -15,8 +15,7 @@ extern "C" { #endif -void -write_lattice_summary(const char *filename, Lattice *lattice); +void write_lattice_summary(const char *filename, Lattice *lattice); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c @@ -6,36 +6,33 @@ *cr ***************************************************************************/ +#include "atom.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include "atom.h" - #define LINELEN 96 #define INITLEN 20 - -Atoms *read_atom_file(const char *fname) -{ +Atoms *read_atom_file(const char *fname) { FILE *file; char line[LINELEN]; - Atom *atom; /* Atom array */ - int len = INITLEN; /* Size of atom array */ - int cnt = 0; /* Number of atoms read */ + Atom *atom; /* Atom array */ + int len = INITLEN; /* Size of atom array */ + int cnt = 0; /* Number of atoms read */ /* open atom "pqr" file */ file = fopen(fname, "r"); - if (NULL==file) { + if (NULL == file) { fprintf(stderr, "can't open file \"%s\" for reading\n", fname); return NULL; } /* allocate initial atom array */ - atom = (Atom *) malloc(len * sizeof(Atom)); - if (NULL==atom) { + atom = (Atom *)malloc(len * sizeof(Atom)); + if (NULL == atom) { fprintf(stderr, "can't allocate memory\n"); return NULL; } @@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname) while (fgets(line, LINELEN, file) != NULL) { if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) { - continue; /* skip anything that isn't an atom record */ + continue; /* skip anything that isn't an atom record */ } - if (cnt==len) { /* extend atom array */ - void *tmp = realloc(atom, 2*len*sizeof(Atom)); - if (NULL==tmp) { + if (cnt == len) { /* extend atom array */ + void *tmp = realloc(atom, 2 * len * sizeof(Atom)); + if (NULL == tmp) { fprintf(stderr, "can't allocate more memory\n"); return NULL; } - atom = (Atom *) tmp; + atom = (Atom *)tmp; len *= 2; } /* read position coordinates and charge from atom record */ if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x), - &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { - fprintf(stderr, "atom record %d does not have expected format\n", cnt+1); + &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) { + fprintf(stderr, "atom record %d does not have expected format\n", + cnt + 1); return NULL; } - cnt++; /* count atoms as we store them */ + cnt++; /* count atoms as we store them */ } /* verify EOF and close file */ - if ( !feof(file) ) { + if (!feof(file)) { fprintf(stderr, "did not find EOF\n"); return NULL; } @@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname) } } - -void free_atom(Atoms *atom) -{ +void free_atom(Atoms *atom) { if (atom) { free(atom->atoms); free(atom); } } -void -get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) -{ +void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) { Atom *atoms = atom->atoms; int natoms = atom->size; Vec3 lo; diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h index ce9ce82c4acc351d7d239f3053023e964490eabe..0cd4bd055875c814b1712939b73179f7607043ad 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h @@ -14,7 +14,7 @@ #define OMEGA (1.95f) -#define OUTPUT_PRECISION float +#define OUTPUT_PRECISION float #define BOOL int #define TRUE (-1) diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c index 81294ac4455b4a92dfe80b7cb5d0ac0696a4b027..e6ea7c4d621e8470680a125bca11f70a634f2a56 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c @@ -4,9 +4,8 @@ #include "lbm.h" #include <math.h> -#include <stdlib.h> #include <stdio.h> - +#include <stdlib.h> #if !defined(SPEC_CPU) #ifdef _OPENMP @@ -16,674 +15,757 @@ /*############################################################################*/ -#define DFL1 (1.0/ 3.0) -#define DFL2 (1.0/18.0) -#define DFL3 (1.0/36.0) +#define DFL1 (1.0 / 3.0) +#define DFL2 (1.0 / 18.0) +#define DFL3 (1.0 / 36.0) /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES, - size = sizeof( LBM_Grid ) + 2*margin*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES, + size = sizeof(LBM_Grid) + 2 * margin * sizeof(float); - *ptr = malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + *ptr = malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } #if !defined(SPEC_CPU) - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); #endif - *ptr += margin; + *ptr += margin; } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES; +void LBM_freeGrid(float **ptr) { + const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES; - free( *ptr-margin ); - *ptr = NULL; + free(*ptr - margin); + *ptr = NULL; } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP #pragma omp parallel for #endif #endif - SWEEP_START( 0, 0, -2, 0, 0, SIZE_Z+2 ) - LOCAL( grid, C ) = DFL1; - LOCAL( grid, N ) = DFL2; - LOCAL( grid, S ) = DFL2; - LOCAL( grid, E ) = DFL2; - LOCAL( grid, W ) = DFL2; - LOCAL( grid, T ) = DFL2; - LOCAL( grid, B ) = DFL2; - LOCAL( grid, NE ) = DFL3; - LOCAL( grid, NW ) = DFL3; - LOCAL( grid, SE ) = DFL3; - LOCAL( grid, SW ) = DFL3; - LOCAL( grid, NT ) = DFL3; - LOCAL( grid, NB ) = DFL3; - LOCAL( grid, ST ) = DFL3; - LOCAL( grid, SB ) = DFL3; - LOCAL( grid, ET ) = DFL3; - LOCAL( grid, EB ) = DFL3; - LOCAL( grid, WT ) = DFL3; - LOCAL( grid, WB ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END + SWEEP_START(0, 0, -2, 0, 0, SIZE_Z + 2) + LOCAL(grid, C) = DFL1; + LOCAL(grid, N) = DFL2; + LOCAL(grid, S) = DFL2; + LOCAL(grid, E) = DFL2; + LOCAL(grid, W) = DFL2; + LOCAL(grid, T) = DFL2; + LOCAL(grid, B) = DFL2; + LOCAL(grid, NE) = DFL3; + LOCAL(grid, NW) = DFL3; + LOCAL(grid, SE) = DFL3; + LOCAL(grid, SW) = DFL3; + LOCAL(grid, NT) = DFL3; + LOCAL(grid, NB) = DFL3; + LOCAL(grid, ST) = DFL3; + LOCAL(grid, SB) = DFL3; + LOCAL(grid, ET) = DFL3; + LOCAL(grid, EB) = DFL3; + LOCAL(grid, WT) = DFL3; + LOCAL(grid, WB) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /*############################################################################*/ -void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 ) { - LBM_GridPtr aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2) { + LBM_GridPtr aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( x, y ) +#pragma omp parallel for private(x, y) #endif #endif - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_initializeSpecialCellsForChannel( LBM_Grid grid ) { - int x, y, z; +void LBM_initializeSpecialCellsForChannel(LBM_Grid grid) { + int x, y, z; - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( x, y ) +#pragma omp parallel for private(x, y) #endif #endif - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - - if( (z == 0 || z == SIZE_Z-1) && - ! TEST_FLAG( grid, x, y, z, OBSTACLE )) - SET_FLAG( grid, x, y, z, IN_OUT_FLOW ); - } - } - } - } + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + + if ((z == 0 || z == SIZE_Z - 1) && + !TEST_FLAG(grid, x, y, z, OBSTACLE)) + SET_FLAG(grid, x, y, z, IN_OUT_FLOW); + } + } + } + } } /*############################################################################*/ -void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ) { - SWEEP_VAR +void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid) { + SWEEP_VAR - float ux, uy, uz, u2, rho; + float ux, uy, uz, u2, rho; - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( ux, uy, uz, u2, rho ) +#pragma omp parallel for private(ux, uy, uz, u2, rho) #endif #endif - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - if( TEST_FLAG_SWEEP( srcGrid, OBSTACLE )) { - DST_C ( dstGrid ) = SRC_C ( srcGrid ); - DST_S ( dstGrid ) = SRC_N ( srcGrid ); - DST_N ( dstGrid ) = SRC_S ( srcGrid ); - DST_W ( dstGrid ) = SRC_E ( srcGrid ); - DST_E ( dstGrid ) = SRC_W ( srcGrid ); - DST_B ( dstGrid ) = SRC_T ( srcGrid ); - DST_T ( dstGrid ) = SRC_B ( srcGrid ); - DST_SW( dstGrid ) = SRC_NE( srcGrid ); - DST_SE( dstGrid ) = SRC_NW( srcGrid ); - DST_NW( dstGrid ) = SRC_SE( srcGrid ); - DST_NE( dstGrid ) = SRC_SW( srcGrid ); - DST_SB( dstGrid ) = SRC_NT( srcGrid ); - DST_ST( dstGrid ) = SRC_NB( srcGrid ); - DST_NB( dstGrid ) = SRC_ST( srcGrid ); - DST_NT( dstGrid ) = SRC_SB( srcGrid ); - DST_WB( dstGrid ) = SRC_ET( srcGrid ); - DST_WT( dstGrid ) = SRC_EB( srcGrid ); - DST_EB( dstGrid ) = SRC_WT( srcGrid ); - DST_ET( dstGrid ) = SRC_WB( srcGrid ); - continue; - } - - rho = + SRC_C ( srcGrid ) + SRC_N ( srcGrid ) - + SRC_S ( srcGrid ) + SRC_E ( srcGrid ) - + SRC_W ( srcGrid ) + SRC_T ( srcGrid ) - + SRC_B ( srcGrid ) + SRC_NE( srcGrid ) - + SRC_NW( srcGrid ) + SRC_SE( srcGrid ) - + SRC_SW( srcGrid ) + SRC_NT( srcGrid ) - + SRC_NB( srcGrid ) + SRC_ST( srcGrid ) - + SRC_SB( srcGrid ) + SRC_ET( srcGrid ) - + SRC_EB( srcGrid ) + SRC_WT( srcGrid ) - + SRC_WB( srcGrid ); - - ux = + SRC_E ( srcGrid ) - SRC_W ( srcGrid ) - + SRC_NE( srcGrid ) - SRC_NW( srcGrid ) - + SRC_SE( srcGrid ) - SRC_SW( srcGrid ) - + SRC_ET( srcGrid ) + SRC_EB( srcGrid ) - - SRC_WT( srcGrid ) - SRC_WB( srcGrid ); - uy = + SRC_N ( srcGrid ) - SRC_S ( srcGrid ) - + SRC_NE( srcGrid ) + SRC_NW( srcGrid ) - - SRC_SE( srcGrid ) - SRC_SW( srcGrid ) - + SRC_NT( srcGrid ) + SRC_NB( srcGrid ) - - SRC_ST( srcGrid ) - SRC_SB( srcGrid ); - uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid ) - + SRC_NT( srcGrid ) - SRC_NB( srcGrid ) - + SRC_ST( srcGrid ) - SRC_SB( srcGrid ) - + SRC_ET( srcGrid ) - SRC_EB( srcGrid ) - + SRC_WT( srcGrid ) - SRC_WB( srcGrid ); - - ux /= rho; - uy /= rho; - uz /= rho; - - if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) { - ux = 0.005f; - uy = 0.002f; - uz = 0.000f; - } - - u2 = 1.5f * (ux*ux + uy*uy + uz*uz); - DST_C ( dstGrid ) = (1.0f-OMEGA)*SRC_C ( srcGrid ) + DFL1*OMEGA*rho*(1.0f - u2); - - DST_N ( dstGrid ) = (1.0f-OMEGA)*SRC_N ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uy*(4.5f*uy + 3.0f) - u2); - DST_S ( dstGrid ) = (1.0f-OMEGA)*SRC_S ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uy*(4.5f*uy - 3.0f) - u2); - DST_E ( dstGrid ) = (1.0f-OMEGA)*SRC_E ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + ux*(4.5f*ux + 3.0f) - u2); - DST_W ( dstGrid ) = (1.0f-OMEGA)*SRC_W ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + ux*(4.5f*ux - 3.0f) - u2); - DST_T ( dstGrid ) = (1.0f-OMEGA)*SRC_T ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uz*(4.5f*uz + 3.0f) - u2); - DST_B ( dstGrid ) = (1.0f-OMEGA)*SRC_B ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uz*(4.5f*uz - 3.0f) - u2); - - DST_NE( dstGrid ) = (1.0f-OMEGA)*SRC_NE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2); - DST_NW( dstGrid ) = (1.0f-OMEGA)*SRC_NW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2); - DST_SE( dstGrid ) = (1.0f-OMEGA)*SRC_SE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2); - DST_SW( dstGrid ) = (1.0f-OMEGA)*SRC_SW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2); - DST_NT( dstGrid ) = (1.0f-OMEGA)*SRC_NT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2); - DST_NB( dstGrid ) = (1.0f-OMEGA)*SRC_NB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2); - DST_ST( dstGrid ) = (1.0f-OMEGA)*SRC_ST( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2); - DST_SB( dstGrid ) = (1.0f-OMEGA)*SRC_SB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2); - DST_ET( dstGrid ) = (1.0f-OMEGA)*SRC_ET( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2); - DST_EB( dstGrid ) = (1.0f-OMEGA)*SRC_EB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2); - DST_WT( dstGrid ) = (1.0f-OMEGA)*SRC_WT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2); - DST_WB( dstGrid ) = (1.0f-OMEGA)*SRC_WB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2); - SWEEP_END + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + if (TEST_FLAG_SWEEP(srcGrid, OBSTACLE)) { + DST_C(dstGrid) = SRC_C(srcGrid); + DST_S(dstGrid) = SRC_N(srcGrid); + DST_N(dstGrid) = SRC_S(srcGrid); + DST_W(dstGrid) = SRC_E(srcGrid); + DST_E(dstGrid) = SRC_W(srcGrid); + DST_B(dstGrid) = SRC_T(srcGrid); + DST_T(dstGrid) = SRC_B(srcGrid); + DST_SW(dstGrid) = SRC_NE(srcGrid); + DST_SE(dstGrid) = SRC_NW(srcGrid); + DST_NW(dstGrid) = SRC_SE(srcGrid); + DST_NE(dstGrid) = SRC_SW(srcGrid); + DST_SB(dstGrid) = SRC_NT(srcGrid); + DST_ST(dstGrid) = SRC_NB(srcGrid); + DST_NB(dstGrid) = SRC_ST(srcGrid); + DST_NT(dstGrid) = SRC_SB(srcGrid); + DST_WB(dstGrid) = SRC_ET(srcGrid); + DST_WT(dstGrid) = SRC_EB(srcGrid); + DST_EB(dstGrid) = SRC_WT(srcGrid); + DST_ET(dstGrid) = SRC_WB(srcGrid); + continue; + } + + rho = +SRC_C(srcGrid) + SRC_N(srcGrid) + SRC_S(srcGrid) + SRC_E(srcGrid) + + SRC_W(srcGrid) + SRC_T(srcGrid) + SRC_B(srcGrid) + SRC_NE(srcGrid) + + SRC_NW(srcGrid) + SRC_SE(srcGrid) + SRC_SW(srcGrid) + SRC_NT(srcGrid) + + SRC_NB(srcGrid) + SRC_ST(srcGrid) + SRC_SB(srcGrid) + SRC_ET(srcGrid) + + SRC_EB(srcGrid) + SRC_WT(srcGrid) + SRC_WB(srcGrid); + + ux = +SRC_E(srcGrid) - SRC_W(srcGrid) + SRC_NE(srcGrid) - SRC_NW(srcGrid) + + SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_ET(srcGrid) + SRC_EB(srcGrid) - + SRC_WT(srcGrid) - SRC_WB(srcGrid); + uy = +SRC_N(srcGrid) - SRC_S(srcGrid) + SRC_NE(srcGrid) + SRC_NW(srcGrid) - + SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_NT(srcGrid) + SRC_NB(srcGrid) - + SRC_ST(srcGrid) - SRC_SB(srcGrid); + uz = +SRC_T(srcGrid) - SRC_B(srcGrid) + SRC_NT(srcGrid) - SRC_NB(srcGrid) + + SRC_ST(srcGrid) - SRC_SB(srcGrid) + SRC_ET(srcGrid) - SRC_EB(srcGrid) + + SRC_WT(srcGrid) - SRC_WB(srcGrid); + + ux /= rho; + uy /= rho; + uz /= rho; + + if (TEST_FLAG_SWEEP(srcGrid, ACCEL)) { + ux = 0.005f; + uy = 0.002f; + uz = 0.000f; + } + + u2 = 1.5f * (ux * ux + uy * uy + uz * uz); + DST_C(dstGrid) = + (1.0f - OMEGA) * SRC_C(srcGrid) + DFL1 * OMEGA * rho * (1.0f - u2); + + DST_N(dstGrid) = (1.0f - OMEGA) * SRC_N(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy + 3.0f) - u2); + DST_S(dstGrid) = (1.0f - OMEGA) * SRC_S(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy - 3.0f) - u2); + DST_E(dstGrid) = (1.0f - OMEGA) * SRC_E(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux + 3.0f) - u2); + DST_W(dstGrid) = (1.0f - OMEGA) * SRC_W(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux - 3.0f) - u2); + DST_T(dstGrid) = (1.0f - OMEGA) * SRC_T(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz + 3.0f) - u2); + DST_B(dstGrid) = (1.0f - OMEGA) * SRC_B(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz - 3.0f) - u2); + + DST_NE(dstGrid) = (1.0f - OMEGA) * SRC_NE(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux + uy) * (4.5f * (+ux + uy) + 3.0f) - u2); + DST_NW(dstGrid) = (1.0f - OMEGA) * SRC_NW(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux + uy) * (4.5f * (-ux + uy) + 3.0f) - u2); + DST_SE(dstGrid) = (1.0f - OMEGA) * SRC_SE(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux - uy) * (4.5f * (+ux - uy) + 3.0f) - u2); + DST_SW(dstGrid) = (1.0f - OMEGA) * SRC_SW(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux - uy) * (4.5f * (-ux - uy) + 3.0f) - u2); + DST_NT(dstGrid) = (1.0f - OMEGA) * SRC_NT(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+uy + uz) * (4.5f * (+uy + uz) + 3.0f) - u2); + DST_NB(dstGrid) = (1.0f - OMEGA) * SRC_NB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+uy - uz) * (4.5f * (+uy - uz) + 3.0f) - u2); + DST_ST(dstGrid) = (1.0f - OMEGA) * SRC_ST(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-uy + uz) * (4.5f * (-uy + uz) + 3.0f) - u2); + DST_SB(dstGrid) = (1.0f - OMEGA) * SRC_SB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-uy - uz) * (4.5f * (-uy - uz) + 3.0f) - u2); + DST_ET(dstGrid) = (1.0f - OMEGA) * SRC_ET(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux + uz) * (4.5f * (+ux + uz) + 3.0f) - u2); + DST_EB(dstGrid) = (1.0f - OMEGA) * SRC_EB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux - uz) * (4.5f * (+ux - uz) + 3.0f) - u2); + DST_WT(dstGrid) = (1.0f - OMEGA) * SRC_WT(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux + uz) * (4.5f * (-ux + uz) + 3.0f) - u2); + DST_WB(dstGrid) = (1.0f - OMEGA) * SRC_WB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux - uz) * (4.5f * (-ux - uz) + 3.0f) - u2); + SWEEP_END } /*############################################################################*/ -void LBM_handleInOutFlow( LBM_Grid srcGrid ) { - float ux , uy , uz , rho , - ux1, uy1, uz1, rho1, - ux2, uy2, uz2, rho2, - u2, px, py; - SWEEP_VAR +void LBM_handleInOutFlow(LBM_Grid srcGrid) { + float ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, uy2, uz2, rho2, u2, px, py; + SWEEP_VAR - /* inflow */ - /*voption indep*/ + /* inflow */ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \ - ux2, uy2, uz2, rho2, u2, px, py ) +#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, \ + uy2, uz2, rho2, u2, px, py) #endif #endif - SWEEP_START( 0, 0, 0, 0, 0, 1 ) - rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WB ); - rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WB ); - - rho = 2.0*rho1 - rho2; - - px = (SWEEP_X / (0.5*(SIZE_X-1))) - 1.0; - py = (SWEEP_Y / (0.5*(SIZE_Y-1))) - 1.0; - ux = 0.00; - uy = 0.00; - uz = 0.01 * (1.0-px*px) * (1.0-py*py); - - u2 = 1.5 * (ux*ux + uy*uy + uz*uz); - - LOCAL( srcGrid, C ) = DFL1*rho*(1.0 - u2); - - LOCAL( srcGrid, N ) = DFL2*rho*(1.0 + uy*(4.5*uy + 3.0) - u2); - LOCAL( srcGrid, S ) = DFL2*rho*(1.0 + uy*(4.5*uy - 3.0) - u2); - LOCAL( srcGrid, E ) = DFL2*rho*(1.0 + ux*(4.5*ux + 3.0) - u2); - LOCAL( srcGrid, W ) = DFL2*rho*(1.0 + ux*(4.5*ux - 3.0) - u2); - LOCAL( srcGrid, T ) = DFL2*rho*(1.0 + uz*(4.5*uz + 3.0) - u2); - LOCAL( srcGrid, B ) = DFL2*rho*(1.0 + uz*(4.5*uz - 3.0) - u2); - - LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2); - LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2); - LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2); - LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2); - LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2); - LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2); - LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2); - LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2); - LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2); - LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2); - SWEEP_END - - /* outflow */ - /*voption indep*/ + SWEEP_START(0, 0, 0, 0, 0, 1) + rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WB); + rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WB); + + rho = 2.0 * rho1 - rho2; + + px = (SWEEP_X / (0.5 * (SIZE_X - 1))) - 1.0; + py = (SWEEP_Y / (0.5 * (SIZE_Y - 1))) - 1.0; + ux = 0.00; + uy = 0.00; + uz = 0.01 * (1.0 - px * px) * (1.0 - py * py); + + u2 = 1.5 * (ux * ux + uy * uy + uz * uz); + + LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2); + + LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2); + LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2); + LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2); + LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2); + LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2); + LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2); + + LOCAL(srcGrid, NE) = + DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2); + LOCAL(srcGrid, NW) = + DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2); + LOCAL(srcGrid, SE) = + DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2); + LOCAL(srcGrid, SW) = + DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2); + LOCAL(srcGrid, NT) = + DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2); + LOCAL(srcGrid, NB) = + DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ST) = + DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2); + LOCAL(srcGrid, SB) = + DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ET) = + DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2); + LOCAL(srcGrid, EB) = + DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2); + LOCAL(srcGrid, WT) = + DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2); + LOCAL(srcGrid, WB) = + DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2); + SWEEP_END + + /* outflow */ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \ - ux2, uy2, uz2, rho2, u2, px, py ) +#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, \ + uy2, uz2, rho2, u2, px, py) #endif #endif - SWEEP_START( 0, 0, SIZE_Z-1, 0, 0, SIZE_Z ) - rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB ); - ux1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB ); - uy1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ); - uz1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB ); - - ux1 /= rho1; - uy1 /= rho1; - uz1 /= rho1; - - rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB ); - ux2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB ); - uy2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ); - uz2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB ); - - ux2 /= rho2; - uy2 /= rho2; - uz2 /= rho2; - - rho = 1.0; - - ux = 2*ux1 - ux2; - uy = 2*uy1 - uy2; - uz = 2*uz1 - uz2; - - u2 = 1.5 * (ux*ux + uy*uy + uz*uz); - - LOCAL( srcGrid, C ) = DFL1*rho*(1.0 - u2); - - LOCAL( srcGrid, N ) = DFL2*rho*(1.0 + uy*(4.5*uy + 3.0) - u2); - LOCAL( srcGrid, S ) = DFL2*rho*(1.0 + uy*(4.5*uy - 3.0) - u2); - LOCAL( srcGrid, E ) = DFL2*rho*(1.0 + ux*(4.5*ux + 3.0) - u2); - LOCAL( srcGrid, W ) = DFL2*rho*(1.0 + ux*(4.5*ux - 3.0) - u2); - LOCAL( srcGrid, T ) = DFL2*rho*(1.0 + uz*(4.5*uz + 3.0) - u2); - LOCAL( srcGrid, B ) = DFL2*rho*(1.0 + uz*(4.5*uz - 3.0) - u2); - - LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2); - LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2); - LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2); - LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2); - LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2); - LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2); - LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2); - LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2); - LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2); - LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2); - SWEEP_END + SWEEP_START(0, 0, SIZE_Z - 1, 0, 0, SIZE_Z) + rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB); + ux1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB); + uy1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB); + uz1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB); + + ux1 /= rho1; + uy1 /= rho1; + uz1 /= rho1; + + rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB); + ux2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB); + uy2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB); + uz2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB); + + ux2 /= rho2; + uy2 /= rho2; + uz2 /= rho2; + + rho = 1.0; + + ux = 2 * ux1 - ux2; + uy = 2 * uy1 - uy2; + uz = 2 * uz1 - uz2; + + u2 = 1.5 * (ux * ux + uy * uy + uz * uz); + + LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2); + + LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2); + LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2); + LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2); + LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2); + LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2); + LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2); + + LOCAL(srcGrid, NE) = + DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2); + LOCAL(srcGrid, NW) = + DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2); + LOCAL(srcGrid, SE) = + DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2); + LOCAL(srcGrid, SW) = + DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2); + LOCAL(srcGrid, NT) = + DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2); + LOCAL(srcGrid, NB) = + DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ST) = + DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2); + LOCAL(srcGrid, SB) = + DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ET) = + DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2); + LOCAL(srcGrid, EB) = + DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2); + LOCAL(srcGrid, WT) = + DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2); + LOCAL(srcGrid, WB) = + DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2); + SWEEP_END } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = + LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = +LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - int x, y, z; - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - rho = + GRID_ENTRY( grid, x, y, z, C ) + GRID_ENTRY( grid, x, y, z, N ) - + GRID_ENTRY( grid, x, y, z, S ) + GRID_ENTRY( grid, x, y, z, E ) - + GRID_ENTRY( grid, x, y, z, W ) + GRID_ENTRY( grid, x, y, z, T ) - + GRID_ENTRY( grid, x, y, z, B ) + GRID_ENTRY( grid, x, y, z, NE ) - + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE ) - + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT ) - + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST ) - + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET ) - + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT ) - + GRID_ENTRY( grid, x, y, z, WB ); - ux = + GRID_ENTRY( grid, x, y, z, E ) - GRID_ENTRY( grid, x, y, z, W ) - + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) - + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) - - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - uy = + GRID_ENTRY( grid, x, y, z, N ) - GRID_ENTRY( grid, x, y, z, S ) - + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) - - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) - - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ); - uz = + GRID_ENTRY( grid, x, y, z, T ) - GRID_ENTRY( grid, x, y, z, B ) - + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) - + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) - + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) - + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - } - } - } - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + int x, y, z; + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) + + GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) + + GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) + + GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) + + GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) + + GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) + + GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) + + GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) + + GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) + + GRID_ENTRY(grid, x, y, z, WB); + ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) + + GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) + + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) - + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) + + GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) - + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) - + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB); + uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) + + GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) + + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) + + GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) + + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + } + } + } + + fclose(file); } /*############################################################################*/ -void LBM_compareVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - int x, y, z; - float rho, ux, uy, uz; - OUTPUT_PRECISION fileUx, fileUy, fileUz, - dUx, dUy, dUz, - diff2, maxDiff2 = -1e+30; - - FILE* file = fopen( filename, (binary ? "rb" : "r") ); - - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - rho = + GRID_ENTRY( grid, x, y, z, C ) + GRID_ENTRY( grid, x, y, z, N ) - + GRID_ENTRY( grid, x, y, z, S ) + GRID_ENTRY( grid, x, y, z, E ) - + GRID_ENTRY( grid, x, y, z, W ) + GRID_ENTRY( grid, x, y, z, T ) - + GRID_ENTRY( grid, x, y, z, B ) + GRID_ENTRY( grid, x, y, z, NE ) - + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE ) - + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT ) - + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST ) - + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET ) - + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT ) - + GRID_ENTRY( grid, x, y, z, WB ); - ux = + GRID_ENTRY( grid, x, y, z, E ) - GRID_ENTRY( grid, x, y, z, W ) - + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) - + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) - - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - uy = + GRID_ENTRY( grid, x, y, z, N ) - GRID_ENTRY( grid, x, y, z, S ) - + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) - - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) - - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ); - uz = + GRID_ENTRY( grid, x, y, z, T ) - GRID_ENTRY( grid, x, y, z, B ) - + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) - + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) - + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) - + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - loadValue( file, &fileUx ); - loadValue( file, &fileUy ); - loadValue( file, &fileUz ); - } - else { - if( sizeof( OUTPUT_PRECISION ) == sizeof( double )) { - fscanf( file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz ); - } - else { - fscanf( file, "%f %f %f\n", &fileUx, &fileUy, &fileUz ); - } - } - - dUx = ux - fileUx; - dUy = uy - fileUy; - dUz = uz - fileUz; - diff2 = dUx*dUx + dUy*dUy + dUz*dUz; - if( diff2 > maxDiff2 ) maxDiff2 = diff2; - } - } - } +void LBM_compareVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + int x, y, z; + float rho, ux, uy, uz; + OUTPUT_PRECISION fileUx, fileUy, fileUz, dUx, dUy, dUz, diff2, + maxDiff2 = -1e+30; + + FILE *file = fopen(filename, (binary ? "rb" : "r")); + + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) + + GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) + + GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) + + GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) + + GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) + + GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) + + GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) + + GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) + + GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) + + GRID_ENTRY(grid, x, y, z, WB); + ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) + + GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) + + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) - + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) + + GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) - + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) - + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB); + uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) + + GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) + + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) + + GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) + + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + loadValue(file, &fileUx); + loadValue(file, &fileUy); + loadValue(file, &fileUz); + } else { + if (sizeof(OUTPUT_PRECISION) == sizeof(double)) { + fscanf(file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz); + } else { + fscanf(file, "%f %f %f\n", &fileUx, &fileUy, &fileUz); + } + } + + dUx = ux - fileUx; + dUy = uy - fileUy; + dUz = uz - fileUz; + diff2 = dUx * dUx + dUy * dUy + dUz * dUz; + if (diff2 > maxDiff2) + maxDiff2 = diff2; + } + } + } #if defined(SPEC_CPU) - printf( "LBM_compareVelocityField: maxDiff = %e \n\n", - sqrt( maxDiff2 ) ); + printf("LBM_compareVelocityField: maxDiff = %e \n\n", sqrt(maxDiff2)); #else - printf( "LBM_compareVelocityField: maxDiff = %e ==> %s\n\n", - sqrt( maxDiff2 ), - sqrt( maxDiff2 ) > 1e-5 ? "##### ERROR #####" : "OK" ); + printf("LBM_compareVelocityField: maxDiff = %e ==> %s\n\n", sqrt(maxDiff2), + sqrt(maxDiff2) > 1e-5 ? "##### ERROR #####" : "OK"); #endif - fclose( file ); + fclose(file); } - diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h index e35818c0b300593f382a61131e7a35584d35cee1..94189f0f2bcc080ed79e42941b5a0638649d46e3 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h @@ -18,30 +18,31 @@ typedef enum {C = 0, NT, NB, ST, SB, ET, EB, WT, WB, FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; - */ + */ #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; - +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_initializeSpecialCellsForChannel( LBM_Grid grid ); -void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 ); -void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ); -void LBM_handleInOutFlow( LBM_Grid srcGrid ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); -void LBM_compareVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_initializeSpecialCellsForChannel(LBM_Grid grid); +void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2); +void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid); +void LBM_handleInOutFlow(LBM_Grid srcGrid); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); +void LBM_compareVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h index 42c999e204dffc83c1affe8d56e086dcf1815b43..92b4c1b21dc9d87531691b3fce4bd1ff01b201f8 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h @@ -3,163 +3,204 @@ #ifndef _LBM_MACROS_H_ #define _LBM_MACROS_H_ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; -#define SIZE (120) -#define SIZE_X (1*SIZE) -#define SIZE_Y (1*SIZE) +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; +#define SIZE (120) +#define SIZE_X (1 * SIZE) +#define SIZE_Y (1 * SIZE) #define SIZE_Z (150) /*############################################################################*/ -typedef float LBM_Grid[SIZE_Z*SIZE_Y*SIZE_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float LBM_Grid[SIZE_Z * SIZE_Y * SIZE_X * N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ -#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \ - (y)*SIZE_X+(z)*SIZE_X*SIZE_Y)) +#define CALC_INDEX(x, y, z, e) \ + ((e) + N_CELL_ENTRIES * ((x) + (y)*SIZE_X + (z)*SIZE_X * SIZE_Y)) #define SWEEP_VAR int i; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( i = CALC_INDEX(x1, y1, z1, 0); \ - i < CALC_INDEX(x2, y2, z2, 0); \ - i += N_CELL_ENTRIES ) { +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (i = CALC_INDEX(x1, y1, z1, 0); i < CALC_INDEX(x2, y2, z2, 0); \ + i += N_CELL_ENTRIES) { #define SWEEP_END } -#define SWEEP_X ((i / N_CELL_ENTRIES) % SIZE_X) +#define SWEEP_X ((i / N_CELL_ENTRIES) % SIZE_X) #define SWEEP_Y (((i / N_CELL_ENTRIES) / SIZE_X) % SIZE_Y) -#define SWEEP_Z ((i / N_CELL_ENTRIES) / (SIZE_X*SIZE_Y)) - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_Z ((i / N_CELL_ENTRIES) / (SIZE_X * SIZE_Y)) + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX(dx, dy, dz, e) + (i)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #define COLLIDE_STREAM #ifdef COLLIDE_STREAM -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* COLLIDE_STREAM */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* COLLIDE_STREAM */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* const _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *const _aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c index 85600dbfdf20059a71694b7ae72f0243ee5c82eb..6985e3e58b300a7fad88ed4623340562693c80bd 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c @@ -8,10 +8,10 @@ #include <stdlib.h> #if defined(SPEC_CPU) -# include <time.h> +#include <time.h> #else -# include <sys/times.h> -# include <unistd.h> +#include <sys/times.h> +#include <unistd.h> #endif #include <sys/stat.h> @@ -23,168 +23,169 @@ static LBM_GridPtr srcGrid, dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; +int main(int nArgs, char *arg[]) { + MAIN_Param param; #if !defined(SPEC_CPU) - MAIN_Time time; + MAIN_Time time; #endif - int t; + int t; - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); - MAIN_initialize( ¶m ); + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); + MAIN_initialize(¶m); #if !defined(SPEC_CPU) - MAIN_startClock( &time ); + MAIN_startClock(&time); #endif - for( t = 1; t <= param.nTimeSteps; t++ ) { - if( param.simType == CHANNEL ) { - LBM_handleInOutFlow( *srcGrid ); - } + for (t = 1; t <= param.nTimeSteps; t++) { + if (param.simType == CHANNEL) { + LBM_handleInOutFlow(*srcGrid); + } - LBM_performStreamCollide( *srcGrid, *dstGrid ); - LBM_swapGrids( &srcGrid, &dstGrid ); + LBM_performStreamCollide(*srcGrid, *dstGrid); + LBM_swapGrids(&srcGrid, &dstGrid); - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); - //LBM_showGridStatistics( *srcGrid ); - } - } + if ((t & 63) == 0) { + printf("timestep: %i\n", t); + // LBM_showGridStatistics( *srcGrid ); + } + } #if !defined(SPEC_CPU) - MAIN_stopClock( &time, ¶m ); + MAIN_stopClock(&time, ¶m); #endif - MAIN_finalize( ¶m ); + MAIN_finalize(¶m); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); - return 0; + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params) { - struct stat fileStat; - - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } - - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } - } - else param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; - param->action = STORE; - param->simType = LDC; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; + + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); + } + } else + param->obstacleFilename = NULL; + + param->resultFilename = params->outFile; + param->action = STORE; + param->simType = LDC; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - const char actionString[3][32] = {"nothing", "compare", "store"}; - const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"}; - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - actionString[param->action], simTypeString[param->simType], - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + const char actionString[3][32] = {"nothing", "compare", "store"}; + const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"}; + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, actionString[param->action], + simTypeString[param->simType], + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param ) { - LBM_allocateGrid( (float**) &srcGrid ); - LBM_allocateGrid( (float**) &dstGrid ); +void MAIN_initialize(const MAIN_Param *param) { + LBM_allocateGrid((float **)&srcGrid); + LBM_allocateGrid((float **)&dstGrid); - LBM_initializeGrid( *srcGrid ); - LBM_initializeGrid( *dstGrid ); + LBM_initializeGrid(*srcGrid); + LBM_initializeGrid(*dstGrid); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( *srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( *dstGrid, param->obstacleFilename ); - } + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(*srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(*dstGrid, param->obstacleFilename); + } - if( param->simType == CHANNEL ) { - LBM_initializeSpecialCellsForChannel( *srcGrid ); - LBM_initializeSpecialCellsForChannel( *dstGrid ); - } - else { - LBM_initializeSpecialCellsForLDC( *srcGrid ); - LBM_initializeSpecialCellsForLDC( *dstGrid ); - } + if (param->simType == CHANNEL) { + LBM_initializeSpecialCellsForChannel(*srcGrid); + LBM_initializeSpecialCellsForChannel(*dstGrid); + } else { + LBM_initializeSpecialCellsForLDC(*srcGrid); + LBM_initializeSpecialCellsForLDC(*dstGrid); + } - LBM_showGridStatistics( *srcGrid ); + LBM_showGridStatistics(*srcGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param ) { - LBM_showGridStatistics( *srcGrid ); +void MAIN_finalize(const MAIN_Param *param) { + LBM_showGridStatistics(*srcGrid); - if( param->action == COMPARE ) - LBM_compareVelocityField( *srcGrid, param->resultFilename, TRUE ); - if( param->action == STORE ) - LBM_storeVelocityField( *srcGrid, param->resultFilename, TRUE ); + if (param->action == COMPARE) + LBM_compareVelocityField(*srcGrid, param->resultFilename, TRUE); + if (param->action == STORE) + LBM_storeVelocityField(*srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &srcGrid ); - LBM_freeGrid( (float**) &dstGrid ); + LBM_freeGrid((float **)&srcGrid); + LBM_freeGrid((float **)&dstGrid); } #if !defined(SPEC_CPU) /*############################################################################*/ -void MAIN_startClock( MAIN_Time* time ) { - time->timeScale = 1.0 / sysconf( _SC_CLK_TCK ); - time->tickStart = times( &(time->timeStart) ); +void MAIN_startClock(MAIN_Time *time) { + time->timeScale = 1.0 / sysconf(_SC_CLK_TCK); + time->tickStart = times(&(time->timeStart)); } - /*############################################################################*/ -void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param ) { - time->tickStop = times( &(time->timeStop) ); - - printf( "MAIN_stopClock:\n" - "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n", - (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale, - (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale, - (time->timeStop.tms_utime - time->timeStart.tms_utime + - time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale, - (time->tickStop - time->tickStart ) * time->timeScale, - 1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps / - (time->tickStop - time->tickStart ) / time->timeScale ); +void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param) { + time->tickStop = times(&(time->timeStop)); + + printf( + "MAIN_stopClock:\n" + "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n", + (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale, + (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale, + (time->timeStop.tms_utime - time->timeStart.tms_utime + + time->timeStop.tms_stime - time->timeStart.tms_stime) * + time->timeScale, + (time->tickStop - time->tickStart) * time->timeScale, + 1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps / + (time->tickStop - time->tickStart) / time->timeScale); } #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h index e207f4158f06a1cdf74ccc4fd0eb982543de0f87..4eb16dd70d0a121488ae657442b7e950a0afd16a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h @@ -18,34 +18,35 @@ #if !defined(SPEC_CPU) typedef struct { - float timeScale; - clock_t tickStart, tickStop; - struct tms timeStart, timeStop; + float timeScale; + clock_t tickStart, tickStop; + struct tms timeStart, timeStop; } MAIN_Time; #endif -typedef enum {NOTHING = 0, COMPARE, STORE} MAIN_Action; -typedef enum {LDC = 0, CHANNEL} MAIN_SimType; +typedef enum { NOTHING = 0, COMPARE, STORE } MAIN_Action; +typedef enum { LDC = 0, CHANNEL } MAIN_SimType; typedef struct { - int nTimeSteps; - char* resultFilename; - MAIN_Action action; - MAIN_SimType simType; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + MAIN_Action action; + MAIN_SimType simType; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param ); -void MAIN_finalize( const MAIN_Param* param ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param); +void MAIN_finalize(const MAIN_Param *param); #if !defined(SPEC_CPU) -void MAIN_startClock( MAIN_Time* time ); -void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param ); +void MAIN_startClock(MAIN_Time *time); +void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param); #endif /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h index 793109e2a547fc68978582f5f9514814a9272b58..9c3c00905cdf4ceab6a84341766c375499a8dc1a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h @@ -13,30 +13,30 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension -//Align rows of X to 128-bytes +// Changeable settings +// Padding in each dimension +// Align rows of X to 128-bytes #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*(SIZE_Z)) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*(PADDED_Z)) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * (SIZE_Z)) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * (PADDED_Z)) -//Flattening function +// Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 0 @@ -45,7 +45,7 @@ #define SCATTER #endif -//CUDA block size (not trivially changeable here) +// CUDA block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h index 6f3b1138805911d1053c3c78ebcf9496bec7604f..10c32ddd3761120bfb1073fe3c9ee65a9ef8cff6 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h @@ -13,21 +13,38 @@ /*############################################################################*/ - /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; - +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #include "layout_config.h" #include "lbm_macros.h" @@ -36,23 +53,23 @@ typedef enum {OBSTACLE = 1 << 0, #ifdef __cplusplus extern "C" { #endif -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( LBM_GridPtr grid1, LBM_GridPtr grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(LBM_GridPtr grid1, LBM_GridPtr grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* CUDA ***********************************************************************/ -void CUDA_LBM_allocateGrid( float** ptr ); -void CUDA_LBM_freeGrid( float** ptr ); -void CUDA_LBM_initializeGrid( float** d_grid, float** h_grid ); -void CUDA_LBM_getDeviceGrid( float** d_grid, float** h_grid ); -void CUDA_LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ); +void CUDA_LBM_allocateGrid(float **ptr); +void CUDA_LBM_freeGrid(float **ptr); +void CUDA_LBM_initializeGrid(float **d_grid, float **h_grid); +void CUDA_LBM_getDeviceGrid(float **d_grid, float **h_grid); +void CUDA_LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid); #ifdef __cplusplus } #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h index 9c3934c01f32db7b7b8e2075dda275f0c140f75a..7f5e522da17823325e1f74bd405faa4b560c1a31 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h @@ -19,154 +19,175 @@ /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* SCATTER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* SCATTER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc index bc238e27340f6fdd16f96be059e5fc831d598609..6ed10cad68cd7c02b427ad58dbe784d5fd15b6aa 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc @@ -18,159 +18,156 @@ /*############################################################################*/ static LBM_Grid CUDA_srcGrid, CUDA_dstGrid; - /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; - - pb_InitializeTimerSet(&timers); - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); - - - static LBM_GridPtr TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); - - MAIN_initialize( ¶m ); - - for( t = 1; t <= param.nTimeSteps; t++ ) { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - CUDA_LBM_performStreamCollide( CUDA_srcGrid, CUDA_dstGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids( &CUDA_srcGrid, &CUDA_dstGrid ); - - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; + + pb_InitializeTimerSet(&timers); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + + static LBM_GridPtr TEMP_srcGrid; + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); + + MAIN_initialize(¶m); + + for (t = 1; t <= param.nTimeSteps; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + CUDA_LBM_performStreamCollide(CUDA_srcGrid, CUDA_dstGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_swapGrids(&CUDA_srcGrid, &CUDA_dstGrid); + + if ((t & 63) == 0) { + printf("timestep: %i\n", t); #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - } - } + } + } - MAIN_finalize( ¶m ); + MAIN_finalize(¶m); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); - return 0; + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; - - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } - - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } - } - else param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; + + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); + } + } else + param->obstacleFilename = NULL; + + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; +void MAIN_initialize(const MAIN_Param *param) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //Setup DEVICE datastructures - CUDA_LBM_allocateGrid( (float**) &CUDA_srcGrid ); - CUDA_LBM_allocateGrid( (float**) &CUDA_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // Setup DEVICE datastructures + CUDA_LBM_allocateGrid((float **)&CUDA_srcGrid); + CUDA_LBM_allocateGrid((float **)&CUDA_dstGrid); - //Initialize DEVICE datastructures - CUDA_LBM_initializeGrid( (float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid ); - CUDA_LBM_initializeGrid( (float**)&CUDA_dstGrid, (float**)&TEMP_dstGrid ); + // Initialize DEVICE datastructures + CUDA_LBM_initializeGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid); + CUDA_LBM_initializeGrid((float **)&CUDA_dstGrid, (float **)&TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param ) { - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param) { + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + CUDA_LBM_getDeviceGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - CUDA_LBM_freeGrid( (float**) &CUDA_srcGrid ); - CUDA_LBM_freeGrid( (float**) &CUDA_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + CUDA_LBM_freeGrid((float **)&CUDA_srcGrid); + CUDA_LBM_freeGrid((float **)&CUDA_dstGrid); } - diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h index 1bb326e9a9b7df66cd6dda5c411c60e7db962da3..2094b326226a5a97ee15ac32dccdb8dde1e41583 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h @@ -16,27 +16,30 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param ); -void MAIN_finalize( const MAIN_Param* param ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param); +void MAIN_finalize(const MAIN_Param *param); /*############################################################################*/ #ifndef __MCUDA__ -#define CUDA_ERRCK \ - {cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, cudaGetErrorString(err)); \ - exit(-1); \ - } \ +#define CUDA_ERRCK \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, \ + cudaGetErrorString(err)); \ + exit(-1); \ + } \ } #else #define CUDA_ERRCK diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h index 0b7efa770cddad6d71af56c7bf695391a21ad48a..497829547c0dfb739f5c3bb3b22c0c871935054f 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h @@ -13,34 +13,34 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension -//Note that the padding in the highest Cartesian dimension -// must be at least 4 to simplify the kernel by avoiding +// Changeable settings +// Padding in each dimension +// Note that the padding in the highest Cartesian dimension +// must be at least 4 to simplify the kernel by avoiding // out-of-bounds access checks. #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*(SIZE_Z)) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * (SIZE_Z)) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) -//Flattening function +// Flattening function // This macro will be used to map a 3-D index and element to a value -// The macro below implements the equivalent of a 3-D array of +// The macro below implements the equivalent of a 3-D array of // 20-element structures in C standard layout. -#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (e + N_CELL_ENTRIES * ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 0 @@ -49,7 +49,7 @@ #define SCATTER #endif -//CUDA block size (not trivially changeable here) +// CUDA block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h index 6f3b1138805911d1053c3c78ebcf9496bec7604f..10c32ddd3761120bfb1073fe3c9ee65a9ef8cff6 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h @@ -13,21 +13,38 @@ /*############################################################################*/ - /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; - +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #include "layout_config.h" #include "lbm_macros.h" @@ -36,23 +53,23 @@ typedef enum {OBSTACLE = 1 << 0, #ifdef __cplusplus extern "C" { #endif -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( LBM_GridPtr grid1, LBM_GridPtr grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(LBM_GridPtr grid1, LBM_GridPtr grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* CUDA ***********************************************************************/ -void CUDA_LBM_allocateGrid( float** ptr ); -void CUDA_LBM_freeGrid( float** ptr ); -void CUDA_LBM_initializeGrid( float** d_grid, float** h_grid ); -void CUDA_LBM_getDeviceGrid( float** d_grid, float** h_grid ); -void CUDA_LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ); +void CUDA_LBM_allocateGrid(float **ptr); +void CUDA_LBM_freeGrid(float **ptr); +void CUDA_LBM_initializeGrid(float **d_grid, float **h_grid); +void CUDA_LBM_getDeviceGrid(float **d_grid, float **h_grid); +void CUDA_LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid); #ifdef __cplusplus } #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h index 9c3934c01f32db7b7b8e2075dda275f0c140f75a..7f5e522da17823325e1f74bd405faa4b560c1a31 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h @@ -19,154 +19,175 @@ /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* SCATTER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* SCATTER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc index bc238e27340f6fdd16f96be059e5fc831d598609..6ed10cad68cd7c02b427ad58dbe784d5fd15b6aa 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc @@ -18,159 +18,156 @@ /*############################################################################*/ static LBM_Grid CUDA_srcGrid, CUDA_dstGrid; - /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; - - pb_InitializeTimerSet(&timers); - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); - - - static LBM_GridPtr TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); - - MAIN_initialize( ¶m ); - - for( t = 1; t <= param.nTimeSteps; t++ ) { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - CUDA_LBM_performStreamCollide( CUDA_srcGrid, CUDA_dstGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids( &CUDA_srcGrid, &CUDA_dstGrid ); - - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; + + pb_InitializeTimerSet(&timers); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + + static LBM_GridPtr TEMP_srcGrid; + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); + + MAIN_initialize(¶m); + + for (t = 1; t <= param.nTimeSteps; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + CUDA_LBM_performStreamCollide(CUDA_srcGrid, CUDA_dstGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_swapGrids(&CUDA_srcGrid, &CUDA_dstGrid); + + if ((t & 63) == 0) { + printf("timestep: %i\n", t); #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - } - } + } + } - MAIN_finalize( ¶m ); + MAIN_finalize(¶m); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); - return 0; + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; - - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } - - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } - } - else param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; + + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); + } + } else + param->obstacleFilename = NULL; + + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; +void MAIN_initialize(const MAIN_Param *param) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //Setup DEVICE datastructures - CUDA_LBM_allocateGrid( (float**) &CUDA_srcGrid ); - CUDA_LBM_allocateGrid( (float**) &CUDA_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // Setup DEVICE datastructures + CUDA_LBM_allocateGrid((float **)&CUDA_srcGrid); + CUDA_LBM_allocateGrid((float **)&CUDA_dstGrid); - //Initialize DEVICE datastructures - CUDA_LBM_initializeGrid( (float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid ); - CUDA_LBM_initializeGrid( (float**)&CUDA_dstGrid, (float**)&TEMP_dstGrid ); + // Initialize DEVICE datastructures + CUDA_LBM_initializeGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid); + CUDA_LBM_initializeGrid((float **)&CUDA_dstGrid, (float **)&TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param ) { - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param) { + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + CUDA_LBM_getDeviceGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - CUDA_LBM_freeGrid( (float**) &CUDA_srcGrid ); - CUDA_LBM_freeGrid( (float**) &CUDA_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + CUDA_LBM_freeGrid((float **)&CUDA_srcGrid); + CUDA_LBM_freeGrid((float **)&CUDA_dstGrid); } - diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h index 1bb326e9a9b7df66cd6dda5c411c60e7db962da3..2094b326226a5a97ee15ac32dccdb8dde1e41583 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h @@ -16,27 +16,30 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param ); -void MAIN_finalize( const MAIN_Param* param ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param); +void MAIN_finalize(const MAIN_Param *param); /*############################################################################*/ #ifndef __MCUDA__ -#define CUDA_ERRCK \ - {cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, cudaGetErrorString(err)); \ - exit(-1); \ - } \ +#define CUDA_ERRCK \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, \ + cudaGetErrorString(err)); \ + exit(-1); \ + } \ } #else #define CUDA_ERRCK diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h index ce9ce82c4acc351d7d239f3053023e964490eabe..0cd4bd055875c814b1712939b73179f7607043ad 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h @@ -14,7 +14,7 @@ #define OMEGA (1.95f) -#define OUTPUT_PRECISION float +#define OUTPUT_PRECISION float #define BOOL int #define TRUE (-1) diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c index 81294ac4455b4a92dfe80b7cb5d0ac0696a4b027..e6ea7c4d621e8470680a125bca11f70a634f2a56 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c @@ -4,9 +4,8 @@ #include "lbm.h" #include <math.h> -#include <stdlib.h> #include <stdio.h> - +#include <stdlib.h> #if !defined(SPEC_CPU) #ifdef _OPENMP @@ -16,674 +15,757 @@ /*############################################################################*/ -#define DFL1 (1.0/ 3.0) -#define DFL2 (1.0/18.0) -#define DFL3 (1.0/36.0) +#define DFL1 (1.0 / 3.0) +#define DFL2 (1.0 / 18.0) +#define DFL3 (1.0 / 36.0) /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES, - size = sizeof( LBM_Grid ) + 2*margin*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES, + size = sizeof(LBM_Grid) + 2 * margin * sizeof(float); - *ptr = malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + *ptr = malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } #if !defined(SPEC_CPU) - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); #endif - *ptr += margin; + *ptr += margin; } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES; +void LBM_freeGrid(float **ptr) { + const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES; - free( *ptr-margin ); - *ptr = NULL; + free(*ptr - margin); + *ptr = NULL; } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP #pragma omp parallel for #endif #endif - SWEEP_START( 0, 0, -2, 0, 0, SIZE_Z+2 ) - LOCAL( grid, C ) = DFL1; - LOCAL( grid, N ) = DFL2; - LOCAL( grid, S ) = DFL2; - LOCAL( grid, E ) = DFL2; - LOCAL( grid, W ) = DFL2; - LOCAL( grid, T ) = DFL2; - LOCAL( grid, B ) = DFL2; - LOCAL( grid, NE ) = DFL3; - LOCAL( grid, NW ) = DFL3; - LOCAL( grid, SE ) = DFL3; - LOCAL( grid, SW ) = DFL3; - LOCAL( grid, NT ) = DFL3; - LOCAL( grid, NB ) = DFL3; - LOCAL( grid, ST ) = DFL3; - LOCAL( grid, SB ) = DFL3; - LOCAL( grid, ET ) = DFL3; - LOCAL( grid, EB ) = DFL3; - LOCAL( grid, WT ) = DFL3; - LOCAL( grid, WB ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END + SWEEP_START(0, 0, -2, 0, 0, SIZE_Z + 2) + LOCAL(grid, C) = DFL1; + LOCAL(grid, N) = DFL2; + LOCAL(grid, S) = DFL2; + LOCAL(grid, E) = DFL2; + LOCAL(grid, W) = DFL2; + LOCAL(grid, T) = DFL2; + LOCAL(grid, B) = DFL2; + LOCAL(grid, NE) = DFL3; + LOCAL(grid, NW) = DFL3; + LOCAL(grid, SE) = DFL3; + LOCAL(grid, SW) = DFL3; + LOCAL(grid, NT) = DFL3; + LOCAL(grid, NB) = DFL3; + LOCAL(grid, ST) = DFL3; + LOCAL(grid, SB) = DFL3; + LOCAL(grid, ET) = DFL3; + LOCAL(grid, EB) = DFL3; + LOCAL(grid, WT) = DFL3; + LOCAL(grid, WB) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /*############################################################################*/ -void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 ) { - LBM_GridPtr aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2) { + LBM_GridPtr aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( x, y ) +#pragma omp parallel for private(x, y) #endif #endif - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_initializeSpecialCellsForChannel( LBM_Grid grid ) { - int x, y, z; +void LBM_initializeSpecialCellsForChannel(LBM_Grid grid) { + int x, y, z; - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( x, y ) +#pragma omp parallel for private(x, y) #endif #endif - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - - if( (z == 0 || z == SIZE_Z-1) && - ! TEST_FLAG( grid, x, y, z, OBSTACLE )) - SET_FLAG( grid, x, y, z, IN_OUT_FLOW ); - } - } - } - } + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + + if ((z == 0 || z == SIZE_Z - 1) && + !TEST_FLAG(grid, x, y, z, OBSTACLE)) + SET_FLAG(grid, x, y, z, IN_OUT_FLOW); + } + } + } + } } /*############################################################################*/ -void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ) { - SWEEP_VAR +void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid) { + SWEEP_VAR - float ux, uy, uz, u2, rho; + float ux, uy, uz, u2, rho; - /*voption indep*/ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( ux, uy, uz, u2, rho ) +#pragma omp parallel for private(ux, uy, uz, u2, rho) #endif #endif - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - if( TEST_FLAG_SWEEP( srcGrid, OBSTACLE )) { - DST_C ( dstGrid ) = SRC_C ( srcGrid ); - DST_S ( dstGrid ) = SRC_N ( srcGrid ); - DST_N ( dstGrid ) = SRC_S ( srcGrid ); - DST_W ( dstGrid ) = SRC_E ( srcGrid ); - DST_E ( dstGrid ) = SRC_W ( srcGrid ); - DST_B ( dstGrid ) = SRC_T ( srcGrid ); - DST_T ( dstGrid ) = SRC_B ( srcGrid ); - DST_SW( dstGrid ) = SRC_NE( srcGrid ); - DST_SE( dstGrid ) = SRC_NW( srcGrid ); - DST_NW( dstGrid ) = SRC_SE( srcGrid ); - DST_NE( dstGrid ) = SRC_SW( srcGrid ); - DST_SB( dstGrid ) = SRC_NT( srcGrid ); - DST_ST( dstGrid ) = SRC_NB( srcGrid ); - DST_NB( dstGrid ) = SRC_ST( srcGrid ); - DST_NT( dstGrid ) = SRC_SB( srcGrid ); - DST_WB( dstGrid ) = SRC_ET( srcGrid ); - DST_WT( dstGrid ) = SRC_EB( srcGrid ); - DST_EB( dstGrid ) = SRC_WT( srcGrid ); - DST_ET( dstGrid ) = SRC_WB( srcGrid ); - continue; - } - - rho = + SRC_C ( srcGrid ) + SRC_N ( srcGrid ) - + SRC_S ( srcGrid ) + SRC_E ( srcGrid ) - + SRC_W ( srcGrid ) + SRC_T ( srcGrid ) - + SRC_B ( srcGrid ) + SRC_NE( srcGrid ) - + SRC_NW( srcGrid ) + SRC_SE( srcGrid ) - + SRC_SW( srcGrid ) + SRC_NT( srcGrid ) - + SRC_NB( srcGrid ) + SRC_ST( srcGrid ) - + SRC_SB( srcGrid ) + SRC_ET( srcGrid ) - + SRC_EB( srcGrid ) + SRC_WT( srcGrid ) - + SRC_WB( srcGrid ); - - ux = + SRC_E ( srcGrid ) - SRC_W ( srcGrid ) - + SRC_NE( srcGrid ) - SRC_NW( srcGrid ) - + SRC_SE( srcGrid ) - SRC_SW( srcGrid ) - + SRC_ET( srcGrid ) + SRC_EB( srcGrid ) - - SRC_WT( srcGrid ) - SRC_WB( srcGrid ); - uy = + SRC_N ( srcGrid ) - SRC_S ( srcGrid ) - + SRC_NE( srcGrid ) + SRC_NW( srcGrid ) - - SRC_SE( srcGrid ) - SRC_SW( srcGrid ) - + SRC_NT( srcGrid ) + SRC_NB( srcGrid ) - - SRC_ST( srcGrid ) - SRC_SB( srcGrid ); - uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid ) - + SRC_NT( srcGrid ) - SRC_NB( srcGrid ) - + SRC_ST( srcGrid ) - SRC_SB( srcGrid ) - + SRC_ET( srcGrid ) - SRC_EB( srcGrid ) - + SRC_WT( srcGrid ) - SRC_WB( srcGrid ); - - ux /= rho; - uy /= rho; - uz /= rho; - - if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) { - ux = 0.005f; - uy = 0.002f; - uz = 0.000f; - } - - u2 = 1.5f * (ux*ux + uy*uy + uz*uz); - DST_C ( dstGrid ) = (1.0f-OMEGA)*SRC_C ( srcGrid ) + DFL1*OMEGA*rho*(1.0f - u2); - - DST_N ( dstGrid ) = (1.0f-OMEGA)*SRC_N ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uy*(4.5f*uy + 3.0f) - u2); - DST_S ( dstGrid ) = (1.0f-OMEGA)*SRC_S ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uy*(4.5f*uy - 3.0f) - u2); - DST_E ( dstGrid ) = (1.0f-OMEGA)*SRC_E ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + ux*(4.5f*ux + 3.0f) - u2); - DST_W ( dstGrid ) = (1.0f-OMEGA)*SRC_W ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + ux*(4.5f*ux - 3.0f) - u2); - DST_T ( dstGrid ) = (1.0f-OMEGA)*SRC_T ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uz*(4.5f*uz + 3.0f) - u2); - DST_B ( dstGrid ) = (1.0f-OMEGA)*SRC_B ( srcGrid ) + DFL2*OMEGA*rho*(1.0f + uz*(4.5f*uz - 3.0f) - u2); - - DST_NE( dstGrid ) = (1.0f-OMEGA)*SRC_NE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2); - DST_NW( dstGrid ) = (1.0f-OMEGA)*SRC_NW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2); - DST_SE( dstGrid ) = (1.0f-OMEGA)*SRC_SE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2); - DST_SW( dstGrid ) = (1.0f-OMEGA)*SRC_SW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2); - DST_NT( dstGrid ) = (1.0f-OMEGA)*SRC_NT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2); - DST_NB( dstGrid ) = (1.0f-OMEGA)*SRC_NB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2); - DST_ST( dstGrid ) = (1.0f-OMEGA)*SRC_ST( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2); - DST_SB( dstGrid ) = (1.0f-OMEGA)*SRC_SB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2); - DST_ET( dstGrid ) = (1.0f-OMEGA)*SRC_ET( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2); - DST_EB( dstGrid ) = (1.0f-OMEGA)*SRC_EB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2); - DST_WT( dstGrid ) = (1.0f-OMEGA)*SRC_WT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2); - DST_WB( dstGrid ) = (1.0f-OMEGA)*SRC_WB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2); - SWEEP_END + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + if (TEST_FLAG_SWEEP(srcGrid, OBSTACLE)) { + DST_C(dstGrid) = SRC_C(srcGrid); + DST_S(dstGrid) = SRC_N(srcGrid); + DST_N(dstGrid) = SRC_S(srcGrid); + DST_W(dstGrid) = SRC_E(srcGrid); + DST_E(dstGrid) = SRC_W(srcGrid); + DST_B(dstGrid) = SRC_T(srcGrid); + DST_T(dstGrid) = SRC_B(srcGrid); + DST_SW(dstGrid) = SRC_NE(srcGrid); + DST_SE(dstGrid) = SRC_NW(srcGrid); + DST_NW(dstGrid) = SRC_SE(srcGrid); + DST_NE(dstGrid) = SRC_SW(srcGrid); + DST_SB(dstGrid) = SRC_NT(srcGrid); + DST_ST(dstGrid) = SRC_NB(srcGrid); + DST_NB(dstGrid) = SRC_ST(srcGrid); + DST_NT(dstGrid) = SRC_SB(srcGrid); + DST_WB(dstGrid) = SRC_ET(srcGrid); + DST_WT(dstGrid) = SRC_EB(srcGrid); + DST_EB(dstGrid) = SRC_WT(srcGrid); + DST_ET(dstGrid) = SRC_WB(srcGrid); + continue; + } + + rho = +SRC_C(srcGrid) + SRC_N(srcGrid) + SRC_S(srcGrid) + SRC_E(srcGrid) + + SRC_W(srcGrid) + SRC_T(srcGrid) + SRC_B(srcGrid) + SRC_NE(srcGrid) + + SRC_NW(srcGrid) + SRC_SE(srcGrid) + SRC_SW(srcGrid) + SRC_NT(srcGrid) + + SRC_NB(srcGrid) + SRC_ST(srcGrid) + SRC_SB(srcGrid) + SRC_ET(srcGrid) + + SRC_EB(srcGrid) + SRC_WT(srcGrid) + SRC_WB(srcGrid); + + ux = +SRC_E(srcGrid) - SRC_W(srcGrid) + SRC_NE(srcGrid) - SRC_NW(srcGrid) + + SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_ET(srcGrid) + SRC_EB(srcGrid) - + SRC_WT(srcGrid) - SRC_WB(srcGrid); + uy = +SRC_N(srcGrid) - SRC_S(srcGrid) + SRC_NE(srcGrid) + SRC_NW(srcGrid) - + SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_NT(srcGrid) + SRC_NB(srcGrid) - + SRC_ST(srcGrid) - SRC_SB(srcGrid); + uz = +SRC_T(srcGrid) - SRC_B(srcGrid) + SRC_NT(srcGrid) - SRC_NB(srcGrid) + + SRC_ST(srcGrid) - SRC_SB(srcGrid) + SRC_ET(srcGrid) - SRC_EB(srcGrid) + + SRC_WT(srcGrid) - SRC_WB(srcGrid); + + ux /= rho; + uy /= rho; + uz /= rho; + + if (TEST_FLAG_SWEEP(srcGrid, ACCEL)) { + ux = 0.005f; + uy = 0.002f; + uz = 0.000f; + } + + u2 = 1.5f * (ux * ux + uy * uy + uz * uz); + DST_C(dstGrid) = + (1.0f - OMEGA) * SRC_C(srcGrid) + DFL1 * OMEGA * rho * (1.0f - u2); + + DST_N(dstGrid) = (1.0f - OMEGA) * SRC_N(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy + 3.0f) - u2); + DST_S(dstGrid) = (1.0f - OMEGA) * SRC_S(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy - 3.0f) - u2); + DST_E(dstGrid) = (1.0f - OMEGA) * SRC_E(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux + 3.0f) - u2); + DST_W(dstGrid) = (1.0f - OMEGA) * SRC_W(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux - 3.0f) - u2); + DST_T(dstGrid) = (1.0f - OMEGA) * SRC_T(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz + 3.0f) - u2); + DST_B(dstGrid) = (1.0f - OMEGA) * SRC_B(srcGrid) + + DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz - 3.0f) - u2); + + DST_NE(dstGrid) = (1.0f - OMEGA) * SRC_NE(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux + uy) * (4.5f * (+ux + uy) + 3.0f) - u2); + DST_NW(dstGrid) = (1.0f - OMEGA) * SRC_NW(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux + uy) * (4.5f * (-ux + uy) + 3.0f) - u2); + DST_SE(dstGrid) = (1.0f - OMEGA) * SRC_SE(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux - uy) * (4.5f * (+ux - uy) + 3.0f) - u2); + DST_SW(dstGrid) = (1.0f - OMEGA) * SRC_SW(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux - uy) * (4.5f * (-ux - uy) + 3.0f) - u2); + DST_NT(dstGrid) = (1.0f - OMEGA) * SRC_NT(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+uy + uz) * (4.5f * (+uy + uz) + 3.0f) - u2); + DST_NB(dstGrid) = (1.0f - OMEGA) * SRC_NB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+uy - uz) * (4.5f * (+uy - uz) + 3.0f) - u2); + DST_ST(dstGrid) = (1.0f - OMEGA) * SRC_ST(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-uy + uz) * (4.5f * (-uy + uz) + 3.0f) - u2); + DST_SB(dstGrid) = (1.0f - OMEGA) * SRC_SB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-uy - uz) * (4.5f * (-uy - uz) + 3.0f) - u2); + DST_ET(dstGrid) = (1.0f - OMEGA) * SRC_ET(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux + uz) * (4.5f * (+ux + uz) + 3.0f) - u2); + DST_EB(dstGrid) = (1.0f - OMEGA) * SRC_EB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (+ux - uz) * (4.5f * (+ux - uz) + 3.0f) - u2); + DST_WT(dstGrid) = (1.0f - OMEGA) * SRC_WT(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux + uz) * (4.5f * (-ux + uz) + 3.0f) - u2); + DST_WB(dstGrid) = (1.0f - OMEGA) * SRC_WB(srcGrid) + + DFL3 * OMEGA * rho * + (1.0f + (-ux - uz) * (4.5f * (-ux - uz) + 3.0f) - u2); + SWEEP_END } /*############################################################################*/ -void LBM_handleInOutFlow( LBM_Grid srcGrid ) { - float ux , uy , uz , rho , - ux1, uy1, uz1, rho1, - ux2, uy2, uz2, rho2, - u2, px, py; - SWEEP_VAR +void LBM_handleInOutFlow(LBM_Grid srcGrid) { + float ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, uy2, uz2, rho2, u2, px, py; + SWEEP_VAR - /* inflow */ - /*voption indep*/ + /* inflow */ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \ - ux2, uy2, uz2, rho2, u2, px, py ) +#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, \ + uy2, uz2, rho2, u2, px, py) #endif #endif - SWEEP_START( 0, 0, 0, 0, 0, 1 ) - rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WB ); - rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WB ); - - rho = 2.0*rho1 - rho2; - - px = (SWEEP_X / (0.5*(SIZE_X-1))) - 1.0; - py = (SWEEP_Y / (0.5*(SIZE_Y-1))) - 1.0; - ux = 0.00; - uy = 0.00; - uz = 0.01 * (1.0-px*px) * (1.0-py*py); - - u2 = 1.5 * (ux*ux + uy*uy + uz*uz); - - LOCAL( srcGrid, C ) = DFL1*rho*(1.0 - u2); - - LOCAL( srcGrid, N ) = DFL2*rho*(1.0 + uy*(4.5*uy + 3.0) - u2); - LOCAL( srcGrid, S ) = DFL2*rho*(1.0 + uy*(4.5*uy - 3.0) - u2); - LOCAL( srcGrid, E ) = DFL2*rho*(1.0 + ux*(4.5*ux + 3.0) - u2); - LOCAL( srcGrid, W ) = DFL2*rho*(1.0 + ux*(4.5*ux - 3.0) - u2); - LOCAL( srcGrid, T ) = DFL2*rho*(1.0 + uz*(4.5*uz + 3.0) - u2); - LOCAL( srcGrid, B ) = DFL2*rho*(1.0 + uz*(4.5*uz - 3.0) - u2); - - LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2); - LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2); - LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2); - LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2); - LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2); - LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2); - LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2); - LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2); - LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2); - LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2); - SWEEP_END - - /* outflow */ - /*voption indep*/ + SWEEP_START(0, 0, 0, 0, 0, 1) + rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WB); + rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WB); + + rho = 2.0 * rho1 - rho2; + + px = (SWEEP_X / (0.5 * (SIZE_X - 1))) - 1.0; + py = (SWEEP_Y / (0.5 * (SIZE_Y - 1))) - 1.0; + ux = 0.00; + uy = 0.00; + uz = 0.01 * (1.0 - px * px) * (1.0 - py * py); + + u2 = 1.5 * (ux * ux + uy * uy + uz * uz); + + LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2); + + LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2); + LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2); + LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2); + LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2); + LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2); + LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2); + + LOCAL(srcGrid, NE) = + DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2); + LOCAL(srcGrid, NW) = + DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2); + LOCAL(srcGrid, SE) = + DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2); + LOCAL(srcGrid, SW) = + DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2); + LOCAL(srcGrid, NT) = + DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2); + LOCAL(srcGrid, NB) = + DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ST) = + DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2); + LOCAL(srcGrid, SB) = + DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ET) = + DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2); + LOCAL(srcGrid, EB) = + DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2); + LOCAL(srcGrid, WT) = + DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2); + LOCAL(srcGrid, WB) = + DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2); + SWEEP_END + + /* outflow */ + /*voption indep*/ #if !defined(SPEC_CPU) #ifdef _OPENMP -#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \ - ux2, uy2, uz2, rho2, u2, px, py ) +#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, \ + uy2, uz2, rho2, u2, px, py) #endif #endif - SWEEP_START( 0, 0, SIZE_Z-1, 0, 0, SIZE_Z ) - rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB ); - ux1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB ); - uy1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ); - uz1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB ); - - ux1 /= rho1; - uy1 /= rho1; - uz1 /= rho1; - - rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, C ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB ); - ux2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB ); - uy2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) - - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ); - uz2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) - + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB ); - - ux2 /= rho2; - uy2 /= rho2; - uz2 /= rho2; - - rho = 1.0; - - ux = 2*ux1 - ux2; - uy = 2*uy1 - uy2; - uz = 2*uz1 - uz2; - - u2 = 1.5 * (ux*ux + uy*uy + uz*uz); - - LOCAL( srcGrid, C ) = DFL1*rho*(1.0 - u2); - - LOCAL( srcGrid, N ) = DFL2*rho*(1.0 + uy*(4.5*uy + 3.0) - u2); - LOCAL( srcGrid, S ) = DFL2*rho*(1.0 + uy*(4.5*uy - 3.0) - u2); - LOCAL( srcGrid, E ) = DFL2*rho*(1.0 + ux*(4.5*ux + 3.0) - u2); - LOCAL( srcGrid, W ) = DFL2*rho*(1.0 + ux*(4.5*ux - 3.0) - u2); - LOCAL( srcGrid, T ) = DFL2*rho*(1.0 + uz*(4.5*uz + 3.0) - u2); - LOCAL( srcGrid, B ) = DFL2*rho*(1.0 + uz*(4.5*uz - 3.0) - u2); - - LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2); - LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2); - LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2); - LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2); - LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2); - LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2); - LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2); - LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2); - LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2); - LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2); - LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2); - SWEEP_END + SWEEP_START(0, 0, SIZE_Z - 1, 0, 0, SIZE_Z) + rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB); + ux1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB); + uy1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB); + uz1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB); + + ux1 /= rho1; + uy1 /= rho1; + uz1 /= rho1; + + rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, C) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB); + ux2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB); + uy2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB); + uz2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) + + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) - + GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB); + + ux2 /= rho2; + uy2 /= rho2; + uz2 /= rho2; + + rho = 1.0; + + ux = 2 * ux1 - ux2; + uy = 2 * uy1 - uy2; + uz = 2 * uz1 - uz2; + + u2 = 1.5 * (ux * ux + uy * uy + uz * uz); + + LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2); + + LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2); + LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2); + LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2); + LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2); + LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2); + LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2); + + LOCAL(srcGrid, NE) = + DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2); + LOCAL(srcGrid, NW) = + DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2); + LOCAL(srcGrid, SE) = + DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2); + LOCAL(srcGrid, SW) = + DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2); + LOCAL(srcGrid, NT) = + DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2); + LOCAL(srcGrid, NB) = + DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ST) = + DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2); + LOCAL(srcGrid, SB) = + DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2); + LOCAL(srcGrid, ET) = + DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2); + LOCAL(srcGrid, EB) = + DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2); + LOCAL(srcGrid, WT) = + DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2); + LOCAL(srcGrid, WB) = + DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2); + SWEEP_END } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = + LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = +LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - int x, y, z; - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - rho = + GRID_ENTRY( grid, x, y, z, C ) + GRID_ENTRY( grid, x, y, z, N ) - + GRID_ENTRY( grid, x, y, z, S ) + GRID_ENTRY( grid, x, y, z, E ) - + GRID_ENTRY( grid, x, y, z, W ) + GRID_ENTRY( grid, x, y, z, T ) - + GRID_ENTRY( grid, x, y, z, B ) + GRID_ENTRY( grid, x, y, z, NE ) - + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE ) - + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT ) - + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST ) - + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET ) - + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT ) - + GRID_ENTRY( grid, x, y, z, WB ); - ux = + GRID_ENTRY( grid, x, y, z, E ) - GRID_ENTRY( grid, x, y, z, W ) - + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) - + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) - - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - uy = + GRID_ENTRY( grid, x, y, z, N ) - GRID_ENTRY( grid, x, y, z, S ) - + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) - - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) - - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ); - uz = + GRID_ENTRY( grid, x, y, z, T ) - GRID_ENTRY( grid, x, y, z, B ) - + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) - + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) - + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) - + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - } - } - } - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + int x, y, z; + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) + + GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) + + GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) + + GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) + + GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) + + GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) + + GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) + + GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) + + GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) + + GRID_ENTRY(grid, x, y, z, WB); + ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) + + GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) + + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) - + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) + + GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) - + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) - + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB); + uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) + + GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) + + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) + + GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) + + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + } + } + } + + fclose(file); } /*############################################################################*/ -void LBM_compareVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - int x, y, z; - float rho, ux, uy, uz; - OUTPUT_PRECISION fileUx, fileUy, fileUz, - dUx, dUy, dUz, - diff2, maxDiff2 = -1e+30; - - FILE* file = fopen( filename, (binary ? "rb" : "r") ); - - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - rho = + GRID_ENTRY( grid, x, y, z, C ) + GRID_ENTRY( grid, x, y, z, N ) - + GRID_ENTRY( grid, x, y, z, S ) + GRID_ENTRY( grid, x, y, z, E ) - + GRID_ENTRY( grid, x, y, z, W ) + GRID_ENTRY( grid, x, y, z, T ) - + GRID_ENTRY( grid, x, y, z, B ) + GRID_ENTRY( grid, x, y, z, NE ) - + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE ) - + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT ) - + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST ) - + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET ) - + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT ) - + GRID_ENTRY( grid, x, y, z, WB ); - ux = + GRID_ENTRY( grid, x, y, z, E ) - GRID_ENTRY( grid, x, y, z, W ) - + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) - + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) - - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - uy = + GRID_ENTRY( grid, x, y, z, N ) - GRID_ENTRY( grid, x, y, z, S ) - + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) - - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) - + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) - - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ); - uz = + GRID_ENTRY( grid, x, y, z, T ) - GRID_ENTRY( grid, x, y, z, B ) - + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) - + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) - + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) - + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - loadValue( file, &fileUx ); - loadValue( file, &fileUy ); - loadValue( file, &fileUz ); - } - else { - if( sizeof( OUTPUT_PRECISION ) == sizeof( double )) { - fscanf( file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz ); - } - else { - fscanf( file, "%f %f %f\n", &fileUx, &fileUy, &fileUz ); - } - } - - dUx = ux - fileUx; - dUy = uy - fileUy; - dUz = uz - fileUz; - diff2 = dUx*dUx + dUy*dUy + dUz*dUz; - if( diff2 > maxDiff2 ) maxDiff2 = diff2; - } - } - } +void LBM_compareVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + int x, y, z; + float rho, ux, uy, uz; + OUTPUT_PRECISION fileUx, fileUy, fileUz, dUx, dUy, dUz, diff2, + maxDiff2 = -1e+30; + + FILE *file = fopen(filename, (binary ? "rb" : "r")); + + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) + + GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) + + GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) + + GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) + + GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) + + GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) + + GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) + + GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) + + GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) + + GRID_ENTRY(grid, x, y, z, WB); + ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) + + GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) + + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) - + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) + + GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) - + GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) + + GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) - + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB); + uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) + + GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) + + GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) + + GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) + + GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + loadValue(file, &fileUx); + loadValue(file, &fileUy); + loadValue(file, &fileUz); + } else { + if (sizeof(OUTPUT_PRECISION) == sizeof(double)) { + fscanf(file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz); + } else { + fscanf(file, "%f %f %f\n", &fileUx, &fileUy, &fileUz); + } + } + + dUx = ux - fileUx; + dUy = uy - fileUy; + dUz = uz - fileUz; + diff2 = dUx * dUx + dUy * dUy + dUz * dUz; + if (diff2 > maxDiff2) + maxDiff2 = diff2; + } + } + } #if defined(SPEC_CPU) - printf( "LBM_compareVelocityField: maxDiff = %e \n\n", - sqrt( maxDiff2 ) ); + printf("LBM_compareVelocityField: maxDiff = %e \n\n", sqrt(maxDiff2)); #else - printf( "LBM_compareVelocityField: maxDiff = %e ==> %s\n\n", - sqrt( maxDiff2 ), - sqrt( maxDiff2 ) > 1e-5 ? "##### ERROR #####" : "OK" ); + printf("LBM_compareVelocityField: maxDiff = %e ==> %s\n\n", sqrt(maxDiff2), + sqrt(maxDiff2) > 1e-5 ? "##### ERROR #####" : "OK"); #endif - fclose( file ); + fclose(file); } - diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h index e35818c0b300593f382a61131e7a35584d35cee1..94189f0f2bcc080ed79e42941b5a0638649d46e3 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h @@ -18,30 +18,31 @@ typedef enum {C = 0, NT, NB, ST, SB, ET, EB, WT, WB, FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; - */ + */ #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; - +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_initializeSpecialCellsForChannel( LBM_Grid grid ); -void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 ); -void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ); -void LBM_handleInOutFlow( LBM_Grid srcGrid ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); -void LBM_compareVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_initializeSpecialCellsForChannel(LBM_Grid grid); +void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2); +void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid); +void LBM_handleInOutFlow(LBM_Grid srcGrid); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); +void LBM_compareVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h index 42c999e204dffc83c1affe8d56e086dcf1815b43..92b4c1b21dc9d87531691b3fce4bd1ff01b201f8 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h @@ -3,163 +3,204 @@ #ifndef _LBM_MACROS_H_ #define _LBM_MACROS_H_ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; -#define SIZE (120) -#define SIZE_X (1*SIZE) -#define SIZE_Y (1*SIZE) +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; +#define SIZE (120) +#define SIZE_X (1 * SIZE) +#define SIZE_Y (1 * SIZE) #define SIZE_Z (150) /*############################################################################*/ -typedef float LBM_Grid[SIZE_Z*SIZE_Y*SIZE_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float LBM_Grid[SIZE_Z * SIZE_Y * SIZE_X * N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ -#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \ - (y)*SIZE_X+(z)*SIZE_X*SIZE_Y)) +#define CALC_INDEX(x, y, z, e) \ + ((e) + N_CELL_ENTRIES * ((x) + (y)*SIZE_X + (z)*SIZE_X * SIZE_Y)) #define SWEEP_VAR int i; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( i = CALC_INDEX(x1, y1, z1, 0); \ - i < CALC_INDEX(x2, y2, z2, 0); \ - i += N_CELL_ENTRIES ) { +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (i = CALC_INDEX(x1, y1, z1, 0); i < CALC_INDEX(x2, y2, z2, 0); \ + i += N_CELL_ENTRIES) { #define SWEEP_END } -#define SWEEP_X ((i / N_CELL_ENTRIES) % SIZE_X) +#define SWEEP_X ((i / N_CELL_ENTRIES) % SIZE_X) #define SWEEP_Y (((i / N_CELL_ENTRIES) / SIZE_X) % SIZE_Y) -#define SWEEP_Z ((i / N_CELL_ENTRIES) / (SIZE_X*SIZE_Y)) - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_Z ((i / N_CELL_ENTRIES) / (SIZE_X * SIZE_Y)) + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX(dx, dy, dz, e) + (i)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #define COLLIDE_STREAM #ifdef COLLIDE_STREAM -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* COLLIDE_STREAM */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* COLLIDE_STREAM */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* const _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *const _aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c index 85600dbfdf20059a71694b7ae72f0243ee5c82eb..6985e3e58b300a7fad88ed4623340562693c80bd 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c @@ -8,10 +8,10 @@ #include <stdlib.h> #if defined(SPEC_CPU) -# include <time.h> +#include <time.h> #else -# include <sys/times.h> -# include <unistd.h> +#include <sys/times.h> +#include <unistd.h> #endif #include <sys/stat.h> @@ -23,168 +23,169 @@ static LBM_GridPtr srcGrid, dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; +int main(int nArgs, char *arg[]) { + MAIN_Param param; #if !defined(SPEC_CPU) - MAIN_Time time; + MAIN_Time time; #endif - int t; + int t; - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); - MAIN_initialize( ¶m ); + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); + MAIN_initialize(¶m); #if !defined(SPEC_CPU) - MAIN_startClock( &time ); + MAIN_startClock(&time); #endif - for( t = 1; t <= param.nTimeSteps; t++ ) { - if( param.simType == CHANNEL ) { - LBM_handleInOutFlow( *srcGrid ); - } + for (t = 1; t <= param.nTimeSteps; t++) { + if (param.simType == CHANNEL) { + LBM_handleInOutFlow(*srcGrid); + } - LBM_performStreamCollide( *srcGrid, *dstGrid ); - LBM_swapGrids( &srcGrid, &dstGrid ); + LBM_performStreamCollide(*srcGrid, *dstGrid); + LBM_swapGrids(&srcGrid, &dstGrid); - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); - //LBM_showGridStatistics( *srcGrid ); - } - } + if ((t & 63) == 0) { + printf("timestep: %i\n", t); + // LBM_showGridStatistics( *srcGrid ); + } + } #if !defined(SPEC_CPU) - MAIN_stopClock( &time, ¶m ); + MAIN_stopClock(&time, ¶m); #endif - MAIN_finalize( ¶m ); + MAIN_finalize(¶m); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); - return 0; + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params) { - struct stat fileStat; - - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } - - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } - } - else param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; - param->action = STORE; - param->simType = LDC; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; + + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); + } + } else + param->obstacleFilename = NULL; + + param->resultFilename = params->outFile; + param->action = STORE; + param->simType = LDC; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - const char actionString[3][32] = {"nothing", "compare", "store"}; - const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"}; - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - actionString[param->action], simTypeString[param->simType], - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + const char actionString[3][32] = {"nothing", "compare", "store"}; + const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"}; + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, actionString[param->action], + simTypeString[param->simType], + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param ) { - LBM_allocateGrid( (float**) &srcGrid ); - LBM_allocateGrid( (float**) &dstGrid ); +void MAIN_initialize(const MAIN_Param *param) { + LBM_allocateGrid((float **)&srcGrid); + LBM_allocateGrid((float **)&dstGrid); - LBM_initializeGrid( *srcGrid ); - LBM_initializeGrid( *dstGrid ); + LBM_initializeGrid(*srcGrid); + LBM_initializeGrid(*dstGrid); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( *srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( *dstGrid, param->obstacleFilename ); - } + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(*srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(*dstGrid, param->obstacleFilename); + } - if( param->simType == CHANNEL ) { - LBM_initializeSpecialCellsForChannel( *srcGrid ); - LBM_initializeSpecialCellsForChannel( *dstGrid ); - } - else { - LBM_initializeSpecialCellsForLDC( *srcGrid ); - LBM_initializeSpecialCellsForLDC( *dstGrid ); - } + if (param->simType == CHANNEL) { + LBM_initializeSpecialCellsForChannel(*srcGrid); + LBM_initializeSpecialCellsForChannel(*dstGrid); + } else { + LBM_initializeSpecialCellsForLDC(*srcGrid); + LBM_initializeSpecialCellsForLDC(*dstGrid); + } - LBM_showGridStatistics( *srcGrid ); + LBM_showGridStatistics(*srcGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param ) { - LBM_showGridStatistics( *srcGrid ); +void MAIN_finalize(const MAIN_Param *param) { + LBM_showGridStatistics(*srcGrid); - if( param->action == COMPARE ) - LBM_compareVelocityField( *srcGrid, param->resultFilename, TRUE ); - if( param->action == STORE ) - LBM_storeVelocityField( *srcGrid, param->resultFilename, TRUE ); + if (param->action == COMPARE) + LBM_compareVelocityField(*srcGrid, param->resultFilename, TRUE); + if (param->action == STORE) + LBM_storeVelocityField(*srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &srcGrid ); - LBM_freeGrid( (float**) &dstGrid ); + LBM_freeGrid((float **)&srcGrid); + LBM_freeGrid((float **)&dstGrid); } #if !defined(SPEC_CPU) /*############################################################################*/ -void MAIN_startClock( MAIN_Time* time ) { - time->timeScale = 1.0 / sysconf( _SC_CLK_TCK ); - time->tickStart = times( &(time->timeStart) ); +void MAIN_startClock(MAIN_Time *time) { + time->timeScale = 1.0 / sysconf(_SC_CLK_TCK); + time->tickStart = times(&(time->timeStart)); } - /*############################################################################*/ -void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param ) { - time->tickStop = times( &(time->timeStop) ); - - printf( "MAIN_stopClock:\n" - "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n", - (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale, - (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale, - (time->timeStop.tms_utime - time->timeStart.tms_utime + - time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale, - (time->tickStop - time->tickStart ) * time->timeScale, - 1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps / - (time->tickStop - time->tickStart ) / time->timeScale ); +void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param) { + time->tickStop = times(&(time->timeStop)); + + printf( + "MAIN_stopClock:\n" + "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n", + (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale, + (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale, + (time->timeStop.tms_utime - time->timeStart.tms_utime + + time->timeStop.tms_stime - time->timeStart.tms_stime) * + time->timeScale, + (time->tickStop - time->tickStart) * time->timeScale, + 1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps / + (time->tickStop - time->tickStart) / time->timeScale); } #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h index e207f4158f06a1cdf74ccc4fd0eb982543de0f87..4eb16dd70d0a121488ae657442b7e950a0afd16a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h @@ -18,34 +18,35 @@ #if !defined(SPEC_CPU) typedef struct { - float timeScale; - clock_t tickStart, tickStop; - struct tms timeStart, timeStop; + float timeScale; + clock_t tickStart, tickStop; + struct tms timeStart, timeStop; } MAIN_Time; #endif -typedef enum {NOTHING = 0, COMPARE, STORE} MAIN_Action; -typedef enum {LDC = 0, CHANNEL} MAIN_SimType; +typedef enum { NOTHING = 0, COMPARE, STORE } MAIN_Action; +typedef enum { LDC = 0, CHANNEL } MAIN_SimType; typedef struct { - int nTimeSteps; - char* resultFilename; - MAIN_Action action; - MAIN_SimType simType; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + MAIN_Action action; + MAIN_SimType simType; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param ); -void MAIN_finalize( const MAIN_Param* param ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param); +void MAIN_finalize(const MAIN_Param *param); #if !defined(SPEC_CPU) -void MAIN_startClock( MAIN_Time* time ); -void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param ); +void MAIN_startClock(MAIN_Time *time); +void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param); #endif /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h index ef75410c43c337651291d2b27655ab26d73485d9..57b6b0875204536ee7cb7a5b12fb9e120e246fec 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h @@ -13,33 +13,33 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) -//Flattening function +// Flattening function // This macro will be used to map a 3-D index and element to a value -// The macro below implements the equivalent of a 3-D array of +// The macro below implements the equivalent of a 3-D array of // 20-element structures in C standard layout. -#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (e + N_CELL_ENTRIES * ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -48,22 +48,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c index 7a7a539232830dce79bb6664ea05eec91be8a4bb..6bc8c020cc457210124c6b21b3dc337239d222a3 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c @@ -10,346 +10,320 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") + cl_int clStatus; - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - memset( *ptr, 0, size ); + memset(*ptr, 0, size); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); + + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; size_t max_alloc_size = 0; - clGetDeviceInfo(prm->clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + clGetDeviceInfo(prm->clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_alloc_size), &max_alloc_size, NULL); if (max_alloc_size < size) { fprintf(stderr, "Can't allocate buffer: max alloc size is %dMB\n", - (int) (max_alloc_size >> 20)); + (int)(max_alloc_size >> 20)); exit(-1); } - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h index 9dcf4639faf25701b015e0d3e6dcf0f9400b1745..64a617feb862bdffdcb0c6aa57b0f1b09c26debb 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h index 24fad43205f11da1c05cc8aa5895e7aa2688d3f4..99c50c048a14bb47bb3659b61f088db95706bb0c 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c index ac972815b190d1f91ba9c78512fbebb503501d14..193dec15418f96d53198c1a07ab3affdee3e956e 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c @@ -9,16 +9,16 @@ /*############################################################################*/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <sys/stat.h> -#include <parboil.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,202 +27,205 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; - - OpenCL_Param prm; - - pb_InitializeTimerSet(&timers); - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); - - - static LBM_GridPtr TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); - - OpenCL_initialize(&prm); - MAIN_initialize( ¶m, &prm ); - - for( t = 1; t <= param.nTimeSteps; t++ ) { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); - - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; + + OpenCL_Param prm; + + pb_InitializeTimerSet(&timers); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + + static LBM_GridPtr TEMP_srcGrid; + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); + + OpenCL_initialize(&prm); + MAIN_initialize(¶m, &prm); + + for (t = 1; t <= param.nTimeSteps; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); + + if ((t & 63) == 0) { + printf("timestep: %i\n", t); #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - } - } - - MAIN_finalize( ¶m, &prm ); + } + } - LBM_freeGrid( (float**) &TEMP_srcGrid ); + MAIN_finalize(¶m, &prm); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); - return 0; + LBM_freeGrid((float **)&TEMP_srcGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; - - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } - - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } - } - else param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; + + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); + } + } else + param->obstacleFilename = NULL; + + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); - - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); + + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); - + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; - - clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL); - CHECK_ERROR("clGetPlatformIDs") +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; + + clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL); + CHECK_ERROR("clGetPlatformIDs") + + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") + prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL, + NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") - prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + prm->clProgram = + clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") + char clOptions[100]; + sprintf(clOptions, "-I src/opencl_base"); - char clOptions[100]; - sprintf(clOptions,"-I src/opencl_base"); - - clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") + clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions, + NULL, NULL); + CHECK_ERROR("clBuildProgram") - prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") + prm->clKernel = + clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus); + CHECK_ERROR("clCreateKernel") - free((void*)clSource[0]); + free((void *)clSource[0]); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h index feee4e8768b13f0975481b1e3a5505ad3cdd018f..9d8e145c93b37488a3826e77b964c56699377d2a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c index 292f55728a7b78c9448300637369fb0044fa6f4d..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c index 8cae2c1c172ff66c001627cd24389edd74a44472..26d90928b500d0ae5e5630dbe20b2f57e9f202c2 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c @@ -10,345 +10,319 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; - - size_t bytes = 100; - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(size_t),(void*)&bytes); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,2,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,3,sizeof(size_t),(void*)&bytes); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { + + cl_int clStatus; + + size_t bytes = 100; + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(size_t), (void *)&bytes); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 2, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 3, sizeof(size_t), (void *)&bytes); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - memset( *ptr, 0, size ); + memset(*ptr, 0, size); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); + + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c index 3bf89f4b8a03dec812196187cc2f4bcbd328de24..59aa8daf9a018348274e20653c9c92f6995a96e4 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c @@ -15,10 +15,10 @@ #include <sys/stat.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,287 +27,296 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; - OpenCL_Param prm; + OpenCL_Param prm; - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if( param.obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename ); - } + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + if (param.obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics( TEMP_srcGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_showGridStatistics(TEMP_srcGrid); - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - OpenCL_initialize(&prm); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid ); + OpenCL_initialize(&prm); - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid ); + // Setup DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - clFinish(prm.clCommandQueue); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + clFinish(prm.clCommandQueue); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for( t = 1; t <= param.nTimeSteps; t++ ) { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); + for (t = 1; t <= param.nTimeSteps; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); - /*if( (t & 63) == 0 ) {*/ - /*printf( "timestep: %i\n", t );*/ + /*if( (t & 63) == 0 ) {*/ + /*printf( "timestep: %i\n", t );*/ #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - /*}*/ - } - clFinish(prm.clCommandQueue); - /*MAIN_finalize( ¶m, &prm );*/ // inlined + /*}*/ + } + clFinish(prm.clCommandQueue); + /*MAIN_finalize( ¶m, &prm );*/ // inlined - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); + clReleaseProgram(prm.clProgram); + clReleaseKernel(prm.clKernel); + clReleaseCommandQueue(prm.clCommandQueue); + clReleaseContext(prm.clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + LBM_showGridStatistics(TEMP_srcGrid); + LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); } - else param->obstacleFilename = NULL; + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); + } + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); + // Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - prm->clPlatform = clPlatform[1]; - - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; - - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_CPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") - - /*printf("Device id = %p\n", prm->clDevice);*/ - /*cl_device_partition_property props[4];*/ - /*props[0] = CL_DEVICE_PARTITION_BY_COUNTS;*/ - /*props[1] = NUM_CORES;*/ - /*props[1] = 8;*/ - /*props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;*/ - /*props[3] = 0;*/ - /*cl_device_id subdevice_id;*/ - /*cl_uint num_entries = 1;*/ - - /*cl_uint numDevices;*/ - /*clCreateSubDevices(prm->clDevice, clCps, num_entries, &subdevice_id, &numDevices);*/ - /*printf("Num of devices = %d\n", numDevices);*/ - /*for(unsigned i =0 ; i< numDevices; i++)*/ - /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/ - /*prm->clDevice = subdevice_id;*/ - - /*printf("Device id = %p\n", prm->clDevice);*/ - prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - prm->clContext = clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - - const unsigned char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; - - prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); - /*size_t binarySize = 39303;*/ - /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, &binarySize, &clSource[0], NULL,&clStatus);*/ - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[100]; - sprintf(clOptions,"-I src/opencl_nvidia"); - - clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") - - free((void*)clSource[0]); - - /*pb_CreateAndBuildKernelFromBinary("build/opencl_cpu_baseline_default/kernel.ir", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);*/ +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + prm->clPlatform = clPlatform[1]; + + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; + + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_CPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") + + /*printf("Device id = %p\n", prm->clDevice);*/ + /*cl_device_partition_property props[4];*/ + /*props[0] = CL_DEVICE_PARTITION_BY_COUNTS;*/ + /*props[1] = NUM_CORES;*/ + /*props[1] = 8;*/ + /*props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;*/ + /*props[3] = 0;*/ + /*cl_device_id subdevice_id;*/ + /*cl_uint num_entries = 1;*/ + + /*cl_uint numDevices;*/ + /*clCreateSubDevices(prm->clDevice, clCps, num_entries, &subdevice_id, + * &numDevices);*/ + /*printf("Num of devices = %d\n", numDevices);*/ + /*for(unsigned i =0 ; i< numDevices; i++)*/ + /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/ + /*prm->clDevice = subdevice_id;*/ + + /*printf("Device id = %p\n", prm->clDevice);*/ + prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_CPU, NULL, + NULL, &clStatus); + prm->clContext = + clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + + const unsigned char *clSource[] = { + readFile("src/opencl_cpu_baseline/kernel.cl")}; + + prm->clProgram = + clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus); + /*size_t binarySize = 39303;*/ + /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, + * &binarySize, &clSource[0], NULL,&clStatus);*/ + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[100]; + sprintf(clOptions, "-I src/opencl_nvidia"); + + clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions, + NULL, NULL); + CHECK_ERROR("clBuildProgram") + + prm->clKernel = + clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus); + CHECK_ERROR("clCreateKernel") + + free((void *)clSource[0]); + + /*pb_CreateAndBuildKernelFromBinary("build/opencl_cpu_baseline_default/kernel.ir", + * "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, + * &prm->clProgram, &prm->clKernel);*/ } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; -} + buffer[size] = 0; + fclose(fp); + return buffer; +} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h index c7a93a636ea59f77e59a61032b68ad8c15477511..d5011fdcf889fb729689b2a9bf08d76e6c828f10 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s! Errcode = %d\n",errorMessage, clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s! Errcode = %d\n", errorMessage, clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c @@ -10,338 +10,312 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; - - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { + + cl_int clStatus; + + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + memset(*ptr, 0, size); - memset( *ptr, 0, size ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c index 168cc12fd63245b6a04d3f8468d7b3fe463187db..d93a919df300c520c7105612cc54f9684f052678 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c @@ -15,10 +15,10 @@ #include <sys/stat.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,286 +27,294 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; - OpenCL_Param prm; + OpenCL_Param prm; - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if( param.obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename ); - } + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + if (param.obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics( TEMP_srcGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_showGridStatistics(TEMP_srcGrid); - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - OpenCL_initialize(&prm); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid ); + OpenCL_initialize(&prm); - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid ); + // Setup DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for( t = 1; t <= param.nTimeSteps; t++ ) { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); + for (t = 1; t <= param.nTimeSteps; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); - /*if( (t & 63) == 0 ) {*/ - /*printf( "timestep: %i\n", t );*/ + /*if( (t & 63) == 0 ) {*/ + /*printf( "timestep: %i\n", t );*/ #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - /*}*/ - } + /*}*/ + } - /*MAIN_finalize( ¶m, &prm );*/ // inlined + /*MAIN_finalize( ¶m, &prm );*/ // inlined - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); + clReleaseProgram(prm.clProgram); + clReleaseKernel(prm.clKernel); + clReleaseCommandQueue(prm.clCommandQueue); + clReleaseContext(prm.clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + LBM_showGridStatistics(TEMP_srcGrid); + LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); } - else param->obstacleFilename = NULL; + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); - - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); + + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - prm->clPlatform = clPlatform[1]; - - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; - - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_CPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") - - /*printf("Device id = %p\n", prm->clDevice);*/ - /*cl_device_partition_property props[3];*/ - /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/ - /*props[1] = 1;*/ - /*props[2] = 0;*/ - /*cl_device_id subdevice_id[8];*/ - /*cl_uint num_entries = 8;*/ - - /*cl_uint numDevices;*/ - /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id, &numDevices);*/ - /*printf("Num of devices = %d\n", numDevices);*/ - /*for(unsigned i =0 ; i< numDevices; i++)*/ - /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/ - /*prm->clDevice = subdevice_id[0];*/ - - /*printf("Device id = %p\n", prm->clDevice);*/ - /*prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/ - prm->clContext = clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - - /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/ - - /*prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/ - /*size_t binarySize = 39303;*/ - /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, &binarySize, &clSource[0], NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[100];*/ - /*sprintf(clOptions,"-I src/opencl_nvidia");*/ - - /*clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - /*free((void*)clSource[0]);*/ - - pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel); +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + prm->clPlatform = clPlatform[1]; + + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; + + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_CPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") + + /*printf("Device id = %p\n", prm->clDevice);*/ + /*cl_device_partition_property props[3];*/ + /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/ + /*props[1] = 1;*/ + /*props[2] = 0;*/ + /*cl_device_id subdevice_id[8];*/ + /*cl_uint num_entries = 8;*/ + + /*cl_uint numDevices;*/ + /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id, + * &numDevices);*/ + /*printf("Num of devices = %d\n", numDevices);*/ + /*for(unsigned i =0 ; i< numDevices; i++)*/ + /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/ + /*prm->clDevice = subdevice_id[0];*/ + + /*printf("Device id = %p\n", prm->clDevice);*/ + /*prm->clContext = + * clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/ + prm->clContext = + clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + + /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/ + + /*prm->clProgram = + * clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/ + /*size_t binarySize = 39303;*/ + /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, + * &binarySize, &clSource[0], NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[100];*/ + /*sprintf(clOptions,"-I src/opencl_nvidia");*/ + + /*clStatus = + * clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*prm->clKernel = + * clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + /*free((void*)clSource[0]);*/ + + pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel", + &prm->clContext, &prm->clDevice, + &prm->clProgram, &prm->clKernel); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; -} + buffer[size] = 0; + fclose(fp); + return buffer; +} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h index c7a93a636ea59f77e59a61032b68ad8c15477511..d5011fdcf889fb729689b2a9bf08d76e6c828f10 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s! Errcode = %d\n",errorMessage, clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s! Errcode = %d\n", errorMessage, clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c @@ -10,338 +10,312 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; - - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { + + cl_int clStatus; + + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + memset(*ptr, 0, size); - memset( *ptr, 0, size ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c index 168cc12fd63245b6a04d3f8468d7b3fe463187db..d93a919df300c520c7105612cc54f9684f052678 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c @@ -15,10 +15,10 @@ #include <sys/stat.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,286 +27,294 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; - OpenCL_Param prm; + OpenCL_Param prm; - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if( param.obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename ); - } + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + if (param.obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics( TEMP_srcGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_showGridStatistics(TEMP_srcGrid); - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - OpenCL_initialize(&prm); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid ); + OpenCL_initialize(&prm); - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid ); + // Setup DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for( t = 1; t <= param.nTimeSteps; t++ ) { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); + for (t = 1; t <= param.nTimeSteps; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); - /*if( (t & 63) == 0 ) {*/ - /*printf( "timestep: %i\n", t );*/ + /*if( (t & 63) == 0 ) {*/ + /*printf( "timestep: %i\n", t );*/ #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - /*}*/ - } + /*}*/ + } - /*MAIN_finalize( ¶m, &prm );*/ // inlined + /*MAIN_finalize( ¶m, &prm );*/ // inlined - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); + clReleaseProgram(prm.clProgram); + clReleaseKernel(prm.clKernel); + clReleaseCommandQueue(prm.clCommandQueue); + clReleaseContext(prm.clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + LBM_showGridStatistics(TEMP_srcGrid); + LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); } - else param->obstacleFilename = NULL; + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); - - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); + + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - prm->clPlatform = clPlatform[1]; - - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; - - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_CPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") - - /*printf("Device id = %p\n", prm->clDevice);*/ - /*cl_device_partition_property props[3];*/ - /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/ - /*props[1] = 1;*/ - /*props[2] = 0;*/ - /*cl_device_id subdevice_id[8];*/ - /*cl_uint num_entries = 8;*/ - - /*cl_uint numDevices;*/ - /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id, &numDevices);*/ - /*printf("Num of devices = %d\n", numDevices);*/ - /*for(unsigned i =0 ; i< numDevices; i++)*/ - /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/ - /*prm->clDevice = subdevice_id[0];*/ - - /*printf("Device id = %p\n", prm->clDevice);*/ - /*prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/ - prm->clContext = clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus); - CHECK_ERROR("clCreateContextFromType") - - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - - /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/ - - /*prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/ - /*size_t binarySize = 39303;*/ - /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, &binarySize, &clSource[0], NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[100];*/ - /*sprintf(clOptions,"-I src/opencl_nvidia");*/ - - /*clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - /*free((void*)clSource[0]);*/ - - pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel); +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + prm->clPlatform = clPlatform[1]; + + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; + + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_CPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") + + /*printf("Device id = %p\n", prm->clDevice);*/ + /*cl_device_partition_property props[3];*/ + /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/ + /*props[1] = 1;*/ + /*props[2] = 0;*/ + /*cl_device_id subdevice_id[8];*/ + /*cl_uint num_entries = 8;*/ + + /*cl_uint numDevices;*/ + /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id, + * &numDevices);*/ + /*printf("Num of devices = %d\n", numDevices);*/ + /*for(unsigned i =0 ; i< numDevices; i++)*/ + /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/ + /*prm->clDevice = subdevice_id[0];*/ + + /*printf("Device id = %p\n", prm->clDevice);*/ + /*prm->clContext = + * clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/ + prm->clContext = + clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + + /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/ + + /*prm->clProgram = + * clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/ + /*size_t binarySize = 39303;*/ + /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, + * &binarySize, &clSource[0], NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[100];*/ + /*sprintf(clOptions,"-I src/opencl_nvidia");*/ + + /*clStatus = + * clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*prm->clKernel = + * clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + /*free((void*)clSource[0]);*/ + + pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel", + &prm->clContext, &prm->clDevice, + &prm->clProgram, &prm->clKernel); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; -} + buffer[size] = 0; + fclose(fp); + return buffer; +} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h index c7a93a636ea59f77e59a61032b68ad8c15477511..d5011fdcf889fb729689b2a9bf08d76e6c828f10 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s! Errcode = %d\n",errorMessage, clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s! Errcode = %d\n", errorMessage, clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c @@ -10,338 +10,312 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; - - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { + + cl_int clStatus; + + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + memset(*ptr, 0, size); - memset( *ptr, 0, size ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h index 9dcf4639faf25701b015e0d3e6dcf0f9400b1745..64a617feb862bdffdcb0c6aa57b0f1b09c26debb 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h index 24fad43205f11da1c05cc8aa5895e7aa2688d3f4..99c50c048a14bb47bb3659b61f088db95706bb0c 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c index 41b22f662a69e29a1b67eaf54bfb2de439becd78..18320b7394e5d499339ee820a992b00acd9b368e 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c @@ -15,10 +15,10 @@ #include <sys/stat.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,258 +27,262 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; - - OpenCL_Param prm; - - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; + OpenCL_Param prm; - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if( param.obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename ); - } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + if (param.obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics( TEMP_srcGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - OpenCL_initialize(&prm); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_showGridStatistics(TEMP_srcGrid); - //Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid ); + OpenCL_initialize(&prm); + // Setup DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - for( t = 1; t <= param.nTimeSteps; t++ ) { - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); + for (t = 1; t <= param.nTimeSteps; t++) { + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); - /*if( (t & 63) == 0 ) {*/ - /*printf( "timestep: %i\n", t );*/ + /*if( (t & 63) == 0 ) {*/ + /*printf( "timestep: %i\n", t );*/ #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - /*}*/ - } + /*}*/ + } - /*MAIN_finalize( ¶m, &prm );*/ // inlined + /*MAIN_finalize( ¶m, &prm );*/ // inlined - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); + clReleaseProgram(prm.clProgram); + clReleaseKernel(prm.clKernel); + clReleaseCommandQueue(prm.clCommandQueue); + clReleaseContext(prm.clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + LBM_showGridStatistics(TEMP_srcGrid); + LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); } - else param->obstacleFilename = NULL; + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + // Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; - clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL); - CHECK_ERROR("clGetPlatformIDs") + clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL); + CHECK_ERROR("clGetPlatformIDs") - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") - prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") + prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL, + NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + prm->clProgram = + clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") - char clOptions[100]; - sprintf(clOptions,"-I src/opencl_nvidia"); + char clOptions[100]; + sprintf(clOptions, "-I src/opencl_nvidia"); - clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") + clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions, + NULL, NULL); + CHECK_ERROR("clBuildProgram") - prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") + prm->clKernel = + clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus); + CHECK_ERROR("clCreateKernel") - free((void*)clSource[0]); + free((void *)clSource[0]); - /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);*/ + /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", + * "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, + * &prm->clProgram, &prm->clKernel);*/ } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h index feee4e8768b13f0975481b1e3a5505ad3cdd018f..9d8e145c93b37488a3826e77b964c56699377d2a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; -} + buffer[size] = 0; + fclose(fp); + return buffer; +} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c @@ -10,338 +10,312 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; - - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { + + cl_int clStatus; + + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + memset(*ptr, 0, size); - memset( *ptr, 0, size ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c index c95d33e0409daa902ec2bc939a10081b99b259a1..5e43b754279910d3ca3b45d40184df666138f9e5 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c @@ -15,10 +15,10 @@ #include <sys/stat.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,259 +27,266 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; - OpenCL_Param prm; + OpenCL_Param prm; - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + if (param.obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if( param.obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename ); - } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_showGridStatistics(TEMP_srcGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - OpenCL_initialize(&prm); - - //Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for( int i=0; i < 1; i++) { - for( t = 1; t <= param.nTimeSteps; t++ ) { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); - - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + OpenCL_initialize(&prm); + + // Setup DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int i = 0; i < 1; i++) { + for (t = 1; t <= param.nTimeSteps; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); + + if ((t & 63) == 0) { + printf("timestep: %i\n", t); #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - } } } - /*MAIN_finalize( ¶m, &prm );*/ // inlined + } + /*MAIN_finalize( ¶m, &prm );*/ // inlined - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); + clReleaseProgram(prm.clProgram); + clReleaseKernel(prm.clKernel); + clReleaseCommandQueue(prm.clCommandQueue); + clReleaseContext(prm.clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + LBM_showGridStatistics(TEMP_srcGrid); + LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); } - else param->obstacleFilename = NULL; + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); - - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); + + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; - clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL); - CHECK_ERROR("clGetPlatformIDs") + clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL); + CHECK_ERROR("clGetPlatformIDs") - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") - prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") + prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL, + NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - //prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + // prm->clProgram = + // clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[100]; - //sprintf(clOptions,"-I src/opencl_nvidia"); + // char clOptions[100]; + // sprintf(clOptions,"-I src/opencl_nvidia"); - //clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = + // clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); - //CHECK_ERROR("clCreateKernel") + // prm->clKernel = + // clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); + // CHECK_ERROR("clCreateKernel") - //free((void*)clSource[0]); + // free((void*)clSource[0]); - pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_long_default/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_nvidia_long_default/kernel_offline.nvptx.s", + "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, + &prm->clProgram, &prm->clKernel); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; -} + buffer[size] = 0; + fclose(fp); + return buffer; +} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c @@ -10,338 +10,312 @@ // includes, system #include <CL/cl.h> +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "ocl.h" -#include "lbm.h" /******************************************************************************/ -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) { - - cl_int clStatus; - - clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid); - CHECK_ERROR("clSetKernelArg") - - size_t dimBlock[3] = {SIZE_X,1,1}; - size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - clStatus = clFinish(prm->clCommandQueue); - CHECK_ERROR("clFinish") +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid) { + + cl_int clStatus; + + clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid); + CHECK_ERROR("clSetKernelArg") + + size_t dimBlock[3] = {SIZE_X, 1, 1}; + size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL, + dimGrid, dimBlock, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(prm->clCommandQueue); + CHECK_ERROR("clFinish") } /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + memset(*ptr, 0, size); - memset( *ptr, 0, size ); + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + *ptr += MARGIN; } /******************************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - *ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + *ptr = + clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") } /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ -void OpenCL_LBM_freeGrid(cl_mem ptr) { - clReleaseMemObject(ptr); -} +void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); } /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") } -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - cl_int clStatus; - clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + cl_int clStatus; + clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size, + h_grid - MARGIN, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") } /*############################################################################*/ -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) { - cl_mem aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) { + cl_mem aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h @@ -13,23 +13,26 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /* OpenCL *********************************************************************/ -void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ); -void OpenCL_LBM_freeGrid( cl_mem ptr ); -void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ); -void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ); +void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr); +void OpenCL_LBM_freeGrid(cl_mem ptr); +void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid, + LBM_Grid h_grid); +void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid, + cl_mem dstGrid); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c index e07d6946258afd9daca0ec526752c15352620c5c..e66cb2c47cc5bd1f62d774952a7e2397005f1e47 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c @@ -15,10 +15,10 @@ #include <sys/stat.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" -#include "ocl.h" #include "main.h" -#include "lbm.h" +#include "ocl.h" /*############################################################################*/ @@ -27,259 +27,266 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; /*############################################################################*/ struct pb_TimerSet timers; -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; - OpenCL_Param prm; + OpenCL_Param prm; - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined - /*MAIN_initialize( ¶m, &prm ); */ // This has been inlined + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + if (param.obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - if( param.obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename ); - } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_showGridStatistics(TEMP_srcGrid); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - OpenCL_initialize(&prm); - - //Setup DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for( int i=0; i < 4; i++) { - for( t = 1; t <= param.nTimeSteps; t++ ) { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); - - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + OpenCL_initialize(&prm); + + // Setup DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int i = 0; i < 4; i++) { + for (t = 1; t <= param.nTimeSteps; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); + + if ((t & 63) == 0) { + printf("timestep: %i\n", t); #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - } } } - /*MAIN_finalize( ¶m, &prm );*/ // inlined + } + /*MAIN_finalize( ¶m, &prm );*/ // inlined - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm.clProgram); - clReleaseKernel(prm.clKernel); - clReleaseCommandQueue(prm.clCommandQueue); - clReleaseContext(prm.clContext); + clReleaseProgram(prm.clProgram); + clReleaseKernel(prm.clKernel); + clReleaseCommandQueue(prm.clCommandQueue); + clReleaseContext(prm.clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE ); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + LBM_showGridStatistics(TEMP_srcGrid); + LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + param->nTimeSteps = atoi(arg[1]); + + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); } - else param->obstacleFilename = NULL; + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); - - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); + LBM_allocateGrid((float **)&TEMP_dstGrid); + LBM_initializeGrid(TEMP_srcGrid); + LBM_initializeGrid(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); + LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + // Setup DEVICE datastructures + OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); + OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); + + // Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); + + LBM_freeGrid((float **)&TEMP_srcGrid); + LBM_freeGrid((float **)&TEMP_dstGrid); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + // Setup TEMP datastructures + LBM_allocateGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics(TEMP_srcGrid); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid(OpenCL_srcGrid); + OpenCL_LBM_freeGrid(OpenCL_dstGrid); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } -void OpenCL_initialize(OpenCL_Param* prm) -{ - cl_int clStatus; +void OpenCL_initialize(OpenCL_Param *prm) { + cl_int clStatus; - clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL); - CHECK_ERROR("clGetPlatformIDs") + clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL); + CHECK_ERROR("clGetPlatformIDs") - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") + clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1, + &(prm->clDevice), NULL); + CHECK_ERROR("clGetDeviceIDs") - prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") + prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL, + NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") + prm->clCommandQueue = clCreateCommandQueue( + prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - //prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + // prm->clProgram = + // clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[100]; - //sprintf(clOptions,"-I src/opencl_nvidia"); + // char clOptions[100]; + // sprintf(clOptions,"-I src/opencl_nvidia"); - //clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = + // clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); - //CHECK_ERROR("clCreateKernel") + // prm->clKernel = + // clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); + // CHECK_ERROR("clCreateKernel") - //free((void*)clSource[0]); + // free((void*)clSource[0]); - pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_short_default/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_nvidia_short_default/kernel_offline.nvptx.s", + "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, + &prm->clProgram, &prm->clKernel); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h @@ -12,19 +12,20 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ); -void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm); +void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm); -void OpenCL_initialize(OpenCL_Param* prm); +void OpenCL_initialize(OpenCL_Param *prm); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c @@ -1,40 +1,36 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> -#include "ocl.h" -char* readFile(char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); +char *readFile(char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); - if(fp == NULL) - { - printf("Error 1!\n"); - return NULL; - } + if (fp == NULL) { + printf("Error 1!\n"); + return NULL; + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - return NULL; - } + char *buffer = malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + return NULL; + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - return NULL; - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + return NULL; + } - buffer[size] = 0; - fclose(fp); - return buffer; -} + buffer[size] = 0; + fclose(fp); + return buffer; +} diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h @@ -2,24 +2,22 @@ #define __OCLH__ typedef struct { - cl_platform_id clPlatform; - cl_context_properties clCps[3]; - cl_device_id clDevice; - cl_context clContext; - cl_command_queue clCommandQueue; - cl_program clProgram; - cl_kernel clKernel; + cl_platform_id clPlatform; + cl_context_properties clCps[3]; + cl_device_id clDevice; + cl_context clContext; + cl_command_queue clCommandQueue; + cl_program clProgram; + cl_kernel clKernel; } OpenCL_Param; - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -char* readFile(char*); +char *readFile(char *); #endif diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h @@ -13,31 +13,31 @@ /*############################################################################*/ -//Unchangeable settings: volume simulation size for the given example +// Unchangeable settings: volume simulation size for the given example #define SIZE_X (120) #define SIZE_Y (120) #define SIZE_Z (150) -//Changeable settings -//Padding in each dimension +// Changeable settings +// Padding in each dimension #define PADDING_X (8) #define PADDING_Y (0) #define PADDING_Z (4) -//Pitch in each dimension -#define PADDED_X (SIZE_X+PADDING_X) -#define PADDED_Y (SIZE_Y+PADDING_Y) -#define PADDED_Z (SIZE_Z+PADDING_Z) +// Pitch in each dimension +#define PADDED_X (SIZE_X + PADDING_X) +#define PADDED_Y (SIZE_Y + PADDING_Y) +#define PADDED_Z (SIZE_Z + PADDING_Z) -#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z) -#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z) +#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z) +#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z) // Flattening function // This macro will be used to map a 3-D index and element to a value -#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \ - ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) ) +#define CALC_INDEX(x, y, z, e) \ + (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y)) -#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0)) +#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0)) // Set this value to 1 for GATHER, or 0 for SCATTER #if 1 @@ -46,22 +46,41 @@ #define SCATTER #endif -//OpenCL block size (not trivially changeable here) +// OpenCL block size (not trivially changeable here) #define BLOCK_SIZE SIZE_X /*############################################################################*/ -typedef enum {C = 0, - N, S, E, W, T, B, - NE, NW, SE, SW, - NT, NB, ST, SB, - ET, EB, WT, WB, - FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; +typedef enum { + C = 0, + N, + S, + E, + W, + T, + B, + NE, + NW, + SE, + SW, + NT, + NB, + ST, + SB, + ET, + EB, + WT, + WB, + FLAGS, + N_CELL_ENTRIES +} CELL_ENTRIES; #define N_DISTR_FUNCS FLAGS -typedef enum {OBSTACLE = 1 << 0, - ACCEL = 1 << 1, - IN_OUT_FLOW = 1 << 2} CELL_FLAGS; +typedef enum { + OBSTACLE = 1 << 0, + ACCEL = 1 << 1, + IN_OUT_FLOW = 1 << 2 +} CELL_FLAGS; #endif /* _CONFIG_H_ */ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp index 9b9de7702142f56cfc492aa3d680990a8f707a56..cf00ad76a285f5209ecee541308d5f18ed356249 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp @@ -9,295 +9,263 @@ /*############################################################################*/ // includes, system +#include <float.h> #include <math.h> -#include <stdlib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> -#include <float.h> // includes, project #include "layout_config.h" -#include "lbm_macros.h" #include "lbm.h" +#include "lbm_macros.h" /******************************************************************************/ - /*############################################################################*/ -void LBM_allocateGrid( float** ptr ) { - const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); +void LBM_allocateGrid(float **ptr) { + const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - *ptr = (float*)malloc( size ); - if( ! *ptr ) { - printf( "LBM_allocateGrid: could not allocate %.1f MByte\n", - size / (1024.0*1024.0) ); - exit( 1 ); - } + *ptr = (float *)malloc(size); + if (!*ptr) { + printf("LBM_allocateGrid: could not allocate %.1f MByte\n", + size / (1024.0 * 1024.0)); + exit(1); + } - memset( *ptr, 0, size ); + memset(*ptr, 0, size); - printf( "LBM_allocateGrid: allocated %.1f MByte\n", - size / (1024.0*1024.0) ); - - *ptr += MARGIN; + printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0)); + + *ptr += MARGIN; } /******************************************************************************/ /*############################################################################*/ -void LBM_freeGrid( float** ptr ) { - free( *ptr-MARGIN ); - *ptr = NULL; +void LBM_freeGrid(float **ptr) { + free(*ptr - MARGIN); + *ptr = NULL; } /******************************************************************************/ /*############################################################################*/ -void LBM_initializeGrid( LBM_Grid grid ) { - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - SRC_C( grid ) = DFL1; - SRC_N( grid ) = DFL2; - SRC_S( grid ) = DFL2; - SRC_E( grid ) = DFL2; - SRC_W( grid ) = DFL2; - SRC_T( grid ) = DFL2; - SRC_B( grid ) = DFL2; - SRC_NE( grid ) = DFL3; - SRC_NW( grid ) = DFL3; - SRC_SE( grid ) = DFL3; - SRC_SW( grid ) = DFL3; - SRC_NT( grid ) = DFL3; - SRC_NB( grid ) = DFL3; - SRC_ST( grid ) = DFL3; - SRC_SB( grid ) = DFL3; - SRC_ET( grid ) = DFL3; - SRC_EB( grid ) = DFL3; - SRC_WT( grid ) = DFL3; - SRC_WB( grid ) = DFL3; - - CLEAR_ALL_FLAGS_SWEEP( grid ); - SWEEP_END +void LBM_initializeGrid(LBM_Grid grid) { + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + SRC_C(grid) = DFL1; + SRC_N(grid) = DFL2; + SRC_S(grid) = DFL2; + SRC_E(grid) = DFL2; + SRC_W(grid) = DFL2; + SRC_T(grid) = DFL2; + SRC_B(grid) = DFL2; + SRC_NE(grid) = DFL3; + SRC_NW(grid) = DFL3; + SRC_SE(grid) = DFL3; + SRC_SW(grid) = DFL3; + SRC_NT(grid) = DFL3; + SRC_NB(grid) = DFL3; + SRC_ST(grid) = DFL3; + SRC_SB(grid) = DFL3; + SRC_ET(grid) = DFL3; + SRC_EB(grid) = DFL3; + SRC_WT(grid) = DFL3; + SRC_WB(grid) = DFL3; + + CLEAR_ALL_FLAGS_SWEEP(grid); + SWEEP_END } /******************************************************************************/ /*############################################################################*/ -void LBM_swapGrids( LBM_Grid* grid1, LBM_Grid* grid2 ) { - LBM_Grid aux = *grid1; - *grid1 = *grid2; - *grid2 = aux; +void LBM_swapGrids(LBM_Grid *grid1, LBM_Grid *grid2) { + LBM_Grid aux = *grid1; + *grid1 = *grid2; + *grid2 = aux; } /*############################################################################*/ -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) { - int x, y, z; +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) { + int x, y, z; - FILE* file = fopen( filename, "rb" ); + FILE *file = fopen(filename, "rb"); - for( z = 0; z < SIZE_Z; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE ); - } - fgetc( file ); - } - fgetc( file ); - } + for (z = 0; z < SIZE_Z; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (fgetc(file) != '.') + SET_FLAG(grid, x, y, z, OBSTACLE); + } + fgetc(file); + } + fgetc(file); + } - fclose( file ); + fclose(file); } /*############################################################################*/ -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) { - int x, y, z; - - for( z = -2; z < SIZE_Z+2; z++ ) { - for( y = 0; y < SIZE_Y; y++ ) { - for( x = 0; x < SIZE_X; x++ ) { - if( x == 0 || x == SIZE_X-1 || - y == 0 || y == SIZE_Y-1 || - z == 0 || z == SIZE_Z-1 ) { - SET_FLAG( grid, x, y, z, OBSTACLE ); - } - else { - if( (z == 1 || z == SIZE_Z-2) && - x > 1 && x < SIZE_X-2 && - y > 1 && y < SIZE_Y-2 ) { - SET_FLAG( grid, x, y, z, ACCEL ); - } - } - } - } - } +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) { + int x, y, z; + + for (z = -2; z < SIZE_Z + 2; z++) { + for (y = 0; y < SIZE_Y; y++) { + for (x = 0; x < SIZE_X; x++) { + if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 || + z == SIZE_Z - 1) { + SET_FLAG(grid, x, y, z, OBSTACLE); + } else { + if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 && + y < SIZE_Y - 2) { + SET_FLAG(grid, x, y, z, ACCEL); + } + } + } + } + } } /*############################################################################*/ -void LBM_showGridStatistics( LBM_Grid grid ) { - int nObstacleCells = 0, - nAccelCells = 0, - nFluidCells = 0; - float ux, uy, uz; - float minU2 = 1e+30, maxU2 = -1e+30, u2; - float minRho = 1e+30, maxRho = -1e+30, rho; - float mass = 0; - - SWEEP_VAR - - SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z ) - rho = LOCAL( grid, C ) + LOCAL( grid, N ) - + LOCAL( grid, S ) + LOCAL( grid, E ) - + LOCAL( grid, W ) + LOCAL( grid, T ) - + LOCAL( grid, B ) + LOCAL( grid, NE ) - + LOCAL( grid, NW ) + LOCAL( grid, SE ) - + LOCAL( grid, SW ) + LOCAL( grid, NT ) - + LOCAL( grid, NB ) + LOCAL( grid, ST ) - + LOCAL( grid, SB ) + LOCAL( grid, ET ) - + LOCAL( grid, EB ) + LOCAL( grid, WT ) - + LOCAL( grid, WB ); - - if( rho < minRho ) minRho = rho; - if( rho > maxRho ) maxRho = rho; - mass += rho; - - if( TEST_FLAG_SWEEP( grid, OBSTACLE )) { - nObstacleCells++; - } - else { - if( TEST_FLAG_SWEEP( grid, ACCEL )) - nAccelCells++; - else - nFluidCells++; - - ux = + LOCAL( grid, E ) - LOCAL( grid, W ) - + LOCAL( grid, NE ) - LOCAL( grid, NW ) - + LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, ET ) + LOCAL( grid, EB ) - - LOCAL( grid, WT ) - LOCAL( grid, WB ); - uy = + LOCAL( grid, N ) - LOCAL( grid, S ) - + LOCAL( grid, NE ) + LOCAL( grid, NW ) - - LOCAL( grid, SE ) - LOCAL( grid, SW ) - + LOCAL( grid, NT ) + LOCAL( grid, NB ) - - LOCAL( grid, ST ) - LOCAL( grid, SB ); - uz = + LOCAL( grid, T ) - LOCAL( grid, B ) - + LOCAL( grid, NT ) - LOCAL( grid, NB ) - + LOCAL( grid, ST ) - LOCAL( grid, SB ) - + LOCAL( grid, ET ) - LOCAL( grid, EB ) - + LOCAL( grid, WT ) - LOCAL( grid, WB ); - u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho); - if( u2 < minU2 ) minU2 = u2; - if( u2 > maxU2 ) maxU2 = u2; - } - SWEEP_END - - printf( "LBM_showGridStatistics:\n" - "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" - "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" - "\tminU: %e maxU: %e\n\n", - nObstacleCells, nAccelCells, nFluidCells, - minRho, maxRho, mass, - sqrt( minU2 ), sqrt( maxU2 ) ); - +void LBM_showGridStatistics(LBM_Grid grid) { + int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0; + float ux, uy, uz; + float minU2 = 1e+30, maxU2 = -1e+30, u2; + float minRho = 1e+30, maxRho = -1e+30, rho; + float mass = 0; + + SWEEP_VAR + + SWEEP_START(0, 0, 0, 0, 0, SIZE_Z) + rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) + + LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) + + LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) + + LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) + + LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB); + + if (rho < minRho) + minRho = rho; + if (rho > maxRho) + maxRho = rho; + mass += rho; + + if (TEST_FLAG_SWEEP(grid, OBSTACLE)) { + nObstacleCells++; + } else { + if (TEST_FLAG_SWEEP(grid, ACCEL)) + nAccelCells++; + else + nFluidCells++; + + ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) + + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) - + LOCAL(grid, WT) - LOCAL(grid, WB); + uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) - + LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) - + LOCAL(grid, ST) - LOCAL(grid, SB); + uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) + + LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) + + LOCAL(grid, WT) - LOCAL(grid, WB); + u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho); + if (u2 < minU2) + minU2 = u2; + if (u2 > maxU2) + maxU2 = u2; + } + SWEEP_END + + printf("LBM_showGridStatistics:\n" + "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n" + "\tminRho: %8.4f maxRho: %8.4f mass: %e\n" + "\tminU: %e maxU: %e\n\n", + nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass, + sqrt(minU2), sqrt(maxU2)); } /*############################################################################*/ -static void storeValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - const char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1]; - - fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - } - else { /* little endian */ - fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void storeValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + const char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1]; + + fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file); + } else { /* little endian */ + fwrite(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -static void loadValue( FILE* file, OUTPUT_PRECISION* v ) { - const int litteBigEndianTest = 1; - if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */ - char* vPtr = (char*) v; - char buffer[sizeof( OUTPUT_PRECISION )]; - int i; - - fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file ); - - for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++) - vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1]; - } - else { /* little endian */ - fread( v, sizeof( OUTPUT_PRECISION ), 1, file ); - } +static void loadValue(FILE *file, OUTPUT_PRECISION *v) { + const int litteBigEndianTest = 1; + if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */ + char *vPtr = (char *)v; + char buffer[sizeof(OUTPUT_PRECISION)]; + int i; + + fread(buffer, sizeof(OUTPUT_PRECISION), 1, file); + + for (i = 0; i < sizeof(OUTPUT_PRECISION); i++) + vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1]; + } else { /* little endian */ + fread(v, sizeof(OUTPUT_PRECISION), 1, file); + } } /*############################################################################*/ -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const int binary ) { - OUTPUT_PRECISION rho, ux, uy, uz; - - FILE* file = fopen( filename, (binary ? "wb" : "w") ); - - SWEEP_VAR - SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z) - rho = + SRC_C( grid ) + SRC_N( grid ) - + SRC_S( grid ) + SRC_E( grid ) - + SRC_W( grid ) + SRC_T( grid ) - + SRC_B( grid ) + SRC_NE( grid ) - + SRC_NW( grid ) + SRC_SE( grid ) - + SRC_SW( grid ) + SRC_NT( grid ) - + SRC_NB( grid ) + SRC_ST( grid ) - + SRC_SB( grid ) + SRC_ET( grid ) - + SRC_EB( grid ) + SRC_WT( grid ) - + SRC_WB( grid ); - ux = + SRC_E( grid ) - SRC_W( grid ) - + SRC_NE( grid ) - SRC_NW( grid ) - + SRC_SE( grid ) - SRC_SW( grid ) - + SRC_ET( grid ) + SRC_EB( grid ) - - SRC_WT( grid ) - SRC_WB( grid ); - uy = + SRC_N( grid ) - SRC_S( grid ) - + SRC_NE( grid ) + SRC_NW( grid ) - - SRC_SE( grid ) - SRC_SW( grid ) - + SRC_NT( grid ) + SRC_NB( grid ) - - SRC_ST( grid ) - SRC_SB( grid ); - uz = + SRC_T( grid ) - SRC_B( grid ) - + SRC_NT( grid ) - SRC_NB( grid ) - + SRC_ST( grid ) - SRC_SB( grid ) - + SRC_ET( grid ) - SRC_EB( grid ) - + SRC_WT( grid ) - SRC_WB( grid ); - ux /= rho; - uy /= rho; - uz /= rho; - - if( binary ) { - /* - fwrite( &ux, sizeof( ux ), 1, file ); - fwrite( &uy, sizeof( uy ), 1, file ); - fwrite( &uz, sizeof( uz ), 1, file ); - */ - storeValue( file, &ux ); - storeValue( file, &uy ); - storeValue( file, &uz ); - } else - fprintf( file, "%e %e %e\n", ux, uy, uz ); - - SWEEP_END; - - fclose( file ); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const int binary) { + OUTPUT_PRECISION rho, ux, uy, uz; + + FILE *file = fopen(filename, (binary ? "wb" : "w")); + + SWEEP_VAR + SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z) + rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) + + SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) + + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) + + SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) + + SRC_WB(grid); + ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) - + SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid); + uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) - + SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid); + uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) - + SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid); + ux /= rho; + uy /= rho; + uz /= rho; + + if (binary) { + /* + fwrite( &ux, sizeof( ux ), 1, file ); + fwrite( &uy, sizeof( uy ), 1, file ); + fwrite( &uz, sizeof( uz ), 1, file ); + */ + storeValue(file, &ux); + storeValue(file, &uy); + storeValue(file, &uz); + } else + fprintf(file, "%e %e %e\n", ux, uy, uz); + + SWEEP_END; + + fclose(file); } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h index 4768d949a9cae7bf2d118feaa0b2200667200b2f..8f2d5fde2470862aea3efb51bb46262f86008518 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h @@ -13,15 +13,15 @@ /*############################################################################*/ -void LBM_allocateGrid( float** ptr ); -void LBM_freeGrid( float** ptr ); -void LBM_initializeGrid( LBM_Grid grid ); -void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ); -void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ); -void LBM_swapGrids( LBM_Grid* grid1, LBM_Grid* grid2 ); -void LBM_showGridStatistics( LBM_Grid Grid ); -void LBM_storeVelocityField( LBM_Grid grid, const char* filename, - const BOOL binary ); +void LBM_allocateGrid(float **ptr); +void LBM_freeGrid(float **ptr); +void LBM_initializeGrid(LBM_Grid grid); +void LBM_initializeSpecialCellsForLDC(LBM_Grid grid); +void LBM_loadObstacleFile(LBM_Grid grid, const char *filename); +void LBM_swapGrids(LBM_Grid *grid1, LBM_Grid *grid2); +void LBM_showGridStatistics(LBM_Grid Grid); +void LBM_storeVelocityField(LBM_Grid grid, const char *filename, + const BOOL binary); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h index d8ceb373dfe0e7a02d374c78a9c3fd68cc8f3085..ae91da7c5c304aac7537de0c68edffcd91a83dbe 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h @@ -17,160 +17,181 @@ #define TRUE (-1) #define FALSE (0) -#define DFL1 (1.0f/ 3.0f) -#define DFL2 (1.0f/18.0f) -#define DFL3 (1.0f/36.0f) +#define DFL1 (1.0f / 3.0f) +#define DFL2 (1.0f / 18.0f) +#define DFL3 (1.0f / 36.0f) /*############################################################################*/ -typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; -typedef LBM_Grid* LBM_GridPtr; +typedef float + *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES]; +typedef LBM_Grid *LBM_GridPtr; /*############################################################################*/ - -#define SWEEP_X __temp_x__ -#define SWEEP_Y __temp_y__ -#define SWEEP_Z __temp_z__ +#define SWEEP_X __temp_x__ +#define SWEEP_Y __temp_y__ +#define SWEEP_Z __temp_z__ #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__; -#define SWEEP_START(x1,y1,z1,x2,y2,z2) \ - for( __temp_z__ = z1; \ - __temp_z__ < z2; \ - __temp_z__++) { \ - for( __temp_y__ = 0; \ - __temp_y__ < SIZE_Y; \ - __temp_y__++) { \ - for(__temp_x__ = 0; \ - __temp_x__ < SIZE_X; \ - __temp_x__++) { \ - -#define SWEEP_END }}} - - -#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)]) -#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)]) - -#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e )) -#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e )) -#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e )) -#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e )) -#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e )) -#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e )) -#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e )) -#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e )) -#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e )) -#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e )) -#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e )) -#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e )) -#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e )) -#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e )) -#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e )) -#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e )) -#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e )) -#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e )) -#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e )) - +#define SWEEP_START(x1, y1, z1, x2, y2, z2) \ + for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) { \ + for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) { \ + for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) { + +#define SWEEP_END \ + } \ + } \ + } + +#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)]) +#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e) \ + ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)]) + +#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e)) +#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e)) +#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e)) +#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e)) +#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e)) +#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e)) +#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e)) +#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e)) +#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e)) +#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e)) +#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e)) +#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e)) +#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e)) +#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e)) +#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e)) +#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e)) +#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e)) +#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e)) +#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e)) #ifdef SCATTER -#define SRC_C(g) (LOCAL( g, C )) -#define SRC_N(g) (LOCAL( g, N )) -#define SRC_S(g) (LOCAL( g, S )) -#define SRC_E(g) (LOCAL( g, E )) -#define SRC_W(g) (LOCAL( g, W )) -#define SRC_T(g) (LOCAL( g, T )) -#define SRC_B(g) (LOCAL( g, B )) -#define SRC_NE(g) (LOCAL( g, NE )) -#define SRC_NW(g) (LOCAL( g, NW )) -#define SRC_SE(g) (LOCAL( g, SE )) -#define SRC_SW(g) (LOCAL( g, SW )) -#define SRC_NT(g) (LOCAL( g, NT )) -#define SRC_NB(g) (LOCAL( g, NB )) -#define SRC_ST(g) (LOCAL( g, ST )) -#define SRC_SB(g) (LOCAL( g, SB )) -#define SRC_ET(g) (LOCAL( g, ET )) -#define SRC_EB(g) (LOCAL( g, EB )) -#define SRC_WT(g) (LOCAL( g, WT )) -#define SRC_WB(g) (LOCAL( g, WB )) - -#define DST_C(g) (NEIGHBOR_C ( g, C )) -#define DST_N(g) (NEIGHBOR_N ( g, N )) -#define DST_S(g) (NEIGHBOR_S ( g, S )) -#define DST_E(g) (NEIGHBOR_E ( g, E )) -#define DST_W(g) (NEIGHBOR_W ( g, W )) -#define DST_T(g) (NEIGHBOR_T ( g, T )) -#define DST_B(g) (NEIGHBOR_B ( g, B )) -#define DST_NE(g) (NEIGHBOR_NE( g, NE )) -#define DST_NW(g) (NEIGHBOR_NW( g, NW )) -#define DST_SE(g) (NEIGHBOR_SE( g, SE )) -#define DST_SW(g) (NEIGHBOR_SW( g, SW )) -#define DST_NT(g) (NEIGHBOR_NT( g, NT )) -#define DST_NB(g) (NEIGHBOR_NB( g, NB )) -#define DST_ST(g) (NEIGHBOR_ST( g, ST )) -#define DST_SB(g) (NEIGHBOR_SB( g, SB )) -#define DST_ET(g) (NEIGHBOR_ET( g, ET )) -#define DST_EB(g) (NEIGHBOR_EB( g, EB )) -#define DST_WT(g) (NEIGHBOR_WT( g, WT )) -#define DST_WB(g) (NEIGHBOR_WB( g, WB )) +#define SRC_C(g) (LOCAL(g, C)) +#define SRC_N(g) (LOCAL(g, N)) +#define SRC_S(g) (LOCAL(g, S)) +#define SRC_E(g) (LOCAL(g, E)) +#define SRC_W(g) (LOCAL(g, W)) +#define SRC_T(g) (LOCAL(g, T)) +#define SRC_B(g) (LOCAL(g, B)) +#define SRC_NE(g) (LOCAL(g, NE)) +#define SRC_NW(g) (LOCAL(g, NW)) +#define SRC_SE(g) (LOCAL(g, SE)) +#define SRC_SW(g) (LOCAL(g, SW)) +#define SRC_NT(g) (LOCAL(g, NT)) +#define SRC_NB(g) (LOCAL(g, NB)) +#define SRC_ST(g) (LOCAL(g, ST)) +#define SRC_SB(g) (LOCAL(g, SB)) +#define SRC_ET(g) (LOCAL(g, ET)) +#define SRC_EB(g) (LOCAL(g, EB)) +#define SRC_WT(g) (LOCAL(g, WT)) +#define SRC_WB(g) (LOCAL(g, WB)) + +#define DST_C(g) (NEIGHBOR_C(g, C)) +#define DST_N(g) (NEIGHBOR_N(g, N)) +#define DST_S(g) (NEIGHBOR_S(g, S)) +#define DST_E(g) (NEIGHBOR_E(g, E)) +#define DST_W(g) (NEIGHBOR_W(g, W)) +#define DST_T(g) (NEIGHBOR_T(g, T)) +#define DST_B(g) (NEIGHBOR_B(g, B)) +#define DST_NE(g) (NEIGHBOR_NE(g, NE)) +#define DST_NW(g) (NEIGHBOR_NW(g, NW)) +#define DST_SE(g) (NEIGHBOR_SE(g, SE)) +#define DST_SW(g) (NEIGHBOR_SW(g, SW)) +#define DST_NT(g) (NEIGHBOR_NT(g, NT)) +#define DST_NB(g) (NEIGHBOR_NB(g, NB)) +#define DST_ST(g) (NEIGHBOR_ST(g, ST)) +#define DST_SB(g) (NEIGHBOR_SB(g, SB)) +#define DST_ET(g) (NEIGHBOR_ET(g, ET)) +#define DST_EB(g) (NEIGHBOR_EB(g, EB)) +#define DST_WT(g) (NEIGHBOR_WT(g, WT)) +#define DST_WB(g) (NEIGHBOR_WB(g, WB)) #else /* GATHER */ -#define SRC_C(g) (NEIGHBOR_C ( g, C )) -#define SRC_N(g) (NEIGHBOR_S ( g, N )) -#define SRC_S(g) (NEIGHBOR_N ( g, S )) -#define SRC_E(g) (NEIGHBOR_W ( g, E )) -#define SRC_W(g) (NEIGHBOR_E ( g, W )) -#define SRC_T(g) (NEIGHBOR_B ( g, T )) -#define SRC_B(g) (NEIGHBOR_T ( g, B )) -#define SRC_NE(g) (NEIGHBOR_SW( g, NE )) -#define SRC_NW(g) (NEIGHBOR_SE( g, NW )) -#define SRC_SE(g) (NEIGHBOR_NW( g, SE )) -#define SRC_SW(g) (NEIGHBOR_NE( g, SW )) -#define SRC_NT(g) (NEIGHBOR_SB( g, NT )) -#define SRC_NB(g) (NEIGHBOR_ST( g, NB )) -#define SRC_ST(g) (NEIGHBOR_NB( g, ST )) -#define SRC_SB(g) (NEIGHBOR_NT( g, SB )) -#define SRC_ET(g) (NEIGHBOR_WB( g, ET )) -#define SRC_EB(g) (NEIGHBOR_WT( g, EB )) -#define SRC_WT(g) (NEIGHBOR_EB( g, WT )) -#define SRC_WB(g) (NEIGHBOR_ET( g, WB )) - -#define DST_C(g) (LOCAL( g, C )) -#define DST_N(g) (LOCAL( g, N )) -#define DST_S(g) (LOCAL( g, S )) -#define DST_E(g) (LOCAL( g, E )) -#define DST_W(g) (LOCAL( g, W )) -#define DST_T(g) (LOCAL( g, T )) -#define DST_B(g) (LOCAL( g, B )) -#define DST_NE(g) (LOCAL( g, NE )) -#define DST_NW(g) (LOCAL( g, NW )) -#define DST_SE(g) (LOCAL( g, SE )) -#define DST_SW(g) (LOCAL( g, SW )) -#define DST_NT(g) (LOCAL( g, NT )) -#define DST_NB(g) (LOCAL( g, NB )) -#define DST_ST(g) (LOCAL( g, ST )) -#define DST_SB(g) (LOCAL( g, SB )) -#define DST_ET(g) (LOCAL( g, ET )) -#define DST_EB(g) (LOCAL( g, EB )) -#define DST_WT(g) (LOCAL( g, WT )) -#define DST_WB(g) (LOCAL( g, WB )) +#define SRC_C(g) (NEIGHBOR_C(g, C)) +#define SRC_N(g) (NEIGHBOR_S(g, N)) +#define SRC_S(g) (NEIGHBOR_N(g, S)) +#define SRC_E(g) (NEIGHBOR_W(g, E)) +#define SRC_W(g) (NEIGHBOR_E(g, W)) +#define SRC_T(g) (NEIGHBOR_B(g, T)) +#define SRC_B(g) (NEIGHBOR_T(g, B)) +#define SRC_NE(g) (NEIGHBOR_SW(g, NE)) +#define SRC_NW(g) (NEIGHBOR_SE(g, NW)) +#define SRC_SE(g) (NEIGHBOR_NW(g, SE)) +#define SRC_SW(g) (NEIGHBOR_NE(g, SW)) +#define SRC_NT(g) (NEIGHBOR_SB(g, NT)) +#define SRC_NB(g) (NEIGHBOR_ST(g, NB)) +#define SRC_ST(g) (NEIGHBOR_NB(g, ST)) +#define SRC_SB(g) (NEIGHBOR_NT(g, SB)) +#define SRC_ET(g) (NEIGHBOR_WB(g, ET)) +#define SRC_EB(g) (NEIGHBOR_WT(g, EB)) +#define SRC_WT(g) (NEIGHBOR_EB(g, WT)) +#define SRC_WB(g) (NEIGHBOR_ET(g, WB)) + +#define DST_C(g) (LOCAL(g, C)) +#define DST_N(g) (LOCAL(g, N)) +#define DST_S(g) (LOCAL(g, S)) +#define DST_E(g) (LOCAL(g, E)) +#define DST_W(g) (LOCAL(g, W)) +#define DST_T(g) (LOCAL(g, T)) +#define DST_B(g) (LOCAL(g, B)) +#define DST_NE(g) (LOCAL(g, NE)) +#define DST_NW(g) (LOCAL(g, NW)) +#define DST_SE(g) (LOCAL(g, SE)) +#define DST_SW(g) (LOCAL(g, SW)) +#define DST_NT(g) (LOCAL(g, NT)) +#define DST_NB(g) (LOCAL(g, NB)) +#define DST_ST(g) (LOCAL(g, ST)) +#define DST_SB(g) (LOCAL(g, SB)) +#define DST_ET(g) (LOCAL(g, ET)) +#define DST_EB(g) (LOCAL(g, EB)) +#define DST_WT(g) (LOCAL(g, WT)) +#define DST_WB(g) (LOCAL(g, WB)) #endif /* GATHER */ -#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v)))) -#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v) - -#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) -#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;} - -#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) -#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);} -#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);} -#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;} +#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v)))) +#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v) + +#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f)) +#define SET_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG_SWEEP(g, f) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS_SWEEP(g) \ + { \ + FLAG_VAR(LOCAL(g, FLAGS)); \ + (*_aux_) = 0; \ + } + +#define TEST_FLAG(g, x, y, z, f) \ + ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f)) +#define SET_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) |= (f); \ + } +#define CLEAR_FLAG(g, x, y, z, f) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) &= ~(f); \ + } +#define CLEAR_ALL_FLAGS(g, x, y, z) \ + { \ + FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); \ + (*_aux_) = 0; \ + } /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp index 91e9722353a40ba21911003b298616bde879b497..b51864366b500fc796d9073fe1893be2f402797f 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp @@ -15,11 +15,11 @@ #include <visc.h> #include "layout_config.h" +#include "lbm.h" #include "lbm_macros.h" #include "main.h" -#include "lbm.h" -#define AS_UINT(x) (*((unsigned*)&(x))) +#define AS_UINT(x) (*((unsigned *)&(x))) /*############################################################################*/ @@ -29,404 +29,396 @@ static LBM_Grid srcGrid, dstGrid; struct pb_TimerSet timers; - /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *params) { + struct stat fileStat; - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } + if (nArgs < 2) { + printf("syntax: lbm <time steps>\n"); + exit(1); + } + + param->nTimeSteps = atoi(arg[1]); - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } + if (params->inpFiles[0] != NULL) { + param->obstacleFilename = params->inpFiles[0]; + + if (stat(param->obstacleFilename, &fileStat) != 0) { + printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename); + exit(1); + } + if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) { + printf("MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int)fileStat.st_size, + SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z); + exit(1); } - else param->obstacleFilename = NULL; + } else + param->obstacleFilename = NULL; - param->resultFilename = params->outFile; + param->resultFilename = params->outFile; } /*############################################################################*/ -void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); +void MAIN_printInfo(const MAIN_Param *param) { + printf("MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z, + param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" + : param->obstacleFilename); } /*############################################################################*/ typedef struct __attribute__((__packed__)) { - float* srcG; size_t bytes_srcG; - float* dstG; size_t bytes_dstG; - size_t dim_X1, dim_X2, dim_Y2; + float *srcG; + size_t bytes_srcG; + float *dstG; + size_t bytes_dstG; + size_t dim_X1, dim_X2, dim_Y2; } RootIn; -void performStreamCollide_kernel( float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG ) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(2, srcG, dstG, 1, dstG); - - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); - - srcG += MARGIN; - dstG += MARGIN; - - int lx = __visc__getNodeInstanceID_x(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - - //Using some predefined macros here. Consider this the declaration - // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z - - SWEEP_VAR - SWEEP_X = lx; // get_local_id(0) - SWEEP_Y = gx; // get_group_id(0) - SWEEP_Z = gy; // get_group_id(1) - - float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB; - float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ; - float tempSB, tempET, tempEB, tempWT, tempWB ; - - //Load all of the input fields - //This is a gather operation of the SCATTER preprocessor variable - // is undefined in layout_config.h, or a "local" read otherwise - tempC = SRC_C(srcG); - - tempN = SRC_N(srcG); - tempS = SRC_S(srcG); - tempE = SRC_E(srcG); - tempW = SRC_W(srcG); - tempT = SRC_T(srcG); - tempB = SRC_B(srcG); - - tempNE = SRC_NE(srcG); - tempNW = SRC_NW(srcG); - tempSE = SRC_SE(srcG); - tempSW = SRC_SW(srcG); - tempNT = SRC_NT(srcG); - tempNB = SRC_NB(srcG); - tempST = SRC_ST(srcG); - tempSB = SRC_SB(srcG); - tempET = SRC_ET(srcG); - tempEB = SRC_EB(srcG); - tempWT = SRC_WT(srcG); - tempWB = SRC_WB(srcG); - - //Test whether the cell is fluid or obstacle - if(AS_UINT(LOCAL(srcG,FLAGS)) & (OBSTACLE)) { - - //Swizzle the inputs: reflect any fluid coming into this cell - // back to where it came from - temp_swp = tempN ; - tempN = tempS ; - tempS = temp_swp ; - temp_swp = tempE ; - tempE = tempW ; - tempW = temp_swp; - temp_swp = tempT ; - tempT = tempB ; - tempB = temp_swp; - temp_swp = tempNE; - tempNE = tempSW ; - tempSW = temp_swp; - temp_swp = tempNW; - tempNW = tempSE ; - tempSE = temp_swp; - temp_swp = tempNT ; - tempNT = tempSB ; - tempSB = temp_swp; - temp_swp = tempNB ; - tempNB = tempST ; - tempST = temp_swp; - temp_swp = tempET ; - tempET= tempWB ; - tempWB = temp_swp; - temp_swp = tempEB ; - tempEB = tempWT ; - tempWT = temp_swp; - } - else { - - //The math meat of LBM: ignore for optimization - float ux, uy, uz, rho, u2; - float temp1, temp2, temp_base; - rho = tempC + tempN - + tempS + tempE - + tempW + tempT - + tempB + tempNE - + tempNW + tempSE - + tempSW + tempNT - + tempNB + tempST - + tempSB + tempET - + tempEB + tempWT - + tempWB; - - ux = + tempE - tempW - + tempNE - tempNW - + tempSE - tempSW - + tempET + tempEB - - tempWT - tempWB; - - uy = + tempN - tempS - + tempNE + tempNW - - tempSE - tempSW - + tempNT + tempNB - - tempST - tempSB; - - uz = + tempT - tempB - + tempNT - tempNB - + tempST - tempSB - + tempET - tempEB - + tempWT - tempWB; - - ux /= rho; - uy /= rho; - uz /= rho; - - if(AS_UINT(LOCAL(srcG,FLAGS)) & (ACCEL)) { - - ux = 0.005f; - uy = 0.002f; - uz = 0.000f; - } - - u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f; - temp_base = OMEGA*rho; - temp1 = DFL1*temp_base; - - //Put the output values for this cell in the shared memory - temp_base = OMEGA*rho; - temp1 = DFL1*temp_base; - temp2 = 1.0f-OMEGA; - tempC = temp2*tempC + temp1*( - u2); - temp1 = DFL2*temp_base; - tempN = temp2*tempN + temp1*( uy*(4.5f*uy + 3.0f) - u2); - tempS = temp2*tempS + temp1*( uy*(4.5f*uy - 3.0f) - u2); - tempT = temp2*tempT + temp1*( uz*(4.5f*uz + 3.0f) - u2); - tempB = temp2*tempB + temp1*( uz*(4.5f*uz - 3.0f) - u2); - tempE = temp2*tempE + temp1*( ux*(4.5f*ux + 3.0f) - u2); - tempW = temp2*tempW + temp1*( ux*(4.5f*ux - 3.0f) - u2); - temp1 = DFL3*temp_base; - tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2); - tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2); - tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2); - tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2); - tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2); - tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2); - tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2); - tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2); - tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2); - tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2); - tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2); - tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2); +void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG, + size_t bytes_dstG) { + __visc__hint(visc::DEVICE); + __visc__attributes(2, srcG, dstG, 1, dstG); + + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + + srcG += MARGIN; + dstG += MARGIN; + + int lx = __visc__getNodeInstanceID_x(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + + // Using some predefined macros here. Consider this the declaration + // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z + + SWEEP_VAR + SWEEP_X = lx; // get_local_id(0) + SWEEP_Y = gx; // get_group_id(0) + SWEEP_Z = gy; // get_group_id(1) + + float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB; + float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST; + float tempSB, tempET, tempEB, tempWT, tempWB; + + // Load all of the input fields + // This is a gather operation of the SCATTER preprocessor variable + // is undefined in layout_config.h, or a "local" read otherwise + tempC = SRC_C(srcG); + + tempN = SRC_N(srcG); + tempS = SRC_S(srcG); + tempE = SRC_E(srcG); + tempW = SRC_W(srcG); + tempT = SRC_T(srcG); + tempB = SRC_B(srcG); + + tempNE = SRC_NE(srcG); + tempNW = SRC_NW(srcG); + tempSE = SRC_SE(srcG); + tempSW = SRC_SW(srcG); + tempNT = SRC_NT(srcG); + tempNB = SRC_NB(srcG); + tempST = SRC_ST(srcG); + tempSB = SRC_SB(srcG); + tempET = SRC_ET(srcG); + tempEB = SRC_EB(srcG); + tempWT = SRC_WT(srcG); + tempWB = SRC_WB(srcG); + + // Test whether the cell is fluid or obstacle + if (AS_UINT(LOCAL(srcG, FLAGS)) & (OBSTACLE)) { + + // Swizzle the inputs: reflect any fluid coming into this cell + // back to where it came from + temp_swp = tempN; + tempN = tempS; + tempS = temp_swp; + temp_swp = tempE; + tempE = tempW; + tempW = temp_swp; + temp_swp = tempT; + tempT = tempB; + tempB = temp_swp; + temp_swp = tempNE; + tempNE = tempSW; + tempSW = temp_swp; + temp_swp = tempNW; + tempNW = tempSE; + tempSE = temp_swp; + temp_swp = tempNT; + tempNT = tempSB; + tempSB = temp_swp; + temp_swp = tempNB; + tempNB = tempST; + tempST = temp_swp; + temp_swp = tempET; + tempET = tempWB; + tempWB = temp_swp; + temp_swp = tempEB; + tempEB = tempWT; + tempWT = temp_swp; + } else { + + // The math meat of LBM: ignore for optimization + float ux, uy, uz, rho, u2; + float temp1, temp2, temp_base; + rho = tempC + tempN + tempS + tempE + tempW + tempT + tempB + tempNE + + tempNW + tempSE + tempSW + tempNT + tempNB + tempST + tempSB + + tempET + tempEB + tempWT + tempWB; + + ux = +tempE - tempW + tempNE - tempNW + tempSE - tempSW + tempET + tempEB - + tempWT - tempWB; + + uy = +tempN - tempS + tempNE + tempNW - tempSE - tempSW + tempNT + tempNB - + tempST - tempSB; + + uz = +tempT - tempB + tempNT - tempNB + tempST - tempSB + tempET - tempEB + + tempWT - tempWB; + + ux /= rho; + uy /= rho; + uz /= rho; + + if (AS_UINT(LOCAL(srcG, FLAGS)) & (ACCEL)) { + + ux = 0.005f; + uy = 0.002f; + uz = 0.000f; } - //Write the results computed above - //This is a scatter operation of the SCATTER preprocessor variable - // is defined in layout_config.h, or a "local" write otherwise - DST_C ( dstG ) = tempC; - - DST_N ( dstG ) = tempN; - DST_S ( dstG ) = tempS; - DST_E ( dstG ) = tempE; - DST_W ( dstG ) = tempW; - DST_T ( dstG ) = tempT; - DST_B ( dstG ) = tempB; - - DST_NE( dstG ) = tempNE; - DST_NW( dstG ) = tempNW; - DST_SE( dstG ) = tempSE; - DST_SW( dstG ) = tempSW; - DST_NT( dstG ) = tempNT; - DST_NB( dstG ) = tempNB; - DST_ST( dstG ) = tempST; - DST_SB( dstG ) = tempSB; - DST_ET( dstG ) = tempET; - DST_EB( dstG ) = tempEB; - DST_WT( dstG ) = tempWT; - DST_WB( dstG ) = tempWB; + u2 = 1.5f * (ux * ux + uy * uy + uz * uz) - 1.0f; + temp_base = OMEGA * rho; + temp1 = DFL1 * temp_base; + + // Put the output values for this cell in the shared memory + temp_base = OMEGA * rho; + temp1 = DFL1 * temp_base; + temp2 = 1.0f - OMEGA; + tempC = temp2 * tempC + temp1 * (-u2); + temp1 = DFL2 * temp_base; + tempN = temp2 * tempN + temp1 * (uy * (4.5f * uy + 3.0f) - u2); + tempS = temp2 * tempS + temp1 * (uy * (4.5f * uy - 3.0f) - u2); + tempT = temp2 * tempT + temp1 * (uz * (4.5f * uz + 3.0f) - u2); + tempB = temp2 * tempB + temp1 * (uz * (4.5f * uz - 3.0f) - u2); + tempE = temp2 * tempE + temp1 * (ux * (4.5f * ux + 3.0f) - u2); + tempW = temp2 * tempW + temp1 * (ux * (4.5f * ux - 3.0f) - u2); + temp1 = DFL3 * temp_base; + tempNT = + temp2 * tempNT + temp1 * ((+uy + uz) * (4.5f * (+uy + uz) + 3.0f) - u2); + tempNB = + temp2 * tempNB + temp1 * ((+uy - uz) * (4.5f * (+uy - uz) + 3.0f) - u2); + tempST = + temp2 * tempST + temp1 * ((-uy + uz) * (4.5f * (-uy + uz) + 3.0f) - u2); + tempSB = + temp2 * tempSB + temp1 * ((-uy - uz) * (4.5f * (-uy - uz) + 3.0f) - u2); + tempNE = + temp2 * tempNE + temp1 * ((+ux + uy) * (4.5f * (+ux + uy) + 3.0f) - u2); + tempSE = + temp2 * tempSE + temp1 * ((+ux - uy) * (4.5f * (+ux - uy) + 3.0f) - u2); + tempET = + temp2 * tempET + temp1 * ((+ux + uz) * (4.5f * (+ux + uz) + 3.0f) - u2); + tempEB = + temp2 * tempEB + temp1 * ((+ux - uz) * (4.5f * (+ux - uz) + 3.0f) - u2); + tempNW = + temp2 * tempNW + temp1 * ((-ux + uy) * (4.5f * (-ux + uy) + 3.0f) - u2); + tempSW = + temp2 * tempSW + temp1 * ((-ux - uy) * (4.5f * (-ux - uy) + 3.0f) - u2); + tempWT = + temp2 * tempWT + temp1 * ((-ux + uz) * (4.5f * (-ux + uz) + 3.0f) - u2); + tempWB = + temp2 * tempWB + temp1 * ((-ux - uz) * (4.5f * (-ux - uz) + 3.0f) - u2); + } + + // Write the results computed above + // This is a scatter operation of the SCATTER preprocessor variable + // is defined in layout_config.h, or a "local" write otherwise + DST_C(dstG) = tempC; + + DST_N(dstG) = tempN; + DST_S(dstG) = tempS; + DST_E(dstG) = tempE; + DST_W(dstG) = tempW; + DST_T(dstG) = tempT; + DST_B(dstG) = tempB; + + DST_NE(dstG) = tempNE; + DST_NW(dstG) = tempNW; + DST_SE(dstG) = tempSE; + DST_SW(dstG) = tempSW; + DST_NT(dstG) = tempNT; + DST_NB(dstG) = tempNB; + DST_ST(dstG) = tempST; + DST_SB(dstG) = tempSB; + DST_ET(dstG) = tempET; + DST_EB(dstG) = tempEB; + DST_WT(dstG) = tempWT; + DST_WB(dstG) = tempWB; } -void lbmLvl1(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(2, srcG, dstG, 1, dstG); - void* lbm_node = __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1); - __visc__bindIn(lbm_node, 0, 0, 0); - __visc__bindIn(lbm_node, 1, 1, 0); - __visc__bindIn(lbm_node, 2, 2, 0); - __visc__bindIn(lbm_node, 3, 3, 0); +void lbmLvl1(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, + size_t dim_X1) { + __visc__hint(visc::DEVICE); + __visc__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = + __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1); + __visc__bindIn(lbm_node, 0, 0, 0); + __visc__bindIn(lbm_node, 1, 1, 0); + __visc__bindIn(lbm_node, 2, 2, 0); + __visc__bindIn(lbm_node, 3, 3, 0); } -void lbmLvl2(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2) -{ - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, srcG, dstG, 1, dstG); - void* lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2); - __visc__bindIn(lbm_node, 0, 0, 0); - __visc__bindIn(lbm_node, 1, 1, 0); - __visc__bindIn(lbm_node, 2, 2, 0); - __visc__bindIn(lbm_node, 3, 3, 0); - __visc__bindIn(lbm_node, 4, 4, 0); +void lbmLvl2(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, + size_t dim_X1, size_t dim_X2, size_t dim_Y2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2); + __visc__bindIn(lbm_node, 0, 0, 0); + __visc__bindIn(lbm_node, 1, 1, 0); + __visc__bindIn(lbm_node, 2, 2, 0); + __visc__bindIn(lbm_node, 3, 3, 0); + __visc__bindIn(lbm_node, 4, 4, 0); } -void lbmLvl3(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2) -{ - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, srcG, dstG, 1, dstG); - void* lbm_node = __visc__createNodeND(0, lbmLvl2); - __visc__bindIn(lbm_node, 0, 0, 0); - __visc__bindIn(lbm_node, 1, 1, 0); - __visc__bindIn(lbm_node, 2, 2, 0); - __visc__bindIn(lbm_node, 3, 3, 0); - __visc__bindIn(lbm_node, 4, 4, 0); - __visc__bindIn(lbm_node, 5, 5, 0); - __visc__bindIn(lbm_node, 6, 6, 0); +void lbmLvl3(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, + size_t dim_X1, size_t dim_X2, size_t dim_Y2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = __visc__createNodeND(0, lbmLvl2); + __visc__bindIn(lbm_node, 0, 0, 0); + __visc__bindIn(lbm_node, 1, 1, 0); + __visc__bindIn(lbm_node, 2, 2, 0); + __visc__bindIn(lbm_node, 3, 3, 0); + __visc__bindIn(lbm_node, 4, 4, 0); + __visc__bindIn(lbm_node, 5, 5, 0); + __visc__bindIn(lbm_node, 6, 6, 0); } -__attribute__((noinline)) void MAIN_performStreamCollide( LBM_Grid src, LBM_Grid dst ) { - - long dimBlock[3] = {SIZE_X,1,1}; - long dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1}; - size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - - void* root_in = malloc(sizeof(RootIn)); - RootIn root_in_local = { - src - MARGIN, size, - dst - MARGIN, size, - SIZE_X, SIZE_Y, SIZE_Z - }; - *(RootIn*)root_in = root_in_local; - void* lbmDFG = __visc__launch(0, lbmLvl3, root_in); - - __visc__wait(lbmDFG); +__attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src, + LBM_Grid dst) { + + long dimBlock[3] = {SIZE_X, 1, 1}; + long dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1}; + size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + + void *root_in = malloc(sizeof(RootIn)); + RootIn root_in_local = {src - MARGIN, size, dst - MARGIN, size, + SIZE_X, SIZE_Y, SIZE_Z}; + *(RootIn *)root_in = root_in_local; + void *lbmDFG = __visc__launch(0, lbmLvl3, root_in); + __visc__wait(lbmDFG); } -void MAIN_initialize( const MAIN_Param* param ) { +void MAIN_initialize(const MAIN_Param *param) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup datastructures - LBM_allocateGrid( (float**) &srcGrid ); - LBM_allocateGrid( (float**) &dstGrid ); - LBM_initializeGrid( srcGrid ); - LBM_initializeGrid( dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( dstGrid, param->obstacleFilename ); - } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // Setup datastructures + LBM_allocateGrid((float **)&srcGrid); + LBM_allocateGrid((float **)&dstGrid); + LBM_initializeGrid(srcGrid); + LBM_initializeGrid(dstGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( srcGrid ); - LBM_initializeSpecialCellsForLDC( dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if (param->obstacleFilename != NULL) { + LBM_loadObstacleFile(srcGrid, param->obstacleFilename); + LBM_loadObstacleFile(dstGrid, param->obstacleFilename); + } - LBM_showGridStatistics( srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC(srcGrid); + LBM_initializeSpecialCellsForLDC(dstGrid); - //LBM_freeGrid( (float**) &srcGrid ); - //LBM_freeGrid( (float**) &dstGrid ); + LBM_showGridStatistics(srcGrid); + + // LBM_freeGrid( (float**) &srcGrid ); + // LBM_freeGrid( (float**) &dstGrid ); } /*############################################################################*/ -void MAIN_finalize( const MAIN_Param* param ) { - - //Setup TEMP datastructures +void MAIN_finalize(const MAIN_Param *param) { - LBM_showGridStatistics( srcGrid ); + // Setup TEMP datastructures - LBM_storeVelocityField( srcGrid, param->resultFilename, TRUE ); + LBM_showGridStatistics(srcGrid); - LBM_freeGrid( (float**) &srcGrid ); - LBM_freeGrid( (float**) &dstGrid ); + LBM_storeVelocityField(srcGrid, param->resultFilename, TRUE); + LBM_freeGrid((float **)&srcGrid); + LBM_freeGrid((float **)&dstGrid); } -int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; +int main(int nArgs, char *arg[]) { + MAIN_Param param; + int t; - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); + struct pb_Parameters *params; + params = pb_ReadParameters(&nArgs, arg); + // Setup TEMP datastructures + MAIN_parseCommandLine(nArgs, arg, ¶m, params); + MAIN_printInfo(¶m); - //Setup TEMP datastructures - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); + MAIN_initialize(¶m); - MAIN_initialize( ¶m ); + pb_InitializeTimerSet(&timers); + __visc__init(); - pb_InitializeTimerSet(&timers); - __visc__init(); + size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(srcGrid - MARGIN, size); + llvm_visc_track_mem(dstGrid - MARGIN, size); - size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(srcGrid-MARGIN, size); - llvm_visc_track_mem(dstGrid-MARGIN, size); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + for (t = 1; t <= param.nTimeSteps; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + MAIN_performStreamCollide(srcGrid, dstGrid); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - for( t = 1; t <= param.nTimeSteps; t++ ) { - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - MAIN_performStreamCollide( srcGrid, dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids( &srcGrid, &dstGrid ); + LBM_swapGrids(&srcGrid, &dstGrid); - /*if( (t & 63) == 0 ) {*/ - /*printf( "timestep: %i\n", t );*/ + /*if( (t & 63) == 0 ) {*/ + /*printf( "timestep: %i\n", t );*/ #if 0 CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); LBM_showGridStatistics( *TEMP_srcGrid ); #endif - /*}*/ - } + /*}*/ + } - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(srcGrid-MARGIN, size); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(srcGrid - MARGIN, size); - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_visc_untrack_mem(srcGrid-MARGIN); - llvm_visc_untrack_mem(dstGrid-MARGIN); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(srcGrid - MARGIN); + llvm_visc_untrack_mem(dstGrid - MARGIN); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - __visc__cleanup(); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + __visc__cleanup(); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - MAIN_finalize( ¶m ); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + MAIN_finalize(¶m); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_FreeParameters(params); - return 0; + pb_FreeParameters(params); + return 0; } diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h index 17728ccb8652dda18f4e08e16c567ec0d4abe4b5..e7d1de926379246587e72c53a3bed3eff4444f0a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h @@ -12,17 +12,18 @@ /*############################################################################*/ typedef struct { - int nTimeSteps; - char* resultFilename; - char* obstacleFilename; + int nTimeSteps; + char *resultFilename; + char *obstacleFilename; } MAIN_Param; /*############################################################################*/ -void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* ); -void MAIN_printInfo( const MAIN_Param* param ); -void MAIN_initialize( const MAIN_Param* param ); -void MAIN_finalize( const MAIN_Param* param ); +void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param, + struct pb_Parameters *); +void MAIN_printInfo(const MAIN_Param *param); +void MAIN_initialize(const MAIN_Param *param); +void MAIN_finalize(const MAIN_Param *param); void MAIN_performStreamCollide(LBM_Grid src, LBM_Grid dst); /*############################################################################*/ diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc @@ -10,15 +10,15 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc index e6fb3a580d0947bd4a7ba9c1e4e7634f5a6fffae..f856cbe45d18d02ce08563b296e0aef9eed4c178 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc @@ -6,27 +6,28 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include "sgemm_kernel.cc" +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> -#include "sgemm_kernel.cc" // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -37,49 +38,44 @@ main (int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } - + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } + /* Read in data */ pb_SwitchToTimer(&timers, pb_TimerID_IO); // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate space for C - std::vector<float> matC(matArow*matBcol); + std::vector<float> matC(matArow * matBcol); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, - &matA.front(), matArow, &matBT.front(), matBcol, 0.0f, &matC.front(), - matArow); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), matArow, + &matBT.front(), matBcol, 0.0f, &matC.front(), matArow); if (params->outFile) { /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } pb_SwitchToTimer(&timers, pb_TimerID_NONE); double CPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_COMPUTE])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/CPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / CPUtime / 1e9 + << std::endl; pb_PrintTimerSet(&timers); pb_FreeParameters(params); return 0; diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc b/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc index e46fac0c88a53d54df310c42695d06391b4a9c3e..b38116ced427728b4b3d8e90a9591cdeeda8c967 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc @@ -6,33 +6,32 @@ *cr ***************************************************************************/ -/* +/* * Base C implementation of MM */ - - -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { float c = 0.0f; for (int i = 0; i < k; ++i) { - float a = A[mm + i * lda]; + float a = A[mm + i * lda]; float b = B[nn + i * ldb]; c += a * b; } - C[mm+nn*ldc] = C[mm+nn*ldc] * beta + alpha * c; + C[mm + nn * ldc] = C[mm + nn * ldc] * beta + alpha * c; } } } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc @@ -10,15 +10,15 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc @@ -10,15 +10,15 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc @@ -10,15 +10,15 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc index 20b377a74a4662dbf1d53a0ca92e3c2e7a64a0e0..20140a1ada56ffd36bd367023c5023305b14ae2e 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc @@ -6,28 +6,31 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> -extern void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ); +extern void basicSgemm(char transa, char transb, int m, int n, int k, + float alpha, const float *A, int lda, const float *B, + int ldb, float beta, float *C, int ldc); // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -38,49 +41,44 @@ main (int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } - + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } + /* Read in data */ pb_SwitchToTimer(&timers, pb_TimerID_IO); // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate space for C - std::vector<float> matC(matArow*matBcol); + std::vector<float> matC(matArow * matBcol); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, - &matA.front(), matArow, &matBT.front(), matBcol, 0.0f, &matC.front(), - matArow); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), matArow, + &matBT.front(), matBcol, 0.0f, &matC.front(), matArow); if (params->outFile) { /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } pb_SwitchToTimer(&timers, pb_TimerID_NONE); double CPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_COMPUTE])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/CPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / CPUtime / 1e9 + << std::endl; pb_PrintTimerSet(&timers); pb_FreeParameters(params); return 0; diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc index 53b3835c0eee1c251c015e1dcf3655184e8393e0..b9896c17312967ba7a431becb1b3bda4b315ba4a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc @@ -6,34 +6,34 @@ *cr ***************************************************************************/ -/* +/* * Base C implementation of MM */ #include <iostream> - -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - #pragma omp parallel for collapse (2) +#pragma omp parallel for collapse(2) for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { float c = 0.0f; for (int i = 0; i < k; ++i) { - float a = A[mm + i * lda]; + float a = A[mm + i * lda]; float b = B[nn + i * ldb]; c += a * b; } - C[mm+nn*ldc] = C[mm+nn*ldc] * beta + alpha * c; + C[mm + nn * ldc] = C[mm + nn * ldc] * beta + alpha * c; } } } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc index 21c9e2301099395daf2a95e43c6365f93e1d7859..5489f6a55ce6e8ba3676b0c98ad4b37ac7f4a7fd 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc @@ -6,79 +6,83 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <CL/cl.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <CL/cl.h> -#include <parboil.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; } - size_t db[2] = {TILE_SZ,TILE_SZ}; - size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + size_t db[2] = {TILE_SZ, TILE_SZ}; + size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; cl_int clStatus; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - clStatus = clFinish(clCommandQueue); + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -88,146 +92,151 @@ int main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")}; - //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")}; + // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,""); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", + &clContext, &clDevice, &clProgram, &clKernel); + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // char clOptions[50]; + // sprintf(clOptions,""); - //size_t binarySizes = 0; - //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySizes, NULL); - //CHECK_ERROR("clGetProgramInfo") + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //std::cout << "Binary Size = " << binarySizes << "\n"; + // size_t binarySizes = 0; + // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, + // sizeof(size_t), &binarySizes, NULL); CHECK_ERROR("clGetProgramInfo") - //unsigned char* binaries = (unsigned char*) malloc(binarySizes); - //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes, &binaries, NULL); - //CHECK_ERROR("clGetProgramInfo") + // std::cout << "Binary Size = " << binarySizes << "\n"; - //std::cout << "Binary = \n" << binaries << "\n"; + // unsigned char* binaries = (unsigned char*) malloc(binarySizes); + // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes, + // &binaries, NULL); CHECK_ERROR("clGetProgramInfo") + // std::cout << "Binary = \n" << binaries << "\n"; - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); clStatus = clReleaseMemObject(dB); clStatus = clReleaseMemObject(dC); clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - + clStatus = clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc index c8118d34bb10e229ce604e8bfab05b9923f2e315..105baf590da13dd2ffc3cb803d63291daef0854d 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, + cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,38 +56,38 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; - //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; + // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -94,136 +97,142 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[50]; - //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + // char clOptions[50]; + // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D + // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_base_opt_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_opt_default/kernel.nvptx.s", "mysgemmNT", &clContext, + &clDevice, &clProgram, &clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - + clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc index bfcd70395e74a4cdcd39d3d7f609cf2c7a2d702f..f72c18c293c52e322a35814b13c000f9b64548b0 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc @@ -6,82 +6,86 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <CL/cl.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <CL/cl.h> -#include <parboil.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 #define VEC_SZ 8 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; } - size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; - size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + size_t db[2] = {TILE_SZ / VEC_SZ, TILE_SZ}; + size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; std::cout << "(" << dg[0] << ", " << dg[1] << ")\n"; std::cout << "(" << db[0] << ", " << db[1] << ")\n"; cl_int clStatus; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - clStatus = clFinish(clCommandQueue); + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -91,134 +95,141 @@ int main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - // const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")}; - // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // const char* clSource[] = + // {readFile("src/opencl_base/kernel_offline.nvptx.s")}; cl_program clProgram + // = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_base_vec_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_vec_default/kernel_offline.nvptx.s", "mysgemmNT", + &clContext, &clDevice, &clProgram, &clKernel); + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[50]; - //sprintf(clOptions,""); + // char clOptions[50]; + // sprintf(clOptions,""); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); clStatus = clReleaseMemObject(dB); clStatus = clReleaseMemObject(dC); clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - + clStatus = clReleaseContext(clContext); + pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } pb_SwitchToTimer(&timers, pb_TimerID_NONE); double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc index 5c75ac7367a284f8acd797393c13bc8384856bcb..744ee4096664e2f11620fae388a0a848a8cd49ac 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc @@ -6,80 +6,84 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <CL/cl.h> +#include <fstream> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <fstream> -#include <CL/cl.h> -#include <parboil.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; } - size_t db[2] = {TILE_SZ,TILE_SZ}; - size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + size_t db[2] = {TILE_SZ, TILE_SZ}; + size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; cl_int clStatus; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - clStatus = clFinish(clCommandQueue); + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -89,33 +93,27 @@ int main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; - cl_uint numPlatforms; + cl_uint numPlatforms; clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; @@ -124,151 +122,163 @@ int main (int argc, char *argv[]) { char buffer[1000]; size_t bytes; - clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, buffer, &bytes); + clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, + buffer, &bytes); CHECK_ERROR("clGetPlatformInfo") printf("\nExtensions: %s\n", buffer); cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - //printf("Device id = %p\n", clDevice); - //cl_device_partition_property props[3]; - //props[0] = CL_DEVICE_PARTITION_EQUALLY; - //props[1] = 1; - //props[2] = 0; - //cl_device_id subdevice_id[8]; - //cl_uint num_entries = 8; - - //cl_uint numDevices; - //clCreateSubDevices(clDevice, props, num_entries, subdevice_id, &numDevices); - //printf("Num of devices = %d\n", numDevices); - //for(unsigned i =0 ; i< numDevices; i++) { - //printf("Subdevice id %d = %p\n", i, subdevice_id[i]); + // printf("Device id = %p\n", clDevice); + // cl_device_partition_property props[3]; + // props[0] = CL_DEVICE_PARTITION_EQUALLY; + // props[1] = 1; + // props[2] = 0; + // cl_device_id subdevice_id[8]; + // cl_uint num_entries = 8; + + // cl_uint numDevices; + // clCreateSubDevices(clDevice, props, num_entries, subdevice_id, + // &numDevices); printf("Num of devices = %d\n", numDevices); for(unsigned i + // =0 ; i< numDevices; i++) { printf("Subdevice id %d = %p\n", i, + // subdevice_id[i]); //} - //clDevice = subdevice_id[0]; - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - //cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); + // clDevice = subdevice_id[0]; + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + // cl_context clContext = + // clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")}; - //const char* clSource[] = {readFile("kernel-spir-64-2.bc")}; - //size_t binarySize = 1112; - //std::cout << "Size of binary = " << binarySize << "\n"; - //cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, &binarySize, (const unsigned char**)clSource, NULL, &clStatus); - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //cl_kernel clKernel; - //cl_program clProgram; - //pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + const char *clSource[] = {readFile("src/opencl_base/kernel_offline.cl")}; + // const char* clSource[] = {readFile("kernel-spir-64-2.bc")}; + // size_t binarySize = 1112; + // std::cout << "Size of binary = " << binarySize << "\n"; + // cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, + // &binarySize, (const unsigned char**)clSource, NULL, &clStatus); + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + // cl_kernel clKernel; + // cl_program clProgram; + // pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", + // "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); cl_program + // clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,""); + sprintf(clOptions, ""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - //size_t binarySize; - //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, NULL); - //CHECK_ERROR("clGetProgramInfo") - //std::cout << "Binary Size = " << binarySize << "\n"; - //unsigned char* binary = (unsigned char*) malloc(binarySize*sizeof(unsigned char)); - //size_t returnSize = 0; - //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySize*sizeof(unsigned char) , &binary, &returnSize); - //CHECK_ERROR("clGetProgramInfo") - - //std::ofstream kernelfile; - //kernelfile.open ("kernel.o", std::ios::out | std::ios::binary); - //for(unsigned i=0; i<binarySize; i++) - //kernelfile << binary[i]; - //kernelfile.close(); - - //free(binary); - //std::cout << "Output binary\n"; - - - cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // size_t binarySize; + // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, + // sizeof(size_t), &binarySize, NULL); CHECK_ERROR("clGetProgramInfo") + // std::cout << "Binary Size = " << binarySize << "\n"; + // unsigned char* binary = (unsigned char*) malloc(binarySize*sizeof(unsigned + // char)); size_t returnSize = 0; clStatus = clGetProgramInfo(clProgram, + // CL_PROGRAM_BINARIES, binarySize*sizeof(unsigned char) , &binary, + // &returnSize); CHECK_ERROR("clGetProgramInfo") + + // std::ofstream kernelfile; + // kernelfile.open ("kernel.o", std::ios::out | std::ios::binary); + // for(unsigned i=0; i<binarySize; i++) + // kernelfile << binary[i]; + // kernelfile.close(); + + // free(binary); + // std::cout << "Output binary\n"; + + cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus); CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); clStatus = clReleaseMemObject(dB); clStatus = clReleaseMemObject(dC); clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - + clStatus = clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc index 898295a6bf1f313279820dd5c8366a2a66f03ef9..45ed8e942a1a69475b75a63a24b70655f1ffa2aa 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc @@ -6,26 +6,28 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <CL/cl.h> +#include <fstream> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <fstream> -#include <CL/cl.h> -#include <parboil.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes //#define TILE_SZ 16 @@ -33,65 +35,68 @@ extern char* readFile(const char*); #define TILE_N 8 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, + pb_TimerSet &timers) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + // In this code we assume the matrix sizes are multiple of tile size - //if ((m%TILE_SZ) || (n%TILE_SZ)) { - //std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - //<< "; n should be multiple of " << TILE_SZ << std::endl; + // if ((m%TILE_SZ) || (n%TILE_SZ)) { + // std::cerr << "unsupported size of matrix. m should be multiple of " << + // TILE_SZ + //<< "; n should be multiple of " << TILE_SZ << std::endl; //} - //size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; - //size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + // size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; + // size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; cl_int clStatus; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i=0; i<1; i++) { - - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + for (int i = 0; i < 1; i++) { + + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - clStatus = clFinish(clCommandQueue); + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } pb_SwitchToTimer(&timers, pb_TimerID_NONE); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -101,168 +106,172 @@ int main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; - cl_uint numPlatforms; + cl_uint numPlatforms; clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - //char buffer[1000]; - //size_t bytes; - //clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, buffer, &bytes); - //CHECK_ERROR("clGetPlatformInfo") + // char buffer[1000]; + // size_t bytes; + // clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, + // buffer, &bytes); CHECK_ERROR("clGetPlatformInfo") - //printf("\nExtensions: %s\n", buffer); + // printf("\nExtensions: %s\n", buffer); cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - //printf("Device id = %p\n", clDevice); - //cl_device_partition_property props[3]; - //props[0] = CL_DEVICE_PARTITION_EQUALLY; - //props[1] = 1; - //props[2] = 0; - //cl_device_id subdevice_id[8]; - //cl_uint num_entries = 8; - - //cl_uint numDevices; - //clCreateSubDevices(clDevice, props, num_entries, subdevice_id, &numDevices); - //printf("Num of devices = %d\n", numDevices); - //for(unsigned i =0 ; i< numDevices; i++) { - //printf("Subdevice id %d = %p\n", i, subdevice_id[i]); + // printf("Device id = %p\n", clDevice); + // cl_device_partition_property props[3]; + // props[0] = CL_DEVICE_PARTITION_EQUALLY; + // props[1] = 1; + // props[2] = 0; + // cl_device_id subdevice_id[8]; + // cl_uint num_entries = 8; + + // cl_uint numDevices; + // clCreateSubDevices(clDevice, props, num_entries, subdevice_id, + // &numDevices); printf("Num of devices = %d\n", numDevices); for(unsigned i + // =0 ; i< numDevices; i++) { printf("Subdevice id %d = %p\n", i, + // subdevice_id[i]); //} - //clDevice = subdevice_id[0]; - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - //cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); + // clDevice = subdevice_id[0]; + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + // cl_context clContext = + // clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - //size_t binarySize = 1112; - //std::cout << "Size of binary = " << binarySize << "\n"; - //cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, &binarySize, (const unsigned char**)clSource, NULL, &clStatus); - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // size_t binarySize = 1112; + // std::cout << "Size of binary = " << binarySize << "\n"; + // cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, + // &binarySize, (const unsigned char**)clSource, NULL, &clStatus); cl_program + // clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext, + &clDevice, &clProgram, &clKernel); CHECK_ERROR("Binary") - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,""); - - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + // char clOptions[50]; + // sprintf(clOptions,""); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); clStatus = clReleaseMemObject(dB); clStatus = clReleaseMemObject(dC); clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - + clStatus = clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc index 1d05e00de1b594071a3c58e816c6e31124140854..d8275be777079f1a57e585b3057685f737f38ed3 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc @@ -6,26 +6,28 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <CL/cl.h> +#include <fstream> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <fstream> -#include <CL/cl.h> -#include <parboil.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes //#define TILE_SZ 16 @@ -33,65 +35,68 @@ extern char* readFile(const char*); #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, + pb_TimerSet &timers) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + // In this code we assume the matrix sizes are multiple of tile size - //if ((m%TILE_SZ) || (n%TILE_SZ)) { - //std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - //<< "; n should be multiple of " << TILE_SZ << std::endl; + // if ((m%TILE_SZ) || (n%TILE_SZ)) { + // std::cerr << "unsupported size of matrix. m should be multiple of " << + // TILE_SZ + //<< "; n should be multiple of " << TILE_SZ << std::endl; //} - //size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; - //size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + // size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; + // size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; cl_int clStatus; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - //for(int i=0; i<15; i++) { - - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") + // pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + // for(int i=0; i<15; i++) { - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") //} - //pb_SwitchToTimer(&timers, pb_TimerID_NONE); + // pb_SwitchToTimer(&timers, pb_TimerID_NONE); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -101,33 +106,27 @@ int main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; - cl_uint numPlatforms; + cl_uint numPlatforms; clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; @@ -135,102 +134,109 @@ int main (int argc, char *argv[]) { CHECK_ERROR("clGetPlatformIDs") cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + cl_context clContext = + clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,""); + sprintf(clOptions, ""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - - - cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus); CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); clStatus = clReleaseMemObject(dB); clStatus = clReleaseMemObject(dC); clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - + clStatus = clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc index 6bd1d9b3e0b16ddb4054f4fdf14dc4aa7d544b19..b4e561ded6b82bf2b84aa4dbab2f5f4b5bceab7b 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc @@ -6,26 +6,28 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <CL/cl.h> +#include <fstream> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <fstream> -#include <CL/cl.h> -#include <parboil.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes //#define TILE_SZ 16 @@ -33,65 +35,68 @@ extern char* readFile(const char*); #define TILE_N 8 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << ": " << clStatus << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers ) -{ +void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, + pb_TimerSet &timers) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } - + if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } - + // In this code we assume the matrix sizes are multiple of tile size - //if ((m%TILE_SZ) || (n%TILE_SZ)) { - //std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - //<< "; n should be multiple of " << TILE_SZ << std::endl; + // if ((m%TILE_SZ) || (n%TILE_SZ)) { + // std::cerr << "unsupported size of matrix. m should be multiple of " << + // TILE_SZ + //<< "; n should be multiple of " << TILE_SZ << std::endl; //} - //size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; - //size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + // size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; + // size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; cl_int clStatus; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - //for(int i=0; i<15; i++) { - - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") + // for(int i=0; i<15; i++) { - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") //} pb_SwitchToTimer(&timers, pb_TimerID_NONE); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -101,168 +106,172 @@ int main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; - cl_uint numPlatforms; + cl_uint numPlatforms; clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - //char buffer[1000]; - //size_t bytes; - //clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, buffer, &bytes); - //CHECK_ERROR("clGetPlatformInfo") + // char buffer[1000]; + // size_t bytes; + // clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, + // buffer, &bytes); CHECK_ERROR("clGetPlatformInfo") - //printf("\nExtensions: %s\n", buffer); + // printf("\nExtensions: %s\n", buffer); cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - //printf("Device id = %p\n", clDevice); - //cl_device_partition_property props[3]; - //props[0] = CL_DEVICE_PARTITION_EQUALLY; - //props[1] = 1; - //props[2] = 0; - //cl_device_id subdevice_id[8]; - //cl_uint num_entries = 8; - - //cl_uint numDevices; - //clCreateSubDevices(clDevice, props, num_entries, subdevice_id, &numDevices); - //printf("Num of devices = %d\n", numDevices); - //for(unsigned i =0 ; i< numDevices; i++) { - //printf("Subdevice id %d = %p\n", i, subdevice_id[i]); + // printf("Device id = %p\n", clDevice); + // cl_device_partition_property props[3]; + // props[0] = CL_DEVICE_PARTITION_EQUALLY; + // props[1] = 1; + // props[2] = 0; + // cl_device_id subdevice_id[8]; + // cl_uint num_entries = 8; + + // cl_uint numDevices; + // clCreateSubDevices(clDevice, props, num_entries, subdevice_id, + // &numDevices); printf("Num of devices = %d\n", numDevices); for(unsigned i + // =0 ; i< numDevices; i++) { printf("Subdevice id %d = %p\n", i, + // subdevice_id[i]); //} - //clDevice = subdevice_id[0]; - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - //cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); + // clDevice = subdevice_id[0]; + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + // cl_context clContext = + // clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); + cl_context clContext = + clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - //size_t binarySize = 1112; - //std::cout << "Size of binary = " << binarySize << "\n"; - //cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, &binarySize, (const unsigned char**)clSource, NULL, &clStatus); - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // size_t binarySize = 1112; + // std::cout << "Size of binary = " << binarySize << "\n"; + // cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, + // &binarySize, (const unsigned char**)clSource, NULL, &clStatus); cl_program + // clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext, + &clDevice, &clProgram, &clKernel); CHECK_ERROR("Binary") - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,""); - - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + // char clOptions[50]; + // sprintf(clOptions,""); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); clStatus = clReleaseMemObject(dB); clStatus = clReleaseMemObject(dC); clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - + clStatus = clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc index 06d92225dc53780988fd2412cd8423ec0b9c1795..8de437a4f8935d5746dbcfbbe5345e0e66ae484a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, + cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,37 +56,37 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -93,133 +96,139 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); cl_int clStatus; - cl_uint numPlatforms; + cl_uint numPlatforms; clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL); + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + cl_context clContext = + clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_cpu_sm/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_cpu_sm/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + sprintf(clOptions, "-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d", TILE_N, + TILE_TB_HEIGHT, TILE_M); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus); CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + std::vector<float> matC(matArow * matBcol); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); if (params->outFile) { /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - free((void*)clSource[0]); + free((void *)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc index fce77fed80666703863adc1a0e62ee9df6520e94..06f5da5c319811ebfc5aa8937559219b2feed625 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, + cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,37 +56,37 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -93,120 +96,126 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + sprintf(clOptions, "-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d", TILE_N, + TILE_TB_HEIGHT, TILE_M); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus); CHECK_ERROR("clCreateKernel") /* Read in data */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); if (params->outFile) { /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } pb_FreeParameters(params); double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; - free((void*)clSource[0]); + free((void *)clSource[0]); clReleaseKernel(clKernel); clReleaseProgram(clProgram); @@ -214,7 +223,7 @@ main (int argc, char *argv[]) { clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - + clReleaseContext(clContext); + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc index 6949f86cafa3995db296b4e4486b4709020fb3e4..b22ebd8804bdb1204c42e2859aab69209dc77e4c 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, + cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,38 +56,38 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; - //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; + // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -94,142 +97,145 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // copy A to device memory // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_opt_8/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_opt_8/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + sprintf(clOptions, "-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d", TILE_N, + TILE_TB_HEIGHT, TILE_M); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus); CHECK_ERROR("clCreateKernel") - //cl_kernel clKernel; - //cl_program clProgram; - //pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + // cl_kernel clKernel; + // cl_program clProgram; + // pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_default/kernel.nvptx.s", + // "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - for(size_t i=0;i<matC.size();i++) - matC[i] = 0.0f; - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - //std::cout << "Copying " << A_sz << " bytes of data to device\n"; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + // std::cout << "Copying " << A_sz << " bytes of data to device\n"; + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - //std::cout << "Copying " << B_sz << " bytes of data to device\n"; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + // std::cout << "Copying " << B_sz << " bytes of data to device\n"; + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - - //std::cout << "Copying " << C_sz << " bytes of data to device\n"; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + // std::cout << "Copying " << C_sz << " bytes of data to device\n"; + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - - + clReleaseContext(clContext); + if (params->outFile) { - + /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc index 4b385c7b57c87016500d63c5045f63894f7be347..a7cb9793e8c1ec991d5a3f3cd1676f7a88ff8e26 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 8 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, + pb_TimerSet &timers) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,42 +56,41 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; - //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; + // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i=0; i<4; i++) { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + for (int i = 0; i < 4; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } pb_SwitchToTimer(&timers, pb_TimerID_NONE); - } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -98,136 +100,142 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[50]; - //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + // char clOptions[50]; + // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D + // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_4K_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_opt_8_4K_default/kernel.nvptx.s", "mysgemmNT", &clContext, + &clDevice, &clProgram, &clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue, timers); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - + clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc index f8fc432c7515d3c4051ad3a0c95915e013fffb32..713fd9e88966f885919bfba7df3bb0386c815f9a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 8 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, + pb_TimerSet &timers) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,42 +56,41 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; - //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; + // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i=0; i<200; i++) { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + for (int i = 0; i < 200; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } pb_SwitchToTimer(&timers, pb_TimerID_NONE); - } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -98,136 +100,142 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[50]; - //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + // char clOptions[50]; + // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D + // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_medium_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_opt_8_medium_default/kernel.nvptx.s", "mysgemmNT", + &clContext, &clDevice, &clProgram, &clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue, timers); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - + clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } - -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc index dfab43744f12bf754473b14569c7019c22b55888..7d5d75c53341060d5d61e21ffdd4d8123aa019a9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc @@ -6,42 +6,45 @@ *cr ***************************************************************************/ -/* +/* * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <parboil.h> -#include <iostream> #include <CL/cl.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 8 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) -{ +void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, + cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, + int ldc, cl_kernel clKernel, + cl_command_queue clCommandQueue) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; exit(1); @@ -53,38 +56,38 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c } // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; exit(1); } - - size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - size_t db[2] = {TILE_N,TILE_TB_HEIGHT}; - //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); + + size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; + size_t db[2] = {TILE_N, TILE_TB_HEIGHT}; + // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]); cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); - clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); - clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); - clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A); + clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k); + clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha); + clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, + 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") - + clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { struct pb_Parameters *params; struct pb_TimerSet timers; @@ -94,136 +97,142 @@ main (int argc, char *argv[]) { int matBrow, matBcol; std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T + /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - printf("%s\n",params->inpFiles[0]); - printf("%s\n",params->inpFiles[1]); - printf("%s\n",params->inpFiles[2]); - exit(-1); - } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + printf("%s\n", params->inpFiles[0]); + printf("%s\n", params->inpFiles[1]); + printf("%s\n", params->inpFiles[2]); + exit(-1); + } /* Read in data */ // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); + A_sz = matArow * matAcol * sizeof(float); // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") - + pb_SetOpenCL(&clContext, &clCommandQueue); - //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") + // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") - //char clOptions[50]; - //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); + // char clOptions[50]; + // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D + // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") - //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); - //CHECK_ERROR("clCreateKernel") + // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + // CHECK_ERROR("clCreateKernel") cl_kernel clKernel; cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_vec_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + pb_CreateAndBuildKernelFromBinary( + "build/opencl_opt_8_vec_default/kernel.nvptx.s", "mysgemmNT", &clContext, + &clDevice, &clProgram, &clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + B_sz = matBrow * matBcol * sizeof(float); // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + C_sz = matArow * matBcol * sizeof(float); // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + std::vector<float> matC(matArow * matBcol); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + cl_mem dA = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + cl_mem dB = + clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + cl_mem dC = + clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - + // Copy A and B^T into device memory - clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz, + &matA.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz, + &matBT.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - for(int i=0;i<matC.size();i++) - matC[i] = 0.0f; + for (int i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, + &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); // Use standard sgemm interface - regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue); + regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, + matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); - - pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, + NULL, NULL); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); clReleaseMemObject(dB); clReleaseMemObject(dC); clReleaseCommandQueue(clCommandQueue); - clReleaseContext(clContext); - + clReleaseContext(clContext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - + if (params->outFile) { - + /* Write C to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; pb_FreeParameters(params); - //free((void*)clSource[0]); + // free((void*)clSource[0]); - return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc index af9ee76e0fed3ced9e2666193afbd7c0631f1ce8..627f5a82412374cff4a9061620ce1f27ea3c14a6 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc @@ -10,272 +10,281 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } typedef struct __attribute__((__packed__)) { - float* A; size_t bytes_A; - int lda; - float* B; size_t bytes_B; - int ldb; - float* C; size_t bytes_C; - int ldc; - int k; float alpha; float beta; - size_t dim_X1, dim_Y1, dim_X2, dim_Y2; + float *A; + size_t bytes_A; + int lda; + float *B; + size_t bytes_B; + int ldb; + float *C; + size_t bytes_C; + int ldc; + int k; + float alpha; + float beta; + size_t dim_X1, dim_Y1, dim_X2, dim_Y2; } RootIn; -void mysgemmNT( - float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, - int ldc, int k, float alpha, float beta -) { - __visc__hint(visc::DEVICE); - __visc__attributes(3, A, B, C, 1, C); - - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int ly = __visc__getNodeInstanceID_y(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - int gridx = __visc__getNumNodeInstances_x(thisNode); - int gridy = __visc__getNumNodeInstances_y(thisNode); - int m = gx * gridx + lx; - int n = gy * gridy + ly; - - float c = 0.0f; - for (int i = 0; i < k; ++i) { - float a = A[m + i * lda]; - float b = B[n + i * ldb]; - c += a * b; - } - C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c; +void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, + int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, + float beta) { + __visc__hint(visc::DEVICE); + __visc__attributes(3, A, B, C, 1, C); + + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int ly = __visc__getNodeInstanceID_y(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + int gridx = __visc__getNumNodeInstances_x(thisNode); + int gridy = __visc__getNumNodeInstances_y(thisNode); + int m = gx * gridx + lx; + int n = gy * gridy + ly; + + float c = 0.0f; + for (int i = 0; i < k; ++i) { + float a = A[m + i * lda]; + float b = B[n + i * ldb]; + c += a * b; + } + C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c; } -void basicSgemmLvl1( - float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, int ldc, - int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1 -) { - __visc__hint(visc::DEVICE); - __visc__attributes(3, A, B, C, 1, C); - void* sgemm_node = __visc__createNodeND(2, mysgemmNT, (size_t) dim_X1, (size_t) dim_Y1); - __visc__bindIn(sgemm_node, 0, 0, 0); - __visc__bindIn(sgemm_node, 1, 1, 0); - __visc__bindIn(sgemm_node, 2, 2, 0); - __visc__bindIn(sgemm_node, 3, 3, 0); - __visc__bindIn(sgemm_node, 4, 4, 0); - __visc__bindIn(sgemm_node, 5, 5, 0); - __visc__bindIn(sgemm_node, 6, 6, 0); - __visc__bindIn(sgemm_node, 7, 7, 0); - __visc__bindIn(sgemm_node, 8, 8, 0); - __visc__bindIn(sgemm_node, 9, 9, 0); - __visc__bindIn(sgemm_node, 10, 10, 0); - __visc__bindIn(sgemm_node, 11, 11, 0); +void basicSgemmLvl1(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, + int ldb, float *C, size_t bytes_C, int ldc, int k, + float alpha, float beta, size_t dim_X1, size_t dim_Y1) { + __visc__hint(visc::DEVICE); + __visc__attributes(3, A, B, C, 1, C); + void *sgemm_node = + __visc__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1); + __visc__bindIn(sgemm_node, 0, 0, 0); + __visc__bindIn(sgemm_node, 1, 1, 0); + __visc__bindIn(sgemm_node, 2, 2, 0); + __visc__bindIn(sgemm_node, 3, 3, 0); + __visc__bindIn(sgemm_node, 4, 4, 0); + __visc__bindIn(sgemm_node, 5, 5, 0); + __visc__bindIn(sgemm_node, 6, 6, 0); + __visc__bindIn(sgemm_node, 7, 7, 0); + __visc__bindIn(sgemm_node, 8, 8, 0); + __visc__bindIn(sgemm_node, 9, 9, 0); + __visc__bindIn(sgemm_node, 10, 10, 0); + __visc__bindIn(sgemm_node, 11, 11, 0); } -void basicSgemmLvl2( - float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, int ldc, - int k, float alpha, float beta, - size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2 -) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* sgemm_node = __visc__createNodeND(2, basicSgemmLvl1, (size_t) dim_X2, (size_t) dim_Y2); - __visc__bindIn(sgemm_node, 0, 0, 0); - __visc__bindIn(sgemm_node, 1, 1, 0); - __visc__bindIn(sgemm_node, 2, 2, 0); - __visc__bindIn(sgemm_node, 3, 3, 0); - __visc__bindIn(sgemm_node, 4, 4, 0); - __visc__bindIn(sgemm_node, 5, 5, 0); - __visc__bindIn(sgemm_node, 6, 6, 0); - __visc__bindIn(sgemm_node, 7, 7, 0); - __visc__bindIn(sgemm_node, 8, 8, 0); - __visc__bindIn(sgemm_node, 9, 9, 0); - __visc__bindIn(sgemm_node, 10, 10, 0); - __visc__bindIn(sgemm_node, 11, 11, 0); - __visc__bindIn(sgemm_node, 12, 12, 0); - __visc__bindIn(sgemm_node, 13, 13, 0); +void basicSgemmLvl2(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, + int ldb, float *C, size_t bytes_C, int ldc, int k, + float alpha, float beta, size_t dim_X1, size_t dim_Y1, + size_t dim_X2, size_t dim_Y2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *sgemm_node = + __visc__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2); + __visc__bindIn(sgemm_node, 0, 0, 0); + __visc__bindIn(sgemm_node, 1, 1, 0); + __visc__bindIn(sgemm_node, 2, 2, 0); + __visc__bindIn(sgemm_node, 3, 3, 0); + __visc__bindIn(sgemm_node, 4, 4, 0); + __visc__bindIn(sgemm_node, 5, 5, 0); + __visc__bindIn(sgemm_node, 6, 6, 0); + __visc__bindIn(sgemm_node, 7, 7, 0); + __visc__bindIn(sgemm_node, 8, 8, 0); + __visc__bindIn(sgemm_node, 9, 9, 0); + __visc__bindIn(sgemm_node, 10, 10, 0); + __visc__bindIn(sgemm_node, 11, 11, 0); + __visc__bindIn(sgemm_node, 12, 12, 0); + __visc__bindIn(sgemm_node, 13, 13, 0); } // A wrapper level used in codegen for some backends -void basicSgemmLvl3( - float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, int ldc, - int k, float alpha, float beta, - size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2 -) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* sgemm_node = __visc__createNodeND(0, basicSgemmLvl2); - __visc__bindIn(sgemm_node, 0, 0, 0); - __visc__bindIn(sgemm_node, 1, 1, 0); - __visc__bindIn(sgemm_node, 2, 2, 0); - __visc__bindIn(sgemm_node, 3, 3, 0); - __visc__bindIn(sgemm_node, 4, 4, 0); - __visc__bindIn(sgemm_node, 5, 5, 0); - __visc__bindIn(sgemm_node, 6, 6, 0); - __visc__bindIn(sgemm_node, 7, 7, 0); - __visc__bindIn(sgemm_node, 8, 8, 0); - __visc__bindIn(sgemm_node, 9, 9, 0); - __visc__bindIn(sgemm_node, 10, 10, 0); - __visc__bindIn(sgemm_node, 11, 11, 0); - __visc__bindIn(sgemm_node, 12, 12, 0); - __visc__bindIn(sgemm_node, 13, 13, 0); - __visc__bindIn(sgemm_node, 14, 14, 0); - __visc__bindIn(sgemm_node, 15, 15, 0); +void basicSgemmLvl3(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, + int ldb, float *C, size_t bytes_C, int ldc, int k, + float alpha, float beta, size_t dim_X1, size_t dim_Y1, + size_t dim_X2, size_t dim_Y2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *sgemm_node = __visc__createNodeND(0, basicSgemmLvl2); + __visc__bindIn(sgemm_node, 0, 0, 0); + __visc__bindIn(sgemm_node, 1, 1, 0); + __visc__bindIn(sgemm_node, 2, 2, 0); + __visc__bindIn(sgemm_node, 3, 3, 0); + __visc__bindIn(sgemm_node, 4, 4, 0); + __visc__bindIn(sgemm_node, 5, 5, 0); + __visc__bindIn(sgemm_node, 6, 6, 0); + __visc__bindIn(sgemm_node, 7, 7, 0); + __visc__bindIn(sgemm_node, 8, 8, 0); + __visc__bindIn(sgemm_node, 9, 9, 0); + __visc__bindIn(sgemm_node, 10, 10, 0); + __visc__bindIn(sgemm_node, 11, 11, 0); + __visc__bindIn(sgemm_node, 12, 12, 0); + __visc__bindIn(sgemm_node, 13, 13, 0); + __visc__bindIn(sgemm_node, 14, 14, 0); + __visc__bindIn(sgemm_node, 15, 15, 0); } -__attribute__((noinline)) void basicSgemm( - char transa, char transb, int m, int n, int k, float alpha, - float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, - float* C, size_t bytesC, int ldc -) { - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } - - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } - - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; - } - - size_t db[2] = {TILE_SZ,TILE_SZ}, dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - - void *root_in = malloc(sizeof(RootIn)); - RootIn root_in_local = { - A, bytesA, lda, - B, bytesB, ldb, - C, bytesC, ldc, - k, alpha, beta, - db[0], db[1], dg[0]/db[0], dg[1]/db[1] - }; - *(RootIn *)root_in = root_in_local; - void* sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in); - __visc__wait(sgemmDFG); -} +__attribute__((noinline)) void basicSgemm(char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } -int main (int argc, char *argv[]) { + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } - struct pb_Parameters *params; - struct pb_TimerSet timers; + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; + } + + size_t db[2] = {TILE_SZ, TILE_SZ}, + dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; + + void *root_in = malloc(sizeof(RootIn)); + RootIn root_in_local = {A, + bytesA, + lda, + B, + bytesB, + ldb, + C, + bytesC, + ldc, + k, + alpha, + beta, + db[0], + db[1], + dg[0] / db[0], + dg[1] / db[1]}; + *(RootIn *)root_in = root_in_local; + void *sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in); + __visc__wait(sgemmDFG); +} - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; +int main(int argc, char *argv[]) { + struct pb_Parameters *params; + struct pb_TimerSet timers; - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; - unsigned iter = 0; - while(params->inpFiles[iter] != NULL) { - printf("Found input file %d - %s\n", iter, params->inpFiles[iter]); - iter++; - } - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - printf("Expecting three input filenames\n"); - exit(-1); - return 0; - } + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); - /* Read in data */ - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + unsigned iter = 0; + while (params->inpFiles[iter] != NULL) { + printf("Found input file %d - %s\n", iter, params->inpFiles[iter]); + iter++; + } + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + printf("Expecting three input filenames\n"); + exit(-1); + return 0; + } - printf("This is in between two reads\n"); - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + /* Read in data */ + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - pb_InitializeTimerSet(&timers); - __visc__init(); + printf("This is in between two reads\n"); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + pb_InitializeTimerSet(&timers); + __visc__init(); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - llvm_visc_request_mem(&matC.front(), C_sz); + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz, + matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, + matArow); - pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - __visc__cleanup(); + pb_PrintTimerSet(&timers); + __visc__cleanup(); - if (params->outFile) { + if (params->outFile) { - /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); - } + /* Write C to file */ + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc index 96ffcaddddbcddcd3b75903a23dbfc6c944a8cbf..62f9285e8a8054e5597fe45adc5257470b147622 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc @@ -10,178 +10,177 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) -{ - __visc__hint(visc::GPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); +void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, + float alpha, float beta) { + __visc__hint(visc::GPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); - float c[TILE_N]; - for (int i=0; i < TILE_N; i++) - c[i] = 0.0f; - - int mid = get_local_id(1)*get_local_size(0)+get_local_id(0); - int m = get_group_id(0) * TILE_M + mid; + float c[TILE_N]; + for (int i = 0; i < TILE_N; i++) + c[i] = 0.0f; - int b_base = 0; + int mid = get_local_id(1) * get_local_size(0) + get_local_id(0); + int m = get_group_id(0) * TILE_M + mid; - for (int i = 0; i < k; i+=TILE_TB_HEIGHT) { - float a; - b_base = get_group_id(1) * TILE_N + i * ldb; + int b_base = 0; - for (int j = 0; j < TILE_TB_HEIGHT; j++) { - a = A[m + (i+j)*lda]; - for (int kk = 0; kk < TILE_N; kk++) - c[kk] += a * B[b_base + j * ldb + kk]; + for (int i = 0; i < k; i += TILE_TB_HEIGHT) { + float a; + b_base = get_group_id(1) * TILE_N + i * ldb; - } - } - int t = ldc * get_group_id(1) * TILE_N + m; - for (int i = 0; i < TILE_N; i++) { - C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i]; + for (int j = 0; j < TILE_TB_HEIGHT; j++) { + a = A[m + (i + j) * lda]; + for (int kk = 0; kk < TILE_N; kk++) + c[kk] += a * B[b_base + j * ldb + kk]; } + } + int t = ldc * get_group_id(1) * TILE_N + m; + for (int i = 0; i < TILE_N; i++) { + C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i]; + } } -__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) -{ - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } +__attribute__((noinline)) void basicSgemm(char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; - return; - } + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; + return; + } - unsigned db[2] = {TILE_N,TILE_TB_HEIGHT}; -// unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N}; + unsigned db[2] = {TILE_N, TILE_TB_HEIGHT}; + // unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; + unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N}; - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, + ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { - struct pb_Parameters *params; - struct pb_TimerSet timers; + struct pb_Parameters *params; + struct pb_TimerSet timers; - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } - - /* Read in data */ - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); - - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + /* Read in data */ + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - pb_InitializeTimerSet(&timers); - __visc__init(); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + pb_InitializeTimerSet(&timers); + __visc__init(); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - llvm_visc_request_mem(&matC.front(), C_sz); + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz, + matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, + matArow); - pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - __visc__cleanup(); + pb_PrintTimerSet(&timers); + __visc__cleanup(); - if (params->outFile) { + if (params->outFile) { - /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); - } + /* Write C to file */ + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc index 16f2341a2203e3510b9c00a91eedd3ac53d296d4..05d143b5884164926213ca060da341a254399bf3 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc @@ -10,377 +10,341 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 16 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define TILE_M (TILE_N * TILE_TB_HEIGHT) + +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } typedef struct __attribute__((__packed__)) { - float *A; - size_t bytesA; - int lda; - float *B; - size_t bytesB; - int ldb; - float *C; - size_t bytesC; - int ldc; - int k; - float alpha; - float beta; - long block_x; - long block_y; - long grid_x; - long grid_y; -} -RootIn; - -void packData(RootIn* args, - float *A, size_t bytesA, - int lda, - float *B, size_t bytesB, - int ldb, - float *C, size_t bytesC, - int ldc, - int k, - float alpha, - float beta, - long block_x, - long block_y, - long grid_x, + float *A; + size_t bytesA; + int lda; + float *B; + size_t bytesB; + int ldb; + float *C; + size_t bytesC; + int ldc; + int k; + float alpha; + float beta; + long block_x; + long block_y; + long grid_x; + long grid_y; +} RootIn; + +void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k, + float alpha, float beta, long block_x, long block_y, long grid_x, long grid_y) { - args->A = A; - args->bytesA = bytesA; - args->lda = lda; - args->B = B; - args->bytesB = bytesB; - args->ldb = ldb; - args->C = C; - args->bytesC = bytesC; - args->ldc = ldc; - args->k = k; - args->alpha = alpha; - args->beta = beta; - args->block_x = block_x; - args->block_y = block_y; - args->grid_x = grid_x; - args->grid_y = grid_y; + args->A = A; + args->bytesA = bytesA; + args->lda = lda; + args->B = B; + args->bytesB = bytesB; + args->ldb = ldb; + args->C = C; + args->bytesC = bytesC; + args->ldc = ldc; + args->k = k; + args->alpha = alpha; + args->beta = beta; + args->block_x = block_x; + args->block_y = block_y; + args->grid_x = grid_x; + args->grid_y = grid_y; } void Allocation(long block_x, long block_y) { - void* shB = __visc__malloc(block_x*block_y*sizeof(float)); - __visc__return(2, shB, block_x*block_y*sizeof(float)); + void *shB = __visc__malloc(block_x * block_y * sizeof(float)); + __visc__return(2, shB, block_x * block_y * sizeof(float)); } +void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, + int ldb, float *C, size_t bytesC, int ldc, int k, float alpha, + float beta, float *shB, size_t bytesshB) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::SPIR_TARGET); + //__visc__hint(visc::GPU_TARGET); -void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float* C, size_t bytesC, int ldc, int k, float alpha, float beta, float* shB, size_t bytesshB) { - __visc__hint(visc::DEVICE); - //__visc__hint(visc::SPIR_TARGET); - //__visc__hint(visc::GPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); - __visc__attributes(3, A, B, C, 1, C); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); + long lx = __visc__getNodeInstanceID_x(thisNode); + long ly = __visc__getNodeInstanceID_y(thisNode); - long lx = __visc__getNodeInstanceID_x(thisNode); - long ly = __visc__getNodeInstanceID_y(thisNode); + long gx = __visc__getNodeInstanceID_x(parentNode); + long gy = __visc__getNodeInstanceID_y(parentNode); - long gx = __visc__getNodeInstanceID_x(parentNode); - long gy = __visc__getNodeInstanceID_y(parentNode); + long dimx = __visc__getNumNodeInstances_x(thisNode); - long dimx = __visc__getNumNodeInstances_x(thisNode); + float c[TILE_N]; + for (int i = 0; i < TILE_N; i++) + c[i] = 0.0f; - float c[TILE_N]; - for (int i=0; i < TILE_N; i++) - c[i] = 0.0f; + int mid = ly * dimx + lx; + int m = gx * TILE_M + mid; + int n = gy * TILE_N + lx; - int mid = ly*dimx+lx; - int m = gx * TILE_M + mid; - int n = gy * TILE_N + lx; + for (int i = 0; i < k; i += TILE_TB_HEIGHT) { + float a; + // shB[ly][lx] = B[n+(i+ly)*ldb]; + shB[ly * dimx + lx] = B[n + (i + ly) * ldb]; - for (int i = 0; i < k; i+=TILE_TB_HEIGHT) { - float a; - //shB[ly][lx] = B[n+(i+ly)*ldb]; - shB[ly*dimx+lx] = B[n+(i+ly)*ldb]; - - __visc__barrier(); - for (int j = 0; j < TILE_TB_HEIGHT; j++) { - a = A[m + (i+j)*lda]; - for (int kk = 0; kk < TILE_N; kk++) { - //c[kk] += a * shB[j][kk]; - c[kk] += a * shB[j*dimx+kk]; - } - } - __visc__barrier(); + __visc__barrier(); + for (int j = 0; j < TILE_TB_HEIGHT; j++) { + a = A[m + (i + j) * lda]; + for (int kk = 0; kk < TILE_N; kk++) { + // c[kk] += a * shB[j][kk]; + c[kk] += a * shB[j * dimx + kk]; + } } + __visc__barrier(); + } - int t = ldc * gy * TILE_N + m; - for (int i = 0; i < TILE_N; i++) { - C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i]; - } + int t = ldc * gy * TILE_N + m; + for (int i = 0; i < TILE_N; i++) { + C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i]; + } } // Work group node for sgemm - Creates allocation node and leaf (work item) node -void SgemmTB(float *A, size_t bytesA, - int lda, - float *B, size_t bytesB, - int ldb, - float *C, size_t bytesC, - int ldc, - int k, - float alpha, - float beta, - long block_x, - long block_y) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* AllocationNode = __visc__createNodeND(0, Allocation); - void* SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y); - - // Bind edges - __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta - - __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x - __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y - - - // Create Edges between AllocationNode and BFSLeafNodeNode - __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B - __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, 0); // Edge bytes_local_B - +void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, + float *C, size_t bytesC, int ldc, int k, float alpha, float beta, + long block_x, long block_y) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y); + + // Bind edges + __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta + + __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x + __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y + + // Create Edges between AllocationNode and BFSLeafNodeNode + __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B + __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, + 0); // Edge bytes_local_B } // Root node for sgemm - Creates work group node -void SgemmRoot( - float *A, size_t bytesA, int lda, // 0-2 - float *B, size_t bytesB, int ldb, // 3-5 - float *C, size_t bytesC, int ldc, // 6-8 - int k, float alpha, float beta, // 9-11 - long block_x, long block_y, long grid_x, long grid_y // 12-15 +void SgemmRoot(float *A, size_t bytesA, int lda, // 0-2 + float *B, size_t bytesB, int ldb, // 3-5 + float *C, size_t bytesC, int ldc, // 6-8 + int k, float alpha, float beta, // 9-11 + long block_x, long block_y, long grid_x, long grid_y // 12-15 ) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y); - - // Bind edges - __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta - __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x - __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y - + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y); + + // Bind edges + __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta + __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x + __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y } -void SgemmWrapper( - float *A, size_t bytesA, int lda, // 0-2 - float *B, size_t bytesB, int ldb, // 3-5 - float *C, size_t bytesC, int ldc, // 6-8 - int k, float alpha, float beta, // 9-11 - long block_x, long block_y, long grid_x, long grid_y // 12-15 +void SgemmWrapper(float *A, size_t bytesA, int lda, // 0-2 + float *B, size_t bytesB, int ldb, // 3-5 + float *C, size_t bytesC, int ldc, // 6-8 + int k, float alpha, float beta, // 9-11 + long block_x, long block_y, long grid_x, long grid_y // 12-15 ) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* SgemmRootNode = __visc__createNodeND(0, SgemmRoot); - - // Bind edges - __visc__bindIn(SgemmRootNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmRootNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmRootNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmRootNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta - __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x - __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y - __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x - __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *SgemmRootNode = __visc__createNodeND(0, SgemmRoot); + + // Bind edges + __visc__bindIn(SgemmRootNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmRootNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmRootNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmRootNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta + __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x + __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y + __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x + __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y } // Creates root node for sgemm -__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) -{ - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } +__attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers, + char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; - return; - } + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; + return; + } + + // unsigned db[2] = {TILE_N,TILE_TB_HEIGHT}; + // unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; + + long block_x = TILE_N; + long block_y = TILE_TB_HEIGHT; + long grid_x = m / TILE_M; + long grid_y = n / TILE_N; -// unsigned db[2] = {TILE_N,TILE_TB_HEIGHT}; -// unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; - - long block_x = TILE_N; - long block_y = TILE_TB_HEIGHT; - long grid_x = m/TILE_M; - long grid_y = n/TILE_N; - - // Pack data in struct - RootIn* args = (RootIn*) malloc(sizeof(RootIn)); - packData(args, - A, bytesA, - lda, - B, bytesB, - ldb, - C, bytesC, - ldc, - k, - alpha, - beta, - block_x, - block_y, - grid_x, - grid_y - ); - - pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION ); - void* sgemmDFG = __visc__launch(0, SgemmWrapper, (void*) args); - - __visc__wait(sgemmDFG); - pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); + // Pack data in struct + RootIn *args = (RootIn *)malloc(sizeof(RootIn)); + packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, + block_x, block_y, grid_x, grid_y); + + pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); + void *sgemmDFG = __visc__launch(0, SgemmWrapper, (void *)args); + + __visc__wait(sgemmDFG); + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); } -int main (int argc, char *argv[]) { - - struct pb_Parameters *params; - struct pb_TimerSet timers; - - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; - - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } +int main(int argc, char *argv[]) { - /* Read in data */ - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + struct pb_Parameters *params; + struct pb_TimerSet timers; - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; - pb_InitializeTimerSet(&timers); - __visc__init(); + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + /* Read in data */ + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + pb_InitializeTimerSet(&timers); + __visc__init(); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - // Use standard sgemm interface - basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, - 0.0f, &matC.front(), C_sz, matArow); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - llvm_visc_request_mem(&matC.front(), C_sz); + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + // Use standard sgemm interface + basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), + A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), + C_sz, matArow); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(&matC.front(), C_sz); - pb_PrintTimerSet(&timers); - __visc__cleanup(); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (params->outFile) { - /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); - } + pb_PrintTimerSet(&timers); + __visc__cleanup(); + + if (params->outFile) { + /* Write C to file */ + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc index 71a615026f979a70ffb7d99341e3e5a1ba23e8b2..0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc @@ -10,168 +10,171 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) -{ - __visc__attributes(3, A, B, C, 1, C); - float c0, c1, c2, c3; - c0 = c1 = c2 = c3 = 0.0f; - int m = 4 * get_global_id(0); - int n = get_global_id(1); - - for (int i = 0; i < k; ++i) { - float a0 = A[m + i * lda]; - float a1 = A[m + 1 + i * lda]; - float a2 = A[m + 2 + i * lda]; - float a3 = A[m + 3 + i * lda]; - - float b = B[n + i * ldb]; - - c0 += a0 * b; - c1 += a1 * b; - c2 += a2 * b; - c3 += a3 * b; - } - C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0; - C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1; - C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2; - C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3; +void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, + float alpha, float beta) { + __visc__attributes(3, A, B, C, 1, C); + float c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0.0f; + int m = 4 * get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a0 = A[m + i * lda]; + float a1 = A[m + 1 + i * lda]; + float a2 = A[m + 2 + i * lda]; + float a3 = A[m + 3 + i * lda]; + + float b = B[n + i * ldb]; + + c0 += a0 * b; + c1 += a1 * b; + c2 += a2 * b; + c3 += a3 * b; + } + C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0; + C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1; + C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2; + C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3; } -__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) -{ - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } - - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } - - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; - } - - unsigned db[2] = {TILE_SZ/4,TILE_SZ}; - unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); +__attribute__((noinline)) void basicSgemm(char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; + } + + unsigned db[2] = {TILE_SZ / 4, TILE_SZ}; + unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; + + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, + ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { - struct pb_Parameters *params; - struct pb_TimerSet timers; + struct pb_Parameters *params; + struct pb_TimerSet timers; - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; - pb_InitializeTimerSet(&timers); - __visc__init(); + pb_InitializeTimerSet(&timers); + __visc__init(); - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } - /* Read in data */ - pb_SwitchToTimer(&timers, pb_TimerID_IO); + /* Read in data */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz, + matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, + matArow); - if (params->outFile) { - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + if (params->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_COPY); - /* Write C to file */ - llvm_visc_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); - } + /* Write C to file */ + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_PrintTimerSet(&timers); - __visc__cleanup(); - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_PrintTimerSet(&timers); + __visc__cleanup(); + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc index 71a615026f979a70ffb7d99341e3e5a1ba23e8b2..0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc @@ -10,168 +10,171 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) -{ - __visc__attributes(3, A, B, C, 1, C); - float c0, c1, c2, c3; - c0 = c1 = c2 = c3 = 0.0f; - int m = 4 * get_global_id(0); - int n = get_global_id(1); - - for (int i = 0; i < k; ++i) { - float a0 = A[m + i * lda]; - float a1 = A[m + 1 + i * lda]; - float a2 = A[m + 2 + i * lda]; - float a3 = A[m + 3 + i * lda]; - - float b = B[n + i * ldb]; - - c0 += a0 * b; - c1 += a1 * b; - c2 += a2 * b; - c3 += a3 * b; - } - C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0; - C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1; - C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2; - C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3; +void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, + float alpha, float beta) { + __visc__attributes(3, A, B, C, 1, C); + float c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0.0f; + int m = 4 * get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a0 = A[m + i * lda]; + float a1 = A[m + 1 + i * lda]; + float a2 = A[m + 2 + i * lda]; + float a3 = A[m + 3 + i * lda]; + + float b = B[n + i * ldb]; + + c0 += a0 * b; + c1 += a1 * b; + c2 += a2 * b; + c3 += a3 * b; + } + C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0; + C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1; + C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2; + C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3; } -__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) -{ - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } - - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } - - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; - } - - unsigned db[2] = {TILE_SZ/4,TILE_SZ}; - unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); +__attribute__((noinline)) void basicSgemm(char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; + } + + unsigned db[2] = {TILE_SZ / 4, TILE_SZ}; + unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; + + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, + ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { - struct pb_Parameters *params; - struct pb_TimerSet timers; + struct pb_Parameters *params; + struct pb_TimerSet timers; - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; - pb_InitializeTimerSet(&timers); - __visc__init(); + pb_InitializeTimerSet(&timers); + __visc__init(); - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } - /* Read in data */ - pb_SwitchToTimer(&timers, pb_TimerID_IO); + /* Read in data */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz, + matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, + matArow); - if (params->outFile) { - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + if (params->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_COPY); - /* Write C to file */ - llvm_visc_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); - } + /* Write C to file */ + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_PrintTimerSet(&timers); - __visc__cleanup(); - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_PrintTimerSet(&timers); + __visc__cleanup(); + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc index 82e3cadcb56c7c942c5d359ffe33c6bb133af870..76d0cefc817ea28f2ffb15cd48d8dd5c7a97d0e0 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc @@ -10,179 +10,180 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_SZ 16 #define VEC_SZ 8 -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ } -void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) -{ - __visc__hint(visc::GPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - - float c = 0.0f; - int m = get_global_id(0); - int n = get_global_id(1); - - for (int i = 0; i < k; ++i) { - float a = A[m + i * lda]; - float b = B[n + i * ldb]; - c += a * b; - } - C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c; -/* - Will be substituted by this kernel at the llvm level - // Partial results - float8 cp = (float8)(0.0f); - - int m = get_global_id(0) * 8; - int n = get_global_id(1); - - for (int i = 0; i < k; ++i) { - float8 a = vload8(0, A + (m + i * lda)); - float8 b = (float8)(B[n + i * ldb]); - cp += a * b; - } - - float8 c = vload8(0, C + (m+n*ldc)); - c = c * beta + alpha * cp; - vstore8(c, 0, C + (m+n*ldc)); -*/ -} +void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, + float alpha, float beta) { + __visc__hint(visc::GPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + + float c = 0.0f; + int m = get_global_id(0); + int n = get_global_id(1); -__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) -{ - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } - - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } - - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_SZ) || (n%TILE_SZ)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ - << "; n should be multiple of " << TILE_SZ << std::endl; - } - - unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; - unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - - unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + for (int i = 0; i < k; ++i) { + float a = A[m + i * lda]; + float b = B[n + i * ldb]; + c += a * b; + } + C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c; + /* + Will be substituted by this kernel at the llvm level + // Partial results + float8 cp = (float8)(0.0f); + + int m = get_global_id(0) * 8; + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float8 a = vload8(0, A + (m + i * lda)); + float8 b = (float8)(B[n + i * ldb]); + cp += a * b; + } + + float8 c = vload8(0, C + (m+n*ldc)); + c = c * beta + alpha * cp; + vstore8(c, 0, C + (m+n*ldc)); + */ } -int main (int argc, char *argv[]) { +__attribute__((noinline)) void basicSgemm(char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_SZ) || (n % TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_SZ << "; n should be multiple of " << TILE_SZ + << std::endl; + } - struct pb_Parameters *params; - struct pb_TimerSet timers; + unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ}; + unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, + ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); +} +int main(int argc, char *argv[]) { - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + struct pb_Parameters *params; + struct pb_TimerSet timers; - /* Read in data */ - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } - pb_InitializeTimerSet(&timers); - __visc__init(); + /* Read in data */ + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + pb_InitializeTimerSet(&timers); + __visc__init(); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz, + matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, + matArow); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(&matC.front(), C_sz); - pb_PrintTimerSet(&timers); - __visc__cleanup(); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (params->outFile) { + pb_PrintTimerSet(&timers); + __visc__cleanup(); - /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); - } + if (params->outFile) { + + /* Write C to file */ + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc index 19f72c9dff35885b9e2c1f8c38502ac59fb6ab6b..a4c252d8f183e76f91349d97872dbca0b3766acf 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc @@ -10,219 +10,218 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <stdio.h> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> #include <vector> -#include <iostream> -#include <parboil.h> #include <visc.h> // I/O routines -extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); -extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); -extern char* readFile(const char*); +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, + std::vector<float> &); +extern char *readFile(const char *); // Parameters of tile sizes #define TILE_N 8 #define TILE_TB_HEIGHT 8 -#define TILE_M (TILE_N*TILE_TB_HEIGHT) - -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - std::cout<<errorMessage<<" Error!\n"; \ - std::cout<<"Line: "<<__LINE__<<"\n"; \ - exit(1); \ - } +#define TILE_M (TILE_N * TILE_TB_HEIGHT) -void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) -{ - __visc__hint(visc::SPIR_TARGET); - __visc__attributes(3, A, B, C, 1, C); - - float c[TILE_N]; - for (int i=0; i < TILE_N; i++) - c[i] = 0.0f; - - int mid = get_local_id(1)*get_local_size(0)+get_local_id(0); - int m = get_group_id(0) * TILE_M + mid; - - int b_base = 0; - - for (int i = 0; i < k; i+=TILE_TB_HEIGHT) { - float a; - b_base = get_group_id(1) * TILE_N + i * ldb; - - for (int j = 0; j < TILE_TB_HEIGHT; j++) { - a = A[m + (i+j)*lda]; - for (int kk = 0; kk < TILE_N; kk++) - c[kk] += a * B[b_base + j * ldb + kk]; - - } - } - int t = ldc * get_group_id(1) * TILE_N + m; - for (int i = 0; i < TILE_N; i++) { - C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i]; - } -/* - Will be substituted by this kernel at the llvm level +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + std::cout << errorMessage << " Error!\n"; \ + std::cout << "Line: " << __LINE__ << "\n"; \ + exit(1); \ + } - // Partial results - floatn cp = (floatn)(0.0f); +void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, + float alpha, float beta) { + __visc__hint(visc::SPIR_TARGET); + __visc__attributes(3, A, B, C, 1, C); - int mid = get_local_id(1)*get_local_size(0)+get_local_id(0); - int m = get_group_id(0) * TILE_M + mid; + float c[TILE_N]; + for (int i = 0; i < TILE_N; i++) + c[i] = 0.0f; - int b_base = 0; + int mid = get_local_id(1) * get_local_size(0) + get_local_id(0); + int m = get_group_id(0) * TILE_M + mid; - for (int i = 0; i < k; i+=TILE_TB_HEIGHT) { - float a; - b_base = get_group_id(1) * TILE_N + i * ldb; + int b_base = 0; - for (int j = 0; j < TILE_TB_HEIGHT; j++) { - a = A[m + (i+j)*lda]; - cp += a * vloadn(0, B + b_base + j * ldb); - } - } + for (int i = 0; i < k; i += TILE_TB_HEIGHT) { + float a; + b_base = get_group_id(1) * TILE_N + i * ldb; - cp = alpha * cp; - float c[TILE_N]; - c[0] = cp.s0; - c[1] = cp.s1; - c[2] = cp.s2; - c[3] = cp.s3; - c[4] = cp.s4; - c[5] = cp.s5; - c[6] = cp.s6; - c[7] = cp.s7; - - int t = ldc * get_group_id(1) * TILE_N + m; - for (int i = 0; i < TILE_N; i++) { - C[t+i*ldc] = C[t+i*ldc] * beta + c[i]; + for (int j = 0; j < TILE_TB_HEIGHT; j++) { + a = A[m + (i + j) * lda]; + for (int kk = 0; kk < TILE_N; kk++) + c[kk] += a * B[b_base + j * ldb + kk]; } - -*/ + } + int t = ldc * get_group_id(1) * TILE_N + m; + for (int i = 0; i < TILE_N; i++) { + C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i]; + } + /* + Will be substituted by this kernel at the llvm level + + // Partial results + floatn cp = (floatn)(0.0f); + + int mid = get_local_id(1)*get_local_size(0)+get_local_id(0); + int m = get_group_id(0) * TILE_M + mid; + + int b_base = 0; + + for (int i = 0; i < k; i+=TILE_TB_HEIGHT) { + float a; + b_base = get_group_id(1) * TILE_N + i * ldb; + + for (int j = 0; j < TILE_TB_HEIGHT; j++) { + a = A[m + (i+j)*lda]; + cp += a * vloadn(0, B + b_base + j * ldb); + } + } + + cp = alpha * cp; + float c[TILE_N]; + c[0] = cp.s0; + c[1] = cp.s1; + c[2] = cp.s2; + c[3] = cp.s3; + c[4] = cp.s4; + c[5] = cp.s5; + c[6] = cp.s6; + c[7] = cp.s7; + + int t = ldc * get_group_id(1) * TILE_N + m; + for (int i = 0; i < TILE_N; i++) { + C[t+i*ldc] = C[t+i*ldc] * beta + c[i]; + } + + */ } -__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) -{ - if ((transa != 'N') && (transa != 'n')) { - std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; - return; - } +__attribute__((noinline)) void basicSgemm(char transa, char transb, int m, + int n, int k, float alpha, float *A, + size_t bytesA, int lda, float *B, + size_t bytesB, int ldb, float beta, + float *C, size_t bytesC, int ldc) { + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } - if ((transb != 'T') && (transb != 't')) { - std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; - return; - } + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } - // In this code we assume the matrix sizes are multiple of tile size - if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; - return; - } + // In this code we assume the matrix sizes are multiple of tile size + if ((m % TILE_M) || (n % TILE_N)) { + std::cerr << "unsupported size of matrix. m should be multiple of " + << TILE_M << "; n should be multiple of " << TILE_N << std::endl; + return; + } -// unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; -// unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; - unsigned db[2] = {TILE_N,TILE_TB_HEIGHT}; - unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; + // unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ}; + // unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + unsigned db[2] = {TILE_N, TILE_TB_HEIGHT}; + unsigned dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; - void* sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __visc__wait(sgemmDFG); + void *sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, + ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); } -int main (int argc, char *argv[]) { - - struct pb_Parameters *params; - struct pb_TimerSet timers; - - size_t A_sz, B_sz, C_sz; - int matArow, matAcol; - int matBrow, matBcol; - std::vector<float> matA, matBT; +int main(int argc, char *argv[]) { + struct pb_Parameters *params; + struct pb_TimerSet timers; - /* Read command line. Expect 3 inputs: A, B and B^T - in column-major layout*/ - params = pb_ReadParameters(&argc, argv); - if ((params->inpFiles[0] == NULL) - || (params->inpFiles[1] == NULL) - || (params->inpFiles[2] == NULL) - || (params->inpFiles[3] != NULL)) - { - fprintf(stderr, "Expecting three input filenames\n"); - exit(-1); - } + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; - /* Read in data */ - // load A - readColMajorMatrixFile(params->inpFiles[0], - matArow, matAcol, matA); + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) || + (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } - // load B^T - readColMajorMatrixFile(params->inpFiles[2], - matBcol, matBrow, matBT); + /* Read in data */ + // load A + readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA); - pb_InitializeTimerSet(&timers); - __visc__init(); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - // copy A to device memory - A_sz = matArow*matAcol*sizeof(float); - B_sz = matBrow*matBcol*sizeof(float); + pb_InitializeTimerSet(&timers); + __visc__init(); - // allocate space for C - C_sz = matArow*matBcol*sizeof(float); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // copy A to device memory + A_sz = matArow * matAcol * sizeof(float); + B_sz = matBrow * matBcol * sizeof(float); - // OpenCL memory allocation - std::vector<float> matC(matArow*matBcol); + // allocate space for C + C_sz = matArow * matBcol * sizeof(float); - pb_SwitchToTimer( &timers, visc_TimerID_MEM_TRACK ); - llvm_visc_track_mem(&matA.front(), A_sz); - llvm_visc_track_mem(&matBT.front(), B_sz); - llvm_visc_track_mem(&matC.front(), C_sz); + // OpenCL memory allocation + std::vector<float> matC(matArow * matBcol); - // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); - for(size_t i=0; i<matC.size(); i++) - matC[i] = 0.0f; + // Copy A and B^T into device memory + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + for (size_t i = 0; i < matC.size(); i++) + matC[i] = 0.0f; - // Use standard sgemm interface - basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ - &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - llvm_visc_request_mem(&matC.front(), C_sz); + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz, + matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, + matArow); - pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); - llvm_visc_untrack_mem(&matA.front()); - llvm_visc_untrack_mem(&matBT.front()); - llvm_visc_untrack_mem(&matC.front()); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - __visc__cleanup(); + pb_PrintTimerSet(&timers); + __visc__cleanup(); - if (params->outFile) { + if (params->outFile) { - /* Write C to file */ - //pb_SwitchToTimer(&timers, pb_TimerID_IO); - writeColMajorMatrixFile(params->outFile, - matArow, matBcol, matC); - } + /* Write C to file */ + // pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); + } - double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); - std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; - pb_FreeParameters(params); + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 + << std::endl; + pb_FreeParameters(params); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c index 7e7672e8079edc6c40b77933b80a83b1fd4c71c3..bf4c1fe9553fd5399c076d91d0bc758734bc2d01 100644 --- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c +++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c @@ -1,46 +1,44 @@ /* -* NOTES: -* -* 1) Matrix Market files are always 1-based, i.e. the index of the first -* element of a matrix is (1,1), not (0,0) as in C. ADJUST THESE -* OFFSETS ACCORDINGLY when reading and writing -* to files. -* -* 2) ANSI C requires one to use the "l" format modifier when reading -* double precision floating point numbers in scanf() and -* its variants. For example, use "%lf", "%lg", or "%le" -* when reading doubles, otherwise errors will occur. -*/ + * NOTES: + * + * 1) Matrix Market files are always 1-based, i.e. the index of the first + * element of a matrix is (1,1), not (0,0) as in C. ADJUST THESE + * OFFSETS ACCORDINGLY when reading and writing + * to files. + * + * 2) ANSI C requires one to use the "l" format modifier when reading + * double precision floating point numbers in scanf() and + * its variants. For example, use "%lf", "%lg", or "%le" + * when reading doubles, otherwise errors will occur. + */ +#include "convert_dataset.h" +#include "mmio.h" +#include <math.h> #include <stdio.h> #include <stdlib.h> -#include <math.h> -#include "mmio.h" -#include "convert_dataset.h" - - typedef struct _mat_entry { - int row, col; /* i,j */ - float val; + int row, col; /* i,j */ + float val; } mat_entry; typedef struct _row_stats { // stats on each row - int index; - int size; - int start; - int padding; + int index; + int size; + int start; + int padding; } row_stats; -int sort_rows(const void* a, const void* b) { - return (((mat_entry*)a)->row - ((mat_entry*)b)->row); +int sort_rows(const void *a, const void *b) { + return (((mat_entry *)a)->row - ((mat_entry *)b)->row); } -int sort_cols(const void* a, const void* b) { - return (((mat_entry*)a)->col - ((mat_entry*)b)->col); +int sort_cols(const void *a, const void *b) { + return (((mat_entry *)a)->col - ((mat_entry *)b)->col); } /* sorts largest by size first */ -int sort_stats(const void* a, const void* b) { - return(((row_stats*)b)->size - ((row_stats*)a)->size); +int sort_stats(const void *a, const void *b) { + return (((row_stats *)b)->size - ((row_stats *)a)->size); } /* @@ -75,262 +73,279 @@ int sort_stats(const void* a, const void* b) { * dim - dimensions of the input matrix * data_ptr_len - size of data_row_ptr (maps to original `depth` var) */ -int coo_to_jds(char* mtx_filename, int pad_rows, int warp_size, int pack_size, - int mirrored, int binary, int debug_level, - float** data, int** data_row_ptr, int** nz_count, int** data_col_index, - int** data_row_map, int* data_cols, int* dim, int* len, int* nz_count_len, - int* data_ptr_len) { - int ret_code; - MM_typecode matcode; - FILE *f; - int nz; - int i; - float *val; - mat_entry* entries; - row_stats* stats; - int rows, cols; - - if ((f = fopen(mtx_filename, "r")) == NULL) - exit(1); +int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size, + int mirrored, int binary, int debug_level, float **data, + int **data_row_ptr, int **nz_count, int **data_col_index, + int **data_row_map, int *data_cols, int *dim, int *len, + int *nz_count_len, int *data_ptr_len) { + int ret_code; + MM_typecode matcode; + FILE *f; + int nz; + int i; + float *val; + mat_entry *entries; + row_stats *stats; + int rows, cols; + if ((f = fopen(mtx_filename, "r")) == NULL) + exit(1); - if (mm_read_banner(f, &matcode) != 0) - { - printf("Could not process Matrix Market banner.\n"); - exit(1); - } + if (mm_read_banner(f, &matcode) != 0) { + printf("Could not process Matrix Market banner.\n"); + exit(1); + } + /* This is how one can screen matrix types if their application */ + /* only supports a subset of the Matrix Market data types. */ - /* This is how one can screen matrix types if their application */ - /* only supports a subset of the Matrix Market data types. */ + if (mm_is_complex(matcode) && mm_is_matrix(matcode) && + mm_is_sparse(matcode)) { + printf("Sorry, this application does not support "); + printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode)); + exit(1); + } - if (mm_is_complex(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode) ) - { - printf("Sorry, this application does not support "); - printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode)); - exit(1); - } + /* find out size of sparse matrix .... */ - /* find out size of sparse matrix .... */ + if ((ret_code = mm_read_mtx_crd_size(f, &rows, &cols, &nz)) != 0) + exit(1); + *dim = rows; - if ((ret_code = mm_read_mtx_crd_size(f, &rows, &cols, &nz)) !=0) - exit(1); - *dim = rows; - - if (mirrored) { - // max possible size, might be less because diagonal values aren't doubled - entries = (mat_entry*) malloc(2 * nz * sizeof(mat_entry)); - } else { - entries = (mat_entry*) malloc(nz * sizeof(mat_entry)); - } - - /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ - /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ - /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - int cur_i=0; // to account for mirrored diagonal entries + if (mirrored) { + // max possible size, might be less because diagonal values aren't doubled + entries = (mat_entry *)malloc(2 * nz * sizeof(mat_entry)); + } else { + entries = (mat_entry *)malloc(nz * sizeof(mat_entry)); + } - for (i=0; i<nz; i++, cur_i++) - { - if (!binary) { - fscanf(f, "%d %d %f\n", &entries[cur_i].row, &entries[cur_i].col, &entries[cur_i].val); - } else { - fscanf(f, "%d %d\n", &entries[cur_i].row, &entries[cur_i].col); - entries[cur_i].val = 1.0; - } - entries[cur_i].row--; - entries[cur_i].col--; - //printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col, entries[cur_i].val); - if (mirrored) { - // fill in mirrored diagonal - if (entries[cur_i].row != entries[cur_i].col) { // not a diagonal value - cur_i++; - entries[cur_i].val = entries[cur_i-1].val; - entries[cur_i].col = entries[cur_i-1].row; - entries[cur_i].row = entries[cur_i-1].col; - //printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col, entries[cur_i].val); - } - } + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ + /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ + /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ + int cur_i = 0; // to account for mirrored diagonal entries + + for (i = 0; i < nz; i++, cur_i++) { + if (!binary) { + fscanf(f, "%d %d %f\n", &entries[cur_i].row, &entries[cur_i].col, + &entries[cur_i].val); + } else { + fscanf(f, "%d %d\n", &entries[cur_i].row, &entries[cur_i].col); + entries[cur_i].val = 1.0; } - // set new non-zero count - nz = cur_i; - if (debug_level >= 1) { - printf("Converting COO to JDS format (%dx%d)\n%d matrix entries, warp size = %d, " - "row padding align = %d, pack size = %d\n\n", rows, cols, nz, warp_size, pad_rows, pack_size); + entries[cur_i].row--; + entries[cur_i].col--; + // printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col, + // entries[cur_i].val); + if (mirrored) { + // fill in mirrored diagonal + if (entries[cur_i].row != entries[cur_i].col) { // not a diagonal value + cur_i++; + entries[cur_i].val = entries[cur_i - 1].val; + entries[cur_i].col = entries[cur_i - 1].row; + entries[cur_i].row = entries[cur_i - 1].col; + // printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col, + // entries[cur_i].val); + } } - if (f !=stdin) fclose(f); + } + // set new non-zero count + nz = cur_i; + if (debug_level >= 1) { + printf("Converting COO to JDS format (%dx%d)\n%d matrix entries, warp size " + "= %d, " + "row padding align = %d, pack size = %d\n\n", + rows, cols, nz, warp_size, pad_rows, pack_size); + } + if (f != stdin) + fclose(f); - /* - * Now we have an array of values in entries - * Transform to padded JDS format - sort by rows, then fubini - */ + /* + * Now we have an array of values in entries + * Transform to padded JDS format - sort by rows, then fubini + */ - int irow, icol=0, istart=0; - int total_size=0; + int irow, icol = 0, istart = 0; + int total_size = 0; - /* Loop through each entry to figure out padding, grouping that determine - * final data array size - * - * First calculate stats for each row - * - * Collect stats using the major_stats typedef - */ - - - qsort(entries, nz, sizeof(mat_entry), sort_rows); // sort by row number - rows = entries[nz-1].row+1; // last item is greatest row (zero indexed) - if (rows%warp_size) { // pad group number to warp_size here - rows += warp_size - rows%warp_size; - } - stats = (row_stats*) calloc(rows, sizeof(row_stats)); // set to 0 - *data_row_map = (int*) calloc(rows, sizeof(int)); - irow = entries[0].row; // set first row - - //printf("First row %d\n", irow); - for (i=0; i<nz; i++) { // loop through each sorted entry - if (entries[i].row != irow || i == nz-1) { // new row - //printf("%d != %d\n", entries[i].row, irow); - if (i == nz-1) { - // last item, add it to current row - //printf("Last item i=%d, row=%d, irow=%d\n", i, entries[i].row, irow); - icol++; - } - // hit a new row, record stats for the last row (i-1) - stats[irow].size = icol; // record # cols in previous row - stats[irow].index = entries[i-1].row; // row # for previous stat item - //printf("Row %d, i=%d, irow=%d\n", entries[i].row, i, irow); - stats[irow].start = istart; // starting location in entries array - // set stats for the next row until this break again - icol=0; // reset row items - irow = entries[i].row; - istart = i; - } - icol++; // keep track of number of items in this row + /* Loop through each entry to figure out padding, grouping that determine + * final data array size + * + * First calculate stats for each row + * + * Collect stats using the major_stats typedef + */ + + qsort(entries, nz, sizeof(mat_entry), sort_rows); // sort by row number + rows = entries[nz - 1].row + 1; // last item is greatest row (zero indexed) + if (rows % warp_size) { // pad group number to warp_size here + rows += warp_size - rows % warp_size; + } + stats = (row_stats *)calloc(rows, sizeof(row_stats)); // set to 0 + *data_row_map = (int *)calloc(rows, sizeof(int)); + irow = entries[0].row; // set first row + + // printf("First row %d\n", irow); + for (i = 0; i < nz; i++) { // loop through each sorted entry + if (entries[i].row != irow || i == nz - 1) { // new row + // printf("%d != %d\n", entries[i].row, irow); + if (i == nz - 1) { + // last item, add it to current row + // printf("Last item i=%d, row=%d, irow=%d\n", i, entries[i].row, irow); + icol++; + } + // hit a new row, record stats for the last row (i-1) + stats[irow].size = icol; // record # cols in previous row + stats[irow].index = entries[i - 1].row; // row # for previous stat item + // printf("Row %d, i=%d, irow=%d\n", entries[i].row, i, irow); + stats[irow].start = istart; // starting location in entries array + // set stats for the next row until this break again + icol = 0; // reset row items + irow = entries[i].row; + istart = i; } - - - *nz_count_len = rows/warp_size + rows%warp_size; - *nz_count = (int*) malloc(*nz_count_len * sizeof(int)); // only one value per group - - /* sort based upon row size, greatest first */ - qsort(stats, rows, sizeof(row_stats), sort_stats); - /* figure out padding and grouping */ - if (debug_level >= 1) { - printf("Padding data....%d rows, %d groups\n", rows, *nz_count_len); + icol++; // keep track of number of items in this row + } + + *nz_count_len = rows / warp_size + rows % warp_size; + *nz_count = + (int *)malloc(*nz_count_len * sizeof(int)); // only one value per group + + /* sort based upon row size, greatest first */ + qsort(stats, rows, sizeof(row_stats), sort_stats); + /* figure out padding and grouping */ + if (debug_level >= 1) { + printf("Padding data....%d rows, %d groups\n", rows, *nz_count_len); + } + int pad_to, total_padding = 0, pack_to; + pad_rows *= pack_size; // change padding to account for packed items + for (i = 0; i < rows; i++) { + // record JDS to real row number + (*data_row_map)[i] = stats[i].index; + if (i < rows - 1) { + // (*data_row_map)[i]--; // ???? no idea why this is off by 1 } - int pad_to, total_padding = 0, pack_to; - pad_rows *= pack_size; // change padding to account for packed items - for (i=0; i<rows; i++) { - // record JDS to real row number - (*data_row_map)[i] = stats[i].index; - if (i<rows-1) { - // (*data_row_map)[i]--; // ???? no idea why this is off by 1 - } - // each row is padded so the number of packed groups % pad_rows == 0 - if (i % warp_size == 0) { // on a group boundary with the largest number of items - // find padding in individual items - if (stats[i].size % pad_rows) { - stats[i].padding = pad_rows - (stats[i].size % pad_rows); // find padding - } else { - stats[i].padding = 0; // no padding necessary, already at pad multiple - } - if (stats[i].size % pack_size) { - pack_to = ceil((float)stats[i].size/pack_size); - } else { - pack_to = stats[i].size/pack_size; - } - //pack_to = stats[i].size + (!stats[i].size%pack_size) ? 0 : (pack_size - stats[i].size%pack_size); - pad_to = stats[i].size + stats[i].padding; // total size of this row, with padding - // TODO: change this to reflect the real number of nonzero packed items, not the padded - // value - (*nz_count)[i/warp_size] = pack_to; // number of packed items in this group - total_size += pad_to * warp_size; // allocate size for this padded group - if (debug_level >= 2) - printf("Padding warp group %d to %d items, zn = %d\n", i/warp_size, pad_to, pack_to); - } else { - stats[i].padding = pad_to - stats[i].size; - } - total_padding += stats[i].padding; - //if (debug_level >= 2) - // printf("Row %d, %d items, %d padding\n", stats[i].index, stats[i].size, stats[i].padding); + // each row is padded so the number of packed groups % pad_rows == 0 + if (i % warp_size == + 0) { // on a group boundary with the largest number of items + // find padding in individual items + if (stats[i].size % pad_rows) { + stats[i].padding = + pad_rows - (stats[i].size % pad_rows); // find padding + } else { + stats[i].padding = 0; // no padding necessary, already at pad multiple + } + if (stats[i].size % pack_size) { + pack_to = ceil((float)stats[i].size / pack_size); + } else { + pack_to = stats[i].size / pack_size; + } + // pack_to = stats[i].size + (!stats[i].size%pack_size) ? 0 : (pack_size - + // stats[i].size%pack_size); + pad_to = stats[i].size + + stats[i].padding; // total size of this row, with padding + // TODO: change this to reflect the real number of nonzero packed items, + // not the padded value + (*nz_count)[i / warp_size] = + pack_to; // number of packed items in this group + total_size += pad_to * warp_size; // allocate size for this padded group + if (debug_level >= 2) + printf("Padding warp group %d to %d items, zn = %d\n", i / warp_size, + pad_to, pack_to); + } else { + stats[i].padding = pad_to - stats[i].size; } - - /* allocate data and data_row_index */ - if (debug_level >= 1) - printf("Allocating data space: %d entries (%f%% padding)\n", total_size, (float)100*total_padding/total_size); - *data = (float*) calloc(total_size, sizeof(float)); // set to 0 so padded values are set - *data_col_index = (int*) calloc(total_size, sizeof(int)); // any unset indexes point to 0 - *data_row_ptr = (int*) calloc(rows, sizeof(int)); - *len = total_size; - i = 0; // data index, including padding - - /* - * Keep looping through each row, writing data a group at a time - * to the output array. Increment `irow` each time, and use it as - * an index into entries along with stats.start to get the next - * data item - */ - irow = 0; // keep track of which row we are in inside the fubini-ed array - int idata = 0; // position within final data array - int entry_index, j; - int ipack; // used in internal loop for writing packed values - mat_entry entry; - while (1) { - /* record data_row_ptr */ - (*data_row_ptr)[irow] = idata; - - /* End condtion: the size of the greatest row is smaller than the current - Fubini-ed row */ - if (stats[0].size+stats[0].padding <= irow*pack_size) break; + total_padding += stats[i].padding; + // if (debug_level >= 2) + // printf("Row %d, %d items, %d padding\n", stats[i].index, + // stats[i].size, stats[i].padding); + } + + /* allocate data and data_row_index */ + if (debug_level >= 1) + printf("Allocating data space: %d entries (%f%% padding)\n", total_size, + (float)100 * total_padding / total_size); + *data = (float *)calloc(total_size, + sizeof(float)); // set to 0 so padded values are set + *data_col_index = + (int *)calloc(total_size, sizeof(int)); // any unset indexes point to 0 + *data_row_ptr = (int *)calloc(rows, sizeof(int)); + *len = total_size; + i = 0; // data index, including padding + + /* + * Keep looping through each row, writing data a group at a time + * to the output array. Increment `irow` each time, and use it as + * an index into entries along with stats.start to get the next + * data item + */ + irow = 0; // keep track of which row we are in inside the fubini-ed array + int idata = 0; // position within final data array + int entry_index, j; + int ipack; // used in internal loop for writing packed values + mat_entry entry; + while (1) { + /* record data_row_ptr */ + (*data_row_ptr)[irow] = idata; + + /* End condtion: the size of the greatest row is smaller than the current + Fubini-ed row */ + if (stats[0].size + stats[0].padding <= irow * pack_size) + break; - //printf("Data row pointer for row %d is %d\n", irow, idata); - for (i=0; i<rows; i++) { - /* take one packed group from each original row */ - //printf("Output irow %d icol %d (real %d,%d size %d)\n", irow, i, entry.col, i, stats[i].size); - /* Watch out for little vs big endian, and how opencl interprets vector casting from pointers */ - for (ipack=0; ipack<pack_size; ipack++) { - if (stats[i].size > irow*pack_size+ipack) { - // copy value - entry_index = stats[i].start + irow*pack_size+ipack; - entry = entries[entry_index]; - /* record index and value */ - (*data)[idata] = entry.val; - /* each data item will get its row index from the thread, col from here */ - (*data_col_index)[idata] = entry.col; + // printf("Data row pointer for row %d is %d\n", irow, idata); + for (i = 0; i < rows; i++) { + /* take one packed group from each original row */ + // printf("Output irow %d icol %d (real %d,%d size %d)\n", irow, i, + // entry.col, i, stats[i].size); + /* Watch out for little vs big endian, and how opencl interprets vector + * casting from pointers */ + for (ipack = 0; ipack < pack_size; ipack++) { + if (stats[i].size > irow * pack_size + ipack) { + // copy value + entry_index = stats[i].start + irow * pack_size + ipack; + entry = entries[entry_index]; + /* record index and value */ + (*data)[idata] = entry.val; + /* each data item will get its row index from the thread, col from + * here */ + (*data_col_index)[idata] = entry.col; - if (debug_level >= 2) { - if (i < 3) { - // first row debugging - printf("[%d row%d=%.3f]", ipack+1, i, entry.val); - } else { - printf("%d", ipack+1); - } - } - } else if (stats[i].size+stats[i].padding > irow*pack_size+ipack) { - /* add padding to the end of each row here - this assumes padding is factored into allocated size */ - if (debug_level >= 2) printf("0"); - (*data_col_index)[idata] = -1; - } else { - goto endwrite; // no data written this pass, so don't increment idata - } - idata += 1; - } - } - endwrite: - if (debug_level >= 2) { - printf("\n"); - } - irow += 1; + if (debug_level >= 2) { + if (i < 3) { + // first row debugging + printf("[%d row%d=%.3f]", ipack + 1, i, entry.val); + } else { + printf("%d", ipack + 1); + } + } + } else if (stats[i].size + stats[i].padding > + irow * pack_size + ipack) { + /* add padding to the end of each row here - this assumes padding is + * factored into allocated size */ + if (debug_level >= 2) + printf("0"); + (*data_col_index)[idata] = -1; + } else { + goto endwrite; // no data written this pass, so don't increment idata + } + idata += 1; + } } - - if (debug_level >= 1) - printf("Finished converting.\nJDS format has %d columns, %d rows.\n", rows, irow); - free(entries); - free(stats); - printf("nz_count_len = %d\n", *nz_count_len); - - *data_cols = rows; - *data_ptr_len = irow+1; - return 0; -} + endwrite: + if (debug_level >= 2) { + printf("\n"); + } + irow += 1; + } + if (debug_level >= 1) + printf("Finished converting.\nJDS format has %d columns, %d rows.\n", rows, + irow); + free(entries); + free(stats); + printf("nz_count_len = %d\n", *nz_count_len); + + *data_cols = rows; + *data_ptr_len = irow + 1; + return 0; +} diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h index a495ffa68821594189f8de61a1e6a74536cd31b9..6713a9ed3e8e37a7089694c36bb81aead4c61122 100644 --- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h +++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h @@ -4,11 +4,11 @@ #ifdef __cplusplus extern "C" { #endif -int coo_to_jds(char* mtx_filename, int pad_rows, int warp_size, int pack_size, - int mirrored, int binary, int debug_level, - float** data, int** data_row_ptr, int** nz_count, int** data_col_index, - int** data_row_map, int* data_cols, int* dim, int* len, int* nz_count_len, - int* data_ptr_len); +int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size, + int mirrored, int binary, int debug_level, float **data, + int **data_row_ptr, int **nz_count, int **data_col_index, + int **data_row_map, int *data_cols, int *dim, int *len, + int *nz_count_len, int *data_ptr_len); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c index c250ff2aed998fe65248537a8b19a359206187ce..1429b087c20888102f13d9296cbdacc108965e27 100644 --- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c +++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c @@ -1,261 +1,234 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ - +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ +#include <ctype.h> #include <stdio.h> -#include <string.h> #include <stdlib.h> -#include <ctype.h> +#include <string.h> #include "mmio.h" int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_) -{ - FILE *f; - MM_typecode matcode; - int M, N, nz; - int i; - double *val; - int *I, *J; - - if ((f = fopen(fname, "r")) == NULL) - return -1; - - - if (mm_read_banner(f, &matcode) != 0) - { - printf("mm_read_unsymetric: Could not process Matrix Market banner "); - printf(" in file [%s]\n", fname); - return -1; - } - - - - if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode))) - { - fprintf(stderr, "Sorry, this application does not support "); - fprintf(stderr, "Market Market type: [%s]\n", - mm_typecode_to_str(matcode)); - return -1; - } - - /* find out size of sparse matrix: M, N, nz .... */ - - if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0) - { - fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n"); - return -1; - } - - *M_ = M; - *N_ = N; - *nz_ = nz; - - /* reseve memory for matrices */ - - I = (int *) malloc(nz * sizeof(int)); - J = (int *) malloc(nz * sizeof(int)); - val = (double *) malloc(nz * sizeof(double)); - - *val_ = val; - *I_ = I; - *J_ = J; - - /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ - /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ - /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - - for (i=0; i<nz; i++) - { - fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]); - I[i]--; /* adjust from 1-based to 0-based */ - J[i]--; - } - fclose(f); - + double **val_, int **I_, int **J_) { + FILE *f; + MM_typecode matcode; + int M, N, nz; + int i; + double *val; + int *I, *J; + + if ((f = fopen(fname, "r")) == NULL) + return -1; + + if (mm_read_banner(f, &matcode) != 0) { + printf("mm_read_unsymetric: Could not process Matrix Market banner "); + printf(" in file [%s]\n", fname); + return -1; + } + + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && + mm_is_sparse(matcode))) { + fprintf(stderr, "Sorry, this application does not support "); + fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode)); + return -1; + } + + /* find out size of sparse matrix: M, N, nz .... */ + + if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) { + fprintf(stderr, + "read_unsymmetric_sparse(): could not parse matrix size.\n"); + return -1; + } + + *M_ = M; + *N_ = N; + *nz_ = nz; + + /* reseve memory for matrices */ + + I = (int *)malloc(nz * sizeof(int)); + J = (int *)malloc(nz * sizeof(int)); + val = (double *)malloc(nz * sizeof(double)); + + *val_ = val; + *I_ = I; + *J_ = J; + + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ + /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ + /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ + + for (i = 0; i < nz; i++) { + fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]); + I[i]--; /* adjust from 1-based to 0-based */ + J[i]--; + } + fclose(f); + + return 0; +} + +int mm_is_valid(MM_typecode matcode) { + if (!mm_is_matrix(matcode)) return 0; + if (mm_is_dense(matcode) && mm_is_pattern(matcode)) + return 0; + if (mm_is_real(matcode) && mm_is_hermitian(matcode)) + return 0; + if (mm_is_pattern(matcode) && + (mm_is_hermitian(matcode) || mm_is_skew(matcode))) + return 0; + return 1; +} + +int mm_read_banner(FILE *f, MM_typecode *matcode) { + char line[MM_MAX_LINE_LENGTH]; + char banner[MM_MAX_TOKEN_LENGTH]; + char mtx[MM_MAX_TOKEN_LENGTH]; + char crd[MM_MAX_TOKEN_LENGTH]; + char data_type[MM_MAX_TOKEN_LENGTH]; + char storage_scheme[MM_MAX_TOKEN_LENGTH]; + char *p; + + mm_clear_typecode(matcode); + + if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) + return MM_PREMATURE_EOF; + + if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, + storage_scheme) != 5) + return MM_PREMATURE_EOF; + + for (p = mtx; *p != '\0'; *p = tolower(*p), p++) + ; /* convert to lower case */ + for (p = crd; *p != '\0'; *p = tolower(*p), p++) + ; + for (p = data_type; *p != '\0'; *p = tolower(*p), p++) + ; + for (p = storage_scheme; *p != '\0'; *p = tolower(*p), p++) + ; + + /* check for banner */ + if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0) + return MM_NO_HEADER; + + /* first field should be "mtx" */ + if (strcmp(mtx, MM_MTX_STR) != 0) + return MM_UNSUPPORTED_TYPE; + mm_set_matrix(matcode); + + /* second field describes whether this is a sparse matrix (in coordinate + storgae) or a dense array */ + + if (strcmp(crd, MM_SPARSE_STR) == 0) + mm_set_sparse(matcode); + else if (strcmp(crd, MM_DENSE_STR) == 0) + mm_set_dense(matcode); + else + return MM_UNSUPPORTED_TYPE; + + /* third field */ + + if (strcmp(data_type, MM_REAL_STR) == 0) + mm_set_real(matcode); + else if (strcmp(data_type, MM_COMPLEX_STR) == 0) + mm_set_complex(matcode); + else if (strcmp(data_type, MM_PATTERN_STR) == 0) + mm_set_pattern(matcode); + else if (strcmp(data_type, MM_INT_STR) == 0) + mm_set_integer(matcode); + else + return MM_UNSUPPORTED_TYPE; + + /* fourth field */ + + if (strcmp(storage_scheme, MM_GENERAL_STR) == 0) + mm_set_general(matcode); + else if (strcmp(storage_scheme, MM_SYMM_STR) == 0) + mm_set_symmetric(matcode); + else if (strcmp(storage_scheme, MM_HERM_STR) == 0) + mm_set_hermitian(matcode); + else if (strcmp(storage_scheme, MM_SKEW_STR) == 0) + mm_set_skew(matcode); + else + return MM_UNSUPPORTED_TYPE; + + return 0; } -int mm_is_valid(MM_typecode matcode) -{ - if (!mm_is_matrix(matcode)) return 0; - if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0; - if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0; - if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || - mm_is_skew(matcode))) return 0; - return 1; +int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz) { + if (fprintf(f, "%d %d %d\n", M, N, nz) != 3) + return MM_COULD_NOT_WRITE_FILE; + else + return 0; } -int mm_read_banner(FILE *f, MM_typecode *matcode) -{ - char line[MM_MAX_LINE_LENGTH]; - char banner[MM_MAX_TOKEN_LENGTH]; - char mtx[MM_MAX_TOKEN_LENGTH]; - char crd[MM_MAX_TOKEN_LENGTH]; - char data_type[MM_MAX_TOKEN_LENGTH]; - char storage_scheme[MM_MAX_TOKEN_LENGTH]; - char *p; +int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz) { + char line[MM_MAX_LINE_LENGTH]; + int num_items_read; + /* set return null parameter values, in case we exit with errors */ + *M = *N = *nz = 0; - mm_clear_typecode(matcode); + /* now continue scanning until you reach the end-of-comments */ + do { + if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) + return MM_PREMATURE_EOF; + } while (line[0] == '%'); - if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) - return MM_PREMATURE_EOF; + /* line[] is either blank or has M,N, nz */ + if (sscanf(line, "%d %d %d", M, N, nz) == 3) + return 0; - if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, - storage_scheme) != 5) + else + do { + num_items_read = fscanf(f, "%d %d %d", M, N, nz); + if (num_items_read == EOF) return MM_PREMATURE_EOF; + } while (num_items_read != 3); - for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */ - for (p=crd; *p!='\0'; *p=tolower(*p),p++); - for (p=data_type; *p!='\0'; *p=tolower(*p),p++); - for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++); - - /* check for banner */ - if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0) - return MM_NO_HEADER; - - /* first field should be "mtx" */ - if (strcmp(mtx, MM_MTX_STR) != 0) - return MM_UNSUPPORTED_TYPE; - mm_set_matrix(matcode); - - - /* second field describes whether this is a sparse matrix (in coordinate - storgae) or a dense array */ - - - if (strcmp(crd, MM_SPARSE_STR) == 0) - mm_set_sparse(matcode); - else - if (strcmp(crd, MM_DENSE_STR) == 0) - mm_set_dense(matcode); - else - return MM_UNSUPPORTED_TYPE; - - - /* third field */ - - if (strcmp(data_type, MM_REAL_STR) == 0) - mm_set_real(matcode); - else - if (strcmp(data_type, MM_COMPLEX_STR) == 0) - mm_set_complex(matcode); - else - if (strcmp(data_type, MM_PATTERN_STR) == 0) - mm_set_pattern(matcode); - else - if (strcmp(data_type, MM_INT_STR) == 0) - mm_set_integer(matcode); - else - return MM_UNSUPPORTED_TYPE; - - - /* fourth field */ - - if (strcmp(storage_scheme, MM_GENERAL_STR) == 0) - mm_set_general(matcode); - else - if (strcmp(storage_scheme, MM_SYMM_STR) == 0) - mm_set_symmetric(matcode); - else - if (strcmp(storage_scheme, MM_HERM_STR) == 0) - mm_set_hermitian(matcode); - else - if (strcmp(storage_scheme, MM_SKEW_STR) == 0) - mm_set_skew(matcode); - else - return MM_UNSUPPORTED_TYPE; - - - return 0; + return 0; } -int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz) -{ - if (fprintf(f, "%d %d %d\n", M, N, nz) != 3) - return MM_COULD_NOT_WRITE_FILE; - else - return 0; -} +int mm_read_mtx_array_size(FILE *f, int *M, int *N) { + char line[MM_MAX_LINE_LENGTH]; + int num_items_read; + /* set return null parameter values, in case we exit with errors */ + *M = *N = 0; -int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz ) -{ - char line[MM_MAX_LINE_LENGTH]; - int num_items_read; - - /* set return null parameter values, in case we exit with errors */ - *M = *N = *nz = 0; - - /* now continue scanning until you reach the end-of-comments */ - do - { - if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) - return MM_PREMATURE_EOF; - }while (line[0] == '%'); - - /* line[] is either blank or has M,N, nz */ - if (sscanf(line, "%d %d %d", M, N, nz) == 3) - return 0; - - else - do - { - num_items_read = fscanf(f, "%d %d %d", M, N, nz); - if (num_items_read == EOF) return MM_PREMATURE_EOF; - } - while (num_items_read != 3); + /* now continue scanning until you reach the end-of-comments */ + do { + if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) + return MM_PREMATURE_EOF; + } while (line[0] == '%'); + /* line[] is either blank or has M,N, nz */ + if (sscanf(line, "%d %d", M, N) == 2) return 0; -} - -int mm_read_mtx_array_size(FILE *f, int *M, int *N) -{ - char line[MM_MAX_LINE_LENGTH]; - int num_items_read; - /* set return null parameter values, in case we exit with errors */ - *M = *N = 0; - - /* now continue scanning until you reach the end-of-comments */ - do - { - if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) - return MM_PREMATURE_EOF; - }while (line[0] == '%'); - - /* line[] is either blank or has M,N, nz */ - if (sscanf(line, "%d %d", M, N) == 2) - return 0; - - else /* we have a blank line */ - do - { - num_items_read = fscanf(f, "%d %d", M, N); - if (num_items_read == EOF) return MM_PREMATURE_EOF; - } - while (num_items_read != 2); + else /* we have a blank line */ + do { + num_items_read = fscanf(f, "%d %d", M, N); + if (num_items_read == EOF) + return MM_PREMATURE_EOF; + } while (num_items_read != 2); - return 0; + return 0; } -int mm_write_mtx_array_size(FILE *f, int M, int N) -{ - if (fprintf(f, "%d %d\n", M, N) != 2) - return MM_COULD_NOT_WRITE_FILE; - else - return 0; +int mm_write_mtx_array_size(FILE *f, int M, int N) { + if (fprintf(f, "%d %d\n", M, N) != 2) + return MM_COULD_NOT_WRITE_FILE; + else + return 0; } - - /*-------------------------------------------------------------------------*/ /******************************************************************/ @@ -263,65 +236,50 @@ int mm_write_mtx_array_size(FILE *f, int M, int N) /******************************************************************/ int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode) -{ - int i; - if (mm_is_complex(matcode)) - { - for (i=0; i<nz; i++) - if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1]) - != 4) return MM_PREMATURE_EOF; - } - else if (mm_is_real(matcode)) - { - for (i=0; i<nz; i++) - { - if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]) - != 3) return MM_PREMATURE_EOF; - - } + double val[], MM_typecode matcode) { + int i; + if (mm_is_complex(matcode)) { + for (i = 0; i < nz; i++) + if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2 * i], + &val[2 * i + 1]) != 4) + return MM_PREMATURE_EOF; + } else if (mm_is_real(matcode)) { + for (i = 0; i < nz; i++) { + if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]) != 3) + return MM_PREMATURE_EOF; } + } - else if (mm_is_pattern(matcode)) - { - for (i=0; i<nz; i++) - if (fscanf(f, "%d %d", &I[i], &J[i]) - != 2) return MM_PREMATURE_EOF; - } - else - return MM_UNSUPPORTED_TYPE; + else if (mm_is_pattern(matcode)) { + for (i = 0; i < nz; i++) + if (fscanf(f, "%d %d", &I[i], &J[i]) != 2) + return MM_PREMATURE_EOF; + } else + return MM_UNSUPPORTED_TYPE; - return 0; - + return 0; } -int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, - double *real, double *imag, MM_typecode matcode) -{ - if (mm_is_complex(matcode)) - { - if (fscanf(f, "%d %d %lg %lg", I, J, real, imag) - != 4) return MM_PREMATURE_EOF; - } - else if (mm_is_real(matcode)) - { - if (fscanf(f, "%d %d %lg\n", I, J, real) - != 3) return MM_PREMATURE_EOF; +int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *imag, + MM_typecode matcode) { + if (mm_is_complex(matcode)) { + if (fscanf(f, "%d %d %lg %lg", I, J, real, imag) != 4) + return MM_PREMATURE_EOF; + } else if (mm_is_real(matcode)) { + if (fscanf(f, "%d %d %lg\n", I, J, real) != 3) + return MM_PREMATURE_EOF; - } + } - else if (mm_is_pattern(matcode)) - { - if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF; - } - else - return MM_UNSUPPORTED_TYPE; + else if (mm_is_pattern(matcode)) { + if (fscanf(f, "%d %d", I, J) != 2) + return MM_PREMATURE_EOF; + } else + return MM_UNSUPPORTED_TYPE; - return 0; - + return 0; } - /************************************************************************ mm_read_mtx_crd() fills M, N, nz, array of values, and return type code, e.g. 'MCRS' @@ -330,182 +288,160 @@ int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, (nz pairs of real/imaginary values) ************************************************************************/ -int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, - double **val, MM_typecode *matcode) -{ - int ret_code; - FILE *f; - - if (strcmp(fname, "stdin") == 0) f=stdin; - else - if ((f = fopen(fname, "r")) == NULL) - return MM_COULD_NOT_READ_FILE; - - - if ((ret_code = mm_read_banner(f, matcode)) != 0) - return ret_code; - - if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && - mm_is_matrix(*matcode))) - return MM_UNSUPPORTED_TYPE; - - if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0) - return ret_code; - - - *I = (int *) malloc(*nz * sizeof(int)); - *J = (int *) malloc(*nz * sizeof(int)); - *val = NULL; - - if (mm_is_complex(*matcode)) - { - *val = (double *) malloc(*nz * 2 * sizeof(double)); - ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, - *matcode); - if (ret_code != 0) return ret_code; - } - else if (mm_is_real(*matcode)) - { - *val = (double *) malloc(*nz * sizeof(double)); - ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, - *matcode); - if (ret_code != 0) return ret_code; - } +int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, + double **val, MM_typecode *matcode) { + int ret_code; + FILE *f; + + if (strcmp(fname, "stdin") == 0) + f = stdin; + else if ((f = fopen(fname, "r")) == NULL) + return MM_COULD_NOT_READ_FILE; + + if ((ret_code = mm_read_banner(f, matcode)) != 0) + return ret_code; + + if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && + mm_is_matrix(*matcode))) + return MM_UNSUPPORTED_TYPE; + + if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0) + return ret_code; + + *I = (int *)malloc(*nz * sizeof(int)); + *J = (int *)malloc(*nz * sizeof(int)); + *val = NULL; + + if (mm_is_complex(*matcode)) { + *val = (double *)malloc(*nz * 2 * sizeof(double)); + ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, *matcode); + if (ret_code != 0) + return ret_code; + } else if (mm_is_real(*matcode)) { + *val = (double *)malloc(*nz * sizeof(double)); + ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, *matcode); + if (ret_code != 0) + return ret_code; + } + + else if (mm_is_pattern(*matcode)) { + ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, *matcode); + if (ret_code != 0) + return ret_code; + } + + if (f != stdin) + fclose(f); + return 0; +} - else if (mm_is_pattern(*matcode)) - { - ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, - *matcode); - if (ret_code != 0) return ret_code; - } +int mm_write_banner(FILE *f, MM_typecode matcode) { + char *str = mm_typecode_to_str(matcode); + int ret_code; - if (f != stdin) fclose(f); + ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str); + free(str); + if (ret_code != 2) + return MM_COULD_NOT_WRITE_FILE; + else return 0; } -int mm_write_banner(FILE *f, MM_typecode matcode) -{ - char *str = mm_typecode_to_str(matcode); - int ret_code; - - ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str); - free(str); - if (ret_code !=2 ) - return MM_COULD_NOT_WRITE_FILE; - else - return 0; -} - int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode) -{ - FILE *f; - int i; - - if (strcmp(fname, "stdout") == 0) - f = stdout; - else - if ((f = fopen(fname, "w")) == NULL) - return MM_COULD_NOT_WRITE_FILE; - - /* print banner followed by typecode */ - fprintf(f, "%s ", MatrixMarketBanner); - fprintf(f, "%s\n", mm_typecode_to_str(matcode)); - - /* print matrix sizes and nonzeros */ - fprintf(f, "%d %d %d\n", M, N, nz); - - /* print values */ - if (mm_is_pattern(matcode)) - for (i=0; i<nz; i++) - fprintf(f, "%d %d\n", I[i], J[i]); - else - if (mm_is_real(matcode)) - for (i=0; i<nz; i++) - fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]); - else - if (mm_is_complex(matcode)) - for (i=0; i<nz; i++) - fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], - val[2*i+1]); - else - { - if (f != stdout) fclose(f); - return MM_UNSUPPORTED_TYPE; - } - - if (f !=stdout) fclose(f); + double val[], MM_typecode matcode) { + FILE *f; + int i; + + if (strcmp(fname, "stdout") == 0) + f = stdout; + else if ((f = fopen(fname, "w")) == NULL) + return MM_COULD_NOT_WRITE_FILE; + + /* print banner followed by typecode */ + fprintf(f, "%s ", MatrixMarketBanner); + fprintf(f, "%s\n", mm_typecode_to_str(matcode)); + + /* print matrix sizes and nonzeros */ + fprintf(f, "%d %d %d\n", M, N, nz); + + /* print values */ + if (mm_is_pattern(matcode)) + for (i = 0; i < nz; i++) + fprintf(f, "%d %d\n", I[i], J[i]); + else if (mm_is_real(matcode)) + for (i = 0; i < nz; i++) + fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]); + else if (mm_is_complex(matcode)) + for (i = 0; i < nz; i++) + fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2 * i], + val[2 * i + 1]); + else { + if (f != stdout) + fclose(f); + return MM_UNSUPPORTED_TYPE; + } + + if (f != stdout) + fclose(f); - return 0; + return 0; } - /** -* Create a new copy of a string s. mm_strdup() is a common routine, but -* not part of ANSI C, so it is included here. Used by mm_typecode_to_str(). -* -*/ -char *mm_strdup(const char *s) -{ - int len = strlen(s); - char *s2 = (char *) malloc((len+1)*sizeof(char)); - return strcpy(s2, s); + * Create a new copy of a string s. mm_strdup() is a common routine, but + * not part of ANSI C, so it is included here. Used by mm_typecode_to_str(). + * + */ +char *mm_strdup(const char *s) { + int len = strlen(s); + char *s2 = (char *)malloc((len + 1) * sizeof(char)); + return strcpy(s2, s); } -char *mm_typecode_to_str(MM_typecode matcode) -{ - char buffer[MM_MAX_LINE_LENGTH]; - char *types[4]; - char *mm_strdup(const char *); - int error =0; - - /* check for MTX type */ - if (mm_is_matrix(matcode)) - types[0] = MM_MTX_STR; - else - error=1; - - /* check for CRD or ARR matrix */ - if (mm_is_sparse(matcode)) - types[1] = MM_SPARSE_STR; - else - if (mm_is_dense(matcode)) - types[1] = MM_DENSE_STR; - else - return NULL; - - /* check for element data type */ - if (mm_is_real(matcode)) - types[2] = MM_REAL_STR; - else - if (mm_is_complex(matcode)) - types[2] = MM_COMPLEX_STR; - else - if (mm_is_pattern(matcode)) - types[2] = MM_PATTERN_STR; - else - if (mm_is_integer(matcode)) - types[2] = MM_INT_STR; - else - return NULL; - - - /* check for symmetry type */ - if (mm_is_general(matcode)) - types[3] = MM_GENERAL_STR; - else - if (mm_is_symmetric(matcode)) - types[3] = MM_SYMM_STR; - else - if (mm_is_hermitian(matcode)) - types[3] = MM_HERM_STR; - else - if (mm_is_skew(matcode)) - types[3] = MM_SKEW_STR; - else - return NULL; - - sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]); - return mm_strdup(buffer); - +char *mm_typecode_to_str(MM_typecode matcode) { + char buffer[MM_MAX_LINE_LENGTH]; + char *types[4]; + char *mm_strdup(const char *); + int error = 0; + + /* check for MTX type */ + if (mm_is_matrix(matcode)) + types[0] = MM_MTX_STR; + else + error = 1; + + /* check for CRD or ARR matrix */ + if (mm_is_sparse(matcode)) + types[1] = MM_SPARSE_STR; + else if (mm_is_dense(matcode)) + types[1] = MM_DENSE_STR; + else + return NULL; + + /* check for element data type */ + if (mm_is_real(matcode)) + types[2] = MM_REAL_STR; + else if (mm_is_complex(matcode)) + types[2] = MM_COMPLEX_STR; + else if (mm_is_pattern(matcode)) + types[2] = MM_PATTERN_STR; + else if (mm_is_integer(matcode)) + types[2] = MM_INT_STR; + else + return NULL; + + /* check for symmetry type */ + if (mm_is_general(matcode)) + types[3] = MM_GENERAL_STR; + else if (mm_is_symmetric(matcode)) + types[3] = MM_SYMM_STR; + else if (mm_is_hermitian(matcode)) + types[3] = MM_HERM_STR; + else if (mm_is_skew(matcode)) + types[3] = MM_SKEW_STR; + else + return NULL; + + sprintf(buffer, "%s %s %s %s", types[0], types[1], types[2], types[3]); + return mm_strdup(buffer); } diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h index 7cfd0a1b7ae7572e85b8c10bcb2fd0b3333ad0b6..ffb80cab0cc94b05a5b97d1d3b28b1b53d8c0d52 100644 --- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h +++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h @@ -1,10 +1,10 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ #ifndef MM_IO_H #define MM_IO_H @@ -25,109 +25,99 @@ int mm_write_banner(FILE *f, MM_typecode matcode); int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); int mm_write_mtx_array_size(FILE *f, int M, int N); - /********************* MM_typecode query fucntions ***************************/ -#define mm_is_matrix(typecode) ((typecode)[0]=='M') - -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') +#define mm_is_matrix(typecode) ((typecode)[0] == 'M') -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') +#define mm_is_sparse(typecode) ((typecode)[1] == 'C') +#define mm_is_coordinate(typecode) ((typecode)[1] == 'C') +#define mm_is_dense(typecode) ((typecode)[1] == 'A') +#define mm_is_array(typecode) ((typecode)[1] == 'A') -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') +#define mm_is_complex(typecode) ((typecode)[2] == 'C') +#define mm_is_real(typecode) ((typecode)[2] == 'R') +#define mm_is_pattern(typecode) ((typecode)[2] == 'P') +#define mm_is_integer(typecode) ((typecode)[2] == 'I') -int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ +#define mm_is_symmetric(typecode) ((typecode)[3] == 'S') +#define mm_is_general(typecode) ((typecode)[3] == 'G') +#define mm_is_skew(typecode) ((typecode)[3] == 'K') +#define mm_is_hermitian(typecode) ((typecode)[3] == 'H') +int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ /********************* MM_typecode modify fucntions ***************************/ -#define mm_set_matrix(typecode) ((*typecode)[0]='M') -#define mm_set_coordinate(typecode) ((*typecode)[1]='C') -#define mm_set_array(typecode) ((*typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) - -#define mm_set_complex(typecode)((*typecode)[2]='C') -#define mm_set_real(typecode) ((*typecode)[2]='R') -#define mm_set_pattern(typecode)((*typecode)[2]='P') -#define mm_set_integer(typecode)((*typecode)[2]='I') +#define mm_set_matrix(typecode) ((*typecode)[0] = 'M') +#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') +#define mm_set_array(typecode) ((*typecode)[1] = 'A') +#define mm_set_dense(typecode) mm_set_array(typecode) +#define mm_set_sparse(typecode) mm_set_coordinate(typecode) +#define mm_set_complex(typecode) ((*typecode)[2] = 'C') +#define mm_set_real(typecode) ((*typecode)[2] = 'R') +#define mm_set_pattern(typecode) ((*typecode)[2] = 'P') +#define mm_set_integer(typecode) ((*typecode)[2] = 'I') -#define mm_set_symmetric(typecode)((*typecode)[3]='S') -#define mm_set_general(typecode)((*typecode)[3]='G') -#define mm_set_skew(typecode) ((*typecode)[3]='K') -#define mm_set_hermitian(typecode)((*typecode)[3]='H') +#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') +#define mm_set_general(typecode) ((*typecode)[3] = 'G') +#define mm_set_skew(typecode) ((*typecode)[3] = 'K') +#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') -#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ - (*typecode)[2]=' ',(*typecode)[3]='G') +#define mm_clear_typecode(typecode) \ + ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) - /********************* Matrix Market error codes ***************************/ - -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 - +#define MM_COULD_NOT_READ_FILE 11 +#define MM_PREMATURE_EOF 12 +#define MM_NOT_MTX 13 +#define MM_NO_HEADER 14 +#define MM_UNSUPPORTED_TYPE 15 +#define MM_LINE_TOO_LONG 16 +#define MM_COULD_NOT_WRITE_FILE 17 /******************** Matrix Market internal definitions ******************** MM_matrix_typecode: 4-character sequence - ojbect sparse/ data storage - dense type scheme + ojbect sparse/ data + storage dense type scheme string position: [0] [1] [2] [3] Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) - A(array) C(omplex) H(ermitian) - P(attern) S(ymmetric) - I(nteger) K(kew) + A(array) + C(omplex) H(ermitian) P(attern) S(ymmetric) I(nteger) K(kew) ***********************************************************************/ -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" - +#define MM_MTX_STR "matrix" +#define MM_ARRAY_STR "array" +#define MM_DENSE_STR "array" +#define MM_COORDINATE_STR "coordinate" +#define MM_SPARSE_STR "coordinate" +#define MM_COMPLEX_STR "complex" +#define MM_REAL_STR "real" +#define MM_INT_STR "integer" +#define MM_GENERAL_STR "general" +#define MM_SYMM_STR "symmetric" +#define MM_HERM_STR "hermitian" +#define MM_SKEW_STR "skew-symmetric" +#define MM_PATTERN_STR "pattern" /* high level routines */ int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); + double val[], MM_typecode matcode); int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); + double val[], MM_typecode matcode); int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, - MM_typecode matcode); + MM_typecode matcode); int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_); - - + double **val_, int **I_, int **J_); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c index 0dc09ff3709ebbbbabc11909673ed699474fea25..ab82ed41118e7a5d596cd51273e897d944e34be7 100644 --- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c +++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c @@ -1,19 +1,17 @@ #include "convert_dataset.h" int main() { - float* data; - int* data_row_ptr, *nz_count, *data_col_index; - int *rows, cols, dim, nz_count_len, len; - - coo_to_jds( - "fidapm05.mtx", // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 4, // row padding - 4, // warp size - 2, // pack size - 1, // is mirrored? - 0, // binary matrix - 3, // debug level [0:2] - &data, &data_row_ptr, &nz_count, &data_col_index, - &rows, &cols, &dim, &len, &nz_count_len - ); + float *data; + int *data_row_ptr, *nz_count, *data_col_index; + int *rows, cols, dim, nz_count_len, len; + + coo_to_jds("fidapm05.mtx", // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 4, // row padding + 4, // warp size + 2, // pack size + 1, // is mirrored? + 0, // binary matrix + 3, // debug level [0:2] + &data, &data_row_ptr, &nz_count, &data_col_index, &rows, &cols, + &dim, &len, &nz_count_len); } \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h index 560c32f4e992657920956c49c8c48deae8f9428c..abc849f930fb63d231d4453d2b9c07183e5758bd 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h @@ -6,9 +6,9 @@ *cr ***************************************************************************/ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c index 8fa9e339b89b50a26c5cb205e80849875d75e4c8..0528323fd945bedb7d756deb61079c7fda9ce3a6 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c @@ -10,118 +10,103 @@ #include <stdio.h> #include <stdlib.h> -#include "file.h" #include "convert_dataset.h" +#include "file.h" +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + printf("CPU-based sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + int len; + int depth; + int dim; + int pad = 1; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - - - - - printf("CPU-based sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting two input filenames\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - int len; - int depth; - int dim; - int pad=1; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1], h_x_vector,dim); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - - int p, i, k; - //main execution - for(p=0;p<50;p++) - { - for (i = 0; i < dim; i++) { - float sum = 0.0f; - //int bound = h_nzcnt[i / 32]; - int bound = h_nzcnt[i]; - for(k=0;k<bound;k++ ) { - int j = h_ptr[k] + i; - int in = h_indices[j]; - - float d = h_data[j]; - float t = h_x_vector[in]; - - sum += d*t; - } - h_Ax_vector[h_perm[i]] = sum; - } - } - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + // main execution + for (p = 0; p < 50; p++) { + for (i = 0; i < dim; i++) { + float sum = 0.0f; + // int bound = h_nzcnt[i / 32]; + int bound = h_nzcnt[i]; + for (k = 0; k < bound; k++) { + int j = h_ptr[k] + i; + int in = h_indices[j]; + + float d = h_data[j]; + float t = h_x_vector[in]; + + sum += d * t; + } + h_Ax_vector[h_perm[i]] = sum; + } + } + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc @@ -1,61 +1,43 @@ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> + +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm) { + int max_thread; + int max_warp; + int max_block = 8; + if (major == 1) { + if (minor >= 2) { + max_thread = 1024; + max_warp = 32; + } else { + max_thread = 768; + max_warp = 24; + } + } else if (major == 2) { + max_thread = 1536; + max_warp = 48; + } else { + // newer GPU //keep using 2.0 + max_thread = 1536; + max_warp = 48; + } -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm) -{ - int max_thread; - int max_warp; - int max_block=8; - if(major==1) - { - if(minor>=2) - { - max_thread=1024; - max_warp=32; - } - else - { - max_thread=768; - max_warp=24; - } - } - else if(major==2) - { - max_thread=1536; - max_warp=48; - } - else - { - //newer GPU //keep using 2.0 - max_thread=1536; - max_warp=48; - } - - int _grid; - int _thread; - int threads_per_sm=0; - if(task*pad>sm*max_thread) - { - //_grid=sm*max_block; - _thread=max_thread/max_block; - _grid=(task*pad+_thread-1)/_thread; - } - else - { - _thread=pad; - _grid=task; - } - thread[0]=_thread; - grid[0]=_grid; - + int _grid; + int _thread; + int threads_per_sm = 0; + if (task * pad > sm * max_thread) { + //_grid=sm*max_block; + _thread = max_thread / max_block; + _grid = (task * pad + _thread - 1) / _thread; + } else { + _thread = pad; + _grid = task; + } + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h @@ -1,8 +1,3 @@ -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm); \ No newline at end of file +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm); \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h index 075c239463c463a2b091b9d9b29647b4655ff13d..975057d6a91fc8f239ad0aa8f9b2cee4cd811e07 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h @@ -1,22 +1,21 @@ +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } +// TEXTURE memory +texture<float, 1> tex_x_float; - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} - - -//TEXTURE memory -texture<float,1> tex_x_float; - -//constant memory +// constant memory __constant__ int jds_ptr_int[5000]; __constant__ int sh_zcnt_int[5000]; -__global__ void spmv_jds(float *dst_vector, - const float *d_data,const int *d_index, const int *d_perm, - const float *x_vec,const int *d_nzcnt,const int dem); - +__global__ void spmv_jds(float *dst_vector, const float *d_data, + const int *d_index, const int *d_perm, + const float *x_vec, const int *d_nzcnt, const int dem); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc @@ -1,61 +1,43 @@ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> + +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm) { + int max_thread; + int max_warp; + int max_block = 8; + if (major == 1) { + if (minor >= 2) { + max_thread = 1024; + max_warp = 32; + } else { + max_thread = 768; + max_warp = 24; + } + } else if (major == 2) { + max_thread = 1536; + max_warp = 48; + } else { + // newer GPU //keep using 2.0 + max_thread = 1536; + max_warp = 48; + } -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm) -{ - int max_thread; - int max_warp; - int max_block=8; - if(major==1) - { - if(minor>=2) - { - max_thread=1024; - max_warp=32; - } - else - { - max_thread=768; - max_warp=24; - } - } - else if(major==2) - { - max_thread=1536; - max_warp=48; - } - else - { - //newer GPU //keep using 2.0 - max_thread=1536; - max_warp=48; - } - - int _grid; - int _thread; - int threads_per_sm=0; - if(task*pad>sm*max_thread) - { - //_grid=sm*max_block; - _thread=max_thread/max_block; - _grid=(task*pad+_thread-1)/_thread; - } - else - { - _thread=pad; - _grid=task; - } - thread[0]=_thread; - grid[0]=_grid; - + int _grid; + int _thread; + int threads_per_sm = 0; + if (task * pad > sm * max_thread) { + //_grid=sm*max_block; + _thread = max_thread / max_block; + _grid = (task * pad + _thread - 1) / _thread; + } else { + _thread = pad; + _grid = task; + } + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h @@ -1,8 +1,3 @@ -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm); \ No newline at end of file +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm); \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h index c9d3062a28dbbcc3bf073884205e1d5054d8bd1f..56ed524240725ae252af41fa43c0a43b208abdd6 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h @@ -1,22 +1,22 @@ +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } +// TEXTURE memory +texture<float, 1> tex_x_float; - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} - - -//TEXTURE memory -texture<float,1> tex_x_float; - -//constant memory +// constant memory __constant__ int jds_ptr_int[5000]; __constant__ int sh_zcnt_int[5000]; -__global__ void spmv_jds_naive(float *dst_vector, - const float *d_data,const int *d_index, const int *d_perm, - const float *x_vec,const int *d_nzcnt,const int dem); - +__global__ void spmv_jds_naive(float *dst_vector, const float *d_data, + const int *d_index, const int *d_perm, + const float *x_vec, const int *d_nzcnt, + const int dem); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc @@ -1,61 +1,43 @@ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> + +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm) { + int max_thread; + int max_warp; + int max_block = 8; + if (major == 1) { + if (minor >= 2) { + max_thread = 1024; + max_warp = 32; + } else { + max_thread = 768; + max_warp = 24; + } + } else if (major == 2) { + max_thread = 1536; + max_warp = 48; + } else { + // newer GPU //keep using 2.0 + max_thread = 1536; + max_warp = 48; + } -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm) -{ - int max_thread; - int max_warp; - int max_block=8; - if(major==1) - { - if(minor>=2) - { - max_thread=1024; - max_warp=32; - } - else - { - max_thread=768; - max_warp=24; - } - } - else if(major==2) - { - max_thread=1536; - max_warp=48; - } - else - { - //newer GPU //keep using 2.0 - max_thread=1536; - max_warp=48; - } - - int _grid; - int _thread; - int threads_per_sm=0; - if(task*pad>sm*max_thread) - { - //_grid=sm*max_block; - _thread=max_thread/max_block; - _grid=(task*pad+_thread-1)/_thread; - } - else - { - _thread=pad; - _grid=task; - } - thread[0]=_thread; - grid[0]=_grid; - + int _grid; + int _thread; + int threads_per_sm = 0; + if (task * pad > sm * max_thread) { + //_grid=sm*max_block; + _thread = max_thread / max_block; + _grid = (task * pad + _thread - 1) / _thread; + } else { + _thread = pad; + _grid = task; + } + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h @@ -1,8 +1,3 @@ -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm); \ No newline at end of file +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm); \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h index 8eef25854e60a98b14bbc0925d2d4886afafdaaf..70f7a33ae9bb6f0af9e59c153bba308c842273f7 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h @@ -1,22 +1,22 @@ +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } +// TEXTURE memory +texture<float, 1> tex_x_float; - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} - - -//TEXTURE memory -texture<float,1> tex_x_float; - -//constant memory +// constant memory __constant__ int jds_ptr_int[5000]; __constant__ int sh_zcnt_int[5000]; -__global__ void spmv_jds_texture(float *dst_vector, - const float *d_data,const int *d_index, const int *d_perm, - const float *x_vec,const int *d_nzcnt,const int dem); - +__global__ void spmv_jds_texture(float *dst_vector, const float *d_data, + const int *d_index, const int *d_perm, + const float *x_vec, const int *d_nzcnt, + const int dem); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc @@ -1,61 +1,43 @@ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> + +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm) { + int max_thread; + int max_warp; + int max_block = 8; + if (major == 1) { + if (minor >= 2) { + max_thread = 1024; + max_warp = 32; + } else { + max_thread = 768; + max_warp = 24; + } + } else if (major == 2) { + max_thread = 1536; + max_warp = 48; + } else { + // newer GPU //keep using 2.0 + max_thread = 1536; + max_warp = 48; + } -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm) -{ - int max_thread; - int max_warp; - int max_block=8; - if(major==1) - { - if(minor>=2) - { - max_thread=1024; - max_warp=32; - } - else - { - max_thread=768; - max_warp=24; - } - } - else if(major==2) - { - max_thread=1536; - max_warp=48; - } - else - { - //newer GPU //keep using 2.0 - max_thread=1536; - max_warp=48; - } - - int _grid; - int _thread; - int threads_per_sm=0; - if(task*pad>sm*max_thread) - { - //_grid=sm*max_block; - _thread=max_thread/max_block; - _grid=(task*pad+_thread-1)/_thread; - } - else - { - _thread=pad; - _grid=task; - } - thread[0]=_thread; - grid[0]=_grid; - + int _grid; + int _thread; + int threads_per_sm = 0; + if (task * pad > sm * max_thread) { + //_grid=sm*max_block; + _thread = max_thread / max_block; + _grid = (task * pad + _thread - 1) / _thread; + } else { + _thread = pad; + _grid = task; + } + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h @@ -1,8 +1,3 @@ -void compute_active_thread(unsigned int *thread, - unsigned int *grid, - int task, - int pad, - int major, - int minor, - int warp_size, - int sm); \ No newline at end of file +void compute_active_thread(unsigned int *thread, unsigned int *grid, int task, + int pad, int major, int minor, int warp_size, + int sm); \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h index 8eef25854e60a98b14bbc0925d2d4886afafdaaf..70f7a33ae9bb6f0af9e59c153bba308c842273f7 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h @@ -1,22 +1,22 @@ +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } +// TEXTURE memory +texture<float, 1> tex_x_float; - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} - - -//TEXTURE memory -texture<float,1> tex_x_float; - -//constant memory +// constant memory __constant__ int jds_ptr_int[5000]; __constant__ int sh_zcnt_int[5000]; -__global__ void spmv_jds_texture(float *dst_vector, - const float *d_data,const int *d_index, const int *d_perm, - const float *x_vec,const int *d_nzcnt,const int dem); - +__global__ void spmv_jds_texture(float *dst_vector, const float *d_data, + const int *d_index, const int *d_perm, + const float *x_vec, const int *d_nzcnt, + const int dem); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h index 560c32f4e992657920956c49c8c48deae8f9428c..abc849f930fb63d231d4453d2b9c07183e5758bd 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h @@ -6,9 +6,9 @@ *cr ***************************************************************************/ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c index 8046b490c4091a102b385013940a45198adf81a0..aa4c36e3e4f8962d81d97c074498f7b44ad06224 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c @@ -10,26 +10,21 @@ #include <stdio.h> #include <stdlib.h> -#include "file.h" #include "convert_dataset.h" +#include "file.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); +static int generate_vector(float *x_vector, int dim) { + srand(54321); int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } /* -void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int* col_index, float* vector, - float* result){ - int i; - int col,row; - int row_index =0; +void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int* +col_index, float* vector, float* result){ int i; int col,row; int row_index =0; int prem_indicator=0; for (i=0; i<len; i++){ if (i>=jds_ptr[prem_indicator+1]){ @@ -47,120 +42,105 @@ void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int* col_ return; } */ -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - - - - - printf("CPU-based sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting two input filenames\n"); - exit(-1); - } +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CPU-based sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + int len; + int depth; + int dim; + int pad = 1; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + + int col_count; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + // generate_vector(h_x_vector, dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - int len; - int depth; - int dim; - int pad=1; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - - - - int col_count; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); -// generate_vector(h_x_vector, dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - int p, i; - //main execution - for(p=0;p<50;p++) - { - #pragma omp parallel for - for (i = 0; i < dim; i++) { + // main execution + for (p = 0; p < 50; p++) { +#pragma omp parallel for + for (i = 0; i < dim; i++) { int k; - float sum = 0.0f; - //int bound = h_nzcnt[i / 32]; - int bound = h_nzcnt[i]; - for(k=0;k<bound;k++ ) { - int j = h_ptr[k] + i; - int in = h_indices[j]; - - float d = h_data[j]; - float t = h_x_vector[in]; - - sum += d*t; - } - // #pragma omp critical - h_Ax_vector[h_perm[i]] = sum; - } - } - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; - + float sum = 0.0f; + // int bound = h_nzcnt[i / 32]; + int bound = h_nzcnt[i]; + for (k = 0; k < bound; k++) { + int j = h_ptr[k] + i; + int in = h_indices[j]; + + float d = h_data[j]; + float t = h_x_vector[in]; + + sum += d * t; + } + // #pragma omp critical + h_Ax_vector[h_perm[i]] = sum; + } + } + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c index 6a9f404808baf25bf985524f0b90ad0eafc8cda0..4bc1b3f79a52e77a7c0524fedc9cd3c8c5137b7a 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c @@ -6,10 +6,10 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" @@ -17,27 +17,20 @@ * Workgroup is multiple of 64 threads * Max threads 265 */ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad) -{ - int max_thread=496*64; - int max_block=256; - int _grid; - int _thread; - - if(task*pad>max_thread) - { - _thread= max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad) { + int max_thread = 496 * 64; + int max_block = 256; + int _grid; + int _thread; + + if (task * pad > max_thread) { + _thread = max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h index 4a061ca31c6af45d5940d9b221fb188408127367..fe1c5cb6c23e3ef6a08da51320d6c565fd28d5d7 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h @@ -9,9 +9,6 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c index 6b5d046583944217fe3a72767944f45a89e73e23..22c1b51753fd8591300d7fd5200dd346e0e8e058 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c @@ -8,272 +8,292 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> #include <string.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - //x_vector[0] = 1.0; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - - //x_vector[i] = 1.0; - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + // x_vector[0] = 1.0; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + + // x_vector[i] = 1.0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL base sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("Optimized for ATI 5000 series by Ian Wetherbee <wetherb1@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting two input filenames\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_ati/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-Werror"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - if (clStatus != CL_SUCCESS) { - size_t paramSize = 1024*1024, paramRet; - char* paramValue; - paramValue = (char*) calloc(paramSize, sizeof(char)); - clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize, paramValue, ¶mRet); - printf(paramValue); - return -1; - } - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_vec",&clStatus); - CHECK_ERROR("clCreateKernel") - - int len; - int depth; - int dim; - int pad=64; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - cl_mem d_nzcnt; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - // HACK: remove the .bin from the end of data, remove later - //parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00; - printf("Input file %s\n", parameters->inpFiles[1]); - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - int col_count; - int warp_size=64; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - warp_size, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - printf("Executing kernel...\n"); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - //generate_vector(h_x_vector,dim) ; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - - /* - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - */ - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,8,sizeof(int),&warp_size); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - printf("Output has %d entries\n", dim); - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //int temp = ((dim + 31)/32)*32; - outputData(parameters->outFile,h_Ax_vector,dim); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL base sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("Optimized for ATI 5000 series by Ian Wetherbee " + "<wetherb1@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_ati/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-Werror"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + if (clStatus != CL_SUCCESS) { + size_t paramSize = 1024 * 1024, paramRet; + char *paramValue; + paramValue = (char *)calloc(paramSize, sizeof(char)); + clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize, + paramValue, ¶mRet); + printf(paramValue); + return -1; + } + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_vec", &clStatus); + CHECK_ERROR("clCreateKernel") + + int len; + int depth; + int dim; + int pad = 64; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + cl_mem d_nzcnt; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // HACK: remove the .bin from the end of data, remove later + // parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00; + printf("Input file %s\n", parameters->inpFiles[1]); + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + int col_count; + int warp_size = 64; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + warp_size, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + printf("Executing kernel...\n"); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + // generate_vector(h_x_vector,dim) ; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + /* + OpenCLDeviceProp clDeviceProp; + clStatus = + clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); + CHECK_ERROR("clGetDeviceInfo") + */ + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &warp_size); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + printf("Output has %d entries\n", dim); + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // int temp = ((dim + 31)/32)*32; + outputData(parameters->outFile, h_Ax_vector, dim); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c @@ -1,48 +1,45 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - fclose(fp); - return buffer; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c index d59d91d0a4310d165aaf925ced2d768cd6a74a12..4bc1b3f79a52e77a7c0524fedc9cd3c8c5137b7a 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c @@ -6,10 +6,10 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" @@ -17,27 +17,20 @@ * Workgroup is multiple of 64 threads * Max threads 265 */ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad) -{ - int max_thread=496*64; - int max_block=256; - int _grid; - int _thread; - - if(task*pad>max_thread) - { - _thread=max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad) { + int max_thread = 496 * 64; + int max_block = 256; + int _grid; + int _thread; + + if (task * pad > max_thread) { + _thread = max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h index 4a061ca31c6af45d5940d9b221fb188408127367..fe1c5cb6c23e3ef6a08da51320d6c565fd28d5d7 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h @@ -9,9 +9,6 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c index 960d0406ca7ad3ec0460cd74ce35f477d2aab78e..ff6dd4138e4f7ab2421dc25c15bf9453f8691e83 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c @@ -8,273 +8,292 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> #include <string.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - //x_vector[0] = 1.0; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - - //x_vector[i] = 1.0; - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + // x_vector[0] = 1.0; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + + // x_vector[i] = 1.0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("Optimized for ATI 5000 series by Ian Wetherbee <wetherb1@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting two input filenames\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_ati_vec/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-Werror"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - if (clStatus != CL_SUCCESS) { - size_t paramSize = 1024*1024, paramRet; - char* paramValue; - paramValue = (char*) calloc(paramSize, sizeof(char)); - clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize, paramValue, ¶mRet); - printf(paramValue); - return -1; - } - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_vec",&clStatus); - CHECK_ERROR("clCreateKernel") - - int len; - int depth; - int dim; - int pad=64; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - cl_mem d_nzcnt; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - // HACK: remove the .bin from the end of data, remove later - //parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00; - printf("Input file %s\n", parameters->inpFiles[0]); - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - int col_count; - int warp_size=64; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - warp_size, // warp size, IMPORTANT: change in kernel as well - 4, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - printf("Executing kernel...\n"); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - - /* - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - */ - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad); - - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,8,sizeof(int),&warp_size); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int i; - for (i=0; i<50; i++) - { - - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - printf("Output has %d entries\n", dim); - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //int temp = ((dim + 31)/32)*32; - outputData(parameters->outFile,h_Ax_vector,dim); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("Optimized for ATI 5000 series by Ian Wetherbee " + "<wetherb1@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_ati_vec/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-Werror"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + if (clStatus != CL_SUCCESS) { + size_t paramSize = 1024 * 1024, paramRet; + char *paramValue; + paramValue = (char *)calloc(paramSize, sizeof(char)); + clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize, + paramValue, ¶mRet); + printf(paramValue); + return -1; + } + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_vec", &clStatus); + CHECK_ERROR("clCreateKernel") + + int len; + int depth; + int dim; + int pad = 64; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + cl_mem d_nzcnt; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // HACK: remove the .bin from the end of data, remove later + // parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00; + printf("Input file %s\n", parameters->inpFiles[0]); + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + int col_count; + int warp_size = 64; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + warp_size, // warp size, IMPORTANT: change in kernel as well + 4, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + printf("Executing kernel...\n"); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + /* + OpenCLDeviceProp clDeviceProp; + clStatus = + clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); + CHECK_ERROR("clGetDeviceInfo") + */ + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &warp_size); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int i; + for (i = 0; i < 50; i++) { + + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + printf("Output has %d entries\n", dim); + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // int temp = ((dim + 31)/32)*32; + outputData(parameters->outFile, h_Ax_vector, dim); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c @@ -1,48 +1,45 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - fclose(fp); - return buffer; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c index 964dab2864a63f675db9614eea856cab46f6d6cc..1bc75a9bd092f0b454eca1e52d691c9f99ba49cb 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c @@ -8,256 +8,275 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> - - +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus); - CHECK_ERROR("clCreateKernel") - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - cl_mem d_nzcnt; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - -// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OpenCLDeviceProp clDeviceProp; -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - //CHECK_ERROR("clGetDeviceInfo") -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_naive", &clStatus); + CHECK_ERROR("clCreateKernel") + + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + cl_mem d_nzcnt; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + OpenCLDeviceProp clDeviceProp; + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + // CHECK_ERROR("clGetDeviceInfo") + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); -// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block); -// printf("!!! dim is %d\n",dim); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is + // %d=\n",grid,block); printf("!!! dim is %d\n",dim); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c index a471cb53938d231012a340633dfa3d3ae8845739..a19184a9659eaa91223da57e1b926ac6bff54b4e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c @@ -8,277 +8,294 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> - - +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // parameters declaration + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = + clCreateCommandQueue(clContext, clDevice, 0, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ + /*cl_program clProgram = + * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[50];*/ + /*sprintf(clOptions,"");*/ + /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds_naive", &clContext, + &clDevice, &clProgram, &clKernel); + + OpenCLDeviceProp clDeviceProp; + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + // CHECK_ERROR("clGetDeviceInfo") + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + // CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is + // %d=\n",grid,block); printf("!!! dim is %d\n",dim); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + int i; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int j = 0; j < 20; j++) { + for (i = 0; i < 50; i++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, + &grid, &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - - //load matrix from files + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + } + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - -// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //parameters declaration - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ - /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[50];*/ - /*sprintf(clOptions,"");*/ - /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - cl_kernel clKernel; - cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds_naive", &clContext, &clDevice, &clProgram, &clKernel); - - OpenCLDeviceProp clDeviceProp; -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - //CHECK_ERROR("clGetDeviceInfo") -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); -// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block); -// printf("!!! dim is %d\n",dim); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int j=0; j<20; j++) { - for (i=0; i<50; i++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - } - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - } - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - } + outputData(parameters->outFile, h_Ax_vector, dim); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - /*free((void*)clSource[0]);*/ + /*free((void*)clSource[0]);*/ - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_FreeParameters(parameters); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c index 1812bc12b7563259eb9797c00ac93a7a5e9210d9..d4fc026b73894e47c94dd7f2c9ef8f31e366eec6 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c @@ -8,271 +8,288 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> - - +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - //load matrix from files +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = + clCreateCommandQueue(clContext, clDevice, 0, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + const char *clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_naive", &clStatus); + CHECK_ERROR("clCreateKernel") + + /*cl_kernel clKernel;*/ + /*cl_program clProgram;*/ + /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, + * &clDevice, &clProgram, &clKernel);*/ + + OpenCLDeviceProp clDeviceProp; + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + // CHECK_ERROR("clGetDeviceInfo") + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + // CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is + // %d=\n",grid,block); printf("!!! dim is %d\n",dim); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + + int i; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + /*for(int j=0; j<20; j++) {*/ + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + /*}*/ + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - -// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - - const char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus); - CHECK_ERROR("clCreateKernel") - - /*cl_kernel clKernel;*/ - /*cl_program clProgram;*/ - /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/ - - OpenCLDeviceProp clDeviceProp; -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - //CHECK_ERROR("clGetDeviceInfo") -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); -// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block); -// printf("!!! dim is %d\n",dim); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - - int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - /*for(int j=0; j<20; j++) {*/ - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - /*}*/ - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - } - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - /*free((void*)clSource[0]);*/ - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_FreeParameters(parameters); - - return 0; + outputData(parameters->outFile, h_Ax_vector, dim); + } + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + /*free((void*)clSource[0]);*/ + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c index 7b26d543aab0b480975111ff2a06a8cbd103de34..42ffab597d028eacba7f9975473908bdf812524e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c @@ -8,277 +8,294 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> - - +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // parameters declaration + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = + clCreateCommandQueue(clContext, clDevice, 0, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ + /*cl_program clProgram = + * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[50];*/ + /*sprintf(clOptions,"");*/ + /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, + &clDevice, &clProgram, &clKernel); + + OpenCLDeviceProp clDeviceProp; + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + // CHECK_ERROR("clGetDeviceInfo") + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + // CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is + // %d=\n",grid,block); printf("!!! dim is %d\n",dim); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + int i; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int j = 0; j < 1; j++) { + for (i = 0; i < 50; i++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, + &grid, &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - - //load matrix from files + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + } + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - -// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //parameters declaration - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ - /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[50];*/ - /*sprintf(clOptions,"");*/ - /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - cl_kernel clKernel; - cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel); - - OpenCLDeviceProp clDeviceProp; -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - //CHECK_ERROR("clGetDeviceInfo") -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); -// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block); -// printf("!!! dim is %d\n",dim); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int j=0; j<1; j++) { - for (i=0; i<50; i++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - } - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - } - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - } + outputData(parameters->outFile, h_Ax_vector, dim); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - /*free((void*)clSource[0]);*/ + /*free((void*)clSource[0]);*/ - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_FreeParameters(parameters); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c index 2f1bff5bb31a0ac0ab980998b681247e3c94d7f2..fbd272b32f7f60fbd0c651b0f329550b47e4db27 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c @@ -8,277 +8,294 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> - - +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // parameters declaration + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = + clCreateCommandQueue(clContext, clDevice, 0, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ + /*cl_program clProgram = + * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[50];*/ + /*sprintf(clOptions,"");*/ + /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, + &clDevice, &clProgram, &clKernel); + + OpenCLDeviceProp clDeviceProp; + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + // CHECK_ERROR("clGetDeviceInfo") + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + // CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is + // %d=\n",grid,block); printf("!!! dim is %d\n",dim); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + int i; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int j = 0; j < 20; j++) { + for (i = 0; i < 50; i++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, + &grid, &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - - //load matrix from files + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + } + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - -// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //parameters declaration - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ - /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[50];*/ - /*sprintf(clOptions,"");*/ - /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - cl_kernel clKernel; - cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel); - - OpenCLDeviceProp clDeviceProp; -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - //CHECK_ERROR("clGetDeviceInfo") -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); -// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block); -// printf("!!! dim is %d\n",dim); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - int i; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int j=0; j<20; j++) { - for (i=0; i<50; i++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - } - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - } - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - } + outputData(parameters->outFile, h_Ax_vector, dim); + } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - /*free((void*)clSource[0]);*/ + /*free((void*)clSource[0]);*/ - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_FreeParameters(parameters); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c index c2a43eb0d171640ac33b7e711ab21990b5462af0..343814149aa74139930380c2178e2f447c64e806 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c @@ -8,262 +8,276 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" - -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one two filenames\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds", &clStatus); + CHECK_ERROR("clCreateKernel") + /*cl_kernel clKernel;*/ + /*cl_program clProgram;*/ + /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", + * "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/ + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Ax_vector, dim); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + // free((void*)clSource[0]); - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one two filenames\n"); - exit(-1); - } + pb_FreeParameters(parameters); - //load matrix from files - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); - CHECK_ERROR("clCreateKernel") - /*cl_kernel clKernel;*/ - /*cl_program clProgram;*/ - /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/ - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - int i; - for(i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - //free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_FreeParameters(parameters); - - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c index 7e1c15d72919e3ba2cee94ad1fd4254b3325f1a8..4600a3e6b8d580ad6fc3986d24a712ad592e25eb 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c @@ -8,279 +8,293 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" - -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one two filenames\n"); - exit(-1); - } - - //load matrix from files - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - printf("Col count = %d, dim = %d\n", col_count, dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,""); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") - - //cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); - //CHECK_ERROR("clCreateKernel") - cl_kernel clKernel; - cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_huge_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth, nzcnt_len); - long totalmem = 8*(len*sizeof(float) - +len*sizeof(int) - +dim*sizeof(int) - +2*dim*sizeof(float) - +depth*sizeof(int) - +nzcnt_len*sizeof(int)); - printf("total mem = %f MB\n", totalmem/(1024.0*1024)); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - int i; - for (int j=0; j<5; j++) { - for(i=0; i<50; i++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - } - - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one two filenames\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + printf("Col count = %d, dim = %d\n", col_count, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + + // char clOptions[50]; + // sprintf(clOptions,""); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") + + // cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); + // CHECK_ERROR("clCreateKernel") + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary( + "build/opencl_nvidia_huge_default/kernel_offline.nvptx.s", "spmv_jds", + &clContext, &clDevice, &clProgram, &clKernel); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth, + nzcnt_len); + long totalmem = 8 * (len * sizeof(float) + len * sizeof(int) + + dim * sizeof(int) + 2 * dim * sizeof(float) + + depth * sizeof(int) + nzcnt_len * sizeof(int)); + printf("total mem = %f MB\n", totalmem / (1024.0 * 1024)); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + int i; + for (int j = 0; j < 5; j++) { + for (i = 0; i < 50; i++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, + &grid, &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + } + pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_PrintTimerSet(&timers); + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Ax_vector, dim); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); - pb_FreeParameters(parameters); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h index 9f082b2fb607495f9d527acbc6727134d3a8d353..9c4f12027d979367d53cc378450d100ebc51780f 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s! ErrorCode = %d\n",errorMessage, clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s! ErrorCode = %d\n", errorMessage, clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c index 5e43b4da93a08719a1bde55f426f5ac3f10cc49d..d2375af91dd8d4812fcb82b78b856e85feda376f 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c @@ -8,279 +8,293 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" - -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one two filenames\n"); - exit(-1); - } - - //load matrix from files - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - printf("Col count = %d, dim = %d\n", col_count, dim); - - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,""); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") - - //cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); - //CHECK_ERROR("clCreateKernel") - cl_kernel clKernel; - cl_program clProgram; - pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_large_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth, nzcnt_len); - long totalmem = 8*(len*sizeof(float) - +len*sizeof(int) - +dim*sizeof(int) - +2*dim*sizeof(float) - +depth*sizeof(int) - +nzcnt_len*sizeof(int)); - printf("total mem = %f MB\n", totalmem/(1024.0*1024)); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - int i; - for (int j=0; j<100; j++) { - for(i=0; i<50; i++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - } - - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one two filenames\n"); + exit(-1); + } + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + printf("Col count = %d, dim = %d\n", col_count, dim); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + + // char clOptions[50]; + // sprintf(clOptions,""); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") + + // cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); + // CHECK_ERROR("clCreateKernel") + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary( + "build/opencl_nvidia_large_default/kernel_offline.nvptx.s", "spmv_jds", + &clContext, &clDevice, &clProgram, &clKernel); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth, + nzcnt_len); + long totalmem = 8 * (len * sizeof(float) + len * sizeof(int) + + dim * sizeof(int) + 2 * dim * sizeof(float) + + depth * sizeof(int) + nzcnt_len * sizeof(int)); + printf("total mem = %f MB\n", totalmem / (1024.0 * 1024)); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + nzcnt_len * sizeof(int), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + int i; + for (int j = 0; j < 100; j++) { + for (i = 0; i < 50; i++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, + &grid, &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + } + pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(sh_zcnt_int); - clStatus = clReleaseMemObject(jds_ptr_int); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(sh_zcnt_int); + clStatus = clReleaseMemObject(jds_ptr_int); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); - - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_PrintTimerSet(&timers); + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Ax_vector, dim); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); - pb_FreeParameters(parameters); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h index 9f082b2fb607495f9d527acbc6727134d3a8d353..9c4f12027d979367d53cc378450d100ebc51780f 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s! ErrorCode = %d\n",errorMessage, clStatus); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s! ErrorCode = %d\n", errorMessage, clStatus); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c index 1736545153135192f84735bb2888f148870df143..a18ed997526039c2292bb31255f8ac2fbe47915d 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c @@ -8,273 +8,292 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting two input filenames\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_tex/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_texture",&clStatus); - CHECK_ERROR("clCreateKernel") - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_tex/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_texture", &clStatus); + CHECK_ERROR("clCreateKernel") + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - OpenCLDeviceProp clDeviceProp; -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); -// CHECK_ERROR("clGetDeviceInfo") -// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + + OpenCLDeviceProp clDeviceProp; + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + // CHECK_ERROR("clGetDeviceInfo") + // clStatus = + // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_IMAGE2D_MAX_WIDTH,sizeof(size_t),&(clDeviceProp.maxImgWidth),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - cl_image_format clImgFmt = {CL_R,CL_FLOAT}; - - size_t clImgWidth; - size_t clImgHeight; - if(dim<=clDeviceProp.maxImgWidth) - { - clImgWidth = dim; - clImgHeight = 1; - } - else - { - clImgWidth = clDeviceProp.maxImgWidth; - clImgHeight = (dim+clDeviceProp.maxImgWidth-1)/clDeviceProp.maxImgWidth; - } - - d_x_vector = clCreateImage2D(clContext,CL_MEM_READ_ONLY,&clImgFmt,clImgWidth,clImgHeight,0,NULL,&clStatus); - CHECK_ERROR("clCreateImage2D") - - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - size_t clOrigin[3] = {0,0,0}; - size_t clRegion[3] = {clImgWidth,clImgHeight,1}; - size_t clRowPitch = clImgWidth*sizeof(cl_float); - size_t clSlicePitch = 0; - clStatus = clEnqueueWriteImage(clCommandQueue,d_x_vector,CL_FALSE,clOrigin,clRegion,clRowPitch,clSlicePitch,h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteImage") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &(clDeviceProp.maxImgWidth), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + + cl_image_format clImgFmt = {CL_R, CL_FLOAT}; + + size_t clImgWidth; + size_t clImgHeight; + if (dim <= clDeviceProp.maxImgWidth) { + clImgWidth = dim; + clImgHeight = 1; + } else { + clImgWidth = clDeviceProp.maxImgWidth; + clImgHeight = + (dim + clDeviceProp.maxImgWidth - 1) / clDeviceProp.maxImgWidth; + } + + d_x_vector = clCreateImage2D(clContext, CL_MEM_READ_ONLY, &clImgFmt, + clImgWidth, clImgHeight, 0, NULL, &clStatus); + CHECK_ERROR("clCreateImage2D") + + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + size_t clOrigin[3] = {0, 0, 0}; + size_t clRegion[3] = {clImgWidth, clImgHeight, 1}; + size_t clRowPitch = clImgWidth * sizeof(cl_float); + size_t clSlicePitch = 0; + clStatus = clEnqueueWriteImage(clCommandQueue, d_x_vector, CL_FALSE, clOrigin, + clRegion, clRowPitch, clSlicePitch, h_x_vector, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteImage") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h index b34cb1b2494bd346c335eb9ce5b306d53ff9c8ae..bbdb88fbe9818887f0af522bb7456231603275db 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h @@ -2,21 +2,20 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; - size_t maxImgWidth; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; + size_t maxImgWidth; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c index 4eebc0ad100e1a7e51c47268e62b134fc74c5d93..04559ee3ae2beaebf28067b346f5a93ddcbc282c 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c @@ -8,276 +8,295 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" #include "ocl.h" -#include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting two input filenames\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_tex_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_texture",&clStatus); - CHECK_ERROR("clCreateKernel") - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_IMAGE2D_MAX_WIDTH,sizeof(size_t),&(clDeviceProp.maxImgWidth),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - cl_image_format clImgFmt = {CL_R,CL_FLOAT}; - - size_t clImgWidth; - size_t clImgHeight; - if(dim<=clDeviceProp.maxImgWidth) - { - clImgWidth = dim; - clImgHeight = 1; - } - else - { - clImgWidth = clDeviceProp.maxImgWidth; - clImgHeight = (dim+clDeviceProp.maxImgWidth-1)/clDeviceProp.maxImgWidth; - } - - d_x_vector = clCreateImage2D(clContext,CL_MEM_READ_ONLY,&clImgFmt,clImgWidth,clImgHeight,0,NULL,&clStatus); - CHECK_ERROR("clCreateImage2D") - - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - size_t clOrigin[3] = {0,0,0}; - size_t clRegion[3] = {clImgWidth,clImgHeight,1}; - size_t clRowPitch = clImgWidth*sizeof(cl_float); - size_t clSlicePitch = 0; - clStatus = clEnqueueWriteImage(clCommandQueue,d_x_vector,CL_FALSE,clOrigin,clRegion,clRowPitch,clSlicePitch,h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteImage") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - -// grid.x=nzcnt_len; -// block.x=pad; - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_tex_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_texture", &clStatus); + CHECK_ERROR("clCreateKernel") + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &(clDeviceProp.maxImgWidth), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + + cl_image_format clImgFmt = {CL_R, CL_FLOAT}; + + size_t clImgWidth; + size_t clImgHeight; + if (dim <= clDeviceProp.maxImgWidth) { + clImgWidth = dim; + clImgHeight = 1; + } else { + clImgWidth = clDeviceProp.maxImgWidth; + clImgHeight = + (dim + clDeviceProp.maxImgWidth - 1) / clDeviceProp.maxImgWidth; + } + + d_x_vector = clCreateImage2D(clContext, CL_MEM_READ_ONLY, &clImgFmt, + clImgWidth, clImgHeight, 0, NULL, &clStatus); + CHECK_ERROR("clCreateImage2D") + + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + size_t clOrigin[3] = {0, 0, 0}; + size_t clRegion[3] = {clImgWidth, clImgHeight, 1}; + size_t clRowPitch = clImgWidth * sizeof(cl_float); + size_t clSlicePitch = 0; + clStatus = clEnqueueWriteImage(clCommandQueue, d_x_vector, CL_FALSE, clOrigin, + clRegion, clRowPitch, clSlicePitch, h_x_vector, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteImage") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + // grid.x=nzcnt_len; + // block.x=pad; + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c @@ -1,49 +1,46 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h index b34cb1b2494bd346c335eb9ce5b306d53ff9c8ae..bbdb88fbe9818887f0af522bb7456231603275db 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h @@ -2,21 +2,20 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; - size_t maxImgWidth; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; + size_t maxImgWidth; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc index 58f9a3a358fcad5da79b375f5d0ec45854317bc8..a15137259e9963e43bfaa56ddeda89399e2d38d6 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc @@ -7,66 +7,61 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +extern "C" void inputData(char *fName, int *len, int *depth, int *dim, + int *nzcnt_len, int *pad, float **h_data, + int **h_indices, int **h_ptr, int **h_perm, + int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; + + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); + + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); + + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); - fclose (fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h index 64ef34a091ae149773ab9c5e8dfa352fbaa8d11e..e86d2ef8b66a60ae4bf5b1171ae23411dcf332d9 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h @@ -13,12 +13,11 @@ extern "C" { #endif -void inputData(char* fName, int* len, int* depth, int* dim, - int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void outputData(char* fName, float *h_Ax_vector,int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c index 2b5c4b3a875ecd25954845381daf1060d03fe6b1..96f4ecc2997b33000e91abecfee8c85664cb4a12 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c @@ -8,237 +8,259 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> #include "file.h" #include "gpu_info.h" #include "ocl.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,0,NULL,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus); - CHECK_ERROR("clCreateKernel") - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - cl_mem d_nzcnt; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - &h_data, &h_indices, &h_ptr, - &h_perm, &h_nzcnt); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - generate_vector(h_x_vector, dim); - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_GPU); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and " + "Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // parameters declaration + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 0, NULL, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_naive", &clStatus); + CHECK_ERROR("clCreateKernel") + + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + cl_mem d_nzcnt; + + // vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + inputData(parameters->inpFiles[0], &len, &depth, &dim, &nzcnt_len, &pad, + &h_data, &h_indices, &h_ptr, &h_perm, &h_nzcnt); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + generate_vector(h_x_vector, dim); + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_GPU); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c @@ -1,48 +1,45 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - fclose(fp); - return buffer; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc index 58f9a3a358fcad5da79b375f5d0ec45854317bc8..a15137259e9963e43bfaa56ddeda89399e2d38d6 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc @@ -7,66 +7,61 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +extern "C" void inputData(char *fName, int *len, int *depth, int *dim, + int *nzcnt_len, int *pad, float **h_data, + int **h_indices, int **h_ptr, int **h_perm, + int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; + + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); + + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); + + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); - fclose (fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h index 64ef34a091ae149773ab9c5e8dfa352fbaa8d11e..e86d2ef8b66a60ae4bf5b1171ae23411dcf332d9 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h @@ -13,12 +13,11 @@ extern "C" { #endif -void inputData(char* fName, int* len, int* depth, int* dim, - int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void outputData(char* fName, float *h_Ax_vector,int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c index 483281d18b32a85ceb49ee1ca34d5811a2eb7b80..218c658394428a0cb96fb702f3a08d7cede040f1 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c @@ -8,238 +8,259 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> #include "file.h" #include "gpu_info.h" #include "ocl.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,0,NULL,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); - CHECK_ERROR("clCreateKernel") - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_nzcnt; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - &h_data, &h_indices, &h_ptr, - &h_perm, &h_nzcnt); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - generate_vector(h_x_vector, dim); - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_GPU); - - int i; - for(i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 0, NULL, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds", &clStatus); + CHECK_ERROR("clCreateKernel") + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_nzcnt; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + inputData(parameters->inpFiles[0], &len, &depth, &dim, &nzcnt_len, &pad, + &h_data, &h_indices, &h_ptr, &h_perm, &h_nzcnt); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + generate_vector(h_x_vector, dim); + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0, + dim * sizeof(int), h_x_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_GPU); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c @@ -1,48 +1,45 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - fclose(fp); - return buffer; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h @@ -2,20 +2,19 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc index 58f9a3a358fcad5da79b375f5d0ec45854317bc8..a15137259e9963e43bfaa56ddeda89399e2d38d6 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc @@ -7,66 +7,61 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +extern "C" void inputData(char *fName, int *len, int *depth, int *dim, + int *nzcnt_len, int *pad, float **h_data, + int **h_indices, int **h_ptr, int **h_perm, + int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; + + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); + + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); + + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); - fclose (fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h index 64ef34a091ae149773ab9c5e8dfa352fbaa8d11e..e86d2ef8b66a60ae4bf5b1171ae23411dcf332d9 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h @@ -13,12 +13,11 @@ extern "C" { #endif -void inputData(char* fName, int* len, int* depth, int* dim, - int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void outputData(char* fName, float *h_Ax_vector,int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c index 34938ab1f214896fa4d4d17f99e9625bd08acf0e..f65916d62a083facf41eca5ad7a38eb62c6ea8e9 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c @@ -8,259 +8,280 @@ #include <CL/cl.h> #include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> -#include <parboil.h> #include "file.h" #include "gpu_info.h" #include "ocl.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_tex/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,0,NULL,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_texture",&clStatus); - CHECK_ERROR("clCreateKernel") - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - &h_data, &h_indices, &h_ptr, - &h_perm, &h_nzcnt); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - generate_vector(h_x_vector, dim); - - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_IMAGE2D_MAX_WIDTH,sizeof(size_t),&(clDeviceProp.maxImgWidth),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - cl_image_format clImgFmt = {CL_R,CL_FLOAT}; - - size_t clImgWidth; - size_t clImgHeight; - if(dim<=clDeviceProp.maxImgWidth) - { - clImgWidth = dim; - clImgHeight = 1; - } - else - { - clImgWidth = clDeviceProp.maxImgWidth; - clImgHeight = (dim+clDeviceProp.maxImgWidth-1)/clDeviceProp.maxImgWidth; - } - - d_x_vector = clCreateImage2D(clContext,CL_MEM_READ_ONLY,&clImgFmt,clImgWidth,clImgHeight,0,NULL,&clStatus); - CHECK_ERROR("clCreateImage2D") - - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - size_t clOrigin[3] = {0,0,0}; - size_t clRegion[3] = {clImgWidth,clImgHeight,1}; - size_t clRowPitch = clImgWidth*sizeof(cl_float); - size_t clSlicePitch = 0; - clStatus = clEnqueueWriteImage(clCommandQueue,d_x_vector,CL_FALSE,clOrigin,clRegion,clRowPitch,clSlicePitch,h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteImage") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_GPU); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_tex/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, ""); + clStatus = clBuildProgram(clProgram, 0, NULL, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_texture", &clStatus); + CHECK_ERROR("clCreateKernel") + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + // device memory allocation + // matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + // load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + inputData(parameters->inpFiles[0], &len, &depth, &dim, &nzcnt_len, &pad, + &h_data, &h_indices, &h_ptr, &h_perm, &h_nzcnt); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + generate_vector(h_x_vector, dim); + + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &(clDeviceProp.major), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &(clDeviceProp.minor), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), + &(clDeviceProp.multiProcessorCount), NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &(clDeviceProp.maxImgWidth), NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL, + &clStatus); + CHECK_ERROR("clCreateBuffer") + + cl_image_format clImgFmt = {CL_R, CL_FLOAT}; + + size_t clImgWidth; + size_t clImgHeight; + if (dim <= clDeviceProp.maxImgWidth) { + clImgWidth = dim; + clImgHeight = 1; + } else { + clImgWidth = clDeviceProp.maxImgWidth; + clImgHeight = + (dim + clDeviceProp.maxImgWidth - 1) / clDeviceProp.maxImgWidth; + } + + d_x_vector = clCreateImage2D(clContext, CL_MEM_READ_ONLY, &clImgFmt, + clImgWidth, clImgHeight, 0, NULL, &clStatus); + CHECK_ERROR("clCreateImage2D") + + d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + dim * sizeof(float), NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float)); + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0, + len * sizeof(float), h_data, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0, + len * sizeof(int), h_indices, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0, + dim * sizeof(int), h_perm, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + size_t clOrigin[3] = {0, 0, 0}; + size_t clRegion[3] = {clImgWidth, clImgHeight, 1}; + size_t clRowPitch = clImgWidth * sizeof(cl_float); + size_t clSlicePitch = 0; + clStatus = clEnqueueWriteImage(clCommandQueue, d_x_vector, CL_FALSE, clOrigin, + clRegion, clRowPitch, clSlicePitch, h_x_vector, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteImage") + + clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0, + depth * sizeof(int), h_ptr, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = + clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0, + nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, + clDeviceProp.minor, clDeviceProp.multiProcessorCount); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_GPU); + + int i; + for (i = 0; i < 50; i++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, + &block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // HtoD memory copy + clStatus = + clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0, + dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Ax_vector, dim); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c @@ -1,48 +1,45 @@ +#include "ocl.h" #include <CL/cl.h> #include <stdio.h> #include <string.h> -#include "ocl.h" -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*size); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * size); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - fclose(fp); - return buffer; + fclose(fp); + return buffer; } -void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) -{ - cl_int clStatus; - char* temp = (char*)malloc(size); - memset(temp,val,size); - clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - free(temp); +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, + size_t size) { + cl_int clStatus; + char *temp = (char *)malloc(size); + memset(temp, val, size); + clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp, + 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h index b34cb1b2494bd346c335eb9ce5b306d53ff9c8ae..bbdb88fbe9818887f0af522bb7456231603275db 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h @@ -2,21 +2,20 @@ #define __OCLH__ typedef struct { - cl_uint major; - cl_uint minor; - cl_uint multiProcessorCount; - size_t maxImgWidth; + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; + size_t maxImgWidth; } OpenCLDeviceProp; void clMemSet(cl_command_queue, cl_mem, int, size_t); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp @@ -7,72 +7,66 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt) -{ - FILE* fid = fopen(fName, "rb"); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt) { + FILE *fid = fopen(fName, "rb"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad); + int _len = len[0]; + int _depth = depth[0]; + int _dim = dim[0]; + int _pad = pad[0]; + int _nzcnt_len = nzcnt_len[0]; - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } + *h_data = (float *)malloc(_len * sizeof(float)); + fread(*h_data, sizeof(float), _len, fid); - fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); - int _len=len[0]; - int _depth=depth[0]; - int _dim=dim[0]; - int _pad=pad[0]; - int _nzcnt_len=nzcnt_len[0]; - - *h_data = (float *) malloc(_len * sizeof (float)); - fread (*h_data, sizeof (float), _len, fid); - - *h_indices = (int *) malloc(_len * sizeof (int)); - fread (*h_indices, sizeof (int), _len, fid); - - *h_ptr = (int *) malloc(_depth * sizeof (int)); - fread (*h_ptr, sizeof (int), _depth, fid); - - *h_perm = (int *) malloc(_dim * sizeof (int)); - fread (*h_perm, sizeof (int), _dim, fid); - - *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); - fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + *h_indices = (int *)malloc(_len * sizeof(int)); + fread(*h_indices, sizeof(int), _len, fid); - fclose (fid); + *h_ptr = (int *)malloc(_depth * sizeof(int)); + fread(*h_ptr, sizeof(int), _depth, fid); + + *h_perm = (int *)malloc(_dim * sizeof(int)); + fread(*h_perm, sizeof(int), _dim, fid); + + *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int)); + fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid); + + fclose(fid); } -void input_vec(char *fName,float *h_vec,int dim) -{ - FILE* fid = fopen(fName, "rb"); - fread (h_vec, sizeof (float), dim, fid); +void input_vec(char *fName, float *h_vec, int dim) { + FILE *fid = fopen(fName, "rb"); + fread(h_vec, sizeof(float), dim, fid); fclose(fid); - } -void outputData(char* fName, float *h_Ax_vector,int dim) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_Ax_vector, int dim) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } tmp32 = dim; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_Ax_vector, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h @@ -8,11 +8,11 @@ #ifndef __FILEH__ #define __FILEH__ -void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, - float** h_data, int** h_indices, int** h_ptr, - int** h_perm, int** h_nzcnt); +void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len, + int *pad, float **h_data, int **h_indices, int **h_ptr, + int **h_perm, int **h_nzcnt); -void input_vec(char* fNanme, float *h_vec,int dim); -void outputData(char* fName, float *h_Ax_vector,int dim); +void input_vec(char *fNanme, float *h_vec, int dim); +void outputData(char *fName, float *h_Ax_vector, int dim); #endif \ No newline at end of file diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp @@ -6,50 +6,39 @@ *cr ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #include "gpu_info.h" -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm) -{ - int max_thread; - int max_block=8; - if(major==1) - { - if(minor>=2) - max_thread=1024; - else - max_thread=768; - } - else if(major==2) - max_thread=1536; - else - //newer GPU //keep using 2.0 - max_thread=1536; - - int _grid; - int _thread; - - if(task*pad>sm*max_thread) - { - _thread=max_thread/max_block; - _grid = ((task*pad+_thread-1)/_thread)*_thread; - } - else - { - _thread=pad; - _grid=task*pad; - } +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm) { + int max_thread; + int max_block = 8; + if (major == 1) { + if (minor >= 2) + max_thread = 1024; + else + max_thread = 768; + } else if (major == 2) + max_thread = 1536; + else + // newer GPU //keep using 2.0 + max_thread = 1536; + + int _grid; + int _thread; + + if (task * pad > sm * max_thread) { + _thread = max_thread / max_block; + _grid = ((task * pad + _thread - 1) / _thread) * _thread; + } else { + _thread = pad; + _grid = task * pad; + } - thread[0]=_thread; - grid[0]=_grid; + thread[0] = _thread; + grid[0] = _grid; } diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h @@ -9,12 +9,7 @@ #ifndef __GPUINFOH__ #define __GPUINFOH__ -void compute_active_thread(size_t *thread, - size_t *grid, - int task, - int pad, - int major, - int minor, - int sm); +void compute_active_thread(size_t *thread, size_t *grid, int task, int pad, + int major, int minor, int sm); #endif diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp index eb4d4d7dbed65c13340578f8023afab1473fe962..f6ce5ccfb2412036f4eadcdab419ceca0a6c8f30 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp @@ -8,366 +8,354 @@ //#include <CL/cl.h> //#include <CL/cl_ext.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> #include <visc.h> +#include "convert_dataset.h" #include "file.h" #include "gpu_info.h" -#include "convert_dataset.h" #define WARP_BITS 5 -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } typedef struct __attribute__((__packed__)) { - float* dst_vector; size_t bytes_dst_vector; - float* d_data; size_t bytes_d_data; - int* d_index; size_t bytes_d_index; - int* d_perm; size_t bytes_d_perm; - float* x_vec; size_t bytes_x_vec; - int dim; - int* jds_ptr_int; size_t bytes_jds_ptr_int; - int* sh_zcnt_int; size_t bytes_sh_zcnt_int; - size_t dim_X1, dim_X2; + float *dst_vector; + size_t bytes_dst_vector; + float *d_data; + size_t bytes_d_data; + int *d_index; + size_t bytes_d_index; + int *d_perm; + size_t bytes_d_perm; + float *x_vec; + size_t bytes_x_vec; + int dim; + int *jds_ptr_int; + size_t bytes_jds_ptr_int; + int *sh_zcnt_int; + size_t bytes_sh_zcnt_int; + size_t dim_X1, dim_X2; } RootIn; -void spmv_jds(float* dst_vector, size_t bytes_dst_vector, - float* d_data, size_t bytes_d_data, - int* d_index, size_t bytes_d_index, - int* d_perm, size_t bytes_d_perm, - float* x_vec, size_t bytes_x_vec, - int dim, - int* jds_ptr_int, size_t bytes_jds_ptr_int, - int* sh_zcnt_int, size_t bytes_sh_zcnt_int) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, - 1, dst_vector); - - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); - int lx = __visc__getNodeInstanceID_x(thisNode); - int gx = __visc__getNodeInstanceID_x(parentNode); - int gridx = __visc__getNumNodeInstances_x(thisNode); - - int ix = gx * gridx + lx; - int warp_id=ix>>WARP_BITS; - - if(ix<dim) +void spmv_jds(float *dst_vector, size_t bytes_dst_vector, float *d_data, + size_t bytes_d_data, int *d_index, size_t bytes_d_index, + int *d_perm, size_t bytes_d_perm, float *x_vec, + size_t bytes_x_vec, int dim, int *jds_ptr_int, + size_t bytes_jds_ptr_int, int *sh_zcnt_int, + size_t bytes_sh_zcnt_int) { + __visc__hint(visc::DEVICE); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + sh_zcnt_int, 1, dst_vector); + + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gridx = __visc__getNumNodeInstances_x(thisNode); + + int ix = gx * gridx + lx; + int warp_id = ix >> WARP_BITS; + + if (ix < dim) { + float sum = 0.0f; + int bound = sh_zcnt_int[warp_id]; + // prefetch 0 + int j = jds_ptr_int[0] + ix; + float d = d_data[j]; + int i = d_index[j]; + float t = x_vec[i]; + + if (bound > 1) // bound >=2 { - float sum=0.0f; - int bound=sh_zcnt_int[warp_id]; - //prefetch 0 - int j=jds_ptr_int[0]+ix; - float d = d_data[j]; - int i = d_index[j]; - float t = x_vec[i]; - - if (bound>1) //bound >=2 - { - //prefetch 1 - j=jds_ptr_int[1]+ix; - i = d_index[j]; - int in; - float dn; - float tn; - for(int k=2; k<bound; k++ ) - { - //prefetch k-1 - dn = d_data[j]; - //prefetch k - j=jds_ptr_int[k]+ix; - in = d_index[j]; - //prefetch k-1 - tn = x_vec[i]; - - //compute k-2 - sum += d*t; - //sweep to k - i = in; - //sweep to k-1 - d = dn; - t =tn; - } - - //fetch last - dn = d_data[j]; - tn = x_vec[i]; - - //compute last-1 - sum += d*t; - //sweep to last - d=dn; - t=tn; - } - //compute last - sum += d*t; // 3 3 - - //write out data - dst_vector[d_perm[ix]]=sum; + // prefetch 1 + j = jds_ptr_int[1] + ix; + i = d_index[j]; + int in; + float dn; + float tn; + for (int k = 2; k < bound; k++) { + // prefetch k-1 + dn = d_data[j]; + // prefetch k + j = jds_ptr_int[k] + ix; + in = d_index[j]; + // prefetch k-1 + tn = x_vec[i]; + + // compute k-2 + sum += d * t; + // sweep to k + i = in; + // sweep to k-1 + d = dn; + t = tn; + } + + // fetch last + dn = d_data[j]; + tn = x_vec[i]; + + // compute last-1 + sum += d * t; + // sweep to last + d = dn; + t = tn; } -} + // compute last + sum += d * t; // 3 3 -void spmvLvl1(float* dst_vector, size_t bytes_dst_vector, - float* d_data, size_t bytes_d_data, - int* d_index, size_t bytes_d_index, - int* d_perm, size_t bytes_d_perm, - float* x_vec, size_t bytes_x_vec, - int dim, - int* jds_ptr_int, size_t bytes_jds_ptr_int, - int* sh_zcnt_int, size_t bytes_sh_zcnt_int, - size_t dim_X1) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, - 1, dst_vector); - void* spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1); - __visc__bindIn(spmv_node, 0, 0, 0); - __visc__bindIn(spmv_node, 1, 1, 0); - __visc__bindIn(spmv_node, 2, 2, 0); - __visc__bindIn(spmv_node, 3, 3, 0); - __visc__bindIn(spmv_node, 4, 4, 0); - __visc__bindIn(spmv_node, 5, 5, 0); - __visc__bindIn(spmv_node, 6, 6, 0); - __visc__bindIn(spmv_node, 7, 7, 0); - __visc__bindIn(spmv_node, 8, 8, 0); - __visc__bindIn(spmv_node, 9, 9, 0); - __visc__bindIn(spmv_node, 10, 10, 0); - __visc__bindIn(spmv_node, 11, 11, 0); - __visc__bindIn(spmv_node, 12, 12, 0); - __visc__bindIn(spmv_node, 13, 13, 0); - __visc__bindIn(spmv_node, 14, 14, 0); + // write out data + dst_vector[d_perm[ix]] = sum; + } } -void spmvLvl2(float* dst_vector, size_t bytes_dst_vector, - float* d_data, size_t bytes_d_data, - int* d_index, size_t bytes_d_index, - int* d_perm, size_t bytes_d_perm, - float* x_vec, size_t bytes_x_vec, - int dim, - int* jds_ptr_int, size_t bytes_jds_ptr_int, - int* sh_zcnt_int, size_t bytes_sh_zcnt_int, - size_t dim_X1, size_t dim_X2) -{ - __visc__hint(visc::CPU_TARGET); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, - 1, dst_vector); - void* spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2); - __visc__bindIn(spmv_node, 0, 0, 0); - __visc__bindIn(spmv_node, 1, 1, 0); - __visc__bindIn(spmv_node, 2, 2, 0); - __visc__bindIn(spmv_node, 3, 3, 0); - __visc__bindIn(spmv_node, 4, 4, 0); - __visc__bindIn(spmv_node, 5, 5, 0); - __visc__bindIn(spmv_node, 6, 6, 0); - __visc__bindIn(spmv_node, 7, 7, 0); - __visc__bindIn(spmv_node, 8, 8, 0); - __visc__bindIn(spmv_node, 9, 9, 0); - __visc__bindIn(spmv_node, 10, 10, 0); - __visc__bindIn(spmv_node, 11, 11, 0); - __visc__bindIn(spmv_node, 12, 12, 0); - __visc__bindIn(spmv_node, 13, 13, 0); - __visc__bindIn(spmv_node, 14, 14, 0); - __visc__bindIn(spmv_node, 15, 15, 0); +void spmvLvl1(float *dst_vector, size_t bytes_dst_vector, float *d_data, + size_t bytes_d_data, int *d_index, size_t bytes_d_index, + int *d_perm, size_t bytes_d_perm, float *x_vec, + size_t bytes_x_vec, int dim, int *jds_ptr_int, + size_t bytes_jds_ptr_int, int *sh_zcnt_int, + size_t bytes_sh_zcnt_int, size_t dim_X1) { + __visc__hint(visc::DEVICE); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + sh_zcnt_int, 1, dst_vector); + void *spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1); + __visc__bindIn(spmv_node, 0, 0, 0); + __visc__bindIn(spmv_node, 1, 1, 0); + __visc__bindIn(spmv_node, 2, 2, 0); + __visc__bindIn(spmv_node, 3, 3, 0); + __visc__bindIn(spmv_node, 4, 4, 0); + __visc__bindIn(spmv_node, 5, 5, 0); + __visc__bindIn(spmv_node, 6, 6, 0); + __visc__bindIn(spmv_node, 7, 7, 0); + __visc__bindIn(spmv_node, 8, 8, 0); + __visc__bindIn(spmv_node, 9, 9, 0); + __visc__bindIn(spmv_node, 10, 10, 0); + __visc__bindIn(spmv_node, 11, 11, 0); + __visc__bindIn(spmv_node, 12, 12, 0); + __visc__bindIn(spmv_node, 13, 13, 0); + __visc__bindIn(spmv_node, 14, 14, 0); } -void spmvLvl3(float* dst_vector, size_t bytes_dst_vector, - float* d_data, size_t bytes_d_data, - int* d_index, size_t bytes_d_index, - int* d_perm, size_t bytes_d_perm, - float* x_vec, size_t bytes_x_vec, - int dim, - int* jds_ptr_int, size_t bytes_jds_ptr_int, - int* sh_zcnt_int, size_t bytes_sh_zcnt_int, - size_t dim_X1, size_t dim_X2) -{ - __visc__hint(visc::CPU_TARGET); - __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, - 1, dst_vector); - void* spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2); - __visc__bindIn(spmv_node, 0, 0, 0); - __visc__bindIn(spmv_node, 1, 1, 0); - __visc__bindIn(spmv_node, 2, 2, 0); - __visc__bindIn(spmv_node, 3, 3, 0); - __visc__bindIn(spmv_node, 4, 4, 0); - __visc__bindIn(spmv_node, 5, 5, 0); - __visc__bindIn(spmv_node, 6, 6, 0); - __visc__bindIn(spmv_node, 7, 7, 0); - __visc__bindIn(spmv_node, 8, 8, 0); - __visc__bindIn(spmv_node, 9, 9, 0); - __visc__bindIn(spmv_node, 10, 10, 0); - __visc__bindIn(spmv_node, 11, 11, 0); - __visc__bindIn(spmv_node, 12, 12, 0); - __visc__bindIn(spmv_node, 13, 13, 0); - __visc__bindIn(spmv_node, 14, 14, 0); - __visc__bindIn(spmv_node, 15, 15, 0); - __visc__bindIn(spmv_node, 16, 16, 0); +void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data, + size_t bytes_d_data, int *d_index, size_t bytes_d_index, + int *d_perm, size_t bytes_d_perm, float *x_vec, + size_t bytes_x_vec, int dim, int *jds_ptr_int, + size_t bytes_jds_ptr_int, int *sh_zcnt_int, + size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + sh_zcnt_int, 1, dst_vector); + void *spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2); + __visc__bindIn(spmv_node, 0, 0, 0); + __visc__bindIn(spmv_node, 1, 1, 0); + __visc__bindIn(spmv_node, 2, 2, 0); + __visc__bindIn(spmv_node, 3, 3, 0); + __visc__bindIn(spmv_node, 4, 4, 0); + __visc__bindIn(spmv_node, 5, 5, 0); + __visc__bindIn(spmv_node, 6, 6, 0); + __visc__bindIn(spmv_node, 7, 7, 0); + __visc__bindIn(spmv_node, 8, 8, 0); + __visc__bindIn(spmv_node, 9, 9, 0); + __visc__bindIn(spmv_node, 10, 10, 0); + __visc__bindIn(spmv_node, 11, 11, 0); + __visc__bindIn(spmv_node, 12, 12, 0); + __visc__bindIn(spmv_node, 13, 13, 0); + __visc__bindIn(spmv_node, 14, 14, 0); + __visc__bindIn(spmv_node, 15, 15, 0); } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one two filenames\n"); - exit(-1); - } - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - //load matrix from files - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - input_vec( parameters->inpFiles[1],h_x_vector,dim); - - pb_InitializeTimerSet(&timers); - __visc__init(); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memset(h_Ax_vector, 0, dim*sizeof(float)); - - size_t grid; - size_t block; - - compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8); - - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(h_Ax_vector, dim*sizeof(float)); - llvm_visc_track_mem(h_data, len*sizeof(float)); - llvm_visc_track_mem(h_indices, len*sizeof(int)); - llvm_visc_track_mem(h_perm, dim*sizeof(int)); - llvm_visc_track_mem(h_x_vector, dim*sizeof(float)); - llvm_visc_track_mem(h_ptr, depth*sizeof(int)); - llvm_visc_track_mem(h_nzcnt, nzcnt_len*sizeof(int)); +void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data, + size_t bytes_d_data, int *d_index, size_t bytes_d_index, + int *d_perm, size_t bytes_d_perm, float *x_vec, + size_t bytes_x_vec, int dim, int *jds_ptr_int, + size_t bytes_jds_ptr_int, int *sh_zcnt_int, + size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + sh_zcnt_int, 1, dst_vector); + void *spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2); + __visc__bindIn(spmv_node, 0, 0, 0); + __visc__bindIn(spmv_node, 1, 1, 0); + __visc__bindIn(spmv_node, 2, 2, 0); + __visc__bindIn(spmv_node, 3, 3, 0); + __visc__bindIn(spmv_node, 4, 4, 0); + __visc__bindIn(spmv_node, 5, 5, 0); + __visc__bindIn(spmv_node, 6, 6, 0); + __visc__bindIn(spmv_node, 7, 7, 0); + __visc__bindIn(spmv_node, 8, 8, 0); + __visc__bindIn(spmv_node, 9, 9, 0); + __visc__bindIn(spmv_node, 10, 10, 0); + __visc__bindIn(spmv_node, 11, 11, 0); + __visc__bindIn(spmv_node, 12, 12, 0); + __visc__bindIn(spmv_node, 13, 13, 0); + __visc__bindIn(spmv_node, 14, 14, 0); + __visc__bindIn(spmv_node, 15, 15, 0); + __visc__bindIn(spmv_node, 16, 16, 0); +} +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao " + "Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { + fprintf(stderr, "Expecting one two filenames\n"); + exit(-1); + } + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + // parameters declaration + int len; + int depth; + int dim; + int pad = 32; + int nzcnt_len; + + // host memory allocation + // matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + // vector + float *h_Ax_vector; + float *h_x_vector; + + // load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, + &len, &nzcnt_len, &depth); + + h_Ax_vector = (float *)malloc(sizeof(float) * dim); + h_x_vector = (float *)malloc(sizeof(float) * dim); + input_vec(parameters->inpFiles[1], h_x_vector, dim); + + pb_InitializeTimerSet(&timers); + __visc__init(); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memset(h_Ax_vector, 0, dim * sizeof(float)); + + size_t grid; + size_t block; + + compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8); + + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(h_Ax_vector, dim * sizeof(float)); + llvm_visc_track_mem(h_data, len * sizeof(float)); + llvm_visc_track_mem(h_indices, len * sizeof(int)); + llvm_visc_track_mem(h_perm, dim * sizeof(int)); + llvm_visc_track_mem(h_x_vector, dim * sizeof(float)); + llvm_visc_track_mem(h_ptr, depth * sizeof(int)); + llvm_visc_track_mem(h_nzcnt, nzcnt_len * sizeof(int)); + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + int i; + for (i = 0; i < 50; i++) { + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - //main execution + void *root_in = malloc(sizeof(RootIn)); + RootIn root_in_local = {h_Ax_vector, + dim * sizeof(float), + h_data, + len * sizeof(float), + h_indices, + len * sizeof(int), + h_perm, + dim * sizeof(int), + h_x_vector, + dim * sizeof(float), + dim, + h_ptr, + depth * sizeof(int), + h_nzcnt, + nzcnt_len * sizeof(int), + block, + (grid / block)}; + *(RootIn *)root_in = root_in_local; + void *spmvDFG = __visc__launch(0, spmvLvl3, root_in); + + __visc__wait(spmvDFG); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int i; - for(i=0; i<50; i++) - { - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - void* root_in = malloc(sizeof(RootIn)); - RootIn root_in_local = { - h_Ax_vector, dim * sizeof(float), - h_data, len * sizeof(float), - h_indices, len * sizeof(int), - h_perm, dim * sizeof(int), - h_x_vector, dim * sizeof(float), - dim, - h_ptr, depth * sizeof(int), - h_nzcnt, nzcnt_len * sizeof(int), - block, (grid/block) - }; - *(RootIn*)root_in = root_in_local; - void* spmvDFG = __visc__launch(0, spmvLvl3, root_in); - - __visc__wait(spmvDFG); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - /******************************* Issues ******************************* - * 1. Using OpenCL to compute grid and block dimensions - * (getting device info) - * We need to check the GPU version (major number) where this kernel - * executes to compare against opencl_nvidia version - * 2. Type of cl_mem buffer for d_x_vector is created with size of float, - but copied in through size of int. - Due to type of h_x_vector, I chose to use float - * (Minor) - * 3. Kernel initially used constant memory for last two arguments - removed - */ - } - - //HtoD memory copy - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float)); - - - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - - llvm_visc_untrack_mem(h_Ax_vector); - llvm_visc_untrack_mem(h_data); - llvm_visc_untrack_mem(h_indices); - llvm_visc_untrack_mem(h_perm); - llvm_visc_untrack_mem(h_x_vector); - llvm_visc_untrack_mem(h_ptr); - llvm_visc_untrack_mem(h_nzcnt); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + /******************************* Issues ******************************* + * 1. Using OpenCL to compute grid and block dimensions + * (getting device info) + * We need to check the GPU version (major number) where this kernel + * executes to compare against opencl_nvidia version + * 2. Type of cl_mem buffer for d_x_vector is created with size of float, + but copied in through size of int. + Due to type of h_x_vector, I chose to use float + * (Minor) + * 3. Kernel initially used constant memory for last two arguments - removed + */ + } + + // HtoD memory copy + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(h_Ax_vector, dim * sizeof(float)); + + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + + llvm_visc_untrack_mem(h_Ax_vector); + llvm_visc_untrack_mem(h_data); + llvm_visc_untrack_mem(h_indices); + llvm_visc_untrack_mem(h_perm); + llvm_visc_untrack_mem(h_x_vector); + llvm_visc_untrack_mem(h_ptr); + llvm_visc_untrack_mem(h_nzcnt); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + __visc__cleanup(); + + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Ax_vector, dim); + } - pb_PrintTimerSet(&timers); - __visc__cleanup(); + free(h_data); + free(h_indices); + free(h_ptr); + free(h_perm); + free(h_nzcnt); + free(h_Ax_vector); + free(h_x_vector); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Ax_vector,dim); + pb_FreeParameters(parameters); - } - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_FreeParameters(parameters); - - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc b/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc index bf6b6dc5b877951cf6da9b27cc0c2bf4007009a8..e94700924dba991140c6db55994f3e7805f90a29 100644 --- a/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc +++ b/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc @@ -1,43 +1,39 @@ #include <endian.h> -#include <stdlib.h> -#include <stdio.h> #include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0; i<dim; i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) { + srand(54321); + int i; + for (i = 0; i < dim; i++) { + x_vector[i] = (rand() / (float)RAND_MAX); + } + return 0; } -void outputData(char* fName, float *A0, int dim) -{ - FILE* fid = fopen(fName, "w"); - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - +void outputData(char *fName, float *A0, int dim) { + FILE *fid = fopen(fName, "w"); + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + fwrite(A0, sizeof(float), dim, fid); - fclose (fid); + fclose(fid); } -int main(int argc, char** argv) { +int main(int argc, char **argv) { int dim; dim = atoi(argv[1]); - char * writefn = argv[2]; - float *outV = (float*) malloc(dim * sizeof(float)); + char *writefn = argv[2]; + float *outV = (float *)malloc(dim * sizeof(float)); generate_vector(outV, dim); outputData(writefn, outV, dim); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c @@ -7,28 +7,25 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif - -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h index 91240cd5e45d4ed14f5d0e6e4d27818a1e5cf7bc..5b09962e164174e16f9bd0294b2fb61a42b2762f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h @@ -6,6 +6,6 @@ *cr ***************************************************************************/ -void inputData(char* fName, int* nx, int* ny, int* nz); +void inputData(char *fName, int *nx, int *ny, int *nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c index 00b3be9005b1cbb86aee7512d604a44a79191229..af00833058939f92ee5fb7e890240e47b025fb60 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c @@ -8,28 +8,22 @@ #include "common.h" -void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz) -{ +void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx, + const int ny, const int nz) { int i, j, k; - for(i=1;i<nx-1;i++) - { - for(j=1;j<ny-1;j++) - { - for(k=1;k<nz-1;k++) - { - Anext[Index3D (nx, ny, i, j, k)] = - (A0[Index3D (nx, ny, i, j, k + 1)] + - A0[Index3D (nx, ny, i, j, k - 1)] + - A0[Index3D (nx, ny, i, j + 1, k)] + - A0[Index3D (nx, ny, i, j - 1, k)] + - A0[Index3D (nx, ny, i + 1, j, k)] + - A0[Index3D (nx, ny, i - 1, j, k)])*c1 - - A0[Index3D (nx, ny, i, j, k)]*c0; - } - } - } - + for (i = 1; i < nx - 1; i++) { + for (j = 1; j < ny - 1; j++) { + for (k = 1; k < nz - 1; k++) { + Anext[Index3D(nx, ny, i, j, k)] = (A0[Index3D(nx, ny, i, j, k + 1)] + + A0[Index3D(nx, ny, i, j, k - 1)] + + A0[Index3D(nx, ny, i, j + 1, k)] + + A0[Index3D(nx, ny, i, j - 1, k)] + + A0[Index3D(nx, ny, i + 1, j, k)] + + A0[Index3D(nx, ny, i - 1, j, k)]) * + c1 - + A0[Index3D(nx, ny, i, j, k)] * c0; + } + } + } } - - diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h index b6735126ac8bf905d9f89b846a580e247cef4cfa..68fb021719c17d6bab318472d9be6c83665d3ab4 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h @@ -6,6 +6,5 @@ *cr ***************************************************************************/ - - -void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz); +void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx, + const int ny, const int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c index 9aa7fa57ab7f73694320aacb4f80d2dc6ed3f333..583b65251a6dd4050c9dcd14166869fc6c219fa6 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c @@ -11,119 +11,103 @@ #include <stdlib.h> #include <string.h> -#include "file.h" #include "common.h" +#include "file.h" #include "kernels.h" - -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i, j, k; - for(i=0;i<nz;i++) - { - for(j=0;j<ny;j++) - { - for(k=0;k<nx;k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } - } - return 0; -} - -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - - - printf("CPU-based 7 points stencil codes****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui Sung<sung10@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //declaration - int nx,ny,nz; - int size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz tx ty t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } + } + return 0; +} - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - - //host data - float *h_A0; - float *h_Anext; - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CPU-based 7 points stencil codes****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui " + "Sung<sung10@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // declaration + int nx, ny, nz; + int size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz tx ty t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } + + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); + read_data(h_A0, nx, ny, nz, fp); fclose(fp); - memcpy (h_Anext,h_A0 ,sizeof(float)*size); - - - + memcpy(h_Anext, h_A0, sizeof(float) * size); int t; - for(t=0;t<iteration;t++) - { - cpu_stencil(c0,c1, h_A0, h_Anext, nx, ny, nz); - float *temp=h_A0; + for (t = 0; t < iteration; t++) { + cpu_stencil(c0, c1, h_A0, h_Anext, nx, ny, nz); + float *temp = h_A0; h_A0 = h_Anext; h_Anext = temp; + } - } - - float *temp=h_A0; + float *temp = h_A0; h_A0 = h_Anext; h_Anext = temp; - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Anext,nx,ny,nz); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free (h_A0); - free (h_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - return 0; + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h index d04cada2ace5b85770a16f26f2f639ebd9eb5248..465de41bc02d805fe9a3819bd305c94bb7c9f337 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h @@ -6,10 +6,11 @@ *cr ***************************************************************************/ - - - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc @@ -7,28 +7,25 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif - -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h index 903d42610b27bfe57703e4c032c5f508d9eb9cb3..5c7731cb348e5b3bbbe1aa1214d4f0061651e178 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h @@ -6,4 +6,4 @@ *cr ***************************************************************************/ -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h index d548f9da3219535d6ca72b0d3a702401e61416b9..57253d774a03e40a41b1b9d587e518fe13af265e 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h @@ -6,13 +6,14 @@ *cr ***************************************************************************/ +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } - - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} - - -__global__ void block2D_hybrid_coarsen_x(float fac,float *A0,float *Anext, int nx, int ny, int nz); +__global__ void block2D_hybrid_coarsen_x(float fac, float *A0, float *Anext, + int nx, int ny, int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h index d04cada2ace5b85770a16f26f2f639ebd9eb5248..465de41bc02d805fe9a3819bd305c94bb7c9f337 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h @@ -6,10 +6,11 @@ *cr ***************************************************************************/ - - - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc @@ -7,28 +7,25 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif - -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h index 903d42610b27bfe57703e4c032c5f508d9eb9cb3..5c7731cb348e5b3bbbe1aa1214d4f0061651e178 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h @@ -6,4 +6,4 @@ *cr ***************************************************************************/ -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h index d04cada2ace5b85770a16f26f2f639ebd9eb5248..465de41bc02d805fe9a3819bd305c94bb7c9f337 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h @@ -6,10 +6,11 @@ *cr ***************************************************************************/ - - - -#define CUERR { cudaError_t err; \ - if ((err = cudaGetLastError()) != cudaSuccess) { \ - printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ - return -1; }} +#define CUERR \ + { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \ + return -1; \ + } \ + } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc @@ -7,28 +7,25 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif - -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h index 903d42610b27bfe57703e4c032c5f508d9eb9cb3..5c7731cb348e5b3bbbe1aa1214d4f0061651e178 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h @@ -6,4 +6,4 @@ *cr ***************************************************************************/ -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c @@ -7,28 +7,25 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif - -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h index 91240cd5e45d4ed14f5d0e6e4d27818a1e5cf7bc..5b09962e164174e16f9bd0294b2fb61a42b2762f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h @@ -6,6 +6,6 @@ *cr ***************************************************************************/ -void inputData(char* fName, int* nx, int* ny, int* nz); +void inputData(char *fName, int *nx, int *ny, int *nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c index 6619c8e1a8ee7e942e0937e7b2494fa5fc8fbf73..15ad898fb1d98bc7b02718ce268874c4c3a3c683 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c @@ -8,31 +8,25 @@ #include "common.h" -void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz) -{ - - int i; - #pragma omp parallel for - for(i=1;i<nx-1;i++) - { - int j,k; - for(j=1;j<ny-1;j++) - { - for(k=1;k<nz-1;k++) - { - //i #pragma omp critical - Anext[Index3D (nx, ny, i, j, k)] = - (A0[Index3D (nx, ny, i, j, k + 1)] + - A0[Index3D (nx, ny, i, j, k - 1)] + - A0[Index3D (nx, ny, i, j + 1, k)] + - A0[Index3D (nx, ny, i, j - 1, k)] + - A0[Index3D (nx, ny, i + 1, j, k)] + - A0[Index3D (nx, ny, i - 1, j, k)])*c1 - - A0[Index3D (nx, ny, i, j, k)]*c0; - } - } - } +void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx, + const int ny, const int nz) { + int i; +#pragma omp parallel for + for (i = 1; i < nx - 1; i++) { + int j, k; + for (j = 1; j < ny - 1; j++) { + for (k = 1; k < nz - 1; k++) { + // i #pragma omp critical + Anext[Index3D(nx, ny, i, j, k)] = (A0[Index3D(nx, ny, i, j, k + 1)] + + A0[Index3D(nx, ny, i, j, k - 1)] + + A0[Index3D(nx, ny, i, j + 1, k)] + + A0[Index3D(nx, ny, i, j - 1, k)] + + A0[Index3D(nx, ny, i + 1, j, k)] + + A0[Index3D(nx, ny, i - 1, j, k)]) * + c1 - + A0[Index3D(nx, ny, i, j, k)] * c0; + } + } + } } - - diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h index b6735126ac8bf905d9f89b846a580e247cef4cfa..68fb021719c17d6bab318472d9be6c83665d3ab4 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h @@ -6,6 +6,5 @@ *cr ***************************************************************************/ - - -void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz); +void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx, + const int ny, const int nz); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c index c363d8ce732fa4d1c343f1874e5e9a283c32344a..583b65251a6dd4050c9dcd14166869fc6c219fa6 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c @@ -11,116 +11,103 @@ #include <stdlib.h> #include <string.h> -#include "file.h" #include "common.h" +#include "file.h" #include "kernels.h" - -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i, j, k; - for(i=0;i<nz;i++) - { - for(j=0;j<ny;j++) - { - for(k=0;k<nx;k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } - } - return 0; -} - -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - - - printf("CPU-based 7 points stencil codes****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui Sung<sung10@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //declaration - int nx,ny,nz; - int size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz tx ty t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } + } + return 0; +} - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - - //host data - float *h_A0; - float *h_Anext; - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CPU-based 7 points stencil codes****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui " + "Sung<sung10@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // declaration + int nx, ny, nz; + int size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz tx ty t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } + + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); + read_data(h_A0, nx, ny, nz, fp); fclose(fp); - memcpy (h_Anext,h_A0 ,sizeof(float)*size); + memcpy(h_Anext, h_A0, sizeof(float) * size); int t; - for(t=0;t<iteration;t++) - { - cpu_stencil(c0,c1, h_A0, h_Anext, nx, ny, nz); - float *temp=h_A0; + for (t = 0; t < iteration; t++) { + cpu_stencil(c0, c1, h_A0, h_Anext, nx, ny, nz); + float *temp = h_A0; h_A0 = h_Anext; h_Anext = temp; + } - } - - float *temp=h_A0; + float *temp = h_A0; h_A0 = h_Anext; h_Anext = temp; - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Anext,nx,ny,nz); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free (h_A0); - free (h_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - return 0; + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c index 2fd7bb7b68cc3a61ad7e75c567b9998e6b151d96..ec47c22227648df094cbf03ea1b667943207207e 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c @@ -8,226 +8,224 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - /*cl_program clProgram;*/ - /*cl_kernel clKernel;*/ - - /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/ - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") - - - //device - cl_mem d_A0; - cl_mem d_Anext; - - memcpy (h_Anext,h_A0,sizeof(float)*size); - - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); - - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - int t; - for(t=0; t<iteration; t++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - - } +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + /*cl_program clProgram;*/ + /*cl_kernel clKernel;*/ + + /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", + * "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/ + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-I src/opencl_base"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus); + CHECK_ERROR("clCreateKernel") + + // device + cl_mem d_A0; + cl_mem d_Anext; + + memcpy(h_Anext, h_A0, sizeof(float) * size); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + int t; + for (t = 0; t < iteration; t++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + } + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_PrintTimerSet(&timers); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c index c2cdd2d6264165b057b9620b4827eb23dff92b46..61382182d1c8b406a2e2ba9dee250327914dbac4 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c @@ -8,239 +8,241 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - cl_program clProgram; - cl_kernel clKernel; - - pb_CreateAndBuildKernelFromBinary("build/opencl_base_default_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); - //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,"-I src/opencl_base"); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") - - //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - //CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i =0;i <10; i++) { - int t; - for(t=0; t<iteration; t++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - } - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + cl_program clProgram; + cl_kernel clKernel; + + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_default_default/kernel_offline.nvptx.s", + "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); + // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + + // char clOptions[50]; + // sprintf(clOptions,"-I src/opencl_base"); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") + + // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); + // CHECK_ERROR("clCreateKernel") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int i = 0; i < 10; i++) { + int t; + for (t = 0; t < iteration; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; + + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") + } - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c index 30b391963ce60b92cae0db3f2670ea49898f5196..217352e036b0d03bcc578286fd62c4339dedfe94 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c @@ -8,239 +8,241 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - cl_program clProgram; - cl_kernel clKernel; - - pb_CreateAndBuildKernelFromBinary("build/opencl_base_large_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); - //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,"-I src/opencl_base"); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") - - //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - //CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i =0;i <1; i++) { - int t; - for(t=0; t<iteration; t++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - } - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + cl_program clProgram; + cl_kernel clKernel; + + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_large_default/kernel_offline.nvptx.s", "naive_kernel", + &clContext, &clDevice, &clProgram, &clKernel); + // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + + // char clOptions[50]; + // sprintf(clOptions,"-I src/opencl_base"); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") + + // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); + // CHECK_ERROR("clCreateKernel") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int i = 0; i < 1; i++) { + int t; + for (t = 0; t < iteration; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; + + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") + } - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h index 85c998198e9ad26c4ac912439c533ec9ca4d7ada..0d2e87b0f14004d71ecedc86e822b0fdde8d6252 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h @@ -13,9 +13,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c index d62bf7fc6f2f21753fe2587d2c87e1ec2af6c47e..8456c3c5b8d0133b98df6150362068f704da5e1a 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c @@ -8,224 +8,227 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0;i<nz;i++) - { - for(j=0;j<ny;j++) - { - for(k=0;k<nx;k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } - } - return 0; +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } + } + } + return 0; } +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // declaration + int nx, ny, nz; + int size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //declaration - int nx,ny,nz; - int size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - /*cl_program clProgram; */ - /*cl_kernel clKernel; */ - - /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_opt_default/kernel.nvptx.s", "block2D_hybrid_coarsen_x", &clContext, &clDevice, &clProgram, &clKernel); */ - - const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base_opt"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"block2D_hybrid_coarsen_x",&clStatus); - CHECK_ERROR("clCreateKernel") - - //host data - float *h_A0; - float *h_Anext; - - //device - cl_mem d_A0; - cl_mem d_Anext; - - //load data from files - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - pb_SwitchToTimer(&timers, pb_TimerID_IO); + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + /*cl_program clProgram; */ + /*cl_kernel clKernel; */ + + /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_opt_default/kernel.nvptx.s", + * "block2D_hybrid_coarsen_x", &clContext, &clDevice, &clProgram, &clKernel); + */ + + const char *clSource[] = {readFile("src/opencl_base_opt/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-I src/opencl_base_opt"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = + clCreateKernel(clProgram, "block2D_hybrid_coarsen_x", &clStatus); + CHECK_ERROR("clCreateKernel") + + // host data + float *h_A0; + float *h_Anext; + + // device + cl_mem d_A0; + cl_mem d_Anext; + + // load data from files + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + pb_SwitchToTimer(&timers, pb_TimerID_IO); FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); + read_data(h_A0, nx, ny, nz, fp); fclose(fp); - memcpy (h_Anext,h_A0,sizeof(float)*size); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use tx by ty threads - int tx = 32; - int ty = 4; - size_t block[3] = {tx,ty,1}; - - //also change threads size maping from tx by ty to 2tx x ty - size_t grid[3] = {(nx+tx*2-1)/(tx*2)*tx,(ny+ty-1)/ty*ty,1}; - -// int sh_size = tx*2*ty*sizeof(float); - - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); -// clStatus = clSetKernelArg(clKernel,7,sh_size,NULL); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int t; - for(t=0;t<iteration;t++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - cl_mem d_temp = d_A0; + memcpy(h_Anext, h_A0, sizeof(float) * size); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use tx by ty threads + int tx = 32; + int ty = 4; + size_t block[3] = {tx, ty, 1}; + + // also change threads size maping from tx by ty to 2tx x ty + size_t grid[3] = {(nx + tx * 2 - 1) / (tx * 2) * tx, (ny + ty - 1) / ty * ty, + 1}; + + // int sh_size = tx*2*ty*sizeof(float); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + // clStatus = clSetKernelArg(clKernel,7,sh_size,NULL); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int t; + for (t = 0; t < iteration; t++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - - } - + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + } - cl_mem d_temp = d_A0; + cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Anext,nx,ny,nz); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //free((void*)clSource[0]); - - free(h_A0); - free(h_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // free((void*)clSource[0]); + + free(h_A0); + free(h_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h index 3b3a473143ce69a8da5a9bc303371f074781415d..9729c0b9dd70b4958c6e6c45469c4030ea427bd5 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h @@ -8,6 +8,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #define TCF 4 #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c index f134bdc93350dd2c2abecb7e7f6d36c412d239b2..28c0e5fd7bf24ac79857b3488dc28f12b3c354df 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c @@ -8,234 +8,235 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - cl_program clProgram; - cl_kernel clKernel; - - pb_CreateAndBuildKernelFromBinary("build/opencl_base_strided_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); - //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,"-I src/opencl_base"); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") - - //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - //CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256/TCF; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+TCF*tx-1)/(TCF*tx)*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + cl_program clProgram; + cl_kernel clKernel; + + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_strided_default/kernel_offline.nvptx.s", + "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); + // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + + // char clOptions[50]; + // sprintf(clOptions,"-I src/opencl_base"); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") + + // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); + // CHECK_ERROR("clCreateKernel") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256 / TCF; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + int t; + for (t = 0; t < iteration; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int t; - for(t=0; t<iteration; t++) - { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - } - cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + } + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_PrintTimerSet(&timers); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h index 042bd64a23d897959a4145e6d2b42df76053e74c..12a6d131c29067073fa79f09c4e6f91b8662969c 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h @@ -10,6 +10,6 @@ #define _COMMON_H_ //#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) // +3 for padding -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))+3) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)) + 3) #define TCF 4 #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c index f387827795b21c00d382eea794f1ba823df05c20..f767f6a9d29094623296e012a6b2671954b0546a 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c @@ -8,234 +8,235 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - cl_program clProgram; - cl_kernel clKernel; - - pb_CreateAndBuildKernelFromBinary("build/opencl_base_vec_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); - //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - //CHECK_ERROR("clCreateProgramWithSource") - - //char clOptions[50]; - //sprintf(clOptions,"-I src/opencl_base"); - //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - //CHECK_ERROR("clBuildProgram") - - //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - //CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256/TCF; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+TCF*tx-1)/(TCF*tx)*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + cl_program clProgram; + cl_kernel clKernel; + + pb_CreateAndBuildKernelFromBinary( + "build/opencl_base_vec_default/kernel_offline.nvptx.s", "naive_kernel", + &clContext, &clDevice, &clProgram, &clKernel); + // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + // cl_program clProgram = + // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + // CHECK_ERROR("clCreateProgramWithSource") + + // char clOptions[50]; + // sprintf(clOptions,"-I src/opencl_base"); + // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + // CHECK_ERROR("clBuildProgram") + + // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); + // CHECK_ERROR("clCreateKernel") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256 / TCF; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + int t; + for (t = 0; t < iteration; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int t; - for(t=0; t<iteration; t++) - { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - } - cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + } + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_PrintTimerSet(&timers); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c index 3456ba2a64d942ec20116ade4513e6f9abe888c5..10626bed59111d3ded3429626463966914218a5c 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c @@ -8,239 +8,240 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlaformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(2,clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - /*cl_program clProgram;*/ - /*cl_kernel clKernel;*/ - - /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/ - const char* clSource[] = {readFile("src/opencl_cpu/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-I src/opencl_cpu"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlaformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(2, clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + /*cl_program clProgram;*/ + /*cl_kernel clKernel;*/ + + /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", + * "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/ + const char *clSource[] = {readFile("src/opencl_cpu/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-I src/opencl_cpu"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus); + CHECK_ERROR("clCreateKernel") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + int t; + for (t = 0; t < iteration; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int t; - for(t=0; t<iteration; t++) - { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - } - cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + } - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_PrintTimerSet(&timers); + + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - //free((void*)clSource[0]); + // free((void*)clSource[0]); - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c index 9d7457c39388f7652d53d7cd1b4872676b2831ae..1d03111f209173dfc2462cb274e1bb0ac56e9c8c 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c @@ -8,237 +8,238 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlaformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - /*cl_program clProgram;*/ - /*cl_kernel clKernel;*/ - - /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/ - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") - - - //device - cl_mem d_A0; - cl_mem d_Anext; - - memcpy (h_Anext,h_A0,sizeof(float)*size); - - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); - /*printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);*/ - - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int t; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - /*for(int i=0; i<1; i++) {*/ - for(t=0; t<iteration; t++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - - } - /*}*/ +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - /*clStatus = clFinish(clCommandQueue);*/ - /*pb_SwitchToTimer(&timers, pb_TimerID_NONE);*/ + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlaformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + /*cl_program clProgram;*/ + /*cl_kernel clKernel;*/ + + /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, + * &clDevice, &clProgram, &clKernel);*/ + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-I src/opencl_base"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus); + CHECK_ERROR("clCreateKernel") + + // device + cl_mem d_A0; + cl_mem d_Anext; + + memcpy(h_Anext, h_A0, sizeof(float) * size); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + /*printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], + * grid[2], block[0], block[1], block[2]);*/ + + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + int t; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + /*for(int i=0; i<1; i++) {*/ + for (t = 0; t < iteration; t++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + } + /*}*/ - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + /*clStatus = clFinish(clCommandQueue);*/ + /*pb_SwitchToTimer(&timers, pb_TimerID_NONE);*/ - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - /*free((void*)clSource[0]);*/ + /*free((void*)clSource[0]);*/ - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c index c9868918043b6cf815a823bd99ac12b62447209e..cf86734a8639ce38eb2b1ac8280582e7bde4531c 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c @@ -8,245 +8,248 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlaformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - cl_program clProgram; - cl_kernel clKernel; - - pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); - /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/ - /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[50];*/ - /*sprintf(clOptions,"-I src/opencl_base");*/ - /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); - printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int t; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i=0; i<2; i++) { - for(t=0; t<iteration; t++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - } + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlaformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + cl_program clProgram; + cl_kernel clKernel; + + pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, + &clDevice, &clProgram, &clKernel); + /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/ + /*cl_program clProgram = + * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[50];*/ + /*sprintf(clOptions,"-I src/opencl_base");*/ + /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], + grid[2], block[0], block[1], block[2]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + int t; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int i = 0; i < 2; i++) { + for (t = 0; t < iteration; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; + + /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } + } - clStatus = clFinish(clCommandQueue); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; + clStatus = clFinish(clCommandQueue); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - /*clStatus = clFinish(clCommandQueue);*/ - /*CHECK_ERROR("clFinish")*/ + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + /*clStatus = clFinish(clCommandQueue);*/ + /*CHECK_ERROR("clFinish")*/ - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_PrintTimerSet(&timers); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - /*free((void*)clSource[0]);*/ + /*free((void*)clSource[0]);*/ - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c index 88fc557e6e7f7e9796e081d1dd63e52aea102ac8..3b009e370e284a5b5b705bcc3a8122547a83c177 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c @@ -8,245 +8,248 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - - //declaration - unsigned nx,ny,nz; - unsigned size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlaformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - cl_program clProgram; - cl_kernel clKernel; - - pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel); - /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/ - /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ - /*CHECK_ERROR("clCreateProgramWithSource")*/ - - /*char clOptions[50];*/ - /*sprintf(clOptions,"-I src/opencl_base");*/ - /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ - /*CHECK_ERROR("clBuildProgram")*/ - - /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/ - /*CHECK_ERROR("clCreateKernel")*/ - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //device - cl_mem d_A0; - cl_mem d_Anext; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use 1D thread block - unsigned tx =256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; -// printf("block x is %d and y is %d z \n",block[0],block[1]); -// printf("grid x is %d and y is %d\n",grid[0],grid[1]); - printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - int t; - pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - for(int i=0; i<1; i++) { - for(t=0; t<iteration; t++) - { - /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - //printf("iteration %d\n",t) - CHECK_ERROR("clEnqueueNDRangeKernel") - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; - - /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + // declaration + unsigned nx, ny, nz; + unsigned size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } - } + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlaformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + + cl_device_id clDevice; + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + cl_program clProgram; + cl_kernel clKernel; + + pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, + &clDevice, &clProgram, &clKernel); + /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/ + /*cl_program clProgram = + * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ + /*CHECK_ERROR("clCreateProgramWithSource")*/ + + /*char clOptions[50];*/ + /*sprintf(clOptions,"-I src/opencl_base");*/ + /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/ + /*CHECK_ERROR("clBuildProgram")*/ + + /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/ + /*CHECK_ERROR("clCreateKernel")*/ + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // device + cl_mem d_A0; + cl_mem d_Anext; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use 1D thread block + unsigned tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + // printf("block x is %d and y is %d z \n",block[0],block[1]); + // printf("grid x is %d and y is %d\n",grid[0],grid[1]); + printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], + grid[2], block[0], block[1], block[2]); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + int t; + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + for (int i = 0; i < 1; i++) { + for (t = 0; t < iteration; t++) { + /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + // printf("iteration %d\n",t) + CHECK_ERROR("clEnqueueNDRangeKernel") + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; + + /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ } + } - clStatus = clFinish(clCommandQueue); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - cl_mem d_temp = d_A0; - d_A0 = d_Anext; - d_Anext = d_temp; + clStatus = clFinish(clCommandQueue); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - /*clStatus = clFinish(clCommandQueue);*/ - /*CHECK_ERROR("clFinish")*/ + cl_mem d_temp = d_A0; + d_A0 = d_Anext; + d_Anext = d_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + /*clStatus = clFinish(clCommandQueue);*/ + /*CHECK_ERROR("clFinish")*/ - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_PrintTimerSet(&timers); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - /*free((void*)clSource[0]);*/ + /*free((void*)clSource[0]);*/ - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc index c690d13171488dbb05c3e707639373d2e89bf18d..4f8b0e2660704b90733827d55f835d408ef2b0da 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc @@ -7,64 +7,56 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } - - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); - - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } - - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } - - buffer[size] = 0; - fclose(fp); - return buffer; +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } + + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); + + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } + + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } + + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h index 124fef8e655943e5e135fc4f189e9b70c552ce40..daf3b5e161194f2e2fda4c336651cbde7d1dee27 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h @@ -13,7 +13,8 @@ extern "C" #endif -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); + void + outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c index 76d651402d873ab0f9be9fcd28138024f64e3e3c..7116ca362087b6f95f99f8e2e0a9af1fbe0ddd24 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c @@ -8,215 +8,215 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0;i<nz;i++) - { - for(j=0;j<ny;j++) - { - for(k=0;k<nx;k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } - } - return 0; +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } + } + } + return 0; } +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // declaration + int nx, ny, nz; + int size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time"); + return -1; + } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //declaration - int nx,ny,nz; - int size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_fermi/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-I src/opencl_fermi"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"block2D_reg_tiling",&clStatus); - CHECK_ERROR("clCreateKernel") - - //host data - float *h_A0; - float *h_Anext; - - //device - cl_mem d_A0; - cl_mem d_Anext; - - //load data from files - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - pb_SwitchToTimer(&timers, pb_TimerID_IO); + const char *clSource[] = {readFile("src/opencl_fermi/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-I src/opencl_fermi"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = + clCreateKernel(clProgram, "block2D_reg_tiling", &clStatus); + CHECK_ERROR("clCreateKernel") + + // host data + float *h_A0; + float *h_Anext; + + // device + cl_mem d_A0; + cl_mem d_Anext; + + // load data from files + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + pb_SwitchToTimer(&timers, pb_TimerID_IO); FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); + read_data(h_A0, nx, ny, nz, fp); fclose(fp); - memcpy (h_Anext,h_A0,sizeof(float)*size); - - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use tx-by-ty threads - size_t tx = 512; - size_t ty = 2; - size_t block[3] = {tx,ty,1}; - size_t grid[3] = {(nx+tx-1)/tx*block[0],(ny+ty-1)/ty*block[1],1}; - - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int t; - for(t=0;t<iteration;t++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") + memcpy(h_Anext, h_A0, sizeof(float) * size); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use tx-by-ty threads + size_t tx = 512; + size_t ty = 2; + size_t block[3] = {tx, ty, 1}; + size_t grid[3] = {(nx + tx - 1) / tx * block[0], + (ny + ty - 1) / ty * block[1], 1}; + + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int t; + for (t = 0; t < iteration; t++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - - } + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + } cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_A0); + free(h_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Anext,nx,ny,nz); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free(h_A0); - free(h_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h index 85c998198e9ad26c4ac912439c533ec9ca4d7ada..0d2e87b0f14004d71ecedc86e822b0fdde8d6252 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h @@ -13,9 +13,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c index d5ff1e913a6f4a098b563e8d24fa1cf2550ecf8c..526666c45c10a077407e6162498fcef8fd4159c2 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c @@ -8,225 +8,227 @@ ***************************************************************************/ #include <CL/cl.h> +#include <parboil.h> #include <stdio.h> #include <stdlib.h>i #include <string.h> -#include <parboil.h> -#include "file.h" #include "common.h" +#include "file.h" -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0;i<nz;i++) - { - for(j=0;j<ny;j++) - { - for(k=0;k<nx;k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } - } - return 0; +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } + } + } + return 0; } +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // declaration + int nx, ny, nz; + int size; + int iteration; + float c0 = 1.0f / 6.0f; + float c1 = 1.0f / 6.0f / 6.0f; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //declaration - int nx,ny,nz; - int size; - int iteration; - float c0=1.0f/6.0f; - float c1=1.0f/6.0f/6.0f; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlaformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,"-I src/opencl_nvidia"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"block2D_hybrid_coarsen_x",&clStatus); - CHECK_ERROR("clCreateKernel") - - // get local memory size [can be removed] - cl_ulong local_mem_size; - clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0); - printf("Scratchpad Size = %lu\n", local_mem_size); - - //host data - float *h_A0; - float *h_Anext; - - //device - cl_mem d_A0; - cl_mem d_Anext; - - //load data from files - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - pb_SwitchToTimer(&timers, pb_TimerID_IO); + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); + CHECK_ERROR("clGetPlaformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions, "-I src/opencl_nvidia"); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = + clCreateKernel(clProgram, "block2D_hybrid_coarsen_x", &clStatus); + CHECK_ERROR("clCreateKernel") + + // get local memory size [can be removed] + cl_ulong local_mem_size; + clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), + &local_mem_size, 0); + printf("Scratchpad Size = %lu\n", local_mem_size); + + // host data + float *h_A0; + float *h_Anext; + + // device + cl_mem d_A0; + cl_mem d_Anext; + + // load data from files + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + pb_SwitchToTimer(&timers, pb_TimerID_IO); FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); + read_data(h_A0, nx, ny, nz, fp); fclose(fp); - memcpy (h_Anext,h_A0,sizeof(float)*size); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //only use tx by ty threads - int tx = 32; - int ty = 4; - size_t block[3] = {tx,ty,1}; - - //also change threads size maping from tx by ty to 2tx x ty - size_t grid[3] = {(nx+tx*2-1)/(tx*2)*tx,(ny+ty-1)/ty*ty,1}; - - int sh_size = tx*2*ty*sizeof(float); - printf("Scratchpad Size Required = %d\n", sh_size); - - clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); - clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); - clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); - clStatus = clSetKernelArg(clKernel,7,sh_size,NULL); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int t; - for(t=0;t<iteration;t++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - - cl_mem d_temp = d_A0; + memcpy(h_Anext, h_A0, sizeof(float) * size); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + // memory allocation + d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), + NULL, &clStatus); + CHECK_ERROR("clCreateBuffer") + + // memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0, + size * sizeof(float), h_A0, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // only use tx by ty threads + int tx = 32; + int ty = 4; + size_t block[3] = {tx, ty, 1}; + + // also change threads size maping from tx by ty to 2tx x ty + size_t grid[3] = {(nx + tx * 2 - 1) / (tx * 2) * tx, (ny + ty - 1) / ty * ty, + 1}; + + int sh_size = tx * 2 * ty * sizeof(float); + printf("Scratchpad Size Required = %d\n", sh_size); + + clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); + clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx); + clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz); + clStatus = clSetKernelArg(clKernel, 7, sh_size, NULL); + CHECK_ERROR("clSetKernelArg") + + // main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int t; + for (t = 0; t < iteration; t++) { + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, + block, 0, NULL, NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); - - } - + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); + clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); + } - cl_mem d_temp = d_A0; + cl_mem d_temp = d_A0; d_A0 = d_Anext; d_Anext = d_temp; - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseMemObject(d_A0); - clStatus = clReleaseMemObject(d_Anext); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - CHECK_ERROR("clReleaseContext") - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Anext,nx,ny,nz); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free(h_A0); - free(h_Anext); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0, + size * sizeof(float), h_Anext, 0, NULL, NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + clStatus = clReleaseMemObject(d_A0); + clStatus = clReleaseMemObject(d_Anext); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + CHECK_ERROR("clReleaseContext") + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + free((void *)clSource[0]); + + free(h_A0); + free(h_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h b/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h @@ -8,5 +8,5 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k))) #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc @@ -7,81 +7,70 @@ ***************************************************************************/ #include <endian.h> -#include <stdlib.h> +#include <inttypes.h> #include <malloc.h> #include <stdio.h> -#include <inttypes.h> +#include <stdlib.h> #if __BYTE_ORDER != __LITTLE_ENDIAN -# error "File I/O is not implemented for this system: wrong endianness." +#error "File I/O is not implemented for this system: wrong endianness." #endif -extern "C" -void inputData(char* fName, int* nx, int* ny, int* nz) -{ - FILE* fid = fopen(fName, "r"); +extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) { + FILE *fid = fopen(fName, "r"); + + if (fid == NULL) { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } - if (fid == NULL) - { - fprintf(stderr, "Cannot open input file\n"); - exit(-1); - } - - fread(nx, sizeof(int ),1,fid); - fread(ny, sizeof(int ),1,fid); - fread(nz, sizeof(int ),1,fid); - fclose (fid); + fread(nx, sizeof(int), 1, fid); + fread(ny, sizeof(int), 1, fid); + fread(nz, sizeof(int), 1, fid); + fclose(fid); } -extern "C" -void outputData(char* fName, float *h_A0,int nx,int ny,int nz) -{ - FILE* fid = fopen(fName, "w"); +extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) { + FILE *fid = fopen(fName, "w"); uint32_t tmp32; - if (fid == NULL) - { - fprintf(stderr, "Cannot open output file\n"); - exit(-1); - } - tmp32 = nx*ny*nz; + if (fid == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = nx * ny * nz; fwrite(&tmp32, sizeof(uint32_t), 1, fid); fwrite(h_A0, sizeof(float), tmp32, fid); - fclose (fid); + fclose(fid); } -extern "C" -char* readFile(const char* fileName) - { - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error 1!\n"); - exit(1); - } +extern "C" char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error 1!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error 2!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error 3!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h @@ -12,9 +12,9 @@ extern "C" { #endif -void inputData(char* fName, int* nx, int* ny, int* nz); -void outputData(char* fName, float *h_A0,int nx,int ny,int nz); -char* readFile(const char* fileName); +void inputData(char *fName, int *nx, int *ny, int *nz); +void outputData(char *fName, float *h_A0, int nx, int ny, int nz); +char *readFile(const char *fileName); #ifdef __cplusplus } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp index 9ecba96aed5642a4babaea8667576c25c1e4fb1f..5672a3ee490917d1374783eae5ab0ba1956ef441 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp @@ -7,283 +7,276 @@ *cr ***************************************************************************/ +#include "common.h" +#include "file.h" +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> #include <visc.h> -#include "file.h" -#include "common.h" -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } typedef struct __attribute__((__packed__)) { - float c0, c1; - float* A0; size_t bytes_A0; - float* Anext; size_t bytes_Anext; - int nx, ny, nz; - size_t dim_X1, dim_Y1, dim_Z1; - size_t dim_X2, dim_Y2, dim_Z2; + float c0, c1; + float *A0; + size_t bytes_A0; + float *Anext; + size_t bytes_Anext; + int nx, ny, nz; + size_t dim_X1, dim_Y1, dim_Z1; + size_t dim_X2, dim_Y2, dim_Z2; } RootIn; -void naive_kernel(float c0, float c1, - float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, - int nx, int ny, int nz) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(2, A0, Anext, 1, Anext); - - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); - - int lx = __visc__getNodeInstanceID_x(thisNode); - int ly = __visc__getNodeInstanceID_y(thisNode); - int lz = __visc__getNodeInstanceID_z(thisNode); - - int gx = __visc__getNodeInstanceID_x(parentNode); - int gy = __visc__getNodeInstanceID_y(parentNode); - int gz = __visc__getNodeInstanceID_z(parentNode); - - int gridx = __visc__getNumNodeInstances_x(thisNode); - int gridy = __visc__getNumNodeInstances_y(thisNode); - int gridz = __visc__getNumNodeInstances_z(thisNode); - - int i = gx * gridx + lx + 1; - int j = gy * gridy + ly + 1; - int k = gz * gridz + lz + 1; - - if(i<nx-1) - { - Anext[Index3D (nx, ny, i, j, k)] = c1 * - ( A0[Index3D (nx, ny, i, j, k + 1)] + - A0[Index3D (nx, ny, i, j, k - 1)] + - A0[Index3D (nx, ny, i, j + 1, k)] + - A0[Index3D (nx, ny, i, j - 1, k)] + - A0[Index3D (nx, ny, i + 1, j, k)] + - A0[Index3D (nx, ny, i - 1, j, k)] ) - - A0[Index3D (nx, ny, i, j, k)] * c0; - } +void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, + size_t bytes_Anext, int nx, int ny, int nz) { + __visc__hint(visc::DEVICE); + __visc__attributes(2, A0, Anext, 1, Anext); + + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + + int lx = __visc__getNodeInstanceID_x(thisNode); + int ly = __visc__getNodeInstanceID_y(thisNode); + int lz = __visc__getNodeInstanceID_z(thisNode); + + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + int gz = __visc__getNodeInstanceID_z(parentNode); + + int gridx = __visc__getNumNodeInstances_x(thisNode); + int gridy = __visc__getNumNodeInstances_y(thisNode); + int gridz = __visc__getNumNodeInstances_z(thisNode); + + int i = gx * gridx + lx + 1; + int j = gy * gridy + ly + 1; + int k = gz * gridz + lz + 1; + + if (i < nx - 1) { + Anext[Index3D(nx, ny, i, j, k)] = c1 * (A0[Index3D(nx, ny, i, j, k + 1)] + + A0[Index3D(nx, ny, i, j, k - 1)] + + A0[Index3D(nx, ny, i, j + 1, k)] + + A0[Index3D(nx, ny, i, j - 1, k)] + + A0[Index3D(nx, ny, i + 1, j, k)] + + A0[Index3D(nx, ny, i - 1, j, k)]) - + A0[Index3D(nx, ny, i, j, k)] * c0; + } } -void stencilLvl1(float c0, float c1, - float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, - int nx, int ny, int nz, - size_t dim_X1, size_t dim_Y1, size_t dim_Z1) -{ - __visc__hint(visc::DEVICE); - __visc__attributes(2, A0, Anext, 1, Anext); - void* stencil_node = __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1); - __visc__bindIn(stencil_node, 0, 0, 0); - __visc__bindIn(stencil_node, 1, 1, 0); - __visc__bindIn(stencil_node, 2, 2, 0); - __visc__bindIn(stencil_node, 3, 3, 0); - __visc__bindIn(stencil_node, 4, 4, 0); - __visc__bindIn(stencil_node, 5, 5, 0); - __visc__bindIn(stencil_node, 6, 6, 0); - __visc__bindIn(stencil_node, 7, 7, 0); - __visc__bindIn(stencil_node, 8, 8, 0); +void stencilLvl1(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, + size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, + size_t dim_Y1, size_t dim_Z1) { + __visc__hint(visc::DEVICE); + __visc__attributes(2, A0, Anext, 1, Anext); + void *stencil_node = + __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1); + __visc__bindIn(stencil_node, 0, 0, 0); + __visc__bindIn(stencil_node, 1, 1, 0); + __visc__bindIn(stencil_node, 2, 2, 0); + __visc__bindIn(stencil_node, 3, 3, 0); + __visc__bindIn(stencil_node, 4, 4, 0); + __visc__bindIn(stencil_node, 5, 5, 0); + __visc__bindIn(stencil_node, 6, 6, 0); + __visc__bindIn(stencil_node, 7, 7, 0); + __visc__bindIn(stencil_node, 8, 8, 0); } -void stencilLvl2(float c0, float c1, - float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, - int nx, int ny, int nz, - size_t dim_X1, size_t dim_Y1, size_t dim_Z1, - size_t dim_X2, size_t dim_Y2, size_t dim_Z2) -{ - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, A0, Anext, 1, Anext); - void* stencil_node = __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2); - __visc__bindIn(stencil_node, 0, 0, 0); - __visc__bindIn(stencil_node, 1, 1, 0); - __visc__bindIn(stencil_node, 2, 2, 0); - __visc__bindIn(stencil_node, 3, 3, 0); - __visc__bindIn(stencil_node, 4, 4, 0); - __visc__bindIn(stencil_node, 5, 5, 0); - __visc__bindIn(stencil_node, 6, 6, 0); - __visc__bindIn(stencil_node, 7, 7, 0); - __visc__bindIn(stencil_node, 8, 8, 0); - __visc__bindIn(stencil_node, 9, 9, 0); - __visc__bindIn(stencil_node, 10, 10, 0); - __visc__bindIn(stencil_node, 11, 11, 0); +void stencilLvl2(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, + size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, + size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2, + size_t dim_Z2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, A0, Anext, 1, Anext); + void *stencil_node = + __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2); + __visc__bindIn(stencil_node, 0, 0, 0); + __visc__bindIn(stencil_node, 1, 1, 0); + __visc__bindIn(stencil_node, 2, 2, 0); + __visc__bindIn(stencil_node, 3, 3, 0); + __visc__bindIn(stencil_node, 4, 4, 0); + __visc__bindIn(stencil_node, 5, 5, 0); + __visc__bindIn(stencil_node, 6, 6, 0); + __visc__bindIn(stencil_node, 7, 7, 0); + __visc__bindIn(stencil_node, 8, 8, 0); + __visc__bindIn(stencil_node, 9, 9, 0); + __visc__bindIn(stencil_node, 10, 10, 0); + __visc__bindIn(stencil_node, 11, 11, 0); } -void stencilLvl3(float c0, float c1, - float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, - int nx, int ny, int nz, - size_t dim_X1, size_t dim_Y1, size_t dim_Z1, - size_t dim_X2, size_t dim_Y2, size_t dim_Z2) -{ - __visc__hint(visc::CPU_TARGET); - __visc__attributes(2, A0, Anext, 1, Anext); - void* stencil_node = __visc__createNodeND(0, stencilLvl2); - __visc__bindIn(stencil_node, 0, 0, 0); - __visc__bindIn(stencil_node, 1, 1, 0); - __visc__bindIn(stencil_node, 2, 2, 0); - __visc__bindIn(stencil_node, 3, 3, 0); - __visc__bindIn(stencil_node, 4, 4, 0); - __visc__bindIn(stencil_node, 5, 5, 0); - __visc__bindIn(stencil_node, 6, 6, 0); - __visc__bindIn(stencil_node, 7, 7, 0); - __visc__bindIn(stencil_node, 8, 8, 0); - __visc__bindIn(stencil_node, 9, 9, 0); - __visc__bindIn(stencil_node, 10, 10, 0); - __visc__bindIn(stencil_node, 11, 11, 0); - __visc__bindIn(stencil_node, 12, 12, 0); - __visc__bindIn(stencil_node, 13, 13, 0); - __visc__bindIn(stencil_node, 14, 14, 0); +void stencilLvl3(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, + size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, + size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2, + size_t dim_Z2) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, A0, Anext, 1, Anext); + void *stencil_node = __visc__createNodeND(0, stencilLvl2); + __visc__bindIn(stencil_node, 0, 0, 0); + __visc__bindIn(stencil_node, 1, 1, 0); + __visc__bindIn(stencil_node, 2, 2, 0); + __visc__bindIn(stencil_node, 3, 3, 0); + __visc__bindIn(stencil_node, 4, 4, 0); + __visc__bindIn(stencil_node, 5, 5, 0); + __visc__bindIn(stencil_node, 6, 6, 0); + __visc__bindIn(stencil_node, 7, 7, 0); + __visc__bindIn(stencil_node, 8, 8, 0); + __visc__bindIn(stencil_node, 9, 9, 0); + __visc__bindIn(stencil_node, 10, 10, 0); + __visc__bindIn(stencil_node, 11, 11, 0); + __visc__bindIn(stencil_node, 12, 12, 0); + __visc__bindIn(stencil_node, 13, 13, 0); + __visc__bindIn(stencil_node, 14, 14, 0); } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - //declaration - int nx,ny,nz; - size_t size; - int iteration; - float c0=1.0/6.0; - float c1=1.0/6.0/6.0; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - - //load data from files - - size=nx*ny*nz; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - __visc__init(); - - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(h_A0, sizeof(float)*size); - llvm_visc_track_mem(h_Anext, sizeof(float)*size); +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + // declaration + int nx, ny, nz; + size_t size; + int iteration; + float c0 = 1.0 / 6.0; + float c1 = 1.0 / 6.0 / 6.0; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } + + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + + // load data from files + + size = nx * ny * nz; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + __visc__init(); + + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(h_A0, sizeof(float) * size); + llvm_visc_track_mem(h_Anext, sizeof(float) * size); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + memcpy(h_Anext, h_A0, sizeof(float) * size); + + // only use 1D thread block + size_t tx = 256; + size_t block[3] = {tx, 1, 1}; + size_t grid[3] = {((unsigned)nx - 2 + tx - 1) / tx * tx, (unsigned)ny - 2, + (unsigned)nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + + printf("grid(%ld, %ld, %ld), block(%ld, %ld, %ld)\n", grid[0], grid[1], + grid[2], block[0], block[1], block[2]); + // main execution + + int t; + size_t bytes = size * sizeof(float); + printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126, 1, 1)]); + printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]); + for (t = 0; t < iteration; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + void *root_in = malloc(sizeof(RootIn)); + RootIn root_in_local = {c0, + c1, + h_A0, + bytes, + h_Anext, + bytes, + nx, + ny, + nz, + block[0], + block[1], + block[2], + grid[0] / block[0], + grid[1] / block[1], + grid[2] / block[2]}; + *(RootIn *)root_in = root_in_local; + void *stencilDFG = __visc__launch(0, stencilLvl3, root_in); + + __visc__wait(stencilDFG); + // printf("iteration %d\n",t); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - memcpy (h_Anext,h_A0,sizeof(float)*size); - - - //only use 1D thread block - size_t tx = 256; - size_t block[3] = {tx,1,1}; - size_t grid[3] = {((unsigned)nx-2+tx-1)/tx*tx,(unsigned)ny-2,(unsigned)nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; - - printf("grid(%ld, %ld, %ld), block(%ld, %ld, %ld)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - //main execution - - int t; - size_t bytes = size*sizeof(float); - printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126,1,1)]); - printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125,1,1)]); - for(t=0; t<iteration; t++) - { - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - void* root_in = malloc(sizeof(RootIn)); - RootIn root_in_local = { - c0, c1, - h_A0, bytes, - h_Anext, bytes, - nx, ny, nz, - block[0], block[1], block[2], - grid[0]/block[0], grid[1]/block[1], grid[2]/block[2] - }; - *(RootIn*)root_in = root_in_local; - void* stencilDFG = __visc__launch(0, stencilLvl3, root_in); - - __visc__wait(stencilDFG); - //printf("iteration %d\n",t); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - float* h_temp = h_A0; - h_A0 = h_Anext; - h_Anext = h_temp; - } - - - float* h_temp = h_A0; + float *h_temp = h_A0; h_A0 = h_Anext; h_Anext = h_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(h_Anext, bytes); - printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126,1,1)]); - printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125,1,1)]); - - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + } - llvm_visc_untrack_mem(h_A0); - llvm_visc_untrack_mem(h_Anext); + float *h_temp = h_A0; + h_A0 = h_Anext; + h_Anext = h_temp; + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(h_Anext, bytes); + printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]); + printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - __visc__cleanup(); + llvm_visc_untrack_mem(h_A0); + llvm_visc_untrack_mem(h_Anext); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext,nx,ny,nz); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); + __visc__cleanup(); - return 0; + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h index 042bd64a23d897959a4145e6d2b42df76053e74c..12a6d131c29067073fa79f09c4e6f91b8662969c 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h @@ -10,6 +10,6 @@ #define _COMMON_H_ //#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) // +3 for padding -#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))+3) +#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)) + 3) #define TCF 4 #endif diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c index 28fb87ac479a9270fbda08f958017fbf495130c1..bb6e45c932a68d951f5559bd856017ecf71aade6 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c @@ -7,179 +7,170 @@ *cr ***************************************************************************/ +#include "common.h" +#include "file.h" +#include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <parboil.h> #include <visc.h> -#include "file.h" -#include "common.h" -static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) -{ - int s=0; - int i,j,k; - for(i=0; i<nz; i++) - { - for(j=0; j<ny; j++) - { - for(k=0; k<nx; k++) - { - fread(A0+s,sizeof(float),1,fp); - s++; - } - } +static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { + int s = 0; + int i, j, k; + for (i = 0; i < nz; i++) { + for (j = 0; j < ny; j++) { + for (k = 0; k < nx; k++) { + fread(A0 + s, sizeof(float), 1, fp); + s++; + } } - return 0; + } + return 0; } -void naive_kernel(float c0,float c1, float* A0, float *Anext,int nx,int ny,int nz) -{ - __visc__attributes(2, A0, Anext, 1, Anext); - int i = get_global_id(0)+1; - int j = get_global_id(1)+1; - int k = get_global_id(2)+1; - - if(i<nx-1) - { - Anext[Index3D (nx, ny, i, j, k)] = c1 * - ( A0[Index3D (nx, ny, i, j, k + 1)] + - A0[Index3D (nx, ny, i, j, k - 1)] + - A0[Index3D (nx, ny, i, j + 1, k)] + - A0[Index3D (nx, ny, i, j - 1, k)] + - A0[Index3D (nx, ny, i + 1, j, k)] + - A0[Index3D (nx, ny, i - 1, j, k)] ) - - A0[Index3D (nx, ny, i, j, k)] * c0; - } +void naive_kernel(float c0, float c1, float *A0, float *Anext, int nx, int ny, + int nz) { + __visc__attributes(2, A0, Anext, 1, Anext); + int i = get_global_id(0) + 1; + int j = get_global_id(1) + 1; + int k = get_global_id(2) + 1; + + if (i < nx - 1) { + Anext[Index3D(nx, ny, i, j, k)] = c1 * (A0[Index3D(nx, ny, i, j, k + 1)] + + A0[Index3D(nx, ny, i, j, k - 1)] + + A0[Index3D(nx, ny, i, j + 1, k)] + + A0[Index3D(nx, ny, i, j - 1, k)] + + A0[Index3D(nx, ny, i + 1, j, k)] + + A0[Index3D(nx, ny, i - 1, j, k)]) - + A0[Index3D(nx, ny, i, j, k)] * c0; + } } -int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated 7 points stencil codes****\n"); - printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - - //declaration - int nx,ny,nz; - size_t size; - int iteration; - float c0=1.0/6.0; - float c1=1.0/6.0/6.0; - - if (argc<5) - { - printf("Usage: probe nx ny nz t\n" - "nx: the grid size x\n" - "ny: the grid size y\n" - "nz: the grid size z\n" - "t: the iteration time\n"); - return -1; - } - - nx = atoi(argv[1]); - if (nx<1) - return -1; - ny = atoi(argv[2]); - if (ny<1) - return -1; - nz = atoi(argv[3]); - if (nz<1) - return -1; - iteration = atoi(argv[4]); - if(iteration<1) - return -1; - - //host data - float *h_A0; - float *h_Anext; - - //load data from files - - size=nx*ny*nz; - - // Padding in the beginning to get aligned loads and stores - size = size+3; - - h_A0=(float*)malloc(sizeof(float)*size); - h_Anext=(float*)malloc(sizeof(float)*size); - - - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - FILE *fp = fopen(parameters->inpFiles[0], "rb"); - read_data(h_A0+3, nx,ny,nz,fp); - fclose(fp); - - pb_InitializeTimerSet(&timers); - __visc__init(); - - pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); - llvm_visc_track_mem(h_A0, sizeof(float)*size); - llvm_visc_track_mem(h_Anext, sizeof(float)*size); - +int main(int argc, char **argv) { + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated 7 points stencil codes****\n"); + printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + // declaration + int nx, ny, nz; + size_t size; + int iteration; + float c0 = 1.0 / 6.0; + float c1 = 1.0 / 6.0 / 6.0; + + if (argc < 5) { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } + + nx = atoi(argv[1]); + if (nx < 1) + return -1; + ny = atoi(argv[2]); + if (ny < 1) + return -1; + nz = atoi(argv[3]); + if (nz < 1) + return -1; + iteration = atoi(argv[4]); + if (iteration < 1) + return -1; + + // host data + float *h_A0; + float *h_Anext; + + // load data from files + + size = nx * ny * nz; + + // Padding in the beginning to get aligned loads and stores + size = size + 3; + + h_A0 = (float *)malloc(sizeof(float) * size); + h_Anext = (float *)malloc(sizeof(float) * size); + + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + FILE *fp = fopen(parameters->inpFiles[0], "rb"); + read_data(h_A0 + 3, nx, ny, nz, fp); + fclose(fp); + + pb_InitializeTimerSet(&timers); + __visc__init(); + + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(h_A0, sizeof(float) * size); + llvm_visc_track_mem(h_Anext, sizeof(float) * size); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + memcpy(h_Anext, h_A0, sizeof(float) * size); + + // only use 1D thread block + int tx = 256 / TCF; + int block[3] = {tx, 1, 1}; + int grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2}; + // size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1, 1, 1}; + + printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2], + block[0], block[1], block[2]); + // main execution + + int t; + size_t bytes = size * sizeof(float); + printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126, 1, 1)]); + printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]); + for (t = 0; t < iteration; t++) { + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + unsigned stencilDFG = __visc__node( + naive_kernel, 2, 3, block[0], block[1], block[2], grid[0] / block[0], + grid[1] / block[1], grid[2] / block[2], 9, (float)c0, (float)c1, h_A0, + bytes, h_Anext, bytes, nx, ny, nz, 0); + __visc__wait(stencilDFG); + // printf("iteration %d\n",t); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - memcpy (h_Anext,h_A0,sizeof(float)*size); - - - //only use 1D thread block - int tx =256/TCF; - int block[3] = {tx,1,1}; - int grid[3] = {(nx-2+TCF*tx-1)/(TCF*tx)*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; - - printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - //main execution - - int t; - size_t bytes = size*sizeof(float); - printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126,1,1)]); - printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125,1,1)]); - for(t=0; t<iteration; t++) - { - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - unsigned stencilDFG = __visc__node(naive_kernel, 2, 3, block[0], block[1], block[2], grid[0]/block[0], grid[1]/block[1], grid[2]/block[2], 9, (float)c0, (float)c1, h_A0, bytes, h_Anext, bytes, nx, ny, nz, 0); - __visc__wait(stencilDFG); - //printf("iteration %d\n",t); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - float* h_temp = h_A0; - h_A0 = h_Anext; - h_Anext = h_temp; - - } - - - float* h_temp = h_A0; + float *h_temp = h_A0; h_A0 = h_Anext; h_Anext = h_temp; - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_visc_request_mem(h_Anext, bytes); - printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126,1,1)]); - printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125,1,1)]); + } - pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + float *h_temp = h_A0; + h_A0 = h_Anext; + h_Anext = h_temp; + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + llvm_visc_request_mem(h_Anext, bytes); + printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]); + printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]); - llvm_visc_untrack_mem(h_A0); - llvm_visc_untrack_mem(h_Anext); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); + llvm_visc_untrack_mem(h_A0); + llvm_visc_untrack_mem(h_Anext); - __visc__cleanup(); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); - if (parameters->outFile) { - /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ - outputData(parameters->outFile,h_Anext+3,nx,ny,nz); + __visc__cleanup(); - } - /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - free(h_A0); - free(h_Anext); - pb_FreeParameters(parameters); - - return 0; + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile, h_Anext + 3, nx, ny, nz); + } + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + free(h_A0); + free(h_Anext); + pb_FreeParameters(parameters); + + return 0; } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c index 2bafdf4580c5a6f4402cf40991c93bffcf8ce3ee..da9b51a7202148e43f9b5bf51156b0d651473571 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c @@ -5,18 +5,17 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <strings.h> -#include <unistd.h> #include <sys/time.h> -#include <math.h> +#include <unistd.h> #include "args.h" #include "model.h" -int main( int argc, char **argv ) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; struct pb_Parameters *params; int rf, k, nbins, npd, npr; @@ -26,77 +25,65 @@ int main( int argc, char **argv ) struct cartesian *data, *random; FILE *outfile; - pb_InitializeTimerSet( &timers ); - params = pb_ReadParameters( &argc, argv ); + pb_InitializeTimerSet(&timers); + params = pb_ReadParameters(&argc, argv); options args; - parse_args( argc, argv, &args ); - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - - log10(min_arcmin))); - memsize = (nbins+2)*sizeof(long long); - + parse_args(argc, argv, &args); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - log10(min_arcmin))); + memsize = (nbins + 2) * sizeof(long long); + // memory for bin boundaries - binb = (float *)malloc((nbins+1)*sizeof(float)); - if (binb == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } - for (k = 0; k < nbins+1; k++) - { - binb[k] = cos(pow(10, log10(min_arcmin) + - k*1.0/bins_per_dec) / 60.0*D2R); - } - + binb = (float *)malloc((nbins + 1) * sizeof(float)); + if (binb == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } + for (k = 0; k < nbins + 1; k++) { + binb[k] = + cos(pow(10, log10(min_arcmin) + k * 1.0 / bins_per_dec) / 60.0 * D2R); + } + // memory for DD - DD = (long long*)malloc(memsize); - if (DD == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } + DD = (long long *)malloc(memsize); + if (DD == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } bzero(DD, memsize); - + // memory for RR - RRS = (long long*)malloc(memsize); - if (RRS == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } + RRS = (long long *)malloc(memsize); + if (RRS == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } bzero(RRS, memsize); - + // memory for DR - DRS = (long long*)malloc(memsize); - if (DRS == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } + DRS = (long long *)malloc(memsize); + if (DRS == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } bzero(DRS, memsize); - + // memory for input data - data = (struct cartesian*)malloc - (args.npoints* sizeof(struct cartesian)); - if (data == NULL) - { - fprintf(stderr, - "Unable to allocate memory for % data points (#1)\n", - args.npoints); - return(0); - } - - random = (struct cartesian*)malloc - (args.npoints*sizeof(struct cartesian)); - if (random == NULL) - { - fprintf(stderr, - "Unable to allocate memory for % data points (#2)\n", - args.npoints); - return(0); - } + data = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian)); + if (data == NULL) { + fprintf(stderr, "Unable to allocate memory for % data points (#1)\n", + args.npoints); + return (0); + } + + random = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian)); + if (random == NULL) { + fprintf(stderr, "Unable to allocate memory for % data points (#2)\n", + args.npoints); + return (0); + } printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); @@ -104,58 +91,51 @@ int main( int argc, char **argv ) printf("Total bins : %i\n", nbins); // read data file - pb_SwitchToTimer( &timers, pb_TimerID_IO ); + pb_SwitchToTimer(&timers, pb_TimerID_IO); npd = readdatafile(params->inpFiles[0], data, args.npoints); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if (npd != args.npoints) - { - fprintf(stderr, - "Error: read %i data points out of %i\n", - npd, args.npoints); - return(0); - } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + if (npd != args.npoints) { + fprintf(stderr, "Error: read %i data points out of %i\n", npd, + args.npoints); + return (0); + } // compute DD doCompute(data, npd, NULL, 0, 1, DD, nbins, binb); // loop through random data files - for (rf = 0; rf < args.random_count; rf++) - { - // read random file - pb_SwitchToTimer( &timers, pb_TimerID_IO ); - npr = readdatafile(params->inpFiles[rf+1], random, args.npoints); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if (npr != args.npoints) - { - fprintf(stderr, - "Error: read %i random points out of %i in file %s\n", - npr, args.npoints, params->inpFiles[rf+1]); - return(0); - } - - // compute RR - doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb); - - // compute DR - doCompute(data, npd, random, npr, 0, DRS, nbins, binb); + for (rf = 0; rf < args.random_count; rf++) { + // read random file + pb_SwitchToTimer(&timers, pb_TimerID_IO); + npr = readdatafile(params->inpFiles[rf + 1], random, args.npoints); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + if (npr != args.npoints) { + fprintf(stderr, "Error: read %i random points out of %i in file %s\n", + npr, args.npoints, params->inpFiles[rf + 1]); + return (0); } + // compute RR + doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb); + + // compute DR + doCompute(data, npd, random, npr, 0, DRS, nbins, binb); + } + // compute and output results - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, - "Unable to open output file %s for writing, assuming stdout\n", - params->outFile); - outfile = stdout; - } + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, assuming stdout\n", + params->outFile); + outfile = stdout; + } - pb_SwitchToTimer( &timers, pb_TimerID_IO ); - for (k = 1; k < nbins+1; k++) - { - fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]); - } + pb_SwitchToTimer(&timers, pb_TimerID_IO); + for (k = 1; k < nbins + 1; k++) { + fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]); + } - if(outfile != stdout) + if (outfile != stdout) fclose(outfile); // free memory @@ -165,9 +145,8 @@ int main( int argc, char **argv ) free(DD); free(RRS); free(DRS); - - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); - pb_PrintTimerSet( &timers ); - pb_FreeParameters( params ); -} + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); +} diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h index f2b182ba412bb3c7f91bb38b0e43ef1df498dbcf..14d4f40df6d2942140610375cf9568def79d631e 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h @@ -10,9 +10,9 @@ #include <parboil.h> -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -22,21 +22,18 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, - int n2, int doSelf, long long *data_bins, - int nbins, float *binb); +int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2, + int doSelf, long long *data_bins, int nbins, float *binb); void initBinB(struct pb_TimerSet *timers); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c index 1e2a114b4d7d1f97646dfa7a15ac477f2a7f1745..b74f27ffd443d6cf6727863b986ee187bada299f 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c @@ -5,64 +5,51 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <sys/time.h> -#include <string.h> #include <math.h> -#include <stdio.h> +#include <stdio.h> +#include <string.h> +#include <sys/time.h> #include "model.h" -int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, - int n2, int doSelf, long long *data_bins, - int nbins, float *binb) -{ +int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2, + int doSelf, long long *data_bins, int nbins, float *binb) { int i, j, k; - if (doSelf) - { - n2 = n1; - data2 = data1; - } - - for (i = 0; i < ((doSelf) ? n1-1 : n1); i++) - { - const register float xi = data1[i].x; - const register float yi = data1[i].y; - const register float zi = data1[i].z; - - for (j = ((doSelf) ? i+1 : 0); j < n2; j++) - { - register float dot = xi * data2[j].x + yi * data2[j].y + - zi * data2[j].z; - - // run binary search - register int min = 0; - register int max = nbins; - register int k, indx; - - while (max > min+1) - { - k = (min + max) / 2; - if (dot >= binb[k]) - max = k; - else - min = k; - }; - - if (dot >= binb[min]) - { - data_bins[min] += 1; /*k = min;*/ - } - else if (dot < binb[max]) - { - data_bins[max+1] += 1; /*k = max+1;*/ - } - else - { - data_bins[max] += 1; /*k = max;*/ - } - } + if (doSelf) { + n2 = n1; + data2 = data1; + } + + for (i = 0; i < ((doSelf) ? n1 - 1 : n1); i++) { + const register float xi = data1[i].x; + const register float yi = data1[i].y; + const register float zi = data1[i].z; + + for (j = ((doSelf) ? i + 1 : 0); j < n2; j++) { + register float dot = xi * data2[j].x + yi * data2[j].y + zi * data2[j].z; + + // run binary search + register int min = 0; + register int max = nbins; + register int k, indx; + + while (max > min + 1) { + k = (min + max) / 2; + if (dot >= binb[k]) + max = k; + else + min = k; + }; + + if (dot >= binb[min]) { + data_bins[min] += 1; /*k = min;*/ + } else if (dot < binb[max]) { + data_bins[max + 1] += 1; /*k = max+1;*/ + } else { + data_bins[max] += 1; /*k = max;*/ + } } - + } + return 0; } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c index 3ee12500dcb5ccbc7f36b9db1da41d5e12f93126..ddc37cfb2b288b6bf8d5ebbd84ccd34d563e26fe 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c @@ -5,45 +5,40 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <sys/time.h> -#include <stdio.h> #include <math.h> +#include <stdio.h> #include <strings.h> -#include <math.h> +#include <sys/time.h> #include "model.h" -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; - - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); + + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h index a8a855872a20342e58a0906faf0c73cc9763f355..fdde265dd1f43582c3b97231189878ee9ea35b5f 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h @@ -10,9 +10,9 @@ #include <parboil.h> -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -23,21 +23,19 @@ #define SINGLE_PRECISION 1 #if SINGLE_PRECISION - #define REAL float +#define REAL float #else - #define REAL double +#define REAL double #endif typedef unsigned long hist_t; -struct spherical -{ - REAL ra, dec; // latitude, longitude pair +struct spherical { + REAL ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - REAL x, y, z; // cartesian coodrinates + +struct cartesian { + REAL x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc index 23a21458f35ebb3d43dcd127691556650ca399d7..182f9ed43ef5579601c17d080a8cdea4d487da09 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc @@ -5,49 +5,44 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <sys/time.h> -#include <stdio.h> #include <math.h> +#include <stdio.h> #include <strings.h> -#include <math.h> +#include <sys/time.h> #include "model.h" -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; REAL ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { +#if SINGLE_PRECISION + if (fscanf(infile, "%f %f", &ra, &dec) != 2) +#else + if (fscanf(infile, "%lf %lf", &ra, &dec) != 2) +#endif + break; - for (lcount = 0; lcount < npoints; lcount++) { - #if SINGLE_PRECISION - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - #else - if (fscanf(infile, "%lf %lf", &ra, &dec) != 2) - #endif - break; - - { - // data conversion - REAL rarad = D2R * ra; - REAL decrad = D2R * dec; - REAL cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + // data conversion + REAL rarad = D2R * ra; + REAL decrad = D2R * dec; + REAL cd = cos(decrad); + + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h index a3c273f0c5b45f8486728ef6fee8b1ab9404136e..a4ffce895fbbf4846681ace0848f2ecbe5a5e741 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h @@ -10,9 +10,9 @@ #include <parboil.h> -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -22,14 +22,12 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc index 1374ba4b19e7352a7717241e7e2f662cc7c18fad..ddc37cfb2b288b6bf8d5ebbd84ccd34d563e26fe 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc @@ -5,45 +5,40 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <sys/time.h> -#include <stdio.h> #include <math.h> +#include <stdio.h> #include <strings.h> -#include <math.h> +#include <sys/time.h> #include "model.h" -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; - - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); + + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c index 2bafdf4580c5a6f4402cf40991c93bffcf8ce3ee..da9b51a7202148e43f9b5bf51156b0d651473571 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c @@ -5,18 +5,17 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <strings.h> -#include <unistd.h> #include <sys/time.h> -#include <math.h> +#include <unistd.h> #include "args.h" #include "model.h" -int main( int argc, char **argv ) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; struct pb_Parameters *params; int rf, k, nbins, npd, npr; @@ -26,77 +25,65 @@ int main( int argc, char **argv ) struct cartesian *data, *random; FILE *outfile; - pb_InitializeTimerSet( &timers ); - params = pb_ReadParameters( &argc, argv ); + pb_InitializeTimerSet(&timers); + params = pb_ReadParameters(&argc, argv); options args; - parse_args( argc, argv, &args ); - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - - log10(min_arcmin))); - memsize = (nbins+2)*sizeof(long long); - + parse_args(argc, argv, &args); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - log10(min_arcmin))); + memsize = (nbins + 2) * sizeof(long long); + // memory for bin boundaries - binb = (float *)malloc((nbins+1)*sizeof(float)); - if (binb == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } - for (k = 0; k < nbins+1; k++) - { - binb[k] = cos(pow(10, log10(min_arcmin) + - k*1.0/bins_per_dec) / 60.0*D2R); - } - + binb = (float *)malloc((nbins + 1) * sizeof(float)); + if (binb == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } + for (k = 0; k < nbins + 1; k++) { + binb[k] = + cos(pow(10, log10(min_arcmin) + k * 1.0 / bins_per_dec) / 60.0 * D2R); + } + // memory for DD - DD = (long long*)malloc(memsize); - if (DD == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } + DD = (long long *)malloc(memsize); + if (DD == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } bzero(DD, memsize); - + // memory for RR - RRS = (long long*)malloc(memsize); - if (RRS == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } + RRS = (long long *)malloc(memsize); + if (RRS == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } bzero(RRS, memsize); - + // memory for DR - DRS = (long long*)malloc(memsize); - if (DRS == NULL) - { - fprintf(stderr, "Unable to allocate memory\n"); - exit(-1); - } + DRS = (long long *)malloc(memsize); + if (DRS == NULL) { + fprintf(stderr, "Unable to allocate memory\n"); + exit(-1); + } bzero(DRS, memsize); - + // memory for input data - data = (struct cartesian*)malloc - (args.npoints* sizeof(struct cartesian)); - if (data == NULL) - { - fprintf(stderr, - "Unable to allocate memory for % data points (#1)\n", - args.npoints); - return(0); - } - - random = (struct cartesian*)malloc - (args.npoints*sizeof(struct cartesian)); - if (random == NULL) - { - fprintf(stderr, - "Unable to allocate memory for % data points (#2)\n", - args.npoints); - return(0); - } + data = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian)); + if (data == NULL) { + fprintf(stderr, "Unable to allocate memory for % data points (#1)\n", + args.npoints); + return (0); + } + + random = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian)); + if (random == NULL) { + fprintf(stderr, "Unable to allocate memory for % data points (#2)\n", + args.npoints); + return (0); + } printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); @@ -104,58 +91,51 @@ int main( int argc, char **argv ) printf("Total bins : %i\n", nbins); // read data file - pb_SwitchToTimer( &timers, pb_TimerID_IO ); + pb_SwitchToTimer(&timers, pb_TimerID_IO); npd = readdatafile(params->inpFiles[0], data, args.npoints); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if (npd != args.npoints) - { - fprintf(stderr, - "Error: read %i data points out of %i\n", - npd, args.npoints); - return(0); - } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + if (npd != args.npoints) { + fprintf(stderr, "Error: read %i data points out of %i\n", npd, + args.npoints); + return (0); + } // compute DD doCompute(data, npd, NULL, 0, 1, DD, nbins, binb); // loop through random data files - for (rf = 0; rf < args.random_count; rf++) - { - // read random file - pb_SwitchToTimer( &timers, pb_TimerID_IO ); - npr = readdatafile(params->inpFiles[rf+1], random, args.npoints); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if (npr != args.npoints) - { - fprintf(stderr, - "Error: read %i random points out of %i in file %s\n", - npr, args.npoints, params->inpFiles[rf+1]); - return(0); - } - - // compute RR - doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb); - - // compute DR - doCompute(data, npd, random, npr, 0, DRS, nbins, binb); + for (rf = 0; rf < args.random_count; rf++) { + // read random file + pb_SwitchToTimer(&timers, pb_TimerID_IO); + npr = readdatafile(params->inpFiles[rf + 1], random, args.npoints); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + if (npr != args.npoints) { + fprintf(stderr, "Error: read %i random points out of %i in file %s\n", + npr, args.npoints, params->inpFiles[rf + 1]); + return (0); } + // compute RR + doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb); + + // compute DR + doCompute(data, npd, random, npr, 0, DRS, nbins, binb); + } + // compute and output results - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, - "Unable to open output file %s for writing, assuming stdout\n", - params->outFile); - outfile = stdout; - } + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, assuming stdout\n", + params->outFile); + outfile = stdout; + } - pb_SwitchToTimer( &timers, pb_TimerID_IO ); - for (k = 1; k < nbins+1; k++) - { - fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]); - } + pb_SwitchToTimer(&timers, pb_TimerID_IO); + for (k = 1; k < nbins + 1; k++) { + fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]); + } - if(outfile != stdout) + if (outfile != stdout) fclose(outfile); // free memory @@ -165,9 +145,8 @@ int main( int argc, char **argv ) free(DD); free(RRS); free(DRS); - - pb_SwitchToTimer( &timers, pb_TimerID_NONE ); - pb_PrintTimerSet( &timers ); - pb_FreeParameters( params ); -} + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); +} diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h index f2b182ba412bb3c7f91bb38b0e43ef1df498dbcf..14d4f40df6d2942140610375cf9568def79d631e 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h @@ -10,9 +10,9 @@ #include <parboil.h> -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -22,21 +22,18 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, - int n2, int doSelf, long long *data_bins, - int nbins, float *binb); +int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2, + int doSelf, long long *data_bins, int nbins, float *binb); void initBinB(struct pb_TimerSet *timers); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c index 25f9e4400c4545ba4acce2183fff76f89ab94ed5..d6f0dee83a044590f37de32426de2b4ff3cd56d6 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c @@ -5,69 +5,55 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <sys/time.h> -#include <string.h> #include <math.h> -#include <stdio.h> +#include <stdio.h> +#include <string.h> +#include <sys/time.h> #include "model.h" -int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, - int n2, int doSelf, long long *data_bins, - int nbins, float *binb) -{ +int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2, + int doSelf, long long *data_bins, int nbins, float *binb) { int i, j, k; - if (doSelf) - { - n2 = n1; - data2 = data1; - } -// #pragma omp parallel for - for (i = 0; i < ((doSelf) ? n1-1 : n1); i++) - { - const register float xi = data1[i].x; - const register float yi = data1[i].y; - const register float zi = data1[i].z; + if (doSelf) { + n2 = n1; + data2 = data1; + } + // #pragma omp parallel for + for (i = 0; i < ((doSelf) ? n1 - 1 : n1); i++) { + const register float xi = data1[i].x; + const register float yi = data1[i].y; + const register float zi = data1[i].z; + +#pragma omp parallel for + for (j = ((doSelf) ? i + 1 : 0); j < n2; j++) { + register float dot = xi * data2[j].x + yi * data2[j].y + zi * data2[j].z; - #pragma omp parallel for - for (j = ((doSelf) ? i+1 : 0); j < n2; j++) - { - register float dot = xi * data2[j].x + yi * data2[j].y + - zi * data2[j].z; - - // run binary search - register int min = 0; - register int max = nbins; - register int k, indx; - + // run binary search + register int min = 0; + register int max = nbins; + register int k, indx; - while (max > min+1) - { - k = (min + max) / 2; - if (dot >= binb[k]) - max = k; - else - min = k; - }; - #pragma omp critical - if (dot >= binb[min]) - { -// #pragma omp critical - data_bins[min] += 1; /*k = min;*/ - } - else if (dot < binb[max]) - { - // #pragma omp critical - data_bins[max+1] += 1; /*k = max+1;*/ - } - else - { - // #pragma omp critical - data_bins[max] += 1; /*k = max;*/ - } - } + while (max > min + 1) { + k = (min + max) / 2; + if (dot >= binb[k]) + max = k; + else + min = k; + }; +#pragma omp critical + if (dot >= binb[min]) { + // #pragma omp critical + data_bins[min] += 1; /*k = min;*/ + } else if (dot < binb[max]) { + // #pragma omp critical + data_bins[max + 1] += 1; /*k = max+1;*/ + } else { + // #pragma omp critical + data_bins[max] += 1; /*k = max;*/ + } } - + } + return 0; } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c index 3ee12500dcb5ccbc7f36b9db1da41d5e12f93126..ddc37cfb2b288b6bf8d5ebbd84ccd34d563e26fe 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c +++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c @@ -5,45 +5,40 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <sys/time.h> -#include <stdio.h> #include <math.h> +#include <stdio.h> #include <strings.h> -#include <math.h> +#include <sys/time.h> #include "model.h" -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; - - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); + + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc index 49d9f243c48352ea866e35add863aacb002a3a55..d945bccf4eae7f296394d74ac0617f3e20426dcd 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc @@ -6,11 +6,11 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "args.h" #include "model.h" @@ -19,114 +19,109 @@ extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; // create the bin boundaries -void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue) -{ - float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float)); - for (int k = 0; k < NUM_BINS+1; k++) - { - binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) - / 60.0*D2R); - } +void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb, + cl_command_queue clCommandQueue) { + float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float)); + for (int k = 0; k < NUM_BINS + 1; k++) { + binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / + 60.0 * D2R); + } - pb_SwitchToTimer( timers, pb_TimerID_COPY ); + pb_SwitchToTimer(timers, pb_TimerID_COPY); cl_int clStatus; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0, + (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); free(binb); } -void TPACF(cl_mem histograms, cl_mem d_x_data, - cl_mem dev_binb, - cl_command_queue clCommandQueue, cl_kernel clKernel) -{ +void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb, + cl_command_queue clCommandQueue, cl_kernel clKernel) { size_t dimBlock = BLOCK_SIZE; - size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock; - + size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock; + cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms); - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_x_data); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&dev_binb); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),&NUM_SETS); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),&NUM_ELEMENTS); - + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms); + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_x_data); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &dev_binb); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &NUM_SETS); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), &NUM_ELEMENTS); + CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid, + &dimBlock, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main( int argc, char** argv) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; struct pb_Parameters *params; - params = pb_ReadParameters( &argc, argv ); + params = pb_ReadParameters(&argc, argv); options args; parse_args(argc, argv, &args); - - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); NUM_ELEMENTS = args.npoints; NUM_SETS = args.random_count; - int num_elements = NUM_ELEMENTS; - + int num_elements = NUM_ELEMENTS; + printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); printf("Bins per dec: %i\n", bins_per_dec); printf("Total bins : %i\n", NUM_BINS); - //read in files - unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian); - unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float); + // read in files + unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian); + unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float); // container for all the points read from files struct cartesian *h_all_data; - h_all_data = (struct cartesian*) malloc(mem_size); + h_all_data = (struct cartesian *)malloc(mem_size); // Until I can get libs fixed - + // iterator for data files struct cartesian *working = h_all_data; - + // go through and read all data and random points into h_all_data - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); readdatafile(params->inpFiles[0], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); working += num_elements; - for(int i = 0; i < (NUM_SETS); i++) - { - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); - char fileName[50]; - readdatafile(params->inpFiles[i+1], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - - working += num_elements; - } + for (int i = 0; i < (NUM_SETS); i++) { + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); + char fileName[50]; + readdatafile(params->inpFiles[i + 1], working, num_elements); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + working += num_elements; + } - pb_InitializeTimerSet( &timers ); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays - float * h_x_data = (float*) malloc (3*f_mem_size); - float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1); - float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1); - for(int i = 0; i < (NUM_SETS+1); ++i) - { - for(int j = 0; j < NUM_ELEMENTS; ++j) - { - h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x; - h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y; - h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z; - } + float *h_x_data = (float *)malloc(3 * f_mem_size); + float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); + float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); + for (int i = 0; i < (NUM_SETS + 1); ++i) { + for (int j = 0; j < NUM_ELEMENTS; ++j) { + h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x; + h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y; + h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z; } + } // from on use x, y, and z arrays, free h_all_data free(h_all_data); @@ -134,136 +129,141 @@ main( int argc, char** argv) cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); + sprintf(clOptions, "-I src/opencl_base"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus); CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); // allocate OpenCL memory to hold all points - //Sub-buffers are not defined in OpenCL 1.0 + // Sub-buffers are not defined in OpenCL 1.0 cl_mem d_x_data; - d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus); + d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") // allocate OpenCL memory to hold final histograms // (1 for dd, and NUM_SETS for dr and rr apiece) cl_mem d_hists; - d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus); + d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") cl_mem dev_binb; - dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus); + dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + (NUM_BINS + 1) * sizeof(float), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate system memory for final histograms - hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)* - sizeof(hist_t)); + hist_t *new_hists = + (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t)); // Initialize the boundary constants for bin search initBinB(&timers, dev_binb, clCommandQueue); // **===------------------ Kick off TPACF on OpenCL------------------===** - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0, + 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel); + TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), + new_hists, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** // references into output histograms hist_t *dd_hist = new_hists; hist_t *rr_hist = dd_hist + NUM_BINS; - hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS; + hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS; // add up values within dr and rr int rr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - rr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - rr[j] += rr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + rr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + rr[j] += rr_hist[i * NUM_BINS + j]; } + } int dr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - dr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - dr[j] += dr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + dr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + dr[j] += dr_hist[i * NUM_BINS + j]; } + } pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); FILE *outfile; - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, "Unable to open output file %s for writing, " - "assuming stdout\n", params->outFile); - outfile = stdout; - } - - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, " + "assuming stdout\n", + params->outFile); + outfile = stdout; + } + + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); // print out final histograms + omega (while calculating omega) - for(int i=0; i<NUM_BINS; i++) - { - fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); - } + for (int i = 0; i < NUM_BINS; i++) { + fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); + } - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if(outfile != stdout) + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + if (outfile != stdout) fclose(outfile); // cleanup memory free(new_hists); - free( h_x_data); + free(h_x_data); - //pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + // pb_SwitchToTimer( &timers, pb_TimerID_COPY ); clStatus = clReleaseMemObject(d_hists); clStatus = clReleaseMemObject(d_x_data); clStatus = clReleaseMemObject(dev_binb); @@ -273,8 +273,7 @@ main( int argc, char** argv) clStatus = clReleaseContext(clContext); CHECK_ERROR("clReleaseContext") - free((void*)clSource[0]); + free((void *)clSource[0]); pb_FreeParameters(params); } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc @@ -6,83 +6,75 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <sys/time.h> -#include <stdio.h> -#include <math.h> -#include <strings.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <strings.h> +#include <sys/time.h> #include "model.h" unsigned int NUM_SETS; unsigned int NUM_ELEMENTS; -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error: Cannot open kernel file for reading!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error: Cannot open kernel file for reading!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error: Cannot allocated buffer for file contents!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error: Cannot allocated buffer for file contents!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error: Cannot read kernel file contents!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error: Cannot read kernel file contents!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h @@ -8,9 +8,9 @@ #ifndef __MODEL_H__ #define __MODEL_H__ -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -21,26 +21,23 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc index 54f4f85539b2e71a24d220f08a8f59373d0976ce..453a983ee0dc19b6f4b6d32e883bf822407b281a 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc @@ -6,147 +6,142 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "args.h" #include "model.h" - #define WARP_SIZE 32 -#define NUM_WARPS (BLOCK_SIZE/WARP_SIZE) +#define NUM_WARPS (BLOCK_SIZE / WARP_SIZE) #define HISTS_PER_WARP 16 -#define NUM_HISTOGRAMS (NUM_WARPS*HISTS_PER_WARP) +#define NUM_HISTOGRAMS (NUM_WARPS * HISTS_PER_WARP) extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; // create the bin boundaries -void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue) -{ - float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float)); - for (int k = 0; k < NUM_BINS+1; k++) - { - binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) - / 60.0*D2R); - } +void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb, + cl_command_queue clCommandQueue) { + float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float)); + for (int k = 0; k < NUM_BINS + 1; k++) { + binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / + 60.0 * D2R); + } - pb_SwitchToTimer( timers, pb_TimerID_COPY ); + pb_SwitchToTimer(timers, pb_TimerID_COPY); cl_int clStatus; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0, + (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); free(binb); } -void TPACF(cl_mem histograms, cl_mem d_x_data, - cl_mem dev_binb, - cl_command_queue clCommandQueue, cl_kernel clKernel) -{ +void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb, + cl_command_queue clCommandQueue, cl_kernel clKernel) { size_t dimBlock = BLOCK_SIZE; - size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock; - long shSize = NUM_BINS*NUM_HISTOGRAMS*sizeof(unsigned int); + size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock; + long shSize = NUM_BINS * NUM_HISTOGRAMS * sizeof(unsigned int); long glSize = 0L; cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms); - clStatus = clSetKernelArg(clKernel,1,sizeof(long),&glSize); + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms); + clStatus = clSetKernelArg(clKernel, 1, sizeof(long), &glSize); + + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_x_data); + clStatus = clSetKernelArg(clKernel, 3, sizeof(long), &glSize); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_x_data); - clStatus = clSetKernelArg(clKernel,3,sizeof(long),&glSize); + clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &dev_binb); + clStatus = clSetKernelArg(clKernel, 5, sizeof(long), &glSize); - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&dev_binb); - clStatus = clSetKernelArg(clKernel,5,sizeof(long),&glSize); + clStatus = clSetKernelArg(clKernel, 6, sizeof(int), &NUM_SETS); - clStatus = clSetKernelArg(clKernel,6,sizeof(int),&NUM_SETS); + clStatus = clSetKernelArg(clKernel, 7, sizeof(int), &NUM_ELEMENTS); - clStatus = clSetKernelArg(clKernel,7,sizeof(int),&NUM_ELEMENTS); + clStatus = clSetKernelArg( + clKernel, 8, NUM_BINS * NUM_HISTOGRAMS * sizeof(unsigned int), NULL); + clStatus = clSetKernelArg(clKernel, 9, sizeof(long), &shSize); - clStatus = clSetKernelArg(clKernel,8,NUM_BINS*NUM_HISTOGRAMS*sizeof(unsigned int),NULL); - clStatus = clSetKernelArg(clKernel,9,sizeof(long),&shSize); - CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid, + &dimBlock, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main( int argc, char** argv) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; struct pb_Parameters *params; - params = pb_ReadParameters( &argc, argv ); + params = pb_ReadParameters(&argc, argv); options args; parse_args(argc, argv, &args); - - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); NUM_ELEMENTS = args.npoints; NUM_SETS = args.random_count; - int num_elements = NUM_ELEMENTS; - + int num_elements = NUM_ELEMENTS; + printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); printf("Bins per dec: %i\n", bins_per_dec); printf("Total bins : %i\n", NUM_BINS); - //read in files - unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian); - unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float); + // read in files + unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian); + unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float); // container for all the points read from files struct cartesian *h_all_data; - h_all_data = (struct cartesian*) malloc(mem_size); + h_all_data = (struct cartesian *)malloc(mem_size); // Until I can get libs fixed - + // iterator for data files struct cartesian *working = h_all_data; - + // go through and read all data and random points into h_all_data - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); readdatafile(params->inpFiles[0], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); working += num_elements; - for(int i = 0; i < (NUM_SETS); i++) - { - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); - char fileName[50]; - readdatafile(params->inpFiles[i+1], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - - working += num_elements; - } + for (int i = 0; i < (NUM_SETS); i++) { + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); + char fileName[50]; + readdatafile(params->inpFiles[i + 1], working, num_elements); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + working += num_elements; + } - pb_InitializeTimerSet( &timers ); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays - float * h_x_data = (float*) malloc (3*f_mem_size); - float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1); - float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1); - for(int i = 0; i < (NUM_SETS+1); ++i) - { - for(int j = 0; j < NUM_ELEMENTS; ++j) - { - h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x; - h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y; - h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z; - } + float *h_x_data = (float *)malloc(3 * f_mem_size); + float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); + float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); + for (int i = 0; i < (NUM_SETS + 1); ++i) { + for (int j = 0; j < NUM_ELEMENTS; ++j) { + h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x; + h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y; + h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z; } + } // from on use x, y, and z arrays, free h_all_data free(h_all_data); @@ -154,136 +149,141 @@ main( int argc, char** argv) cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_base_dynamic1d/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_base_dynamic1d/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base_dynamic1d"); + sprintf(clOptions, "-I src/opencl_base_dynamic1d"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus); CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); // allocate OpenCL memory to hold all points - //Sub-buffers are not defined in OpenCL 1.0 + // Sub-buffers are not defined in OpenCL 1.0 cl_mem d_x_data; - d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus); + d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") // allocate OpenCL memory to hold final histograms // (1 for dd, and NUM_SETS for dr and rr apiece) cl_mem d_hists; - d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus); + d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") cl_mem dev_binb; - dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus); + dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + (NUM_BINS + 1) * sizeof(float), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate system memory for final histograms - hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)* - sizeof(hist_t)); + hist_t *new_hists = + (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t)); // Initialize the boundary constants for bin search initBinB(&timers, dev_binb, clCommandQueue); // **===------------------ Kick off TPACF on OpenCL------------------===** - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0, + 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel); + TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), + new_hists, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** // references into output histograms hist_t *dd_hist = new_hists; hist_t *rr_hist = dd_hist + NUM_BINS; - hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS; + hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS; // add up values within dr and rr int rr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - rr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - rr[j] += rr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + rr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + rr[j] += rr_hist[i * NUM_BINS + j]; } + } int dr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - dr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - dr[j] += dr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + dr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + dr[j] += dr_hist[i * NUM_BINS + j]; } + } pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); FILE *outfile; - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, "Unable to open output file %s for writing, " - "assuming stdout\n", params->outFile); - outfile = stdout; - } - - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, " + "assuming stdout\n", + params->outFile); + outfile = stdout; + } + + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); // print out final histograms + omega (while calculating omega) - for(int i=0; i<NUM_BINS; i++) - { - fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); - } + for (int i = 0; i < NUM_BINS; i++) { + fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); + } - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if(outfile != stdout) + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + if (outfile != stdout) fclose(outfile); // cleanup memory free(new_hists); - free( h_x_data); + free(h_x_data); - //pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + // pb_SwitchToTimer( &timers, pb_TimerID_COPY ); clStatus = clReleaseMemObject(d_hists); clStatus = clReleaseMemObject(d_x_data); clStatus = clReleaseMemObject(dev_binb); @@ -293,8 +293,7 @@ main( int argc, char** argv) clStatus = clReleaseContext(clContext); CHECK_ERROR("clReleaseContext") - free((void*)clSource[0]); + free((void *)clSource[0]); pb_FreeParameters(params); } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc @@ -6,83 +6,75 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <sys/time.h> -#include <stdio.h> -#include <math.h> -#include <strings.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <strings.h> +#include <sys/time.h> #include "model.h" unsigned int NUM_SETS; unsigned int NUM_ELEMENTS; -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error: Cannot open kernel file for reading!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error: Cannot open kernel file for reading!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error: Cannot allocated buffer for file contents!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error: Cannot allocated buffer for file contents!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error: Cannot read kernel file contents!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error: Cannot read kernel file contents!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h @@ -8,9 +8,9 @@ #ifndef __MODEL_H__ #define __MODEL_H__ -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -21,26 +21,23 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc index cf57cd447fb6612dc8eb7c849aa46fe34cedc40b..791b5fbdd6aa70359d37ca5a85139c7f8374c56d 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc @@ -6,11 +6,11 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "args.h" #include "model.h" @@ -19,114 +19,109 @@ extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; // create the bin boundaries -void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue) -{ - float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float)); - for (int k = 0; k < NUM_BINS+1; k++) - { - binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) - / 60.0*D2R); - } +void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb, + cl_command_queue clCommandQueue) { + float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float)); + for (int k = 0; k < NUM_BINS + 1; k++) { + binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / + 60.0 * D2R); + } - pb_SwitchToTimer( timers, pb_TimerID_COPY ); + pb_SwitchToTimer(timers, pb_TimerID_COPY); cl_int clStatus; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0, + (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); free(binb); } -void TPACF(cl_mem histograms, cl_mem d_x_data, - cl_mem dev_binb, - cl_command_queue clCommandQueue, cl_kernel clKernel) -{ +void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb, + cl_command_queue clCommandQueue, cl_kernel clKernel) { size_t dimBlock = BLOCK_SIZE; - size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock; - + size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock; + cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms); - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_x_data); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&dev_binb); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),&NUM_SETS); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),&NUM_ELEMENTS); - + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms); + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_x_data); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &dev_binb); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &NUM_SETS); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), &NUM_ELEMENTS); + CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid, + &dimBlock, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main( int argc, char** argv) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; struct pb_Parameters *params; - params = pb_ReadParameters( &argc, argv ); + params = pb_ReadParameters(&argc, argv); options args; parse_args(argc, argv, &args); - - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); NUM_ELEMENTS = args.npoints; NUM_SETS = args.random_count; - int num_elements = NUM_ELEMENTS; - + int num_elements = NUM_ELEMENTS; + printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); printf("Bins per dec: %i\n", bins_per_dec); printf("Total bins : %i\n", NUM_BINS); - //read in files - unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian); - unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float); + // read in files + unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian); + unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float); // container for all the points read from files struct cartesian *h_all_data; - h_all_data = (struct cartesian*) malloc(mem_size); + h_all_data = (struct cartesian *)malloc(mem_size); // Until I can get libs fixed - + // iterator for data files struct cartesian *working = h_all_data; - + // go through and read all data and random points into h_all_data - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); readdatafile(params->inpFiles[0], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); working += num_elements; - for(int i = 0; i < (NUM_SETS); i++) - { - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); - char fileName[50]; - readdatafile(params->inpFiles[i+1], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - - working += num_elements; - } + for (int i = 0; i < (NUM_SETS); i++) { + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); + char fileName[50]; + readdatafile(params->inpFiles[i + 1], working, num_elements); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + working += num_elements; + } - pb_InitializeTimerSet( &timers ); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays - float * h_x_data = (float*) malloc (3*f_mem_size); - float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1); - float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1); - for(int i = 0; i < (NUM_SETS+1); ++i) - { - for(int j = 0; j < NUM_ELEMENTS; ++j) - { - h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x; - h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y; - h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z; - } + float *h_x_data = (float *)malloc(3 * f_mem_size); + float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); + float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); + for (int i = 0; i < (NUM_SETS + 1); ++i) { + for (int j = 0; j < NUM_ELEMENTS; ++j) { + h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x; + h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y; + h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z; } + } // from on use x, y, and z arrays, free h_all_data free(h_all_data); @@ -134,139 +129,145 @@ main( int argc, char** argv) cl_int clStatus; cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id clPlatform[numPlatforms]; clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + clStatus = + clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform[1], 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_cpu_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_cpu_base/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_base"); + sprintf(clOptions, "-I src/opencl_base"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus); CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); // allocate OpenCL memory to hold all points - //Sub-buffers are not defined in OpenCL 1.0 + // Sub-buffers are not defined in OpenCL 1.0 cl_mem d_x_data; - d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus); + d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") // allocate OpenCL memory to hold final histograms // (1 for dd, and NUM_SETS for dr and rr apiece) cl_mem d_hists; - d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus); + d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") cl_mem dev_binb; - dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus); + dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + (NUM_BINS + 1) * sizeof(float), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate system memory for final histograms - hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)* - sizeof(hist_t)); + hist_t *new_hists = + (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t)); // Initialize the boundary constants for bin search initBinB(&timers, dev_binb, clCommandQueue); // **===------------------ Kick off TPACF on OpenCL------------------===** - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0, + 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel); + TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), + new_hists, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** // references into output histograms hist_t *dd_hist = new_hists; hist_t *rr_hist = dd_hist + NUM_BINS; - hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS; + hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS; // add up values within dr and rr int rr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - rr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - rr[j] += rr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + rr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + rr[j] += rr_hist[i * NUM_BINS + j]; } + } int dr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - dr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - dr[j] += dr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + dr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + dr[j] += dr_hist[i * NUM_BINS + j]; } + } pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); FILE *outfile; - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, "Unable to open output file %s for writing, " - "assuming stdout\n", params->outFile); - outfile = stdout; - } - - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, " + "assuming stdout\n", + params->outFile); + outfile = stdout; + } + + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); // print out final histograms + omega (while calculating omega) - for(int i=0; i<NUM_BINS; i++) - { - fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); - } + for (int i = 0; i < NUM_BINS; i++) { + fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); + } - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if(outfile != stdout) + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + if (outfile != stdout) fclose(outfile); // cleanup memory free(new_hists); - free( h_x_data); + free(h_x_data); - //pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + // pb_SwitchToTimer( &timers, pb_TimerID_COPY ); clStatus = clReleaseMemObject(d_hists); clStatus = clReleaseMemObject(d_x_data); clStatus = clReleaseMemObject(dev_binb); @@ -276,8 +277,7 @@ main( int argc, char** argv) clStatus = clReleaseContext(clContext); CHECK_ERROR("clReleaseContext") - free((void*)clSource[0]); + free((void *)clSource[0]); pb_FreeParameters(params); } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc @@ -6,83 +6,75 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <sys/time.h> -#include <stdio.h> -#include <math.h> -#include <strings.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <strings.h> +#include <sys/time.h> #include "model.h" unsigned int NUM_SETS; unsigned int NUM_ELEMENTS; -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error: Cannot open kernel file for reading!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error: Cannot open kernel file for reading!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error: Cannot allocated buffer for file contents!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error: Cannot allocated buffer for file contents!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error: Cannot read kernel file contents!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error: Cannot read kernel file contents!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h @@ -8,9 +8,9 @@ #ifndef __MODEL_H__ #define __MODEL_H__ -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -21,26 +21,23 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc index e6f4e6a0f92991e87a640d9bac61a419eccf8569..773fc20258b3301ee507e9957efae089a074d047 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc @@ -6,11 +6,11 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "args.h" #include "model.h" @@ -19,114 +19,109 @@ extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; // create the bin boundaries -void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue) -{ - float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float)); - for (int k = 0; k < NUM_BINS+1; k++) - { - binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) - / 60.0*D2R); - } +void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb, + cl_command_queue clCommandQueue) { + float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float)); + for (int k = 0; k < NUM_BINS + 1; k++) { + binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / + 60.0 * D2R); + } - pb_SwitchToTimer( timers, pb_TimerID_COPY ); + pb_SwitchToTimer(timers, pb_TimerID_COPY); cl_int clStatus; - clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL); + clStatus = + clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0, + (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); free(binb); } -void TPACF(cl_mem histograms, cl_mem d_x_data, - cl_mem dev_binb, - cl_command_queue clCommandQueue, cl_kernel clKernel) -{ +void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb, + cl_command_queue clCommandQueue, cl_kernel clKernel) { size_t dimBlock = BLOCK_SIZE; - size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock; - + size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock; + cl_int clStatus; - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms); - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_x_data); - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&dev_binb); - clStatus = clSetKernelArg(clKernel,3,sizeof(int),&NUM_SETS); - clStatus = clSetKernelArg(clKernel,4,sizeof(int),&NUM_ELEMENTS); - + clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms); + clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_x_data); + clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &dev_binb); + clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &NUM_SETS); + clStatus = clSetKernelArg(clKernel, 4, sizeof(int), &NUM_ELEMENTS); + CHECK_ERROR("clSetKernelArg") - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid, + &dimBlock, 0, NULL, NULL); CHECK_ERROR("clEnqueueNDRangeKernel") clStatus = clFinish(clCommandQueue); CHECK_ERROR("clFinish") } -int -main( int argc, char** argv) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; struct pb_Parameters *params; - params = pb_ReadParameters( &argc, argv ); + params = pb_ReadParameters(&argc, argv); options args; parse_args(argc, argv, &args); - - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); NUM_ELEMENTS = args.npoints; NUM_SETS = args.random_count; - int num_elements = NUM_ELEMENTS; - + int num_elements = NUM_ELEMENTS; + printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); printf("Bins per dec: %i\n", bins_per_dec); printf("Total bins : %i\n", NUM_BINS); - //read in files - unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian); - unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float); + // read in files + unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian); + unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float); // container for all the points read from files struct cartesian *h_all_data; - h_all_data = (struct cartesian*) malloc(mem_size); + h_all_data = (struct cartesian *)malloc(mem_size); // Until I can get libs fixed - + // iterator for data files struct cartesian *working = h_all_data; - + // go through and read all data and random points into h_all_data - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); readdatafile(params->inpFiles[0], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); working += num_elements; - for(int i = 0; i < (NUM_SETS); i++) - { - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); - char fileName[50]; - readdatafile(params->inpFiles[i+1], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - - working += num_elements; - } + for (int i = 0; i < (NUM_SETS); i++) { + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); + char fileName[50]; + readdatafile(params->inpFiles[i + 1], working, num_elements); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + working += num_elements; + } - pb_InitializeTimerSet( &timers ); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays - float * h_x_data = (float*) malloc (3*f_mem_size); - float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1); - float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1); - for(int i = 0; i < (NUM_SETS+1); ++i) - { - for(int j = 0; j < NUM_ELEMENTS; ++j) - { - h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x; - h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y; - h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z; - } + float *h_x_data = (float *)malloc(3 * f_mem_size); + float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); + float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); + for (int i = 0; i < (NUM_SETS + 1); ++i) { + for (int j = 0; j < NUM_ELEMENTS; ++j) { + h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x; + h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y; + h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z; } + } // from on use x, y, and z arrays, free h_all_data free(h_all_data); @@ -134,150 +129,157 @@ main( int argc, char** argv) cl_int clStatus; cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + clStatus = clGetPlatformIDs(1, &clPlatform, NULL); CHECK_ERROR("clGetPlatformIDs") - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)clPlatform, 0}; + cl_context clContext = + clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus); CHECK_ERROR("clCreateContextFromType") - + cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL); CHECK_ERROR("clGetDeviceIDs") - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + cl_command_queue clCommandQueue = clCreateCommandQueue( + clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus); CHECK_ERROR("clCreateCommandQueue") pb_SetOpenCL(&clContext, &clCommandQueue); - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + cl_program clProgram = + clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus); CHECK_ERROR("clCreateProgramWithSource") char clOptions[50]; - sprintf(clOptions,"-I src/opencl_nvidia"); + sprintf(clOptions, "-I src/opencl_nvidia"); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL); CHECK_ERROR("clBuildProgram") - cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus); + cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus); CHECK_ERROR("clCreateKernel") - - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + // Get program binary // Query binary (PTX file) size - size_t bin_sz; - clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_sz, NULL); - - // Read binary (PTX file) to memory buffer - unsigned char *bin = (unsigned char *)malloc(bin_sz); - clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL); - - // Save PTX to add_vectors_ocl.ptx - FILE* fp = fopen("tpacf.nvptx.s", "wb"); - fwrite(bin, sizeof(char), bin_sz, fp); - fclose(fp); - free(bin); + size_t bin_sz; + clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t), &bin_sz, NULL); + + // Read binary (PTX file) to memory buffer + unsigned char *bin = (unsigned char *)malloc(bin_sz); + clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, + sizeof(unsigned char *), &bin, NULL); + + // Save PTX to add_vectors_ocl.ptx + FILE *fp = fopen("tpacf.nvptx.s", "wb"); + fwrite(bin, sizeof(char), bin_sz, fp); + fclose(fp); + free(bin); // allocate OpenCL memory to hold all points - //Sub-buffers are not defined in OpenCL 1.0 + // Sub-buffers are not defined in OpenCL 1.0 cl_mem d_x_data; - d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus); + d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") // allocate OpenCL memory to hold final histograms // (1 for dd, and NUM_SETS for dr and rr apiece) cl_mem d_hists; - d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus); + d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL, + &clStatus); CHECK_ERROR("clCreateBuffer") cl_mem dev_binb; - dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus); + dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + (NUM_BINS + 1) * sizeof(float), NULL, &clStatus); CHECK_ERROR("clCreateBuffer") - - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate system memory for final histograms - hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)* - sizeof(hist_t)); + hist_t *new_hists = + (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t)); // Initialize the boundary constants for bin search initBinB(&timers, dev_binb, clCommandQueue); // **===------------------ Kick off TPACF on OpenCL------------------===** - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0, + 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel); + TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); - clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0, + NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), + new_hists, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** // references into output histograms hist_t *dd_hist = new_hists; hist_t *rr_hist = dd_hist + NUM_BINS; - hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS; + hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS; // add up values within dr and rr int rr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - rr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - rr[j] += rr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + rr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + rr[j] += rr_hist[i * NUM_BINS + j]; } + } int dr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { - dr[i] = 0; - } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - dr[j] += dr_hist[i*NUM_BINS + j]; - } + for (int i = 0; i < NUM_BINS; i++) { + dr[i] = 0; + } + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + dr[j] += dr_hist[i * NUM_BINS + j]; } + } pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); FILE *outfile; - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, "Unable to open output file %s for writing, " - "assuming stdout\n", params->outFile); - outfile = stdout; - } - - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, " + "assuming stdout\n", + params->outFile); + outfile = stdout; + } + + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); // print out final histograms + omega (while calculating omega) - for(int i=0; i<NUM_BINS; i++) - { - fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); - } + for (int i = 0; i < NUM_BINS; i++) { + fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); + } - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if(outfile != stdout) + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + if (outfile != stdout) fclose(outfile); // cleanup memory free(new_hists); - free( h_x_data); + free(h_x_data); - //pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + // pb_SwitchToTimer( &timers, pb_TimerID_COPY ); clStatus = clReleaseMemObject(d_hists); clStatus = clReleaseMemObject(d_x_data); clStatus = clReleaseMemObject(dev_binb); @@ -287,8 +289,7 @@ main( int argc, char** argv) clStatus = clReleaseContext(clContext); CHECK_ERROR("clReleaseContext") - free((void*)clSource[0]); + free((void *)clSource[0]); pb_FreeParameters(params); } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc @@ -6,83 +6,75 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <sys/time.h> -#include <stdio.h> -#include <math.h> -#include <strings.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <strings.h> +#include <sys/time.h> #include "model.h" unsigned int NUM_SETS; unsigned int NUM_ELEMENTS; -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error: Cannot open kernel file for reading!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error: Cannot open kernel file for reading!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error: Cannot allocated buffer for file contents!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error: Cannot allocated buffer for file contents!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error: Cannot read kernel file contents!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error: Cannot read kernel file contents!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h @@ -8,9 +8,9 @@ #ifndef __MODEL_H__ #define __MODEL_H__ -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -21,26 +21,23 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc @@ -5,22 +5,21 @@ *cr All Rights Reserved *cr ***************************************************************************/ +#include "args.h" +#include <stdio.h> #include <stdlib.h> #include <unistd.h> -#include <stdio.h> -#include "args.h" extern char *optarg; -void usage(char *name) -{ +void usage(char *name) { printf("Usage: %s <-d data_file_name> <-r rnd_file_name> " - "<-m rnd_count> <-p count> <-o file_name>\n", name); + "<-m rnd_count> <-p count> <-o file_name>\n", + name); exit(0); } -void parse_args(int argc, char **argv, options* args) -{ +void parse_args(int argc, char **argv, options *args) { int c; args->data_name = NULL; @@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args) args->random_count = 0; args->npoints = 0; args->output_name = NULL; - - while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) - { - switch (c) - { - case 'd': - args->data_name = optarg; - break; - case 'r': - args->random_name = optarg; - break; - case 'n': - args->random_count = atoi(optarg); - break; - case 'o': - args->output_name = optarg; - break; - case 'p': - args->npoints = atol(optarg); - break; - default: - usage(argv[0]); - } + + while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) { + switch (c) { + case 'd': + args->data_name = optarg; + break; + case 'r': + args->random_name = optarg; + break; + case 'n': + args->random_count = atoi(optarg); + break; + case 'o': + args->output_name = optarg; + break; + case 'p': + args->npoints = atol(optarg); + break; + default: + usage(argv[0]); } + } } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h @@ -8,8 +8,7 @@ #ifndef __ARGS_H__ #define __ARGS_H__ -typedef struct _options_ -{ +typedef struct _options_ { char *data_name; char *random_name; int random_count; @@ -18,6 +17,6 @@ typedef struct _options_ } options; void usage(char *name); -void parse_args(int argc, char **argv, options* args); +void parse_args(int argc, char **argv, options *args); #endif diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc index d1482d732947aefc2f3eafb380f584680e692f7f..3239be6c92f641422f2ba6910894ae68cc8b220e 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc @@ -5,11 +5,11 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "args.h" #include "model.h" @@ -19,45 +19,35 @@ extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; - #define WARP_SIZE 32 #define NUM_BANKS 16 #define LOG_NUM_BANKS 4 #define BLOCK_SIZE 256 -#define NUM_WARPS (BLOCK_SIZE/WARP_SIZE) +#define NUM_WARPS (BLOCK_SIZE / WARP_SIZE) #define HISTS_PER_WARP 16 -#define NUM_HISTOGRAMS (NUM_WARPS*HISTS_PER_WARP) -#define THREADS_PER_HIST (WARP_SIZE/HISTS_PER_WARP) - -#define warp_hists(x,y) warp_hists[(x)*NUM_HISTOGRAMS+(y)] +#define NUM_HISTOGRAMS (NUM_WARPS * HISTS_PER_WARP) +#define THREADS_PER_HIST (WARP_SIZE / HISTS_PER_WARP) +#define warp_hists(x, y) warp_hists[(x)*NUM_HISTOGRAMS + (y)] typedef struct __attribute__((__packed__)) { - hist_t* histograms; + hist_t *histograms; size_t bytes_histograms; - float* all_x_data; + float *all_x_data; size_t bytes_all_data; - float* binb; + float *binb; size_t bytes_binb; int NUM_SETS; int NUM_ELEMENTS; long block; long grid; -} -RootIn; - -void packData( RootIn* args, - hist_t* histograms, - size_t bytes_histograms, - float* all_x_data, - size_t bytes_all_data, - float* binb, - size_t bytes_binb, - int NUM_SETS, - int NUM_ELEMENTS, - long block, - long grid) { +} RootIn; + +void packData(RootIn *args, hist_t *histograms, size_t bytes_histograms, + float *all_x_data, size_t bytes_all_data, float *binb, + size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block, + long grid) { args->histograms = histograms; args->bytes_histograms = bytes_histograms; args->all_x_data = all_x_data; @@ -70,205 +60,186 @@ void packData( RootIn* args, args->grid = grid; } - void Allocation(long block) { // Memory shared between threadblocks - //void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE); - void* warp_hists = __visc__malloc(sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS); + // void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE); + void *warp_hists = + __visc__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); //__visc__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE, - //warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS); - __visc__return(2, warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS); + // warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS); + __visc__return(2, warp_hists, + sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); } -void TPACFLeaf(hist_t* histograms, size_t bytes_histograms, +void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, // next two args are read-only arrays - float* all_x_data, size_t bytes_all_data, - float* binb, size_t bytes_binb, - int NUM_SETS, int NUM_ELEMENTS, + float *all_x_data, size_t bytes_all_data, float *binb, + size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, // shared memory args - //struct cartesian* data_s, size_t bytes_data_s, - unsigned int* warp_hists, size_t bytes_warp_hists) { + // struct cartesian* data_s, size_t bytes_data_s, + unsigned int *warp_hists, size_t bytes_warp_hists) { __visc__hint(visc::DEVICE); __visc__attributes(2, all_x_data, binb, 1, histograms); - void* thisNode = __visc__getNode(); - void* parentNode = __visc__getParentNode(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); int lx = __visc__getNodeInstanceID_x(thisNode); int gx = __visc__getNodeInstanceID_x(parentNode); int dimx = __visc__getNumNodeInstances_x(thisNode); - float* all_y_data = all_x_data + NUM_ELEMENTS*(NUM_SETS+1); - float* all_z_data = all_y_data + NUM_ELEMENTS*(NUM_SETS+1); + float *all_y_data = all_x_data + NUM_ELEMENTS * (NUM_SETS + 1); + float *all_z_data = all_y_data + NUM_ELEMENTS * (NUM_SETS + 1); unsigned int bx = gx; unsigned int tid = lx; bool do_self = (bx < (NUM_SETS + 1)); - float* data_x; - float* data_y; - float* data_z; - float* random_x; - float* random_y; - float* random_z; - - for(unsigned int w = 0; w < NUM_BINS*NUM_HISTOGRAMS; w += BLOCK_SIZE ) - { - if((w+tid) < (NUM_BINS*NUM_HISTOGRAMS)) - { - warp_hists((w+tid)/NUM_HISTOGRAMS, (w+tid)%NUM_HISTOGRAMS) = 0; - } + float *data_x; + float *data_y; + float *data_z; + float *random_x; + float *random_y; + float *random_z; + + for (unsigned int w = 0; w < NUM_BINS * NUM_HISTOGRAMS; w += BLOCK_SIZE) { + if ((w + tid) < (NUM_BINS * NUM_HISTOGRAMS)) { + warp_hists((w + tid) / NUM_HISTOGRAMS, (w + tid) % NUM_HISTOGRAMS) = 0; } + } // Get stuff into shared memory to kick off the loop. - if( !do_self) - { - data_x = all_x_data; - data_y = all_y_data; - data_z = all_z_data; - - random_x = all_x_data + NUM_ELEMENTS * (bx - NUM_SETS); - random_y = all_y_data + NUM_ELEMENTS * (bx - NUM_SETS); - random_z = all_z_data + NUM_ELEMENTS * (bx - NUM_SETS); - } - else - { - random_x = all_x_data + NUM_ELEMENTS * (bx); - random_y = all_y_data + NUM_ELEMENTS * (bx); - random_z = all_z_data + NUM_ELEMENTS * (bx); - - data_x = random_x; - data_y = random_y; - data_z = random_z; - } - + if (!do_self) { + data_x = all_x_data; + data_y = all_y_data; + data_z = all_z_data; + + random_x = all_x_data + NUM_ELEMENTS * (bx - NUM_SETS); + random_y = all_y_data + NUM_ELEMENTS * (bx - NUM_SETS); + random_z = all_z_data + NUM_ELEMENTS * (bx - NUM_SETS); + } else { + random_x = all_x_data + NUM_ELEMENTS * (bx); + random_y = all_y_data + NUM_ELEMENTS * (bx); + random_z = all_z_data + NUM_ELEMENTS * (bx); + + data_x = random_x; + data_y = random_y; + data_z = random_z; + } + // Iterate over all random points - for(unsigned int j = 0; j < NUM_ELEMENTS; j += BLOCK_SIZE) - { - // load current random point values - float random_x_s; - float random_y_s; - float random_z_s; - - if(tid + j < NUM_ELEMENTS) - { - random_x_s = random_x[tid + j]; - random_y_s = random_y[tid + j]; - random_z_s = random_z[tid + j]; - } - - // Iterate over all data points - // If do_self, then use a tighter bound on the number of data points. - for(unsigned int k = 0; - k < NUM_ELEMENTS && (do_self ? k < j + BLOCK_SIZE : 1); k++) - { - // do actual calculations on the values: - float distance = data_x[k] * random_x_s + - data_y[k] * random_y_s + - data_z[k] * random_z_s ; - - unsigned int bin_index; - - // run binary search to find bin_index - unsigned int min = 0; - unsigned int max = NUM_BINS; - { - unsigned int k2; - - while (max > min+1) - { - k2 = (min + max) / 2; - if (distance >= binb[k2]) - max = k2; - else - min = k2; - } - bin_index = max - 1; - } - - unsigned int warpnum = tid / (WARP_SIZE/HISTS_PER_WARP); - if((distance < binb[min]) && (distance >= binb[max]) && - (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS)) - { - __visc__atomic_add((int*)&(warp_hists(bin_index, warpnum)), 1); - } - } + for (unsigned int j = 0; j < NUM_ELEMENTS; j += BLOCK_SIZE) { + // load current random point values + float random_x_s; + float random_y_s; + float random_z_s; + + if (tid + j < NUM_ELEMENTS) { + random_x_s = random_x[tid + j]; + random_y_s = random_y[tid + j]; + random_z_s = random_z[tid + j]; } - + + // Iterate over all data points + // If do_self, then use a tighter bound on the number of data points. + for (unsigned int k = 0; + k < NUM_ELEMENTS && (do_self ? k < j + BLOCK_SIZE : 1); k++) { + // do actual calculations on the values: + float distance = data_x[k] * random_x_s + data_y[k] * random_y_s + + data_z[k] * random_z_s; + + unsigned int bin_index; + + // run binary search to find bin_index + unsigned int min = 0; + unsigned int max = NUM_BINS; + { + unsigned int k2; + + while (max > min + 1) { + k2 = (min + max) / 2; + if (distance >= binb[k2]) + max = k2; + else + min = k2; + } + bin_index = max - 1; + } + + unsigned int warpnum = tid / (WARP_SIZE / HISTS_PER_WARP); + if ((distance < binb[min]) && (distance >= binb[max]) && + (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS)) { + __visc__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1); + } + } + } + // coalesce the histograms in a block - unsigned int warp_index = tid & ( (NUM_HISTOGRAMS>>1) - 1); - unsigned int bin_index = tid / (NUM_HISTOGRAMS>>1); - for(unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; - offset >>= 1) - { - for(unsigned int bin_base = 0; bin_base < NUM_BINS; - bin_base += BLOCK_SIZE/ (NUM_HISTOGRAMS>>1)) - { - __visc__barrier(); - if(warp_index < offset && bin_base+bin_index < NUM_BINS ) - { - unsigned long sum = - warp_hists(bin_base + bin_index, warp_index) + - warp_hists(bin_base + bin_index, warp_index+offset); - warp_hists(bin_base + bin_index, warp_index) = sum; - } - } + unsigned int warp_index = tid & ((NUM_HISTOGRAMS >> 1) - 1); + unsigned int bin_index = tid / (NUM_HISTOGRAMS >> 1); + for (unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; offset >>= 1) { + for (unsigned int bin_base = 0; bin_base < NUM_BINS; + bin_base += BLOCK_SIZE / (NUM_HISTOGRAMS >> 1)) { + __visc__barrier(); + if (warp_index < offset && bin_base + bin_index < NUM_BINS) { + unsigned long sum = + warp_hists(bin_base + bin_index, warp_index) + + warp_hists(bin_base + bin_index, warp_index + offset); + warp_hists(bin_base + bin_index, warp_index) = sum; + } } - + } + __visc__barrier(); - + // Put the results back in the real histogram // warp_hists(x, 0) holds sum of all locations of bin x - hist_t* hist_base = histograms + NUM_BINS * bx; - if(tid < NUM_BINS) - { - hist_base[tid] = warp_hists(tid, 0); - } + hist_t *hist_base = histograms + NUM_BINS * bx; + if (tid < NUM_BINS) { + hist_base[tid] = warp_hists(tid, 0); + } } -void BlockingTPACF(hist_t* histograms, size_t bytes_histograms, - float* all_x_data, size_t bytes_all_data, +void BlockingTPACF(hist_t *histograms, size_t bytes_histograms, + float *all_x_data, size_t bytes_all_data, // next arg is read-only constant - float* binb, size_t bytes_binb, - int NUM_SETS, int NUM_ELEMENTS, - long block) { + float *binb, size_t bytes_binb, int NUM_SETS, + int NUM_ELEMENTS, long block) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, all_x_data, binb, 1, histograms); - void* AllocationNode = __visc__createNodeND(0, Allocation); - void* TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block); + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block); // Bind Inputs __visc__bindIn(AllocationNode, 8, 0, 0); // Bind block - __visc__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms - __visc__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms - __visc__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data - __visc__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data - __visc__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb - __visc__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb - __visc__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS - __visc__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS + __visc__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms + __visc__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms + __visc__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data + __visc__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data + __visc__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb + __visc__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb + __visc__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS + __visc__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS // Create Edges __visc__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists - __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, 0); // Edge bytes_warp_hists - + __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, + 0); // Edge bytes_warp_hists } -void TPACFRoot(hist_t* histograms, size_t bytes_histograms, - float* all_x_data, size_t bytes_all_data, +void TPACFRoot(hist_t *histograms, size_t bytes_histograms, float *all_x_data, + size_t bytes_all_data, // next arg is read-only constant - float* binb, size_t bytes_binb, - int NUM_SETS, int NUM_ELEMENTS, - long block, - long grid) { + float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, + long block, long grid) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, all_x_data, binb, 1, histograms); - void* BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid); + void *BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid); // Bind Inputs __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms @@ -280,21 +251,17 @@ void TPACFRoot(hist_t* histograms, size_t bytes_histograms, __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block - } -void TPACFWrapper( - hist_t* histograms, size_t bytes_histograms, - float* all_x_data, size_t bytes_all_data, - // next arg is read-only constant - float* binb, size_t bytes_binb, - int NUM_SETS, int NUM_ELEMENTS, - long block, long grid -) { +void TPACFWrapper(hist_t *histograms, size_t bytes_histograms, + float *all_x_data, size_t bytes_all_data, + // next arg is read-only constant + float *binb, size_t bytes_binb, int NUM_SETS, + int NUM_ELEMENTS, long block, long grid) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, all_x_data, binb, 1, histograms); - void* BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot); + void *BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot); // Bind Inputs __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms @@ -311,18 +278,15 @@ void TPACFWrapper( // **===-----------------------------------------------------------===** -int -main( int argc, char** argv) -{ +int main(int argc, char **argv) { struct pb_TimerSet timers; - struct pb_Parameters* params; + struct pb_Parameters *params; - params = pb_ReadParameters( &argc, argv ); + params = pb_ReadParameters(&argc, argv); options args; parse_args(argc, argv, &args); - NUM_ELEMENTS = args.npoints; NUM_SETS = args.random_count; int num_elements = NUM_ELEMENTS; @@ -332,54 +296,50 @@ main( int argc, char** argv) printf("Bins per dec: %i\n", bins_per_dec); printf("Total bins : %i\n", NUM_BINS); - //read in files - unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian); - unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float); + // read in files + unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian); + unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float); // container for all the points read from files struct cartesian *h_all_data; - h_all_data = (struct cartesian*) malloc(mem_size); + h_all_data = (struct cartesian *)malloc(mem_size); // Until I can get libs fixed // iterator for data files struct cartesian *working = h_all_data; - // go through and read all data and random points into h_all_data - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); readdatafile(params->inpFiles[0], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); working += num_elements; - for(int i = 0; i < (NUM_SETS); i++) - { - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); - readdatafile(params->inpFiles[i+1], working, num_elements); - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + for (int i = 0; i < (NUM_SETS); i++) { + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); + readdatafile(params->inpFiles[i + 1], working, num_elements); + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); working += num_elements; } - pb_InitializeTimerSet( &timers ); + pb_InitializeTimerSet(&timers); __visc__init(); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays // AOS to SOA transformation - size_t bytes_h_x_data = 3*f_mem_size; - float * h_x_data = (float*) malloc (bytes_h_x_data); + size_t bytes_h_x_data = 3 * f_mem_size; + float *h_x_data = (float *)malloc(bytes_h_x_data); llvm_visc_track_mem(h_x_data, bytes_h_x_data); - float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1); - float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1); - for(int i = 0; i < (NUM_SETS+1); ++i) - { - for(int j = 0; j < NUM_ELEMENTS; ++j) - { - h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x; - h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y; - h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z; + float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); + float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); + for (int i = 0; i < (NUM_SETS + 1); ++i) { + for (int j = 0; j < NUM_ELEMENTS; ++j) { + h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x; + h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y; + h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z; } } @@ -387,77 +347,61 @@ main( int argc, char** argv) free(h_all_data); // allocate system memory for final histograms - size_t bytes_hists = NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t); - hist_t *hists = (hist_t *) malloc(bytes_hists); + size_t bytes_hists = NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t); + hist_t *hists = (hist_t *)malloc(bytes_hists); llvm_visc_track_mem(hists, bytes_hists); // Initialize the boundary constants for bin search - size_t bytes_binb = (NUM_BINS+1)*sizeof(float); - float *binb = (float*)malloc(bytes_binb); + size_t bytes_binb = (NUM_BINS + 1) * sizeof(float); + float *binb = (float *)malloc(bytes_binb); llvm_visc_track_mem(binb, bytes_binb); - for (int k = 0; k < NUM_BINS+1; k++) - { - binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) - / 60.0*D2R); + for (int k = 0; k < NUM_BINS + 1; k++) { + binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / + 60.0 * D2R); } // **===------------------ Kick off TPACF on OpenCL------------------===** long block = BLOCK_SIZE; - long grid = (NUM_SETS*2 + 1); - - RootIn* graph_args = (RootIn*) malloc (sizeof(RootIn)); - packData(graph_args, - hists, - bytes_hists, - h_x_data, - bytes_h_x_data, - binb, - bytes_binb, - NUM_SETS, - NUM_ELEMENTS, - block, - grid); - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); - - void* TPACF_DFG = __visc__launch(0, TPACFRoot, (void*)graph_args); + long grid = (NUM_SETS * 2 + 1); + + RootIn *graph_args = (RootIn *)malloc(sizeof(RootIn)); + packData(graph_args, hists, bytes_hists, h_x_data, bytes_h_x_data, binb, + bytes_binb, NUM_SETS, NUM_ELEMENTS, block, grid); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); + + void *TPACF_DFG = __visc__launch(0, TPACFRoot, (void *)graph_args); __visc__wait(TPACF_DFG); - pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** llvm_visc_request_mem(hists, bytes_hists); // references into output histograms hist_t *dd_hist = hists; hist_t *rr_hist = dd_hist + NUM_BINS; - hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS; + hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS; // add up values within dr and rr int rr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { + for (int i = 0; i < NUM_BINS; i++) { rr[i] = 0; } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - rr[j] += rr_hist[i*NUM_BINS + j]; + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + rr[j] += rr_hist[i * NUM_BINS + j]; } } int dr[NUM_BINS]; - for(int i=0; i<NUM_BINS; i++) - { + for (int i = 0; i < NUM_BINS; i++) { dr[i] = 0; } - for(int i=0; i<NUM_SETS; i++) - { - for(int j=0; j<NUM_BINS; j++) - { - dr[j] += dr_hist[i*NUM_BINS + j]; + for (int i = 0; i < NUM_SETS; i++) { + for (int j = 0; j < NUM_BINS; j++) { + dr[j] += dr_hist[i * NUM_BINS + j]; } } @@ -466,27 +410,25 @@ main( int argc, char** argv) __visc__cleanup(); FILE *outfile; - if ((outfile = fopen(params->outFile, "w")) == NULL) - { - fprintf(stderr, "Unable to open output file %s for writing, " - "assuming stdout\n", params->outFile); + if ((outfile = fopen(params->outFile, "w")) == NULL) { + fprintf(stderr, + "Unable to open output file %s for writing, " + "assuming stdout\n", + params->outFile); outfile = stdout; } - //pb_SwitchToTimer( &timers, pb_TimerID_IO ); + // pb_SwitchToTimer( &timers, pb_TimerID_IO ); // print out final histograms + omega (while calculating omega) - for(int i=0; i<NUM_BINS; i++) - { + for (int i = 0; i < NUM_BINS; i++) { fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]); } - //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - if(outfile != stdout) + // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + if (outfile != stdout) fclose(outfile); // cleanup memory free(hists); free(h_x_data); - } - diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc @@ -6,83 +6,75 @@ *cr ***************************************************************************/ #include <CL/cl.h> -#include <sys/time.h> -#include <stdio.h> -#include <math.h> -#include <strings.h> #include <math.h> #include <parboil.h> +#include <stdio.h> +#include <strings.h> +#include <sys/time.h> #include "model.h" unsigned int NUM_SETS; unsigned int NUM_ELEMENTS; -int readdatafile(char *fname, struct cartesian *data, int npoints) -{ +int readdatafile(char *fname, struct cartesian *data, int npoints) { FILE *infile; int lcount = 0; float ra, dec; - if ((infile = fopen(fname, "r")) == NULL) - { - fprintf(stderr, "Unable to open data file %s for reading\n", fname); - return lcount; - } + if ((infile = fopen(fname, "r")) == NULL) { + fprintf(stderr, "Unable to open data file %s for reading\n", fname); + return lcount; + } + + for (lcount = 0; lcount < npoints; lcount++) { + if (fscanf(infile, "%f %f", &ra, &dec) != 2) + break; - for (lcount = 0; lcount < npoints; lcount++) { - if (fscanf(infile, "%f %f", &ra, &dec) != 2) - break; + // data conversion + float rarad = D2R * ra; + float decrad = D2R * dec; + float cd = cos(decrad); - { - // data conversion - float rarad = D2R * ra; - float decrad = D2R * dec; - float cd = cos(decrad); - - data[lcount].x = cos(rarad) * cd; - data[lcount].y = sin(rarad) * cd; - data[lcount].z = sin(decrad); - } + data[lcount].x = cos(rarad) * cd; + data[lcount].y = sin(rarad) * cd; + data[lcount].z = sin(decrad); } + } fclose(infile); - + return lcount; } -char* readFile(const char* fileName) -{ - FILE* fp; - fp = fopen(fileName,"r"); - if(fp == NULL) - { - printf("Error: Cannot open kernel file for reading!\n"); - exit(1); - } +char *readFile(const char *fileName) { + FILE *fp; + fp = fopen(fileName, "r"); + if (fp == NULL) { + printf("Error: Cannot open kernel file for reading!\n"); + exit(1); + } - fseek(fp,0,SEEK_END); - long size = ftell(fp); - rewind(fp); + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + rewind(fp); - char* buffer = (char*)malloc(sizeof(char)*(size+1)); - if(buffer == NULL) - { - printf("Error: Cannot allocated buffer for file contents!\n"); - fclose(fp); - exit(1); - } + char *buffer = (char *)malloc(sizeof(char) * (size + 1)); + if (buffer == NULL) { + printf("Error: Cannot allocated buffer for file contents!\n"); + fclose(fp); + exit(1); + } - size_t res = fread(buffer,1,size,fp); - if(res != size) - { - printf("Error: Cannot read kernel file contents!\n"); - fclose(fp); - exit(1); - } + size_t res = fread(buffer, 1, size, fp); + if (res != size) { + printf("Error: Cannot read kernel file contents!\n"); + fclose(fp); + exit(1); + } - buffer[size] = 0; - fclose(fp); - return buffer; + buffer[size] = 0; + fclose(fp); + return buffer; } diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h @@ -8,9 +8,9 @@ #ifndef __MODEL_H__ #define __MODEL_H__ -#define D2R M_PI/180.0 -#define R2D 180.0/M_PI -#define R2AM 60.0*180.0/M_PI +#define D2R M_PI / 180.0 +#define R2D 180.0 / M_PI +#define R2AM 60.0 * 180.0 / M_PI #define bins_per_dec 5 #define min_arcmin 1.0 @@ -21,26 +21,23 @@ typedef unsigned long hist_t; -struct spherical -{ - float ra, dec; // latitude, longitude pair +struct spherical { + float ra, dec; // latitude, longitude pair }; - -struct cartesian -{ - float x, y, z; // cartesian coodrinates + +struct cartesian { + float x, y, z; // cartesian coodrinates }; int readdatafile(char *fname, struct cartesian *data, int npoints); -char* readFile(const char*); +char *readFile(const char *); -#define CHECK_ERROR(errorMessage) \ - if(clStatus != CL_SUCCESS) \ - { \ - printf("Error: %s!\n",errorMessage); \ - printf("Line: %d\n",__LINE__); \ - exit(1); \ +#define CHECK_ERROR(errorMessage) \ + if (clStatus != CL_SUCCESS) { \ + printf("Error: %s!\n", errorMessage); \ + printf("Line: %d\n", __LINE__); \ + exit(1); \ } #endif diff --git a/hpvm/test/parboil/common/include/parboil.h b/hpvm/test/parboil/common/include/parboil.h index 41f78a99c07cea79e013eee86bc746f223517fdc..30ad6721c3190610dd08ec131603b6fe622f897e 100644 --- a/hpvm/test/parboil/common/include/parboil.h +++ b/hpvm/test/parboil/common/include/parboil.h @@ -12,13 +12,13 @@ extern "C" { /* Command line parameters for benchmarks */ struct pb_Parameters { - char *outFile; /* If not NULL, the raw output of the - * computation should be saved to this - * file. The string is owned. */ - char **inpFiles; /* A NULL-terminated array of strings - * holding the input file(s) for the - * computation. The array and strings - * are owned. */ + char *outFile; /* If not NULL, the raw output of the + * computation should be saved to this + * file. The string is owned. */ + char **inpFiles; /* A NULL-terminated array of strings + * holding the input file(s) for the + * computation. The array and strings + * are owned. */ }; /* Read command-line parameters. @@ -30,24 +30,21 @@ struct pb_Parameters { * If there is an error, then an error message is printed on stderr * and NULL is returned. */ -struct pb_Parameters * -pb_ReadParameters(int *_argc, char **argv); +struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv); /* Free an instance of struct pb_Parameters. */ -void -pb_FreeParameters(struct pb_Parameters *p); +void pb_FreeParameters(struct pb_Parameters *p); /* Count the number of input files in a pb_Parameters instance. */ -int -pb_Parameters_CountInputs(struct pb_Parameters *p); +int pb_Parameters_CountInputs(struct pb_Parameters *p); /* A time or duration. */ #if _POSIX_VERSION >= 200112L typedef unsigned long long pb_Timestamp; /* time in microseconds */ #else -# error "Timestamps not implemented" +#error "Timestamps not implemented" #endif enum pb_TimerState { @@ -57,57 +54,53 @@ enum pb_TimerState { struct pb_Timer { enum pb_TimerState state; - pb_Timestamp elapsed; /* Amount of time elapsed so far */ - pb_Timestamp init; /* Beginning of the current time interval, - * if state is RUNNING. End of the last - * recorded time interfal otherwise. */ + pb_Timestamp elapsed; /* Amount of time elapsed so far */ + pb_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ }; /* Reset a timer. * Use this to initialize a timer or to clear * its elapsed time. The reset timer is stopped. */ -void -pb_ResetTimer(struct pb_Timer *timer); +void pb_ResetTimer(struct pb_Timer *timer); /* Start a timer. The timer is set to RUNNING mode and * time elapsed while the timer is running is added to * the timer. * The timer should not already be running. */ -void -pb_StartTimer(struct pb_Timer *timer); +void pb_StartTimer(struct pb_Timer *timer); /* Stop a timer. * This stops adding elapsed time to the timer. * The timer should not already be stopped. */ -void -pb_StopTimer(struct pb_Timer *timer); +void pb_StopTimer(struct pb_Timer *timer); /* Get the elapsed time in seconds. */ -double -pb_GetElapsedTime(struct pb_Timer *timer); +double pb_GetElapsedTime(struct pb_Timer *timer); /* Execution time is assigned to one of these categories. */ enum pb_TimerID { pb_TimerID_NONE = 0, - pb_TimerID_IO, /* Time spent in input/output */ - pb_TimerID_KERNEL, /* Time spent computing on the device, - * recorded asynchronously */ - pb_TimerID_COPY, /* Time spent synchronously moving data - * to/from device and allocating/freeing - * memory on the device */ - pb_TimerID_DRIVER, /* Time spent in the host interacting with the - * driver, primarily for recording the time - * spent queueing asynchronous operations */ - pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ - pb_TimerID_COMPUTE, /* Time for all program execution other - * than parsing command line arguments, - * I/O, kernel, and copy */ - pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and - * host activity: automatically filled in, - * not intended for direct usage */ + pb_TimerID_IO, /* Time spent in input/output */ + pb_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + pb_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + pb_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + pb_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ // GPU FUNCTION visc_TimerID_INIT_CTX, visc_TimerID_CLEAR_CTX, @@ -127,17 +120,17 @@ enum pb_TimerID { visc_TimerID_OUTPUT_PACK, visc_TimerID_OUTPUT_UNPACK, - pb_TimerID_LAST /* Number of timer IDs */ + pb_TimerID_LAST /* Number of timer IDs */ }; /* Dynamic list of asynchronously tracked times between events */ struct pb_async_time_marker_list { - char *label; // actually just a pointer to a string - enum pb_TimerID timerID; /* The ID to which the interval beginning - * with this marker should be attributed */ - void * marker; - //cudaEvent_t marker; /* The driver event for this marker */ - struct pb_async_time_marker_list *next; + char *label; // actually just a pointer to a string + enum pb_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void *marker; + // cudaEvent_t marker; /* The driver event for this marker */ + struct pb_async_time_marker_list *next; }; struct pb_SubTimer { @@ -154,7 +147,7 @@ struct pb_SubTimerList { /* A set of timers for recording execution times. */ struct pb_TimerSet { enum pb_TimerID current; - struct pb_async_time_marker_list* async_markers; + struct pb_async_time_marker_list *async_markers; pb_Timestamp async_begin; pb_Timestamp wall_begin; struct pb_Timer timers[pb_TimerID_LAST]; @@ -162,37 +155,33 @@ struct pb_TimerSet { }; /* Reset all timers in the set. */ -void -pb_InitializeTimerSet(struct pb_TimerSet *timers); +void pb_InitializeTimerSet(struct pb_TimerSet *timers); -void -pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category); +void pb_AddSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID pb_Category); /* Select which timer the next interval of time should be accounted * to. The selected timer is started and other timers are stopped. * Using pb_TimerID_NONE stops all timers. */ -void -pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer); +void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer); -void -pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category); +void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID category); /* Print timer values to standard output. */ -void -pb_PrintTimerSet(struct pb_TimerSet *timers); +void pb_PrintTimerSet(struct pb_TimerSet *timers); /* Release timer resources */ -void -pb_DestroyTimerSet(struct pb_TimerSet * timers); +void pb_DestroyTimerSet(struct pb_TimerSet *timers); -void -pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr); +void pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr); -void -pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKerenlPtr); +void pb_CreateAndBuildKernelFromBinary(const char *file, const char *kernel, + void *clContextPtr, void *clDevicePtr, + void *clProgramPtr, void *clKerenlPtr); #ifdef __cplusplus } #endif -#endif //PARBOIL_HEADER +#endif // PARBOIL_HEADER diff --git a/hpvm/test/parboil/common/include/visc.h b/hpvm/test/parboil/common/include/visc.h index b0a0f141e575b104f0f3934416956cf9cd1f1904..6edc07a0a39353566d2b8edb72a2a39a83dba288 100644 --- a/hpvm/test/parboil/common/include/visc.h +++ b/hpvm/test/parboil/common/include/visc.h @@ -20,54 +20,55 @@ void __visc__hint(enum Target) noexcept; #endif #ifdef __cplusplus -void* __visc__node(...) noexcept; -//void* __visc__createNode(...) noexcept; -//void* __visc__createNode1D(...) noexcept; -//void* __visc__createNode2D(...) noexcept; -//void* __visc__createNode3D(...) noexcept; -//void __visc__return(...) noexcept; +void *__visc__node(...) noexcept; +// void* __visc__createNode(...) noexcept; +// void* __visc__createNode1D(...) noexcept; +// void* __visc__createNode2D(...) noexcept; +// void* __visc__createNode3D(...) noexcept; +// void __visc__return(...) noexcept; #endif -void* __visc__createNodeND(unsigned, ...) noexcept; +void *__visc__createNodeND(unsigned, ...) noexcept; void __visc__return(unsigned, ...) noexcept; void __visc__attributes(unsigned, ...) noexcept; void __visc__init() noexcept; void __visc__cleanup() noexcept; -void __visc__bindIn(void*, unsigned, unsigned, unsigned) noexcept; -void __visc__bindOut(void*, unsigned, unsigned, unsigned) noexcept; -void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned) noexcept; -void __visc__push(void*, void*) noexcept; -void* __visc__pop(void*) noexcept; -void* __visc__launch(unsigned, ...) noexcept; -void __visc__wait(void*) noexcept; - -void* __visc__getNode() noexcept; -void* __visc__getParentNode(void*) noexcept; +void __visc__bindIn(void *, unsigned, unsigned, unsigned) noexcept; +void __visc__bindOut(void *, unsigned, unsigned, unsigned) noexcept; +void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, + unsigned) noexcept; +void __visc__push(void *, void *) noexcept; +void *__visc__pop(void *) noexcept; +void *__visc__launch(unsigned, ...) noexcept; +void __visc__wait(void *) noexcept; + +void *__visc__getNode() noexcept; +void *__visc__getParentNode(void *) noexcept; void __visc__barrier() noexcept; -void* __visc__malloc(long) noexcept; -long __visc__getNodeInstanceID_x(void*) noexcept; -long __visc__getNodeInstanceID_y(void*) noexcept; -long __visc__getNodeInstanceID_z(void*) noexcept; -long __visc__getNumNodeInstances_x(void*) noexcept; -long __visc__getNumNodeInstances_y(void*) noexcept; -long __visc__getNumNodeInstances_z(void*) noexcept; +void *__visc__malloc(long) noexcept; +long __visc__getNodeInstanceID_x(void *) noexcept; +long __visc__getNodeInstanceID_y(void *) noexcept; +long __visc__getNodeInstanceID_z(void *) noexcept; +long __visc__getNumNodeInstances_x(void *) noexcept; +long __visc__getNumNodeInstances_y(void *) noexcept; +long __visc__getNumNodeInstances_z(void *) noexcept; // Atomic // signed int -int __visc__atomic_cmpxchg(int*, int, int) noexcept; -int __visc__atomic_add(int*, int) noexcept; -int __visc__atomic_sub(int*, int) noexcept; -int __visc__atomic_xchg(int*, int) noexcept; -int __visc__atomic_inc(int*) noexcept; -int __visc__atomic_dec(int*) noexcept; -int __visc__atomic_min(int*, int) noexcept; -int __visc__atomic_max(int*, int) noexcept; -int __visc__atomic_umax(int*, int) noexcept; -int __visc__atomic_umin(int*, int) noexcept; -int __visc__atomic_and(int*, int) noexcept; -int __visc__atomic_or(int*, int) noexcept; -int __visc__atomic_xor(int*, int) noexcept; +int __visc__atomic_cmpxchg(int *, int, int) noexcept; +int __visc__atomic_add(int *, int) noexcept; +int __visc__atomic_sub(int *, int) noexcept; +int __visc__atomic_xchg(int *, int) noexcept; +int __visc__atomic_inc(int *) noexcept; +int __visc__atomic_dec(int *) noexcept; +int __visc__atomic_min(int *, int) noexcept; +int __visc__atomic_max(int *, int) noexcept; +int __visc__atomic_umax(int *, int) noexcept; +int __visc__atomic_umin(int *, int) noexcept; +int __visc__atomic_and(int *, int) noexcept; +int __visc__atomic_or(int *, int) noexcept; +int __visc__atomic_xor(int *, int) noexcept; // Special Func float __visc__floor(float) noexcept; @@ -76,18 +77,17 @@ float __visc__sqrt(float) noexcept; float __visc__sin(float) noexcept; float __visc__cos(float) noexcept; // unsigned int -//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned) noexcept; -//unsigned __visc__atomic_add(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_sub(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_xchg(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_inc(unsigned*) noexcept; -//unsigned __visc__atomic_dec(unsigned*) noexcept; -//unsigned __visc__atomic_min(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_max(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_and(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_or(unsigned*, unsigned) noexcept; -//unsigned __visc__atomic_xor(unsigned*, unsigned) noexcept; - +// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned) noexcept; +// unsigned __visc__atomic_add(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_sub(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_xchg(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_inc(unsigned*) noexcept; +// unsigned __visc__atomic_dec(unsigned*) noexcept; +// unsigned __visc__atomic_min(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_max(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_and(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_or(unsigned*, unsigned) noexcept; +// unsigned __visc__atomic_xor(unsigned*, unsigned) noexcept; #include <unistd.h> @@ -96,12 +96,10 @@ long get_group_id(int) noexcept; long get_local_id(int) noexcept; long get_local_size(int) noexcept; - -void llvm_visc_track_mem(void*, size_t) noexcept; -void llvm_visc_untrack_mem(void*) noexcept; -void llvm_visc_request_mem(void*, size_t) noexcept; +void llvm_visc_track_mem(void *, size_t) noexcept; +void llvm_visc_untrack_mem(void *) noexcept; +void llvm_visc_request_mem(void *, size_t) noexcept; #ifdef __cplusplus } #endif - diff --git a/hpvm/test/parboil/common/src/parboil.c b/hpvm/test/parboil/common/src/parboil.c index 2115271c46a4012889b45fcbffda404068850c2a..bd8f453abbd3af6311fd8df48ae40de8f1183025 100644 --- a/hpvm/test/parboil/common/src/parboil.c +++ b/hpvm/test/parboil/common/src/parboil.c @@ -3,41 +3,41 @@ */ #include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdio.h> #if _POSIX_VERSION >= 200112L #include <time.h> #endif -#define BILLION 1000000000LL +#define BILLION 1000000000LL #define true 1 /* Free an array of owned strings. */ -static void -free_string_array(char **string_array) -{ +static void free_string_array(char **string_array) { char **p; - if (!string_array) return; - for (p = string_array; *p; p++) free(*p); + if (!string_array) + return; + for (p = string_array; *p; p++) + free(*p); free(string_array); } /* Parse a comma-delimited list of strings into an * array of strings. */ -static char ** -read_string_array(char *in) -{ +static char **read_string_array(char *in) { char **ret; int i; - int count; /* Number of items in the input */ - char *substring; /* Current substring within 'in' */ + int count; /* Number of items in the input */ + char *substring; /* Current substring within 'in' */ /* Count the number of items in the string */ count = 1; - for (i = 0; in[i]; i++) if (in[i] == ',') count++; + for (i = 0; in[i]; i++) + if (in[i] == ',') + count++; /* Allocate storage */ ret = (char **)malloc((count + 1) * sizeof(char *)); @@ -50,8 +50,8 @@ read_string_array(char *in) /* Find length of substring */ for (substring_end = substring; - (*substring_end != ',') && (*substring_end != 0); - substring_end++); + (*substring_end != ',') && (*substring_end != 0); substring_end++) + ; substring_length = substring_end - substring; @@ -63,41 +63,35 @@ read_string_array(char *in) /* go to next substring */ substring = substring_end + 1; } - ret[i] = NULL; /* Write the sentinel value */ + ret[i] = NULL; /* Write the sentinel value */ return ret; } struct argparse { - int argc; /* Number of arguments. Mutable. */ - char **argv; /* Argument values. Immutable. */ + int argc; /* Number of arguments. Mutable. */ + char **argv; /* Argument values. Immutable. */ - int argn; /* Current argument number. */ - char **argv_get; /* Argument value being read. */ - char **argv_put; /* Argument value being written. - * argv_put <= argv_get. */ + int argn; /* Current argument number. */ + char **argv_get; /* Argument value being read. */ + char **argv_put; /* Argument value being written. + * argv_put <= argv_get. */ }; -static void -initialize_argparse(struct argparse *ap, int argc, char **argv) -{ +static void initialize_argparse(struct argparse *ap, int argc, char **argv) { ap->argc = argc; ap->argn = 0; ap->argv_get = ap->argv_put = ap->argv = argv; } -static void -finalize_argparse(struct argparse *ap) -{ +static void finalize_argparse(struct argparse *ap) { /* Move the remaining arguments */ - for(; ap->argn < ap->argc; ap->argn++) + for (; ap->argn < ap->argc; ap->argn++) *ap->argv_put++ = *ap->argv_get++; } /* Delete the current argument. */ -static void -delete_argument(struct argparse *ap) -{ +static void delete_argument(struct argparse *ap) { if (ap->argn >= ap->argc) { fprintf(stderr, "delete_argument\n"); } @@ -107,9 +101,7 @@ delete_argument(struct argparse *ap) /* Go to the next argument. Also, move the current argument to its * final location in argv. */ -static void -next_argument(struct argparse *ap) -{ +static void next_argument(struct argparse *ap) { if (ap->argn >= ap->argc) { fprintf(stderr, "next_argument\n"); } @@ -118,33 +110,23 @@ next_argument(struct argparse *ap) ap->argn++; } -static int -is_end_of_arguments(struct argparse *ap) -{ +static int is_end_of_arguments(struct argparse *ap) { return ap->argn == ap->argc; } -static char * -get_argument(struct argparse *ap) -{ - return *ap->argv_get; -} +static char *get_argument(struct argparse *ap) { return *ap->argv_get; } -static char * -consume_argument(struct argparse *ap) -{ +static char *consume_argument(struct argparse *ap) { char *ret = get_argument(ap); delete_argument(ap); return ret; } -struct pb_Parameters * -pb_ReadParameters(int *_argc, char **argv) -{ +struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv) { char *err_message; struct argparse ap; struct pb_Parameters *ret = - (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); + (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); /* Initialize the parameters structure */ ret->outFile = NULL; @@ -153,59 +135,54 @@ pb_ReadParameters(int *_argc, char **argv) /* Each argument */ initialize_argparse(&ap, *_argc, argv); - while(!is_end_of_arguments(&ap)) { + while (!is_end_of_arguments(&ap)) { char *arg = get_argument(&ap); /* Single-character flag */ if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) { - delete_argument(&ap); /* This argument is consumed here */ - - switch(arg[1]) { - case 'o': /* Output file name */ - if (is_end_of_arguments(&ap)) - { - err_message = "Expecting file name after '-o'\n"; - goto error; - } - free(ret->outFile); - ret->outFile = strdup(consume_argument(&ap)); - break; - case 'i': /* Input file name */ - if (is_end_of_arguments(&ap)) - { - err_message = "Expecting file name after '-i'\n"; - goto error; - } - ret->inpFiles = read_string_array(consume_argument(&ap)); - break; - case '-': /* End of options */ - goto end_of_options; + delete_argument(&ap); /* This argument is consumed here */ + + switch (arg[1]) { + case 'o': /* Output file name */ + if (is_end_of_arguments(&ap)) { + err_message = "Expecting file name after '-o'\n"; + goto error; + } + free(ret->outFile); + ret->outFile = strdup(consume_argument(&ap)); + break; + case 'i': /* Input file name */ + if (is_end_of_arguments(&ap)) { + err_message = "Expecting file name after '-i'\n"; + goto error; + } + ret->inpFiles = read_string_array(consume_argument(&ap)); + break; + case '-': /* End of options */ + goto end_of_options; default: - err_message = "Unexpected command-line parameter\n"; - goto error; + err_message = "Unexpected command-line parameter\n"; + goto error; } - } - else { + } else { /* Other parameters are ignored */ next_argument(&ap); } } /* end for each argument */ - end_of_options: - *_argc = ap.argc; /* Save the modified argc value */ +end_of_options: + *_argc = ap.argc; /* Save the modified argc value */ finalize_argparse(&ap); return ret; - error: +error: fputs(err_message, stderr); pb_FreeParameters(ret); return NULL; } -void -pb_FreeParameters(struct pb_Parameters *p) -{ +void pb_FreeParameters(struct pb_Parameters *p) { char **cpp; free(p->outFile); @@ -213,56 +190,47 @@ pb_FreeParameters(struct pb_Parameters *p) free(p); } -int -pb_Parameters_CountInputs(struct pb_Parameters *p) -{ +int pb_Parameters_CountInputs(struct pb_Parameters *p) { int n; - for (n = 0; p->inpFiles[n]; n++); + for (n = 0; p->inpFiles[n]; n++) + ; return n; } /*****************************************************************************/ /* Timer routines */ -static void -accumulate_time(pb_Timestamp *accum, - pb_Timestamp start, - pb_Timestamp end) -{ +static void accumulate_time(pb_Timestamp *accum, pb_Timestamp start, + pb_Timestamp end) { #if _POSIX_VERSION >= 200112L *accum += end - start; #else -# error "Timestamps not implemented for this system" +#error "Timestamps not implemented for this system" #endif } #if _POSIX_VERSION >= 200112L -static pb_Timestamp get_time() -{ +static pb_Timestamp get_time() { struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); - return (pb_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec); + return (pb_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); } #else -# error "no supported time libraries are available on this platform" +#error "no supported time libraries are available on this platform" #endif -void -pb_ResetTimer(struct pb_Timer *timer) -{ +void pb_ResetTimer(struct pb_Timer *timer) { timer->state = pb_Timer_STOPPED; #if _POSIX_VERSION >= 200112L timer->elapsed = 0; #else -# error "pb_ResetTimer: not implemented for this system" +#error "pb_ResetTimer: not implemented for this system" #endif } -void -pb_StartTimer(struct pb_Timer *timer) -{ +void pb_StartTimer(struct pb_Timer *timer) { if (timer->state != pb_Timer_STOPPED) { fputs("Ignoring attempt to start a running timer\n", stderr); return; @@ -277,13 +245,12 @@ pb_StartTimer(struct pb_Timer *timer) timer->init = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "pb_StartTimer: not implemented for this system" +#error "pb_StartTimer: not implemented for this system" #endif } -void -pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) -{ +void pb_StartTimerAndSubTimer(struct pb_Timer *timer, + struct pb_Timer *subtimer) { unsigned int numNotStopped = 0x3; // 11 if (timer->state != pb_Timer_STOPPED) { fputs("Warning: Timer was not stopped\n", stderr); @@ -305,24 +272,21 @@ pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); - + if (numNotStopped & 0x2) { timer->init = tv.tv_sec * BILLION + tv.tv_nsec; } - + if (numNotStopped & 0x1) { subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec; } } #else -# error "pb_StartTimer: not implemented for this system" +#error "pb_StartTimer: not implemented for this system" #endif - } -void -pb_StopTimer(struct pb_Timer *timer) -{ +void pb_StopTimer(struct pb_Timer *timer) { pb_Timestamp fini; @@ -340,15 +304,15 @@ pb_StopTimer(struct pb_Timer *timer) fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "pb_StopTimer: not implemented for this system" +#error "pb_StopTimer: not implemented for this system" #endif accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; - } -void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, + struct pb_Timer *subtimer) { pb_Timestamp fini; @@ -366,7 +330,6 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) return; } - timer->state = pb_Timer_STOPPED; subtimer->state = pb_Timer_STOPPED; @@ -377,25 +340,22 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "pb_StopTimer: not implemented for this system" +#error "pb_StopTimer: not implemented for this system" #endif if (numNotRunning & 0x2) { accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } - + if (numNotRunning & 0x1) { accumulate_time(&subtimer->elapsed, subtimer->init, fini); subtimer->init = fini; } - } /* Get the elapsed time in seconds. */ -double -pb_GetElapsedTime(struct pb_Timer *timer) -{ +double pb_GetElapsedTime(struct pb_Timer *timer) { double ret; if (timer->state != pb_Timer_STOPPED) { @@ -405,22 +365,19 @@ pb_GetElapsedTime(struct pb_Timer *timer) #if _POSIX_VERSION >= 200112L ret = timer->elapsed / 1e9; #else -# error "pb_GetElapsedTime: not implemented for this system" +#error "pb_GetElapsedTime: not implemented for this system" #endif return ret; } -void -pb_InitializeTimerSet(struct pb_TimerSet *timers) -{ +void pb_InitializeTimerSet(struct pb_TimerSet *timers) { int n; - + timers->wall_begin = get_time(); timers->current = pb_TimerID_NONE; timers->async_markers = NULL; - for (n = 0; n < pb_TimerID_LAST; n++) { pb_ResetTimer(&timers->timers[n]); @@ -428,24 +385,24 @@ pb_InitializeTimerSet(struct pb_TimerSet *timers) } } -void -pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { - - struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc - (sizeof(struct pb_SubTimer)); - +void pb_AddSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID pb_Category) { + + struct pb_SubTimer *subtimer = + (struct pb_SubTimer *)malloc(sizeof(struct pb_SubTimer)); + int len = strlen(label); - - subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + + subtimer->label = (char *)malloc(sizeof(char) * (len + 1)); sprintf(subtimer->label, "%s\0", label); - + pb_ResetTimer(&subtimer->timer); subtimer->next = NULL; - + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; if (subtimerlist == NULL) { - subtimerlist = (struct pb_SubTimerList *) malloc - (sizeof(struct pb_SubTimerList)); + subtimerlist = + (struct pb_SubTimerList *)malloc(sizeof(struct pb_SubTimerList)); subtimerlist->subtimer_list = subtimer; timers->sub_timer_list[pb_Category] = subtimerlist; } else { @@ -456,28 +413,30 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ } element->next = subtimer; } - } -void -pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) -{ +void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID category) { + + // switchToSub( NULL, NONE + // switchToSub( NULL, some + // switchToSub( some, some + // switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which + // won't be printed -// switchToSub( NULL, NONE -// switchToSub( NULL, some -// switchToSub( some, some -// switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which won't be printed - struct pb_Timer *topLevelToStop = NULL; if (timers->current != category && timers->current != pb_TimerID_NONE) { - // Switching to subtimer in a different category needs to stop the top-level current, different categoried timer. - // NONE shouldn't have a timer associated with it, so exclude from branch + // Switching to subtimer in a different category needs to stop the top-level + // current, different categoried timer. NONE shouldn't have a timer + // associated with it, so exclude from branch topLevelToStop = &timers->timers[timers->current]; - } + } + + struct pb_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = + (subtimerlist == NULL) ? NULL : subtimerlist->current; - struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct pb_SubTimer *curr = (subtimerlist == NULL) ? NULL : subtimerlist->current; - if (timers->current != pb_TimerID_NONE) { if (curr != NULL && topLevelToStop != NULL) { pb_StopTimerAndSubTimer(topLevelToStop, &curr->timer); @@ -487,11 +446,11 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat pb_StopTimer(topLevelToStop); } } - + subtimerlist = timers->sub_timer_list[category]; struct pb_SubTimer *subtimer = NULL; - - if (label != NULL) { + + if (label != NULL) { subtimer = subtimerlist->subtimer_list; while (subtimer != NULL) { if (strcmp(subtimer->label, label) == 0) { @@ -500,48 +459,47 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat subtimer = subtimer->next; } } - } - + } + if (category != pb_TimerID_NONE) { - + if (subtimerlist != NULL) { subtimerlist->current = subtimer; } - + if (category != timers->current && subtimer != NULL) { pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); } else if (subtimer != NULL) { // Same category, different non-NULL subtimer pb_StartTimer(&subtimer->timer); - } else{ - // Different category, but no subtimer (not found or specified as NULL) -- unprefered way of setting topLevel timer + } else { + // Different category, but no subtimer (not found or specified as NULL) -- + // unprefered way of setting topLevel timer pb_StartTimer(&timers->timers[category]); } - } - + } + timers->current = category; - } -void -pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) -{ - if(timer == pb_TimerID_KERNEL) +void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) { + if (timer == pb_TimerID_KERNEL) printf("In parboil.c\n"); /* Stop the currently running timer */ if (timers->current != pb_TimerID_NONE) { struct pb_SubTimer *currSubTimer = NULL; - struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - - if ( subtimerlist != NULL) { + struct pb_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + + if (subtimerlist != NULL) { currSubTimer = timers->sub_timer_list[timers->current]->current; } - if ( currSubTimer!= NULL) { - pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + if (currSubTimer != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], + &currSubTimer->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } - } timers->current = timer; @@ -551,40 +509,39 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) } } -void -pb_PrintTimerSet(struct pb_TimerSet *timers) -{ +void pb_PrintTimerSet(struct pb_TimerSet *timers) { printf("Printing Parboil Timer: Default\n"); pb_Timestamp wall_end = get_time(); struct pb_Timer *t = timers->timers; - struct pb_SubTimer* sub = NULL; - + struct pb_SubTimer *sub = NULL; + int maxSubLength; - -// const char *categories[] = { -// "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" -// }; - const char *categories[] = { - "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap", - "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", - "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc", - "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack" - }; + // const char *categories[] = { + // "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + // }; + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", + "Copy Async", "Compute", "Overlap", "Init_Ctx", + "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", + "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", + "Misc", "Pthread_Create", "Arg_Pack", "Arg_Unpack", + "Computation", "Output_Pack", "Output_Unpack"}; - const int maxCategoryLength = 20; - + int i; - for(i = 1; i < pb_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if(pb_GetElapsedTime(&t[i]) != 0 || true) { - + for (i = 1; i < pb_TimerID_LAST; + ++i) { // exclude NONE and OVRELAP from this format + if (pb_GetElapsedTime(&t[i]) != 0 || true) { + // Print Category Timer - printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); - + printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1], + pb_GetElapsedTime(&t[i])); + if (timers->sub_timer_list[i] != NULL) { sub = timers->sub_timer_list[i]->subtimer_list; maxSubLength = 0; @@ -595,44 +552,44 @@ pb_PrintTimerSet(struct pb_TimerSet *timers) } sub = sub->next; } - + // Fit to Categories if (maxSubLength <= maxCategoryLength) { - maxSubLength = maxCategoryLength; + maxSubLength = maxCategoryLength; } - + sub = timers->sub_timer_list[i]->subtimer_list; - + // Print SubTimers while (sub != NULL) { - printf(" -%-*s: %.9f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + printf(" -%-*s: %.9f\n", maxSubLength, sub->label, + pb_GetElapsedTime(&sub->timer)); sub = sub->next; } } } } - - if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) - printf("CPU/Kernel Overlap: %.9f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); - - float walltime = (wall_end - timers->wall_begin)/ 1e9; - printf("Timer Wall Time: %.9f\n", walltime); - + + if (pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %.9f\n", + pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin) / 1e9; + printf("Timer Wall Time: %.9f\n", walltime); } -void pb_DestroyTimerSet(struct pb_TimerSet * timers) -{ +void pb_DestroyTimerSet(struct pb_TimerSet *timers) { /* clean up all of the async event markers */ - struct pb_async_time_marker_list ** event = &(timers->async_markers); - while( *event != NULL) { - struct pb_async_time_marker_list ** next = &((*event)->next); + struct pb_async_time_marker_list **event = &(timers->async_markers); + while (*event != NULL) { + struct pb_async_time_marker_list **next = &((*event)->next); free(*event); (*event) = NULL; event = next; } - + int i = 0; - for(i = 0; i < pb_TimerID_LAST; ++i) { + for (i = 0; i < pb_TimerID_LAST; ++i) { if (timers->sub_timer_list[i] != NULL) { struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; struct pb_SubTimer *prev = NULL; @@ -646,5 +603,3 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers) } } } - - diff --git a/hpvm/test/parboil/common/src/parboil_cuda.c b/hpvm/test/parboil/common/src/parboil_cuda.c index d1bf554cc3219e20ce4bc0e76c6acfdd0091a9a7..9fd64661643c9afec5cb470beaa516d545017bd3 100644 --- a/hpvm/test/parboil/common/src/parboil_cuda.c +++ b/hpvm/test/parboil/common/src/parboil_cuda.c @@ -3,9 +3,9 @@ */ #include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdio.h> #ifndef __MCUDA__ #include <cuda_runtime_api.h> #else @@ -13,35 +13,35 @@ #endif #if _POSIX_VERSION >= 200112L -# include <sys/time.h> +#include <sys/time.h> #endif #define true 1 /* Free an array of owned strings. */ -static void -free_string_array(char **string_array) -{ +static void free_string_array(char **string_array) { char **p; - if (!string_array) return; - for (p = string_array; *p; p++) free(*p); + if (!string_array) + return; + for (p = string_array; *p; p++) + free(*p); free(string_array); } /* Parse a comma-delimited list of strings into an * array of strings. */ -static char ** -read_string_array(char *in) -{ +static char **read_string_array(char *in) { char **ret; int i; - int count; /* Number of items in the input */ - char *substring; /* Current substring within 'in' */ + int count; /* Number of items in the input */ + char *substring; /* Current substring within 'in' */ /* Count the number of items in the string */ count = 1; - for (i = 0; in[i]; i++) if (in[i] == ',') count++; + for (i = 0; in[i]; i++) + if (in[i] == ',') + count++; /* Allocate storage */ ret = (char **)malloc((count + 1) * sizeof(char *)); @@ -54,8 +54,8 @@ read_string_array(char *in) /* Find length of substring */ for (substring_end = substring; - (*substring_end != ',') && (*substring_end != 0); - substring_end++); + (*substring_end != ',') && (*substring_end != 0); substring_end++) + ; substring_length = substring_end - substring; @@ -67,41 +67,35 @@ read_string_array(char *in) /* go to next substring */ substring = substring_end + 1; } - ret[i] = NULL; /* Write the sentinel value */ + ret[i] = NULL; /* Write the sentinel value */ return ret; } struct argparse { - int argc; /* Number of arguments. Mutable. */ - char **argv; /* Argument values. Immutable. */ + int argc; /* Number of arguments. Mutable. */ + char **argv; /* Argument values. Immutable. */ - int argn; /* Current argument number. */ - char **argv_get; /* Argument value being read. */ - char **argv_put; /* Argument value being written. - * argv_put <= argv_get. */ + int argn; /* Current argument number. */ + char **argv_get; /* Argument value being read. */ + char **argv_put; /* Argument value being written. + * argv_put <= argv_get. */ }; -static void -initialize_argparse(struct argparse *ap, int argc, char **argv) -{ +static void initialize_argparse(struct argparse *ap, int argc, char **argv) { ap->argc = argc; ap->argn = 0; ap->argv_get = ap->argv_put = ap->argv = argv; } -static void -finalize_argparse(struct argparse *ap) -{ +static void finalize_argparse(struct argparse *ap) { /* Move the remaining arguments */ - for(; ap->argn < ap->argc; ap->argn++) + for (; ap->argn < ap->argc; ap->argn++) *ap->argv_put++ = *ap->argv_get++; } /* Delete the current argument. */ -static void -delete_argument(struct argparse *ap) -{ +static void delete_argument(struct argparse *ap) { if (ap->argn >= ap->argc) { fprintf(stderr, "delete_argument\n"); } @@ -111,9 +105,7 @@ delete_argument(struct argparse *ap) /* Go to the next argument. Also, move the current argument to its * final location in argv. */ -static void -next_argument(struct argparse *ap) -{ +static void next_argument(struct argparse *ap) { if (ap->argn >= ap->argc) { fprintf(stderr, "next_argument\n"); } @@ -122,33 +114,23 @@ next_argument(struct argparse *ap) ap->argn++; } -static int -is_end_of_arguments(struct argparse *ap) -{ +static int is_end_of_arguments(struct argparse *ap) { return ap->argn == ap->argc; } -static char * -get_argument(struct argparse *ap) -{ - return *ap->argv_get; -} +static char *get_argument(struct argparse *ap) { return *ap->argv_get; } -static char * -consume_argument(struct argparse *ap) -{ +static char *consume_argument(struct argparse *ap) { char *ret = get_argument(ap); delete_argument(ap); return ret; } -struct pb_Parameters * -pb_ReadParameters(int *_argc, char **argv) -{ +struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv) { char *err_message; struct argparse ap; struct pb_Parameters *ret = - (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); + (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); /* Initialize the parameters structure */ ret->outFile = NULL; @@ -157,59 +139,54 @@ pb_ReadParameters(int *_argc, char **argv) /* Each argument */ initialize_argparse(&ap, *_argc, argv); - while(!is_end_of_arguments(&ap)) { + while (!is_end_of_arguments(&ap)) { char *arg = get_argument(&ap); /* Single-character flag */ if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) { - delete_argument(&ap); /* This argument is consumed here */ - - switch(arg[1]) { - case 'o': /* Output file name */ - if (is_end_of_arguments(&ap)) - { - err_message = "Expecting file name after '-o'\n"; - goto error; - } - free(ret->outFile); - ret->outFile = strdup(consume_argument(&ap)); - break; - case 'i': /* Input file name */ - if (is_end_of_arguments(&ap)) - { - err_message = "Expecting file name after '-i'\n"; - goto error; - } - ret->inpFiles = read_string_array(consume_argument(&ap)); - break; - case '-': /* End of options */ - goto end_of_options; + delete_argument(&ap); /* This argument is consumed here */ + + switch (arg[1]) { + case 'o': /* Output file name */ + if (is_end_of_arguments(&ap)) { + err_message = "Expecting file name after '-o'\n"; + goto error; + } + free(ret->outFile); + ret->outFile = strdup(consume_argument(&ap)); + break; + case 'i': /* Input file name */ + if (is_end_of_arguments(&ap)) { + err_message = "Expecting file name after '-i'\n"; + goto error; + } + ret->inpFiles = read_string_array(consume_argument(&ap)); + break; + case '-': /* End of options */ + goto end_of_options; default: - err_message = "Unexpected command-line parameter\n"; - goto error; + err_message = "Unexpected command-line parameter\n"; + goto error; } - } - else { + } else { /* Other parameters are ignored */ next_argument(&ap); } } /* end for each argument */ - end_of_options: - *_argc = ap.argc; /* Save the modified argc value */ +end_of_options: + *_argc = ap.argc; /* Save the modified argc value */ finalize_argparse(&ap); return ret; - error: +error: fputs(err_message, stderr); pb_FreeParameters(ret); return NULL; } -void -pb_FreeParameters(struct pb_Parameters *p) -{ +void pb_FreeParameters(struct pb_Parameters *p) { char **cpp; free(p->outFile); @@ -217,61 +194,54 @@ pb_FreeParameters(struct pb_Parameters *p) free(p); } -int -pb_Parameters_CountInputs(struct pb_Parameters *p) -{ +int pb_Parameters_CountInputs(struct pb_Parameters *p) { int n; - for (n = 0; p->inpFiles[n]; n++); + for (n = 0; p->inpFiles[n]; n++) + ; return n; } /*****************************************************************************/ /* Timer routines */ -static int is_async(enum pb_TimerID timer) -{ - return (timer == pb_TimerID_KERNEL) || - (timer == pb_TimerID_COPY_ASYNC); +static int is_async(enum pb_TimerID timer) { + return (timer == pb_TimerID_KERNEL) || (timer == pb_TimerID_COPY_ASYNC); } -static int is_blocking(enum pb_TimerID timer) -{ +static int is_blocking(enum pb_TimerID timer) { return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE); } #define INVALID_TIMERID pb_TimerID_LAST -static int asyncs_outstanding(struct pb_TimerSet* timers) -{ - return (timers->async_markers != NULL) && - (timers->async_markers->timerID != INVALID_TIMERID); +static int asyncs_outstanding(struct pb_TimerSet *timers) { + return (timers->async_markers != NULL) && + (timers->async_markers->timerID != INVALID_TIMERID); } -static struct pb_async_time_marker_list * -get_last_async(struct pb_TimerSet* timers) -{ +static struct pb_async_time_marker_list * +get_last_async(struct pb_TimerSet *timers) { /* Find the last event recorded thus far */ - struct pb_async_time_marker_list * last_event = timers->async_markers; - if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { - while(last_event->next != NULL && - last_event->next->timerID != INVALID_TIMERID) + struct pb_async_time_marker_list *last_event = timers->async_markers; + if (last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while (last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) last_event = last_event->next; return last_event; } else return NULL; -} +} -static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer) -{ - struct pb_async_time_marker_list ** new_event = &(tset->async_markers); +static void insert_marker(struct pb_TimerSet *tset, enum pb_TimerID timer) { + struct pb_async_time_marker_list **new_event = &(tset->async_markers); - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) + while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) new_event = &((*new_event)->next); - if(*new_event == NULL) { - *new_event = (struct pb_async_time_marker_list *) - malloc(sizeof(struct pb_async_time_marker_list)); + if (*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *)malloc( + sizeof(struct pb_async_time_marker_list)); (*new_event)->marker = malloc(sizeof(cudaEvent_t)); cudaEventCreate((*new_event)->marker); (*new_event)->next = NULL; @@ -281,19 +251,18 @@ static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer) (*new_event)->label = NULL; (*new_event)->timerID = timer; cudaEventRecord(*((cudaEvent_t *)((*new_event)->marker)), 0); - } -static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer) -{ - struct pb_async_time_marker_list ** new_event = &(tset->async_markers); +static void insert_submarker(struct pb_TimerSet *tset, char *label, + enum pb_TimerID timer) { + struct pb_async_time_marker_list **new_event = &(tset->async_markers); - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) + while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) new_event = &((*new_event)->next); - if(*new_event == NULL) { - *new_event = (struct pb_async_time_marker_list *) - malloc(sizeof(struct pb_async_time_marker_list)); + if (*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *)malloc( + sizeof(struct pb_async_time_marker_list)); (*new_event)->marker = malloc(sizeof(cudaEvent_t)); cudaEventCreate((*new_event)->marker); @@ -304,84 +273,73 @@ static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_Time (*new_event)->label = label; (*new_event)->timerID = timer; cudaEventRecord(*((cudaEvent_t *)((*new_event)->marker)), 0); - } - /* Assumes that all recorded events have completed */ -static pb_Timestamp record_async_times(struct pb_TimerSet* tset) -{ - struct pb_async_time_marker_list * next_interval = NULL; - struct pb_async_time_marker_list * last_marker = get_last_async(tset); +static pb_Timestamp record_async_times(struct pb_TimerSet *tset) { + struct pb_async_time_marker_list *next_interval = NULL; + struct pb_async_time_marker_list *last_marker = get_last_async(tset); pb_Timestamp total_async_time = 0; enum pb_TimerID timer; - for(next_interval = tset->async_markers; next_interval != last_marker; - next_interval = next_interval->next) { + for (next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { float interval_time_ms; - cudaEventElapsedTime(&interval_time_ms, *((cudaEvent_t *)next_interval->marker), - *((cudaEvent_t *)next_interval->next->marker)); - pb_Timestamp interval = (pb_Timestamp) (interval_time_ms * 1e3); + cudaEventElapsedTime(&interval_time_ms, + *((cudaEvent_t *)next_interval->marker), + *((cudaEvent_t *)next_interval->next->marker)); + pb_Timestamp interval = (pb_Timestamp)(interval_time_ms * 1e3); tset->timers[next_interval->timerID].elapsed += interval; if (next_interval->label != NULL) { - struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + struct pb_SubTimer *subtimer = + tset->sub_timer_list[next_interval->timerID]->subtimer_list; while (subtimer != NULL) { - if ( strcmp(subtimer->label, next_interval->label) == 0) { + if (strcmp(subtimer->label, next_interval->label) == 0) { subtimer->timer.elapsed += interval; break; } subtimer = subtimer->next; - } - } + } + } total_async_time += interval; next_interval->timerID = INVALID_TIMERID; } - if(next_interval != NULL) + if (next_interval != NULL) next_interval->timerID = INVALID_TIMERID; - - return total_async_time; } -static void -accumulate_time(pb_Timestamp *accum, - pb_Timestamp start, - pb_Timestamp end) -{ +static void accumulate_time(pb_Timestamp *accum, pb_Timestamp start, + pb_Timestamp end) { #if _POSIX_VERSION >= 200112L *accum += end - start; #else -# error "Timestamps not implemented for this system" +#error "Timestamps not implemented for this system" #endif } #if _POSIX_VERSION >= 200112L -static pb_Timestamp get_time() -{ +static pb_Timestamp get_time() { struct timeval tv; gettimeofday(&tv, NULL); - return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); + return (pb_Timestamp)(tv.tv_sec * 1000000LL + tv.tv_usec); } #else -# error "no supported time libraries are available on this platform" +#error "no supported time libraries are available on this platform" #endif -void -pb_ResetTimer(struct pb_Timer *timer) -{ +void pb_ResetTimer(struct pb_Timer *timer) { timer->state = pb_Timer_STOPPED; #if _POSIX_VERSION >= 200112L timer->elapsed = 0; #else -# error "pb_ResetTimer: not implemented for this system" +#error "pb_ResetTimer: not implemented for this system" #endif } -void -pb_StartTimer(struct pb_Timer *timer) -{ +void pb_StartTimer(struct pb_Timer *timer) { if (timer->state != pb_Timer_STOPPED) { fputs("Ignoring attempt to start a running timer\n", stderr); return; @@ -396,13 +354,12 @@ pb_StartTimer(struct pb_Timer *timer) timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; } #else -# error "pb_StartTimer: not implemented for this system" +#error "pb_StartTimer: not implemented for this system" #endif } -void -pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) -{ +void pb_StartTimerAndSubTimer(struct pb_Timer *timer, + struct pb_Timer *subtimer) { unsigned int numNotStopped = 0x3; // 11 if (timer->state != pb_Timer_STOPPED) { @@ -425,24 +382,21 @@ pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { struct timeval tv; gettimeofday(&tv, NULL); - + if (numNotStopped & 0x2) { timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; } - + if (numNotStopped & 0x1) { subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; } } #else -# error "pb_StartTimer: not implemented for this system" +#error "pb_StartTimer: not implemented for this system" #endif - } -void -pb_StopTimer(struct pb_Timer *timer) -{ +void pb_StopTimer(struct pb_Timer *timer) { pb_Timestamp fini; if (timer->state != pb_Timer_RUNNING) { @@ -459,14 +413,15 @@ pb_StopTimer(struct pb_Timer *timer) fini = tv.tv_sec * 1000000LL + tv.tv_usec; } #else -# error "pb_StopTimer: not implemented for this system" +#error "pb_StopTimer: not implemented for this system" #endif accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } -void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, + struct pb_Timer *subtimer) { pb_Timestamp fini; @@ -484,7 +439,6 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) return; } - timer->state = pb_Timer_STOPPED; subtimer->state = pb_Timer_STOPPED; @@ -495,25 +449,22 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) fini = tv.tv_sec * 1000000LL + tv.tv_usec; } #else -# error "pb_StopTimer: not implemented for this system" +#error "pb_StopTimer: not implemented for this system" #endif if (numNotRunning & 0x2) { accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } - + if (numNotRunning & 0x1) { accumulate_time(&subtimer->elapsed, subtimer->init, fini); subtimer->init = fini; } - } /* Get the elapsed time in seconds. */ -double -pb_GetElapsedTime(struct pb_Timer *timer) -{ +double pb_GetElapsedTime(struct pb_Timer *timer) { double ret; if (timer->state != pb_Timer_STOPPED) { @@ -523,14 +474,12 @@ pb_GetElapsedTime(struct pb_Timer *timer) #if _POSIX_VERSION >= 200112L ret = timer->elapsed / 1e6; #else -# error "pb_GetElapsedTime: not implemented for this system" +#error "pb_GetElapsedTime: not implemented for this system" #endif return ret; } -void -pb_InitializeTimerSet(struct pb_TimerSet *timers) -{ +void pb_InitializeTimerSet(struct pb_TimerSet *timers) { int n; timers->wall_begin = get_time(); @@ -544,29 +493,26 @@ pb_InitializeTimerSet(struct pb_TimerSet *timers) } } -void -pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr) { +void pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr) {} -} +void pb_AddSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID pb_Category) { + + struct pb_SubTimer *subtimer = + (struct pb_SubTimer *)malloc(sizeof(struct pb_SubTimer)); -void -pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { - - struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc - (sizeof(struct pb_SubTimer)); - int len = strlen(label); - - subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + + subtimer->label = (char *)malloc(sizeof(char) * (len + 1)); sprintf(subtimer->label, "%s\0", label); - + pb_ResetTimer(&subtimer->timer); subtimer->next = NULL; - + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; if (subtimerlist == NULL) { - subtimerlist = (struct pb_SubTimerList *) malloc - (sizeof(struct pb_SubTimerList)); + subtimerlist = + (struct pb_SubTimerList *)malloc(sizeof(struct pb_SubTimerList)); subtimerlist->subtimer_list = subtimer; timers->sub_timer_list[pb_Category] = subtimerlist; } else { @@ -577,21 +523,21 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ } element->next = subtimer; } - } -void -pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) -{ +void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) { /* Stop the currently running timer */ if (timers->current != pb_TimerID_NONE) { - struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; - - if (!is_async(timers->current) ) { + struct pb_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct pb_SubTimer *currSubTimer = + (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (!is_async(timers->current)) { if (timers->current != timer) { if (currSubTimer != NULL) { - pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + pb_StopTimerAndSubTimer(&timers->timers[timers->current], + &currSubTimer->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } @@ -607,67 +553,68 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) } } } - + pb_Timestamp currentTime = get_time(); - /* The only cases we check for asynchronous task completion is - * when an overlapping CPU operation completes, or the next + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(timer) ) ) { + if (asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer))) { - struct pb_async_time_marker_list * last_event = get_last_async(timers); + struct pb_async_time_marker_list *last_event = get_last_async(timers); /* cudaSuccess if completed */ - cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker)); + cudaError_t async_done = + cudaEventQuery(*((cudaEvent_t *)last_event->marker)); - if(is_blocking(timer)) { - /* Async operations completed after previous CPU operations: - * overlapped time is the total CPU time since this set of async + if (is_blocking(timer)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async * operations were first issued */ - - // timer to switch to is COPY or NONE - if(async_done != cudaSuccess) - accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); + + // timer to switch to is COPY or NONE + if (async_done != cudaSuccess) + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin, currentTime); /* Wait on async operation completion */ cudaEventSynchronize(*((cudaEvent_t *)last_event->marker)); pb_Timestamp total_async_time = record_async_times(timers); - /* Async operations completed before previous CPU operations: + /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - if(async_done == cudaSuccess) + if (async_done == cudaSuccess) timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; - } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == cudaSuccess) { - /* Async operations completed before previous CPU operations: + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are + // outstanding so something is deeper in stack + if (async_done == cudaSuccess) { + /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); - } + } } /* Start the new timer */ if (timer != pb_TimerID_NONE) { - if(!is_async(timer)) { + if (!is_async(timer)) { pb_StartTimer(&timers->timers[timer]); } else { // toSwitchTo Is Async (KERNEL/COPY_ASYNC) if (!asyncs_outstanding(timers)) { /* No asyncs outstanding, insert a fresh async marker */ - + insert_marker(timers, timer); timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { + } else if (!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo - * already marked the end of the most recent async operation, - * so we can rename that marker as the beginning of this async + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async * operation */ - - struct pb_async_time_marker_list * last_event = get_last_async(timers); + + struct pb_async_time_marker_list *last_event = get_last_async(timers); last_event->label = NULL; last_event->timerID = timer; } @@ -677,20 +624,21 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) } } timers->current = timer; - } -void -pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) -{ - struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; - +void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID category) { + struct pb_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = + (subtimerlist != NULL) ? subtimerlist->current : NULL; + if (timers->current != pb_TimerID_NONE) { - if (!is_async(timers->current) ) { + if (!is_async(timers->current)) { if (timers->current != category) { if (curr != NULL) { - pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + pb_StopTimerAndSubTimer(&timers->timers[timers->current], + &curr->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } @@ -709,56 +657,59 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat pb_Timestamp currentTime = get_time(); - /* The only cases we check for asynchronous task completion is - * when an overlapping CPU operation completes, or the next + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(category) ) ) { + if (asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category))) { - struct pb_async_time_marker_list * last_event = get_last_async(timers); + struct pb_async_time_marker_list *last_event = get_last_async(timers); /* cudaSuccess if completed */ - cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker)); + cudaError_t async_done = + cudaEventQuery(*((cudaEvent_t *)last_event->marker)); - if(is_blocking(category)) { - /* Async operations completed after previous CPU operations: - * overlapped time is the total CPU time since this set of async + if (is_blocking(category)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async * operations were first issued */ - - // timer to switch to is COPY or NONE - // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP - // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization - // it seems that the extra sync wall time isn't being recorded anywhere - if(async_done != cudaSuccess) - accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); + + // timer to switch to is COPY or NONE + // if it hasn't already finished, then just take now and use that as the + // elapsed time in OVERLAP anything happening after now isn't OVERLAP + // because everything is being stopped to wait for synchronization it + // seems that the extra sync wall time isn't being recorded anywhere + if (async_done != cudaSuccess) + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin, currentTime); /* Wait on async operation completion */ cudaEventSynchronize(*((cudaEvent_t *)last_event->marker)); pb_Timestamp total_async_time = record_async_times(timers); - /* Async operations completed before previous CPU operations: + /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - // If it did finish, then accumulate all the async time that did happen into OVERLAP - // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. - if(async_done == cudaSuccess) + // If it did finish, then accumulate all the async time that did happen + // into OVERLAP the immediately preceding EventSynchronize theoretically + // didn't have any effect since it was already completed. + if (async_done == cudaSuccess) timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; - } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == cudaSuccess) { - /* Async operations completed before previous CPU operations: + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are + // outstanding so something is deeper in stack + if (async_done == cudaSuccess) { + /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); - } + } // else, this isn't blocking, so just check the next time around } - + subtimerlist = timers->sub_timer_list[category]; struct pb_SubTimer *subtimer = NULL; - - if (label != NULL) { + + if (label != NULL) { subtimer = subtimerlist->subtimer_list; while (subtimer != NULL) { if (strcmp(subtimer->label, label) == 0) { @@ -771,80 +722,81 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat /* Start the new timer */ if (category != pb_TimerID_NONE) { - if(!is_async(category)) { - + if (!is_async(category)) { + if (subtimerlist != NULL) { subtimerlist->current = subtimer; } - + if (category != timers->current && subtimer != NULL) { pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); } else if (subtimer != NULL) { pb_StartTimer(&subtimer->timer); } else { pb_StartTimer(&timers->timers[category]); - } + } } else { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } - + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) if (!asyncs_outstanding(timers)) { /* No asyncs outstanding, insert a fresh async marker */ insert_submarker(timers, label, category); timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { + } else if (!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo - * already marked the end of the most recent async operation, - * so we can rename that marker as the beginning of this async + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async * operation */ - - struct pb_async_time_marker_list * last_event = get_last_async(timers); + + struct pb_async_time_marker_list *last_event = get_last_async(timers); last_event->timerID = category; last_event->label = label; } // else, marker for switchToThis was already inserted - - //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + + // toSwitchto is already asynchronous, but if current/prev state is async + // too, then DRIVER is already running if (!is_async(timers->current)) { pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); } } } - - timers->current = category; + + timers->current = category; } -void -pb_PrintTimerSet(struct pb_TimerSet *timers) -{ +void pb_PrintTimerSet(struct pb_TimerSet *timers) { pb_Timestamp wall_end = get_time(); struct pb_Timer *t = timers->timers; - struct pb_SubTimer* sub = NULL; - + struct pb_SubTimer *sub = NULL; + int maxSubLength; - -// const char *categories[] = { -// "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" -// }; - const char *categories[] = { - "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap", - "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", - "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc", - "Pthread_Create", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack" - }; + // const char *categories[] = { + // "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + // }; + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", + "Copy Async", "Compute", "Overlap", "Init_Ctx", + "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", + "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", + "Misc", "Pthread_Create", "Arg_Unpack", "Computation", + "Output_Pack", "Output_Unpack"}; const int maxCategoryLength = 10; - + int i; - for(i = 1; i < pb_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if(pb_GetElapsedTime(&t[i]) != 0 || true) { - + for (i = 1; i < pb_TimerID_LAST; + ++i) { // exclude NONE and OVRELAP from this format + if (pb_GetElapsedTime(&t[i]) != 0 || true) { + // Print Category Timer - printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); - + printf("%-*s: %f\n", maxCategoryLength, categories[i - 1], + pb_GetElapsedTime(&t[i])); + if (timers->sub_timer_list[i] != NULL) { sub = timers->sub_timer_list[i]->subtimer_list; maxSubLength = 0; @@ -855,47 +807,47 @@ pb_PrintTimerSet(struct pb_TimerSet *timers) } sub = sub->next; } - + // Fit to Categories if (maxSubLength <= maxCategoryLength) { - maxSubLength = maxCategoryLength; + maxSubLength = maxCategoryLength; } - + sub = timers->sub_timer_list[i]->subtimer_list; - + // Print SubTimers while (sub != NULL) { - printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + printf(" -%-*s: %f\n", maxSubLength, sub->label, + pb_GetElapsedTime(&sub->timer)); sub = sub->next; } } } } - - if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) - printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); - - float walltime = (wall_end - timers->wall_begin)/ 1e6; + + if (pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", + pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin) / 1e6; printf("Timer Wall Time: %f\n", walltime); - } -void pb_DestroyTimerSet(struct pb_TimerSet * timers) -{ +void pb_DestroyTimerSet(struct pb_TimerSet *timers) { /* clean up all of the async event markers */ - struct pb_async_time_marker_list ** event = &(timers->async_markers); - while( *event != NULL) { + struct pb_async_time_marker_list **event = &(timers->async_markers); + while (*event != NULL) { cudaEventSynchronize(*((cudaEvent_t *)(*event)->marker)); cudaEventDestroy(*((cudaEvent_t *)(*event)->marker)); free((*event)->marker); - struct pb_async_time_marker_list ** next = &((*event)->next); + struct pb_async_time_marker_list **next = &((*event)->next); free(*event); (*event) = NULL; event = next; } - + int i = 0; - for(i = 0; i < pb_TimerID_LAST; ++i) { + for (i = 0; i < pb_TimerID_LAST; ++i) { if (timers->sub_timer_list[i] != NULL) { struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; struct pb_SubTimer *prev = NULL; @@ -909,5 +861,3 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers) } } } - - diff --git a/hpvm/test/parboil/common/src/parboil_opencl.c b/hpvm/test/parboil/common/src/parboil_opencl.c index 5f1937f356892489bd78ed5b2fb238d886de2f9a..d493992acee859186d58330a9988ef7ef2571f73 100644 --- a/hpvm/test/parboil/common/src/parboil_opencl.c +++ b/hpvm/test/parboil/common/src/parboil_opencl.c @@ -2,47 +2,47 @@ * (c) 2007 The Board of Trustees of the University of Illinois. */ +#include <CL/cl.h> +#include <assert.h> #include <parboil.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdio.h> -#include <assert.h> -#include <CL/cl.h> #if _POSIX_VERSION >= 200112L #include <time.h> #endif -#define BILLION 1000000000LL -#define true 1 +#define BILLION 1000000000LL +#define true 1 cl_context *clContextPtr; cl_command_queue *clCommandQueuePtr; /* Free an array of owned strings. */ -static void -free_string_array(char **string_array) -{ +static void free_string_array(char **string_array) { char **p; - if (!string_array) return; - for (p = string_array; *p; p++) free(*p); + if (!string_array) + return; + for (p = string_array; *p; p++) + free(*p); free(string_array); } /* Parse a comma-delimited list of strings into an * array of strings. */ -static char ** -read_string_array(char *in) -{ +static char **read_string_array(char *in) { char **ret; int i; - int count; /* Number of items in the input */ - char *substring; /* Current substring within 'in' */ + int count; /* Number of items in the input */ + char *substring; /* Current substring within 'in' */ /* Count the number of items in the string */ count = 1; - for (i = 0; in[i]; i++) if (in[i] == ',') count++; + for (i = 0; in[i]; i++) + if (in[i] == ',') + count++; /* Allocate storage */ ret = (char **)malloc((count + 1) * sizeof(char *)); @@ -55,8 +55,8 @@ read_string_array(char *in) /* Find length of substring */ for (substring_end = substring; - (*substring_end != ',') && (*substring_end != 0); - substring_end++); + (*substring_end != ',') && (*substring_end != 0); substring_end++) + ; substring_length = substring_end - substring; @@ -68,43 +68,37 @@ read_string_array(char *in) /* go to next substring */ substring = substring_end + 1; } - ret[i] = NULL; /* Write the sentinel value */ + ret[i] = NULL; /* Write the sentinel value */ return ret; } struct argparse { - int argc; /* Number of arguments. Mutable. */ - char **argv; /* Argument values. Immutable. */ + int argc; /* Number of arguments. Mutable. */ + char **argv; /* Argument values. Immutable. */ - int argn; /* Current argument number. */ - char **argv_get; /* Argument value being read. */ - char **argv_put; /* Argument value being written. - * argv_put <= argv_get. */ + int argn; /* Current argument number. */ + char **argv_get; /* Argument value being read. */ + char **argv_put; /* Argument value being written. + * argv_put <= argv_get. */ }; -static void -initialize_argparse(struct argparse *ap, int argc, char **argv) -{ +static void initialize_argparse(struct argparse *ap, int argc, char **argv) { ap->argc = argc; ap->argn = 0; ap->argv_get = ap->argv_put = ap->argv = argv; } -static void -finalize_argparse(struct argparse *ap) -{ +static void finalize_argparse(struct argparse *ap) { /* Move the remaining arguments */ - for(; ap->argn < ap->argc; ap->argn++) + for (; ap->argn < ap->argc; ap->argn++) *ap->argv_put++ = *ap->argv_get++; } /* Delete the current argument. */ -static void -delete_argument(struct argparse *ap) -{ +static void delete_argument(struct argparse *ap) { if (ap->argn >= ap->argc) { - //fprintf(stderr, "delete_argument\n"); + // fprintf(stderr, "delete_argument\n"); } ap->argc--; ap->argv_get++; @@ -112,44 +106,32 @@ delete_argument(struct argparse *ap) /* Go to the next argument. Also, move the current argument to its * final location in argv. */ -static void -next_argument(struct argparse *ap) -{ +static void next_argument(struct argparse *ap) { if (ap->argn >= ap->argc) { - //fprintf(stderr, "next_argument\n"); + // fprintf(stderr, "next_argument\n"); } /* Move argument to its new location. */ *ap->argv_put++ = *ap->argv_get++; ap->argn++; } -static int -is_end_of_arguments(struct argparse *ap) -{ +static int is_end_of_arguments(struct argparse *ap) { return ap->argn == ap->argc; } -static char * -get_argument(struct argparse *ap) -{ - return *ap->argv_get; -} +static char *get_argument(struct argparse *ap) { return *ap->argv_get; } -static char * -consume_argument(struct argparse *ap) -{ +static char *consume_argument(struct argparse *ap) { char *ret = get_argument(ap); delete_argument(ap); return ret; } -struct pb_Parameters * -pb_ReadParameters(int *_argc, char **argv) -{ +struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv) { char *err_message; struct argparse ap; struct pb_Parameters *ret = - (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); + (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); /* Initialize the parameters structure */ ret->outFile = NULL; @@ -158,59 +140,54 @@ pb_ReadParameters(int *_argc, char **argv) /* Each argument */ initialize_argparse(&ap, *_argc, argv); - while(!is_end_of_arguments(&ap)) { + while (!is_end_of_arguments(&ap)) { char *arg = get_argument(&ap); /* Single-character flag */ if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) { - delete_argument(&ap); /* This argument is consumed here */ - - switch(arg[1]) { - case 'o': /* Output file name */ - if (is_end_of_arguments(&ap)) - { - err_message = "Expecting file name after '-o'\n"; - goto error; - } - free(ret->outFile); - ret->outFile = strdup(consume_argument(&ap)); - break; - case 'i': /* Input file name */ - if (is_end_of_arguments(&ap)) - { - err_message = "Expecting file name after '-i'\n"; - goto error; - } - ret->inpFiles = read_string_array(consume_argument(&ap)); - break; - case '-': /* End of options */ - goto end_of_options; + delete_argument(&ap); /* This argument is consumed here */ + + switch (arg[1]) { + case 'o': /* Output file name */ + if (is_end_of_arguments(&ap)) { + err_message = "Expecting file name after '-o'\n"; + goto error; + } + free(ret->outFile); + ret->outFile = strdup(consume_argument(&ap)); + break; + case 'i': /* Input file name */ + if (is_end_of_arguments(&ap)) { + err_message = "Expecting file name after '-i'\n"; + goto error; + } + ret->inpFiles = read_string_array(consume_argument(&ap)); + break; + case '-': /* End of options */ + goto end_of_options; default: - err_message = "Unexpected command-line parameter\n"; - goto error; + err_message = "Unexpected command-line parameter\n"; + goto error; } - } - else { + } else { /* Other parameters are ignored */ next_argument(&ap); } } /* end for each argument */ - end_of_options: - *_argc = ap.argc; /* Save the modified argc value */ +end_of_options: + *_argc = ap.argc; /* Save the modified argc value */ finalize_argparse(&ap); return ret; - error: +error: fputs(err_message, stderr); pb_FreeParameters(ret); return NULL; } -void -pb_FreeParameters(struct pb_Parameters *p) -{ +void pb_FreeParameters(struct pb_Parameters *p) { char **cpp; free(p->outFile); @@ -218,79 +195,72 @@ pb_FreeParameters(struct pb_Parameters *p) free(p); } -int -pb_Parameters_CountInputs(struct pb_Parameters *p) -{ +int pb_Parameters_CountInputs(struct pb_Parameters *p) { int n; - for (n = 0; p->inpFiles[n]; n++); + for (n = 0; p->inpFiles[n]; n++) + ; return n; } /*****************************************************************************/ /* Timer routines */ -static int is_async(enum pb_TimerID timer) -{ +static int is_async(enum pb_TimerID timer) { #ifndef OPENCL_CPU - return (timer == pb_TimerID_KERNEL) || - (timer == pb_TimerID_COPY_ASYNC); + return (timer == pb_TimerID_KERNEL) || (timer == pb_TimerID_COPY_ASYNC); #else return (timer == pb_TimerID_COPY_ASYNC); #endif } -static int is_blocking(enum pb_TimerID timer) -{ +static int is_blocking(enum pb_TimerID timer) { return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE); } #define INVALID_TIMERID pb_TimerID_LAST -static int asyncs_outstanding(struct pb_TimerSet* timers) -{ +static int asyncs_outstanding(struct pb_TimerSet *timers) { return (timers->async_markers != NULL) && - (timers->async_markers->timerID != INVALID_TIMERID); + (timers->async_markers->timerID != INVALID_TIMERID); } static struct pb_async_time_marker_list * -get_last_async(struct pb_TimerSet* timers) -{ +get_last_async(struct pb_TimerSet *timers) { /* Find the last event recorded thus far */ - struct pb_async_time_marker_list * last_event = timers->async_markers; - if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { - while(last_event->next != NULL && - last_event->next->timerID != INVALID_TIMERID) + struct pb_async_time_marker_list *last_event = timers->async_markers; + if (last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while (last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) last_event = last_event->next; return last_event; } else return NULL; } -static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer) -{ +static void insert_marker(struct pb_TimerSet *tset, enum pb_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + struct pb_async_time_marker_list **new_event = &(tset->async_markers); - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } - if(*new_event == NULL) { - *new_event = (struct pb_async_time_marker_list *) - malloc(sizeof(struct pb_async_time_marker_list)); + if (*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *)malloc( + sizeof(struct pb_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* - // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event -#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) -fprintf(stderr, "Creating Marker [%d]\n", timer); - *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Creating User Event Object!\n"); + // I don't think this is needed at all. I believe clEnqueueMarker 'creates' +the event #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) fprintf(stderr, "Creating +Marker [%d]\n", timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, +&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User +Event Object!\n"); } - ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Setting User Event Status!\n"); + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), +CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User +Event Status!\n"); } #endif */ @@ -300,36 +270,36 @@ fprintf(stderr, "Creating Marker [%d]\n", timer); /* valid event handle now aquired: insert the event record */ (*new_event)->label = NULL; (*new_event)->timerID = timer; - ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + ciErrNum = + clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); + fprintf(stderr, "Error Enqueueing Marker!\n"); } - } -static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer) -{ +static void insert_submarker(struct pb_TimerSet *tset, char *label, + enum pb_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + struct pb_async_time_marker_list **new_event = &(tset->async_markers); - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } - if(*new_event == NULL) { - *new_event = (struct pb_async_time_marker_list *) - malloc(sizeof(struct pb_async_time_marker_list)); + if (*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *)malloc( + sizeof(struct pb_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); - *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Creating User Event Object!\n"); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, +&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User +Event Object!\n"); } - ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Setting User Event Status!\n"); + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), +CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User +Event Status!\n"); } #endif */ @@ -339,43 +309,48 @@ fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); /* valid event handle now aquired: insert the event record */ (*new_event)->label = label; (*new_event)->timerID = timer; - ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + ciErrNum = + clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); + fprintf(stderr, "Error Enqueueing Marker!\n"); } - } - /* Assumes that all recorded events have completed */ -static pb_Timestamp record_async_times(struct pb_TimerSet* tset) -{ - struct pb_async_time_marker_list * next_interval = NULL; - struct pb_async_time_marker_list * last_marker = get_last_async(tset); +static pb_Timestamp record_async_times(struct pb_TimerSet *tset) { + struct pb_async_time_marker_list *next_interval = NULL; + struct pb_async_time_marker_list *last_marker = get_last_async(tset); pb_Timestamp total_async_time = 0; enum pb_TimerID timer; - for(next_interval = tset->async_markers; next_interval != last_marker; - next_interval = next_interval->next) { - cl_ulong command_start=0, command_end=0; + for (next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { + cl_ulong command_start = 0, command_end = 0; cl_int ciErrNum = CL_SUCCESS; - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), + CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &command_start, NULL); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); } - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); + ciErrNum = clGetEventProfilingInfo( + *((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &command_end, NULL); if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); + fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", + ciErrNum); } - pb_Timestamp interval = (pb_Timestamp) (((double)(command_end - command_start))); + pb_Timestamp interval = + (pb_Timestamp)(((double)(command_end - command_start))); tset->timers[next_interval->timerID].elapsed += interval; if (next_interval->label != NULL) { - struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + struct pb_SubTimer *subtimer = + tset->sub_timer_list[next_interval->timerID]->subtimer_list; while (subtimer != NULL) { - if ( strcmp(subtimer->label, next_interval->label) == 0) { + if (strcmp(subtimer->label, next_interval->label) == 0) { subtimer->timer.elapsed += interval; break; } @@ -386,50 +361,42 @@ static pb_Timestamp record_async_times(struct pb_TimerSet* tset) next_interval->timerID = INVALID_TIMERID; } - if(next_interval != NULL) + if (next_interval != NULL) next_interval->timerID = INVALID_TIMERID; return total_async_time; } -static void -accumulate_time(pb_Timestamp *accum, - pb_Timestamp start, - pb_Timestamp end) -{ +static void accumulate_time(pb_Timestamp *accum, pb_Timestamp start, + pb_Timestamp end) { #if _POSIX_VERSION >= 200112L *accum += end - start; #else -# error "Timestamps not implemented for this system" +#error "Timestamps not implemented for this system" #endif } #if _POSIX_VERSION >= 200112L -static pb_Timestamp get_time() -{ +static pb_Timestamp get_time() { struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); - return (pb_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec); + return (pb_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); } #else -# error "no supported time libraries are available on this platform" +#error "no supported time libraries are available on this platform" #endif -void -pb_ResetTimer(struct pb_Timer *timer) -{ +void pb_ResetTimer(struct pb_Timer *timer) { timer->state = pb_Timer_STOPPED; #if _POSIX_VERSION >= 200112L timer->elapsed = 0; #else -# error "pb_ResetTimer: not implemented for this system" +#error "pb_ResetTimer: not implemented for this system" #endif } -void -pb_StartTimer(struct pb_Timer *timer) -{ +void pb_StartTimer(struct pb_Timer *timer) { if (timer->state != pb_Timer_STOPPED) { fputs("Ignoring attempt to start a running timer\n", stderr); return; @@ -444,13 +411,12 @@ pb_StartTimer(struct pb_Timer *timer) timer->init = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "pb_StartTimer: not implemented for this system" +#error "pb_StartTimer: not implemented for this system" #endif } -void -pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) -{ +void pb_StartTimerAndSubTimer(struct pb_Timer *timer, + struct pb_Timer *subtimer) { unsigned int numNotStopped = 0x3; // 11 if (timer->state != pb_Timer_STOPPED) { @@ -483,14 +449,11 @@ pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) } } #else -# error "pb_StartTimer: not implemented for this system" +#error "pb_StartTimer: not implemented for this system" #endif - } -void -pb_StopTimer(struct pb_Timer *timer) -{ +void pb_StopTimer(struct pb_Timer *timer) { pb_Timestamp fini; if (timer->state != pb_Timer_RUNNING) { @@ -507,14 +470,15 @@ pb_StopTimer(struct pb_Timer *timer) fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "pb_StopTimer: not implemented for this system" +#error "pb_StopTimer: not implemented for this system" #endif accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } -void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, + struct pb_Timer *subtimer) { pb_Timestamp fini; @@ -532,7 +496,6 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) return; } - timer->state = pb_Timer_STOPPED; subtimer->state = pb_Timer_STOPPED; @@ -543,7 +506,7 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -# error "pb_StopTimer: not implemented for this system" +#error "pb_StopTimer: not implemented for this system" #endif if (numNotRunning & 0x2) { @@ -555,13 +518,10 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) accumulate_time(&subtimer->elapsed, subtimer->init, fini); subtimer->init = fini; } - } /* Get the elapsed time in seconds. */ -double -pb_GetElapsedTime(struct pb_Timer *timer) -{ +double pb_GetElapsedTime(struct pb_Timer *timer) { double ret; if (timer->state != pb_Timer_STOPPED) { @@ -571,14 +531,12 @@ pb_GetElapsedTime(struct pb_Timer *timer) #if _POSIX_VERSION >= 200112L ret = timer->elapsed / 1e9; #else -# error "pb_GetElapsedTime: not implemented for this system" +#error "pb_GetElapsedTime: not implemented for this system" #endif return ret; } -void -pb_InitializeTimerSet(struct pb_TimerSet *timers) -{ +void pb_InitializeTimerSet(struct pb_TimerSet *timers) { int n; timers->wall_begin = get_time(); @@ -597,16 +555,14 @@ void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) { clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr); } -static char* LoadProgSource(const char* Filename, size_t* szFinalLength) -{ +static char *LoadProgSource(const char *Filename, size_t *szFinalLength) { // locals - FILE* pFileStream = NULL; + FILE *pFileStream = NULL; size_t szSourceLength; // open the OpenCL source code file pFileStream = fopen(Filename, "rb"); - if(pFileStream == 0) - { + if (pFileStream == 0) { return NULL; } @@ -616,60 +572,62 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength) fseek(pFileStream, 0, SEEK_SET); // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + 1); - if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; + char *cSourceString = (char *)malloc(szSourceLength + 1); + if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) { + fclose(pFileStream); + free(cSourceString); + return 0; } - // close the file and return the total length of the combined (preamble + source) string + // close the file and return the total length of the combined (preamble + + // source) string fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength; + if (szFinalLength != 0) { + *szFinalLength = szSourceLength; } cSourceString[szSourceLength] = '\0'; return cSourceString; } -static inline void checkErr(cl_int err, cl_int success, const char * name) { +static inline void checkErr(cl_int err, cl_int success, const char *name) { if (err != success) { printf("ERROR: %s\n", name); exit(EXIT_FAILURE); } } -void pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKernelPtr) { +void pb_CreateAndBuildKernelFromBinary(const char *file, const char *kernel, + void *clContextPtr, void *clDevicePtr, + void *clProgramPtr, void *clKernelPtr) { size_t kernelLength; char *programSource = LoadProgSource(file, &kernelLength); - checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary"); + checkErr(programSource != NULL, 1 /*bool true*/, + "Failure to load Program Binary"); cl_int binaryStatus; cl_int errcode; - cl_device_id clDevice = *(cl_device_id*) clDevicePtr; - cl_context clContext = *(cl_context*) clContextPtr; - cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, - &kernelLength, - (const unsigned char **)&programSource, - &binaryStatus, &errcode); + cl_device_id clDevice = *(cl_device_id *)clDevicePtr; + cl_context clContext = *(cl_context *)clContextPtr; + cl_program clProgram = clCreateProgramWithBinary( + clContext, 1, &clDevice, &kernelLength, + (const unsigned char **)&programSource, &binaryStatus, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); // printf("Building kernel - %s, from file %s\n", kernel, file); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); // If build fails, get build log from device - if(errcode != CL_SUCCESS) { + if (errcode != CL_SUCCESS) { printf("ERROR: Failure to build program\n"); size_t len = 0; - errcode = clGetProgramBuildInfo(clProgram, clDevice , CL_PROGRAM_BUILD_LOG, 0, - NULL, &len); + errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, + 0, NULL, &len); printf("LOG LENGTH: %lu\n", len); - checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length"); - char *log = (char*) malloc(len*sizeof(char)); - errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len, - log, NULL); + checkErr(errcode, CL_SUCCESS, + "Failure to collect program build log length"); + char *log = (char *)malloc(len * sizeof(char)); + errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, + len, log, NULL); checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); printf("Device Build Log: %s\n", log); @@ -679,22 +637,22 @@ void pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, voi cl_kernel clKernel = clCreateKernel(clProgram, kernel, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); - - *(cl_program*) clProgramPtr = clProgram; - *(cl_kernel*)clKernelPtr = clKernel; + + *(cl_program *)clProgramPtr = clProgram; + *(cl_kernel *)clKernelPtr = clKernel; free(programSource); } -void -pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { +void pb_AddSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID pb_Category) { - struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc - (sizeof(struct pb_SubTimer)); + struct pb_SubTimer *subtimer = + (struct pb_SubTimer *)malloc(sizeof(struct pb_SubTimer)); int len = strlen(label); - subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + subtimer->label = (char *)malloc(sizeof(char) * (len + 1)); sprintf(subtimer->label, "%s\0", label); pb_ResetTimer(&subtimer->timer); @@ -702,8 +660,8 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; if (subtimerlist == NULL) { - subtimerlist = (struct pb_SubTimerList *) calloc - (1, sizeof(struct pb_SubTimerList)); + subtimerlist = + (struct pb_SubTimerList *)calloc(1, sizeof(struct pb_SubTimerList)); subtimerlist->subtimer_list = subtimer; timers->sub_timer_list[pb_Category] = subtimerlist; } else { @@ -714,21 +672,21 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ } element->next = subtimer; } - } -void -pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) -{ +void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) { /* Stop the currently running timer */ if (timers->current != pb_TimerID_NONE) { - struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; + struct pb_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct pb_SubTimer *currSubTimer = + (subtimerlist != NULL) ? subtimerlist->current : NULL; - if (!is_async(timers->current) ) { + if (!is_async(timers->current)) { if (timers->current != timer) { if (currSubTimer != NULL) { - pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + pb_StopTimerAndSubTimer(&timers->timers[timers->current], + &currSubTimer->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } @@ -750,30 +708,31 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(timer) ) ) { + if (asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer))) { - struct pb_async_time_marker_list * last_event = get_last_async(timers); + struct pb_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; cl_int async_done = CL_COMPLETE; - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), + CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), + &async_done, NULL); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Querying EventInfo!\n"); } - - if(is_blocking(timer)) { + if (is_blocking(timer)) { /* Async operations completed after previous CPU operations: * overlapped time is the total CPU time since this set of async * operations were first issued */ // timer to switch to is COPY or NONE - if(async_done != CL_COMPLETE) { + if (async_done != CL_COMPLETE) { accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); + timers->async_begin, currentTime); } /* Wait on async operation completion */ @@ -786,16 +745,17 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - if(async_done == CL_COMPLETE) { - //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); + if (async_done == CL_COMPLETE) { + // fprintf(stderr, "Async_done: total_async_type = %lld\n", + // total_async_time); timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; } } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == CL_COMPLETE ) { + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are + // outstanding so something is deeper in stack + if (async_done == CL_COMPLETE) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); @@ -804,7 +764,7 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) /* Start the new timer */ if (timer != pb_TimerID_NONE) { - if(!is_async(timer)) { + if (!is_async(timer)) { pb_StartTimer(&timers->timers[timer]); } else { // toSwitchTo Is Async (KERNEL/COPY_ASYNC) @@ -813,13 +773,13 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) insert_marker(timers, timer); timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { + } else if (!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo * already marked the end of the most recent async operation, * so we can rename that marker as the beginning of this async * operation */ - struct pb_async_time_marker_list * last_event = get_last_async(timers); + struct pb_async_time_marker_list *last_event = get_last_async(timers); last_event->label = NULL; last_event->timerID = timer; } @@ -829,20 +789,21 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) } } timers->current = timer; - } -void -pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) -{ - struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; +void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, + enum pb_TimerID category) { + struct pb_SubTimerList *subtimerlist = + timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = + (subtimerlist != NULL) ? subtimerlist->current : NULL; if (timers->current != pb_TimerID_NONE) { - if (!is_async(timers->current) ) { + if (!is_async(timers->current)) { if (timers->current != category) { if (curr != NULL) { - pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + pb_StopTimerAndSubTimer(&timers->timers[timers->current], + &curr->timer); } else { pb_StopTimer(&timers->timers[timers->current]); } @@ -864,32 +825,35 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(category) ) ) { + if (asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category))) { - struct pb_async_time_marker_list * last_event = get_last_async(timers); + struct pb_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; cl_int async_done = CL_COMPLETE; - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), + CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), + &async_done, NULL); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Querying EventInfo!\n"); } - if(is_blocking(category)) { + if (is_blocking(category)) { /* Async operations completed after previous CPU operations: * overlapped time is the total CPU time since this set of async * operations were first issued */ // timer to switch to is COPY or NONE - // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP - // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization - // it seems that the extra sync wall time isn't being recorded anywhere - if(async_done != CL_COMPLETE) + // if it hasn't already finished, then just take now and use that as the + // elapsed time in OVERLAP anything happening after now isn't OVERLAP + // because everything is being stopped to wait for synchronization it + // seems that the extra sync wall time isn't being recorded anywhere + if (async_done != CL_COMPLETE) accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); + timers->async_begin, currentTime); /* Wait on async operation completion */ ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); @@ -900,16 +864,17 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - // If it did finish, then accumulate all the async time that did happen into OVERLAP - // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. - if(async_done == CL_COMPLETE /*cudaSuccess*/) + // If it did finish, then accumulate all the async time that did happen + // into OVERLAP the immediately preceding EventSynchronize theoretically + // didn't have any effect since it was already completed. + if (async_done == CL_COMPLETE /*cudaSuccess*/) timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == CL_COMPLETE /*cudaSuccess*/) { + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are + // outstanding so something is deeper in stack + if (async_done == CL_COMPLETE /*cudaSuccess*/) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); @@ -933,7 +898,7 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat /* Start the new timer */ if (category != pb_TimerID_NONE) { - if(!is_async(category)) { + if (!is_async(category)) { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } @@ -955,18 +920,19 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat /* No asyncs outstanding, insert a fresh async marker */ insert_submarker(timers, label, category); timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { + } else if (!is_async(timers->current)) { /* Previous asyncs still in flight, but a previous SwitchTo * already marked the end of the most recent async operation, * so we can rename that marker as the beginning of this async * operation */ - struct pb_async_time_marker_list * last_event = get_last_async(timers); + struct pb_async_time_marker_list *last_event = get_last_async(timers); last_event->timerID = category; last_event->label = label; } // else, marker for switchToThis was already inserted - //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + // toSwitchto is already asynchronous, but if current/prev state is async + // too, then DRIVER is already running if (!is_async(timers->current)) { pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); } @@ -976,36 +942,36 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat timers->current = category; } -void -pb_PrintTimerSet(struct pb_TimerSet *timers) -{ +void pb_PrintTimerSet(struct pb_TimerSet *timers) { printf("Printing Parboil Timer: Default\n"); pb_Timestamp wall_end = get_time(); struct pb_Timer *t = timers->timers; - struct pb_SubTimer* sub = NULL; + struct pb_SubTimer *sub = NULL; int maxSubLength; -// const char *categories[] = { -// "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" -// }; + // const char *categories[] = { + // "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + // }; const char *categories[] = { - "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap", - "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", - "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc", - "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack" - }; - + "IO", "Kernel", "Copy", "Driver", + "Copy Async", "Compute", "Overlap", "Init_Ctx", + "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", + "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", + "Misc", "Pthread_Create", "Arg_Pack", "Arg_Unpack", + "Computation", "Output_Pack", "Output_Unpack"}; const int maxCategoryLength = 20; int i; - for(i = 1; i < pb_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if(pb_GetElapsedTime(&t[i]) != 0 || true) { + for (i = 1; i < pb_TimerID_LAST; + ++i) { // exclude NONE and OVRELAP from this format + if (pb_GetElapsedTime(&t[i]) != 0 || true) { // Print Category Timer - printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1], + pb_GetElapsedTime(&t[i])); if (timers->sub_timer_list[i] != NULL) { sub = timers->sub_timer_list[i]->subtimer_list; @@ -1020,47 +986,47 @@ pb_PrintTimerSet(struct pb_TimerSet *timers) // Fit to Categories if (maxSubLength <= maxCategoryLength) { - maxSubLength = maxCategoryLength; + maxSubLength = maxCategoryLength; } sub = timers->sub_timer_list[i]->subtimer_list; // Print SubTimers while (sub != NULL) { - printf(" -%-*s: %.9f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + printf(" -%-*s: %.9f\n", maxSubLength, sub->label, + pb_GetElapsedTime(&sub->timer)); sub = sub->next; } } } } - if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) - printf("CPU/Kernel Overlap: %.9f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + if (pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %.9f\n", + pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); - float walltime = (wall_end - timers->wall_begin)/ 1e9; + float walltime = (wall_end - timers->wall_begin) / 1e9; printf("Timer Wall Time: %.9f\n", walltime); - } -void pb_DestroyTimerSet(struct pb_TimerSet * timers) -{ +void pb_DestroyTimerSet(struct pb_TimerSet *timers) { /* clean up all of the async event markers */ - struct pb_async_time_marker_list* event = timers->async_markers; - while(event != NULL) { + struct pb_async_time_marker_list *event = timers->async_markers; + while (event != NULL) { cl_int ciErrNum = CL_SUCCESS; ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); if (ciErrNum != CL_SUCCESS) { - //fprintf(stderr, "Error Waiting for Events!\n"); + // fprintf(stderr, "Error Waiting for Events!\n"); } - ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); + ciErrNum = clReleaseEvent(*((cl_event *)(event)->marker)); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Release Events!\n"); } free((event)->marker); - struct pb_async_time_marker_list* next = ((event)->next); + struct pb_async_time_marker_list *next = ((event)->next); free(event); @@ -1069,7 +1035,7 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers) } int i = 0; - for(i = 0; i < pb_TimerID_LAST; ++i) { + for (i = 0; i < pb_TimerID_LAST; ++i) { if (timers->sub_timer_list[i] != NULL) { struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; struct pb_SubTimer *prev = NULL; @@ -1083,5 +1049,3 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers) } } } - - diff --git a/hpvm/test/pipeline/src/io.cc b/hpvm/test/pipeline/src/io.cc index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644 --- a/hpvm/test/pipeline/src/io.cc +++ b/hpvm/test/pipeline/src/io.cc @@ -10,47 +10,42 @@ * layout */ -#include<fstream> -#include<iostream> -#include<vector> +#include <fstream> +#include <iostream> +#include <vector> -char* readFile(const char* fileName) -{ - std::fstream f(fileName,std::fstream::in); - if(!f.good()) - { - std::cerr<<"Error Reading File!!"<<std::endl; - return NULL; - } +char *readFile(const char *fileName) { + std::fstream f(fileName, std::fstream::in); + if (!f.good()) { + std::cerr << "Error Reading File!!" << std::endl; + return NULL; + } + + f.seekg(0, std::ios::end); + int length = f.tellg(); + f.seekg(0, std::ios::beg); - f.seekg(0,std::ios::end); - int length = f.tellg(); - f.seekg(0,std::ios::beg); + char *buffer; - char* buffer; + if (length > 0) { + buffer = new char[length]; + f.read(buffer, length); + buffer[length - 1] = 0; + } else { + buffer = new char; + buffer[0] = 0; + } - if(length>0) - { - buffer = new char[length]; - f.read(buffer,length); - buffer[length-1]=0; - } - else - { - buffer = new char; - buffer[0] = 0; - } - - f.close(); + f.close(); - return buffer; + return buffer; } -bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << std::endl; +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << std::endl; std::fstream f(fn, std::fstream::in); - if ( !f.good() ) { + if (!f.good()) { return false; } @@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto f >> nr_col; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; - while (f.good() ) { + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; + while (f.good()) { f >> data; v.push_back(data); } v.pop_back(); // remove the duplicated last element return true; - } -bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) -{ - std::cerr << "Opening file:"<< fn << " for write." << std::endl; +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, + std::vector<float> &v) { + std::cerr << "Opening file:" << fn << " for write." << std::endl; std::fstream f(fn, std::fstream::out); - if ( !f.good() ) { + if (!f.good()) { return false; } // Read # of rows and cols - f << nr_row << " "<<nr_col<<" "; + f << nr_row << " " << nr_col << " "; float data; - std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl; for (int i = 0; i < v.size(); ++i) { f << v[i] << ' '; } f << "\n"; return true; - } diff --git a/hpvm/test/pipeline/src/main.cc b/hpvm/test/pipeline/src/main.cc index 0d2582d41be135645155676b988e44538c472273..9314833d25d0a3a25f13dfb24fb8a239b94956b1 100644 --- a/hpvm/test/pipeline/src/main.cc +++ b/hpvm/test/pipeline/src/main.cc @@ -10,52 +10,47 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include "opencv2/opencv.hpp" #include "opencv2/ocl/ocl.hpp" -#include <stdio.h> +#include "opencv2/opencv.hpp" +#include <cassert> +#include <iostream> +#include <malloc.h> #include <math.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> -#include <malloc.h> -#include <iostream> -#include <cassert> #include <visc.h> - -#define NUM_RUNS 100 +#define NUM_RUNS 100 #define DEPTH 3 #define HEIGHT 640 #define WIDTH 480 -std::string input_window = "GPU Pipeline - Input Video"; +std::string input_window = "GPU Pipeline - Input Video"; std::string output_window = "GPU Pipeline - Edge Mapping"; - #ifdef MIDDLE - #define POSX_IN 640 - #define POSY_IN 0 - #define POSX_OUT 640 - #define POSY_OUT 540 +#define POSX_IN 640 +#define POSY_IN 0 +#define POSX_OUT 640 +#define POSY_OUT 540 #elif RIGHT - #define POSX_IN 1280 - #define POSY_IN 0 - #define POSX_OUT 1280 - #define POSY_OUT 540 +#define POSX_IN 1280 +#define POSY_IN 0 +#define POSX_OUT 1280 +#define POSY_OUT 540 #else // LEFT - #define POSX_IN 0 - #define POSY_IN 0 - #define POSX_OUT 0 - #define POSY_OUT 540 +#define POSX_IN 0 +#define POSY_IN 0 +#define POSX_OUT 0 +#define POSY_OUT 540 #endif - //#define NUM_FRAMES 20 - - // Definitions of sizes for edge detection kernels #define MIN_BR 0.0f @@ -66,33 +61,33 @@ std::string output_window = "GPU Pipeline - Edge Mapping"; #define REDUCTION_TILE_SZ 1024 -#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y)) -#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y)) +#define _MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define _MAX(X, Y) ((X) > (Y) ? (X) : (Y)) extern "C" { struct __attribute__((__packed__)) InStruct { - float* I ; + float *I; size_t bytesI; - float* Is ; + float *Is; size_t bytesIs; - float* L; + float *L; size_t bytesL; - float* S; + float *S; size_t bytesS; - float* G; + float *G; size_t bytesG; - float* maxG; + float *maxG; size_t bytesMaxG; - float* E; + float *E; size_t bytesE; - float* Gs; + float *Gs; size_t bytesGs; - float* B; + float *B; size_t bytesB; - float* Sx; + float *Sx; size_t bytesSx; - float* Sy; + float *Sy; size_t bytesSy; long m; long n; @@ -100,20 +95,12 @@ struct __attribute__((__packed__)) InStruct { long grid_x; }; - -void packData(struct InStruct* args, float* I, size_t bytesI, - float* Is, size_t bytesIs, - float* L, size_t bytesL, - float* S, size_t bytesS, - float* G, size_t bytesG, - float* maxG, size_t bytesMaxG, - float* E, size_t bytesE, - float* Gs, size_t bytesGs, - float* B, size_t bytesB, - float* Sx, size_t bytesSx, - float* Sy, size_t bytesSy, - long m, long n, - long block_x, long grid_x) { +void packData(struct InStruct *args, float *I, size_t bytesI, float *Is, + size_t bytesIs, float *L, size_t bytesL, float *S, size_t bytesS, + float *G, size_t bytesG, float *maxG, size_t bytesMaxG, float *E, + size_t bytesE, float *Gs, size_t bytesGs, float *B, size_t bytesB, + float *Sx, size_t bytesSx, float *Sy, size_t bytesSy, long m, + long n, long block_x, long grid_x) { args->I = I; args->bytesI = bytesI; args->Is = Is; @@ -142,13 +129,13 @@ void packData(struct InStruct* args, float* I, size_t bytesI, args->grid_x = grid_x; } -/* +/* * Gaussian smoothing of image I of size m x n * I : input image * Gs : gaussian filter * Is: output (smoothed image) * m, n : dimensions - * + * * Need 2D grid, a thread per pixel * No use of separable algorithm because we need to do this in one kernel * No use of shared memory because @@ -157,19 +144,17 @@ void packData(struct InStruct* args, float* I, size_t bytesI, #define GAUSSIAN_SIZE 7 #define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2) -void gaussianSmoothing(float *I, size_t bytesI, - float *Gs, size_t bytesGs, - float *Is, size_t bytesIs, - long m, long n) { +void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, long m, long n) { __visc__hint(visc::DEVICE); __visc__attributes(2, I, Gs, 1, Is); - void* thisNode = __visc__getNode(); + void *thisNode = __visc__getNode(); long gx = __visc__getNodeInstanceID_x(thisNode); long gy = __visc__getNodeInstanceID_y(thisNode); - int gloc = gx + gy*n; + int gloc = gx + gy * n; float smoothedVal = 0; float gval; @@ -179,37 +164,38 @@ void gaussianSmoothing(float *I, size_t bytesI, for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++) for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) { - loadOffset = gloc + i*n + j; - + loadOffset = gloc + i * n + j; + if ((gy + i) < 0) // top contour loadOffset = gx + j; - else if ((gy + i) > m-1 ) // bottom contour - loadOffset = (m-1)*n + gx + j; - else - loadOffset = gloc + i*n + j; // within image vertically + else if ((gy + i) > m - 1) // bottom contour + loadOffset = (m - 1) * n + gx + j; + else + loadOffset = gloc + i * n + j; // within image vertically // Adjust so we are within image horizonally if ((gx + j) < 0) // left contour - loadOffset -= (gx+j); - else if ((gx + j) > n-1 ) // right contour + loadOffset -= (gx + j); + else if ((gx + j) > n - 1) // right contour loadOffset = loadOffset - gx - j + n - 1; gval = I[loadOffset]; - smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j]; + smoothedVal += + gval * + Gs[(GAUSSIAN_RADIUS + i) * GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j]; } - + Is[gloc] = smoothedVal; } __visc__return(2, bytesIs, bytesIs); } -void WrapperGaussianSmoothing(float *I, size_t bytesI, - float *Gs, size_t bytesGs, - float *Is, size_t bytesIs, - long m, long n) { +void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs, + size_t bytesGs, float *Is, size_t bytesIs, long m, + long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, I, Gs, 1, Is); - void* GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n); + void *GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n); __visc__bindIn(GSNode, 0, 0, 0); // Bind I __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs @@ -223,7 +209,6 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs } - /* Compute a non-linear laplacian estimate of input image I of size m x n */ /* * Is : blurred imput image @@ -231,88 +216,99 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) * L : output (laplacian of the image) * Need 2D grid, a thread per pixel -*/ -void laplacianEstimate(float *Is, size_t bytesIs, - float *B, size_t bytesB, - float *L, size_t bytesL, - long m, long n) { + */ +void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, + float *L, size_t bytesL, long m, long n) { __visc__hint(visc::DEVICE); __visc__attributes(2, Is, B, 1, L); // 3x3 image area - float imageArea[SZB*SZB]; + float imageArea[SZB * SZB]; - void* thisNode = __visc__getNode(); + void *thisNode = __visc__getNode(); long gx = __visc__getNodeInstanceID_x(thisNode); long gy = __visc__getNodeInstanceID_y(thisNode); int i, j; if ((gx < n) && (gy < m)) { // Data copy for dilation filter - imageArea[1 * SZB +1] = Is[gy * n + gx]; + imageArea[1 * SZB + 1] = Is[gy * n + gx]; if (gx == 0) { - imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MIN_BR; + imageArea[0 * SZB + 0] = imageArea[1 * SZB + 0] = imageArea[2 * SZB + 0] = + MIN_BR; } else { - imageArea[1 * SZB +0] = Is[gy * n + gx - 1]; - imageArea[0 * SZB +0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; - imageArea[2 * SZB +0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; + imageArea[1 * SZB + 0] = Is[gy * n + gx - 1]; + imageArea[0 * SZB + 0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; + imageArea[2 * SZB + 0] = + (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; } if (gx == n - 1) { - imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MIN_BR; + imageArea[0 * SZB + 2] = imageArea[1 * SZB + 2] = imageArea[2 * SZB + 2] = + MIN_BR; } else { - imageArea[1 * SZB +2] = Is[gy * n + gx + 1]; - imageArea[0 * SZB +2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; - imageArea[2 * SZB +2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; + imageArea[1 * SZB + 2] = Is[gy * n + gx + 1]; + imageArea[0 * SZB + 2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; + imageArea[2 * SZB + 2] = + (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; } - imageArea[0 * SZB +1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; - imageArea[2 * SZB +1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; + imageArea[0 * SZB + 1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; + imageArea[2 * SZB + 1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; // Compute pixel of dilated image float dilatedPixel = MIN_BR; for (i = 0; i < SZB; i++) for (j = 0; j < SZB; j++) - dilatedPixel = _MAX(dilatedPixel, imageArea[i * SZB +j] * B[i*SZB + j]); + dilatedPixel = + _MAX(dilatedPixel, imageArea[i * SZB + j] * B[i * SZB + j]); // Data copy for erotion filter - only change the boundary conditions if (gx == 0) { - imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MAX_BR; + imageArea[0 * SZB + 0] = imageArea[1 * SZB + 0] = imageArea[2 * SZB + 0] = + MAX_BR; } else { - if (gy == 0) imageArea[0 * SZB +0] = MAX_BR; - if (gy == m-1) imageArea[2 * SZB +0] = MAX_BR; + if (gy == 0) + imageArea[0 * SZB + 0] = MAX_BR; + if (gy == m - 1) + imageArea[2 * SZB + 0] = MAX_BR; } if (gx == n - 1) { - imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MAX_BR; + imageArea[0 * SZB + 2] = imageArea[1 * SZB + 2] = imageArea[2 * SZB + 2] = + MAX_BR; } else { - if (gy == 0) imageArea[0 * SZB +2] = MAX_BR; - if (gy == m-1) imageArea[2 * SZB +2] = MAX_BR; + if (gy == 0) + imageArea[0 * SZB + 2] = MAX_BR; + if (gy == m - 1) + imageArea[2 * SZB + 2] = MAX_BR; } - if (gy == 0) imageArea[0 * SZB +1] = MAX_BR; - if (gy == m-1) imageArea[2 * SZB +1] = MAX_BR; + if (gy == 0) + imageArea[0 * SZB + 1] = MAX_BR; + if (gy == m - 1) + imageArea[2 * SZB + 1] = MAX_BR; // Compute pixel of eroded image float erodedPixel = MAX_BR; for (i = 0; i < SZB; i++) for (j = 0; j < SZB; j++) - erodedPixel = _MIN(erodedPixel, imageArea[i * SZB +j] * B[i*SZB + j]); + erodedPixel = + _MIN(erodedPixel, imageArea[i * SZB + j] * B[i * SZB + j]); - float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB +1]; - L[gy*n+gx] = laplacian; + float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB + 1]; + L[gy * n + gx] = laplacian; } __visc__return(1, bytesL); } -void WrapperlaplacianEstimate(float *Is, size_t bytesIs, - float *B, size_t bytesB, - float *L, size_t bytesL, - long m, long n) { +void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B, + size_t bytesB, float *L, size_t bytesL, long m, + long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, Is, B, 1, L); - void* LNode = __visc__createNodeND(2, laplacianEstimate, m, n); + void *LNode = __visc__createNodeND(2, laplacianEstimate, m, n); __visc__bindIn(LNode, 0, 0, 0); // Bind Is __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs __visc__bindIn(LNode, 2, 2, 0); // Bind B @@ -323,7 +319,6 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, __visc__bindIn(LNode, 7, 7, 0); // Bind n __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL - } /* Compute the zero crossings of input image L of size m x n */ @@ -334,10 +329,8 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, * S : output (sign of the image) * Need 2D grid, a thread per pixel */ -void computeZeroCrossings(float *L, size_t bytesL, - float *B, size_t bytesB, - float *S, size_t bytesS, - long m, long n) { +void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, + float *S, size_t bytesS, long m, long n) { __visc__hint(visc::DEVICE); //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); @@ -345,89 +338,93 @@ void computeZeroCrossings(float *L, size_t bytesL, // 3x3 image area float imageArea[SZB][SZB]; - void* thisNode = __visc__getNode(); + void *thisNode = __visc__getNode(); long gx = __visc__getNodeInstanceID_x(thisNode); long gy = __visc__getNodeInstanceID_y(thisNode); int i, j; if ((gx < n) && (gy < m)) { // Data copy for dilation filter - imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR; + imageArea[1][1] = L[gy * n + gx] > MIN_BR ? MAX_BR : MIN_BR; if (gx == 0) { // left most line imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; - } else { - imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR; - imageArea[0][0] = (gy > 0) ? - (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) - : MIN_BR; - imageArea[2][0] = (gy < m - 1) ? - (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) - : MIN_BR; + } else { + imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR ? MAX_BR : MIN_BR; + imageArea[0][0] = + (gy > 0) ? (L[(gy - 1) * n + gx - 1] > MIN_BR ? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][0] = + (gy < m - 1) ? (L[(gy + 1) * n + gx - 1] > MIN_BR ? MAX_BR : MIN_BR) + : MIN_BR; } if (gx == n - 1) { imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; } else { - imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR; - imageArea[0][2] = (gy > 0) ? - (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) - : MIN_BR; - imageArea[2][2] = (gy < m - 1) ? - (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) - : MIN_BR; + imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR ? MAX_BR : MIN_BR; + imageArea[0][2] = + (gy > 0) ? (L[(gy - 1) * n + gx + 1] > MIN_BR ? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][2] = + (gy < m - 1) ? (L[(gy + 1) * n + gx + 1] > MIN_BR ? MAX_BR : MIN_BR) + : MIN_BR; } - imageArea[0][1] = (gy > 0) ? - (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) - : MIN_BR; - imageArea[2][1] = (gy < m - 1)? - (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + imageArea[0][1] = + (gy > 0) ? (L[(gy - 1) * n + gx] > MIN_BR ? MAX_BR : MIN_BR) : MIN_BR; + imageArea[2][1] = (gy < m - 1) + ? (L[(gy + 1) * n + gx] > MIN_BR ? MAX_BR : MIN_BR) : MIN_BR; // Compute pixel of dilated image float dilatedPixel = MIN_BR; for (i = 0; i < SZB; i++) for (j = 0; j < SZB; j++) - dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i * SZB + j]); // Data copy for erotion filter - only change the boundary conditions if (gx == 0) { imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; } else { - if (gy == 0) imageArea[0][0] = MAX_BR; - if (gy == m-1) imageArea[2][0] = MAX_BR; + if (gy == 0) + imageArea[0][0] = MAX_BR; + if (gy == m - 1) + imageArea[2][0] = MAX_BR; } if (gx == n - 1) { imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; } else { - if (gy == 0) imageArea[0][2] = MAX_BR; - if (gy == m-1) imageArea[2][2] = MAX_BR; + if (gy == 0) + imageArea[0][2] = MAX_BR; + if (gy == m - 1) + imageArea[2][2] = MAX_BR; } - if (gy == 0) imageArea[0][1] = MAX_BR; - if (gy == m-1) imageArea[2][1] = MAX_BR; + if (gy == 0) + imageArea[0][1] = MAX_BR; + if (gy == m - 1) + imageArea[2][1] = MAX_BR; // Compute pixel of eroded image float erodedPixel = MAX_BR; for (i = 0; i < SZB; i++) for (j = 0; j < SZB; j++) - erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i * SZB + j]); float pixelSign = dilatedPixel - erodedPixel; - S[gy*n+gx] = pixelSign; + S[gy * n + gx] = pixelSign; } - __visc__return(1, bytesS); + __visc__return(1, bytesS); } -void WrapperComputeZeroCrossings(float *L, size_t bytesL, - float *B, size_t bytesB, - float *S, size_t bytesS, - long m, long n) { +void WrapperComputeZeroCrossings(float *L, size_t bytesL, float *B, + size_t bytesB, float *S, size_t bytesS, long m, + long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); - void* ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n); + void *ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n); __visc__bindIn(ZCNode, 0, 0, 0); // Bind L __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL __visc__bindIn(ZCNode, 2, 2, 0); // Bind B @@ -438,7 +435,6 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL, __visc__bindIn(ZCNode, 7, 7, 0); // Bind n __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS - } /* @@ -458,20 +454,18 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL, #define SOBEL_SIZE 3 #define SOBEL_RADIUS (SOBEL_SIZE / 2) -void computeGradient(float *Is, size_t bytesIs, - float *Sx, size_t bytesSx, - float *Sy, size_t bytesSy, - float *G, size_t bytesG, - long m, long n) { +void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, float *G, size_t bytesG, long m, + long n) { __visc__hint(visc::DEVICE); __visc__attributes(3, Is, Sx, Sy, 1, G); - void* thisNode = __visc__getNode(); + void *thisNode = __visc__getNode(); long gx = __visc__getNodeInstanceID_x(thisNode); long gy = __visc__getNodeInstanceID_y(thisNode); - int gloc = gx + gy*n; + int gloc = gx + gy * n; float Gx = 0; float Gy = 0; @@ -482,39 +476,37 @@ void computeGradient(float *Is, size_t bytesIs, for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++) for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) { - loadOffset = gloc + i*n + j; - + loadOffset = gloc + i * n + j; + if ((gy + i) < 0) // top contour loadOffset = gx + j; - else if ((gy + i) > m-1 ) // bottom contour - loadOffset = (m-1)*n + gx + j; - else - loadOffset = gloc + i*n + j; // within image vertically + else if ((gy + i) > m - 1) // bottom contour + loadOffset = (m - 1) * n + gx + j; + else + loadOffset = gloc + i * n + j; // within image vertically // Adjust so we are within image horizonally if ((gx + j) < 0) // left contour - loadOffset -= (gx+j); - else if ((gx + j) > n-1 ) // right contour + loadOffset -= (gx + j); + else if ((gx + j) > n - 1) // right contour loadOffset = loadOffset - gx - j + n - 1; gval = Is[loadOffset]; - Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; - Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + Gx += gval * Sx[(SOBEL_RADIUS + i) * SOBEL_SIZE + SOBEL_RADIUS + j]; + Gy += gval * Sy[(SOBEL_RADIUS + i) * SOBEL_SIZE + SOBEL_RADIUS + j]; } - G[gloc] = sqrt(Gx*Gx + Gy*Gy); + G[gloc] = sqrt(Gx * Gx + Gy * Gy); } __visc__return(1, bytesG); } -void WrapperComputeGradient(float *Is, size_t bytesIs, - float *Sx, size_t bytesSx, - float *Sy, size_t bytesSy, - float *G, size_t bytesG, - long m, long n) { +void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx, + size_t bytesSx, float *Sy, size_t bytesSy, float *G, + size_t bytesG, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(3, Is, Sx, Sy, 1, G); - void* CGNode = __visc__createNodeND(2, computeGradient, m, n); + void *CGNode = __visc__createNodeND(2, computeGradient, m, n); __visc__bindIn(CGNode, 0, 0, 0); // Bind Is __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx @@ -529,35 +521,34 @@ void WrapperComputeGradient(float *Is, size_t bytesIs, __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG } -/* +/* * Reduction * G : input * maxG: output * m, n: input size * Needs a single thread block */ -void computeMaxGradientLeaf(float *G, size_t bytesG, - float *maxG, size_t bytesMaxG, - long m, long n) { +void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG, + size_t bytesMaxG, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(1, G, 1, maxG); - void* thisNode = __visc__getNode(); + void *thisNode = __visc__getNode(); - long lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x - long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x + long lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x + long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x // Assume a single thread block // Thread block iterates over all elements - for (int i = lx + dimx; i < m*n; i+= dimx) { + for (int i = lx + dimx; i < m * n; i += dimx) { if (G[lx] < G[i]) G[lx] = G[i]; } // First thread iterates over all elements of the thread block - long bounds = dimx < m*n ? dimx : m*n; - if (lx == 0) { + long bounds = dimx < m * n ? dimx : m * n; + if (lx == 0) { for (int i = 1; i < bounds; i++) if (G[lx] < G[i]) G[lx] = G[i]; @@ -568,13 +559,11 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, __visc__return(1, bytesMaxG); } -void computeMaxGradientTB(float *G, size_t bytesG, - float *maxG, size_t bytesMaxG, - long m, long n, - long block_x) { +void computeMaxGradientTB(float *G, size_t bytesG, float *maxG, + size_t bytesMaxG, long m, long n, long block_x) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); - void* CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x); + void *CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x); __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG @@ -585,13 +574,12 @@ void computeMaxGradientTB(float *G, size_t bytesG, __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG } -void WrapperComputeMaxGradient(float *G, size_t bytesG, - float *maxG, size_t bytesMaxG, - long m, long n, - long block_x, long grid_x) { +void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG, + size_t bytesMaxG, long m, long n, long block_x, + long grid_x) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); - void* CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x); + void *CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x); __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG @@ -613,327 +601,307 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG, */ #define THETA 0.1 -void rejectZeroCrossings(float *S, size_t bytesS, - float *G, size_t bytesG, - float *maxG, size_t bytesMaxG, - float *E, size_t bytesE, +void rejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, float *E, size_t bytesE, long m, long n) { __visc__hint(visc::DEVICE); __visc__attributes(3, S, G, maxG, 1, E); - void* thisNode = __visc__getNode(); + void *thisNode = __visc__getNode(); int gx = __visc__getNodeInstanceID_x(thisNode); int gy = __visc__getNodeInstanceID_y(thisNode); float mG = *maxG; if ((gx < n) && (gy < m)) { - E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ; + E[gy * n + gx] = + ((S[gy * n + gx] > 0.0) && (G[gy * n + gx] > THETA * mG)) ? 1.0 : 0.0; } __visc__return(1, bytesE); } -void WrapperRejectZeroCrossings(float *S, size_t bytesS, - float *G, size_t bytesG, - float *maxG, size_t bytesMaxG, - float *E, size_t bytesE, - long m, long n) { +void WrapperRejectZeroCrossings(float *S, size_t bytesS, float *G, + size_t bytesG, float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(3, S, G, maxG, 1, E); - void* RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n); - __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S - __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS - __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G - __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG - __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG - __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG - __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E - __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE - __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m + void *RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n); + __visc__bindIn(RZCNode, 0, 0, 0); // Bind S + __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS + __visc__bindIn(RZCNode, 2, 2, 0); // Bind G + __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG + __visc__bindIn(RZCNode, 4, 4, 0); // Bind maxG + __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG + __visc__bindIn(RZCNode, 6, 6, 0); // Bind E + __visc__bindIn(RZCNode, 7, 7, 0); // Bind bytesE + __visc__bindIn(RZCNode, 8, 8, 0); // Bind m __visc__bindIn(RZCNode, 9, 9, 0); // Bind n __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE } - - // Pipelined Root node -void edgeDetection(float *I, size_t bytesI, // 0 - float *Is, size_t bytesIs, // 2 - float *L, size_t bytesL, // 4 - float *S, size_t bytesS, // 6 - float *G, size_t bytesG, // 8 +void edgeDetection(float *I, size_t bytesI, // 0 + float *Is, size_t bytesIs, // 2 + float *L, size_t bytesL, // 4 + float *S, size_t bytesS, // 6 + float *G, size_t bytesG, // 8 float *maxG, size_t bytesMaxG, // 10 - float *E, size_t bytesE, // 12 - float *Gs, size_t bytesGs, // 14 - float *B, size_t bytesB, // 16 - float *Sx, size_t bytesSx, // 18 - float *Sy, size_t bytesSy, // 20 - long m, // 22 - long n, // 23 - long block_x, // 24 - long grid_x // 25 - ) { + float *E, size_t bytesE, // 12 + float *Gs, size_t bytesGs, // 14 + float *B, size_t bytesB, // 16 + float *Sx, size_t bytesSx, // 18 + float *Sy, size_t bytesSy, // 20 + long m, // 22 + long n, // 23 + long block_x, // 24 + long grid_x // 25 +) { __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); __visc__hint(visc::CPU_TARGET); - void* GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing); - void* LNode = __visc__createNodeND(0, WrapperlaplacianEstimate); - void* CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings); - void* CGNode = __visc__createNodeND(0, WrapperComputeGradient); - void* CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient); - void* RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings); + void *GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing); + void *LNode = __visc__createNodeND(0, WrapperlaplacianEstimate); + void *CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings); + void *CGNode = __visc__createNodeND(0, WrapperComputeGradient); + void *CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient); + void *RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings); // Gaussian Inputs - __visc__bindIn(GSNode, 0 , 0, 1); // Bind I - __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI + __visc__bindIn(GSNode, 0, 0, 1); // Bind I + __visc__bindIn(GSNode, 1, 1, 1); // Bind bytesI __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs - __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is - __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs + __visc__bindIn(GSNode, 2, 4, 1); // Bind Is + __visc__bindIn(GSNode, 3, 5, 1); // Bind bytesIs __visc__bindIn(GSNode, 22, 6, 1); // Bind m __visc__bindIn(GSNode, 23, 7, 1); // Bind n // Laplacian Inputs - __visc__bindIn(LNode, 2 , 0, 1); // Bind Is + __visc__bindIn(LNode, 2, 0, 1); // Bind Is __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs - __visc__bindIn(LNode, 16, 2, 1); // Bind B - __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB - __visc__bindIn(LNode, 4 , 4, 1); // Bind L - __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL - __visc__bindIn(LNode, 22, 6, 1); // Bind m - __visc__bindIn(LNode, 23, 7, 1); // Bind n + __visc__bindIn(LNode, 16, 2, 1); // Bind B + __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(LNode, 4, 4, 1); // Bind L + __visc__bindIn(LNode, 5, 5, 1); // Bind bytesL + __visc__bindIn(LNode, 22, 6, 1); // Bind m + __visc__bindIn(LNode, 23, 7, 1); // Bind n // Compute ZC Inputs - __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L + __visc__bindIn(CZCNode, 4, 0, 1); // Bind L __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL - __visc__bindIn(CZCNode, 16, 2, 1); // Bind B - __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB - __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S - __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS - __visc__bindIn(CZCNode, 22, 6, 1); // Bind m - __visc__bindIn(CZCNode, 23, 7, 1); // Bind n + __visc__bindIn(CZCNode, 16, 2, 1); // Bind B + __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(CZCNode, 6, 4, 1); // Bind S + __visc__bindIn(CZCNode, 7, 5, 1); // Bind bytesS + __visc__bindIn(CZCNode, 22, 6, 1); // Bind m + __visc__bindIn(CZCNode, 23, 7, 1); // Bind n // Gradient Inputs - __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is + __visc__bindIn(CGNode, 2, 0, 1); // Bind Is __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs - __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx - __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx - __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy - __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy - __visc__bindIn(CGNode, 8 , 6, 1); // Bind G - __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG - __visc__bindIn(CGNode, 22, 8, 1); // Bind m - __visc__bindIn(CGNode, 23, 9, 1); // Bind n + __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx + __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx + __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy + __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy + __visc__bindIn(CGNode, 8, 6, 1); // Bind G + __visc__bindIn(CGNode, 9, 7, 1); // Bind bytesG + __visc__bindIn(CGNode, 22, 8, 1); // Bind m + __visc__bindIn(CGNode, 23, 9, 1); // Bind n // Max Gradient Inputs - __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G + __visc__bindIn(CMGNode, 8, 0, 1); // Bind G __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG - __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG - __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG - __visc__bindIn(CMGNode, 22, 4, 1); // Bind m - __visc__bindIn(CMGNode, 23, 5, 1); // Bind n - __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x - __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x + __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG + __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG + __visc__bindIn(CMGNode, 22, 4, 1); // Bind m + __visc__bindIn(CMGNode, 23, 5, 1); // Bind n + __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x + __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x // Reject ZC Inputs - __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S + __visc__bindIn(RZCNode, 6, 0, 1); // Bind S __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS - __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G - __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG - __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG + __visc__bindIn(RZCNode, 8, 2, 1); // Bind G + __visc__bindIn(RZCNode, 9, 3, 1); // Bind bytesG + __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG - __visc__bindIn(RZCNode, 12, 6, 1); // Bind E - __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE - __visc__bindIn(RZCNode, 22, 8, 1); // Bind m - __visc__bindIn(RZCNode, 23, 9, 1); // Bind n + __visc__bindIn(RZCNode, 12, 6, 1); // Bind E + __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE + __visc__bindIn(RZCNode, 22, 8, 1); // Bind m + __visc__bindIn(RZCNode, 23, 9, 1); // Bind n __visc__bindOut(RZCNode, 0, 0, 1); // Bind output } - } using namespace cv; -void getNextFrame(VideoCapture& VC, Mat& F) { +void getNextFrame(VideoCapture &VC, Mat &F) { VC >> F; /// Convert the image to grayscale if image colored - if(F.channels() == 3) - cvtColor( F, F, CV_BGR2GRAY ); + if (F.channels() == 3) + cvtColor(F, F, CV_BGR2GRAY); - F.convertTo(F, CV_32F, 1.0/255.0); + F.convertTo(F, CV_32F, 1.0 / 255.0); } -int main (int argc, char *argv[]) { +int main(int argc, char *argv[]) { - if (argc<2) { - fprintf(stderr, "Expecting input image filename\n"); - exit(-1); - } - char* inFile = argv[1]; - fprintf(stderr, "Running pipeline on %s\n", inFile); + if (argc < 2) { + fprintf(stderr, "Expecting input image filename\n"); + exit(-1); + } + char *inFile = argv[1]; + fprintf(stderr, "Running pipeline on %s\n", inFile); - size_t I_sz; - long block_x, grid_x; + size_t I_sz; + long block_x, grid_x; - std::cout << "Using OpenCV" << CV_VERSION << "\n"; + std::cout << "Using OpenCV" << CV_VERSION << "\n"; - /* Read in data */ - std::cout << "Reading video file: " << inFile << "\n"; - VideoCapture cap(inFile); - if(!cap.isOpened()) { - std::cout << "Could not open video file" << "\n"; - return -1; - } + /* Read in data */ + std::cout << "Reading video file: " << inFile << "\n"; + VideoCapture cap(inFile); + if (!cap.isOpened()) { + std::cout << "Could not open video file" + << "\n"; + return -1; + } - int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT); - NUM_FRAMES = 600; - std::cout << "Number of frames = " << NUM_FRAMES << "\n"; + int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT); + NUM_FRAMES = 600; + std::cout << "Number of frames = " << NUM_FRAMES << "\n"; - namedWindow(input_window, CV_WINDOW_AUTOSIZE); - namedWindow(output_window, CV_WINDOW_AUTOSIZE); - moveWindow(input_window, POSX_IN, POSY_IN); - moveWindow(output_window, POSX_OUT, POSY_OUT); + namedWindow(input_window, CV_WINDOW_AUTOSIZE); + namedWindow(output_window, CV_WINDOW_AUTOSIZE); + moveWindow(input_window, POSX_IN, POSY_IN); + moveWindow(output_window, POSX_OUT, POSY_OUT); - Mat src, Is, L, S, G, E; + Mat src, Is, L, S, G, E; - getNextFrame(cap, src); + getNextFrame(cap, src); - std::cout << "Image dimension = " << src.size() << "\n"; - if(!src.isContinuous()) { - std::cout << "Expecting contiguous storage of image in memory!\n"; - exit(-1); - } + std::cout << "Image dimension = " << src.size() << "\n"; + if (!src.isContinuous()) { + std::cout << "Expecting contiguous storage of image in memory!\n"; + exit(-1); + } - Is = Mat(src.size[0], src.size[1], CV_32F); - L = Mat(src.size[0], src.size[1], CV_32F); - S = Mat(src.size[0], src.size[1], CV_32F); - G = Mat(src.size[0], src.size[1], CV_32F); - E = Mat(src.size[0], src.size[1], CV_32F); - - // All these matrices need to have their data array contiguous in memory - assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous()); - - __visc__init(); - - // copy A to device memory - I_sz = src.size[0]*src.size[1]*sizeof(float); - - size_t bytesMaxG = sizeof(float); - float* maxG = (float*)malloc(bytesMaxG); - - float B[] = { 1, 1, 1, - 1, 1, 1, - 1, 1, 1 }; - size_t bytesB = 9*sizeof(float); - float Sx[] = { -1, 0, 1, - -2, 0, 2, - -1, 0, 1 }; - size_t bytesSx = 9*sizeof(float); - float Sy[] = { -1, -2, -1, - 0, 0, 0, - 1, 2, 1 }; - size_t bytesSy = 9*sizeof(float); - - float Gs [] = { - 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, - 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, - 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, - 0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291, - 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, - 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, - 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 }; - size_t bytesGs = 7*7*sizeof(float); - - block_x = 256; - // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs - grid_x = 1; - - Mat in, out; - resize(src, in, Size(HEIGHT, WIDTH)); - resize(E, out, Size(HEIGHT, WIDTH)); - imshow(input_window, in); - imshow(output_window, out); - waitKey(0); - - struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); - packData(args, (float*)src.data, I_sz, - (float*)Is.data, I_sz, - (float*)L.data, I_sz, - (float*)S.data, I_sz, - (float*)G.data, I_sz, - maxG, bytesMaxG, - (float*)E.data, I_sz, - Gs, bytesGs, - B, bytesB, - Sx, bytesSx, - Sy, bytesSy, - src.size[0], src.size[1], - block_x, grid_x); - - // Check if the total elements is a multiple of block size - assert(src.size[0]*src.size[1] % block_x == 0); - - for(unsigned j=0; j<NUM_RUNS; j++) { - std::cout << "Run: " << j << "\n"; - void* DFG = __visc__launch(1, edgeDetection, (void*)args); - - cap = VideoCapture(inFile); - getNextFrame(cap, src); - - if(NUM_FRAMES >=2) { - for(int i=0; i<NUM_FRAMES; i++) { - std::cout << "Frame " << i << "\n"; - args->I = (float*) src.data; - - *maxG = 0.0; - - llvm_visc_track_mem(src.data, I_sz); - llvm_visc_track_mem(Is.data, I_sz); - llvm_visc_track_mem(L.data, I_sz); - llvm_visc_track_mem(S.data, I_sz); - llvm_visc_track_mem(G.data, I_sz); - llvm_visc_track_mem(maxG, bytesMaxG); - llvm_visc_track_mem(E.data, I_sz); - llvm_visc_track_mem(Gs, bytesGs); - llvm_visc_track_mem(B, bytesB); - llvm_visc_track_mem(Sx, bytesSx); - llvm_visc_track_mem(Sy, bytesSy); - - __visc__push(DFG, args); - void *ret = __visc__pop(DFG); - std::cout << "Returned size: " << *(size_t *)ret - << " expected " << I_sz << '\n'; - - llvm_visc_request_mem(maxG, bytesMaxG); - llvm_visc_request_mem(E.data, I_sz); - - Mat in, out; - resize(src, in, Size(HEIGHT, WIDTH)); - resize(E, out, Size(HEIGHT, WIDTH)); - imshow(output_window, out); - imshow(input_window, in); - waitKey(1); - - llvm_visc_untrack_mem(src.data); - llvm_visc_untrack_mem(Is.data); - llvm_visc_untrack_mem(L.data); - llvm_visc_untrack_mem(S.data); - llvm_visc_untrack_mem(G.data); - llvm_visc_untrack_mem(maxG); - llvm_visc_untrack_mem(E.data); - llvm_visc_untrack_mem(Gs); - llvm_visc_untrack_mem(B); - llvm_visc_untrack_mem(Sx); - llvm_visc_untrack_mem(Sy); - - getNextFrame(cap, src); - } - } - else { - __visc__push(DFG, args); - __visc__pop(DFG); + Is = Mat(src.size[0], src.size[1], CV_32F); + L = Mat(src.size[0], src.size[1], CV_32F); + S = Mat(src.size[0], src.size[1], CV_32F); + G = Mat(src.size[0], src.size[1], CV_32F); + E = Mat(src.size[0], src.size[1], CV_32F); + + // All these matrices need to have their data array contiguous in memory + assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && + S.isContinuous() && G.isContinuous() && E.isContinuous()); + + __visc__init(); + + // copy A to device memory + I_sz = src.size[0] * src.size[1] * sizeof(float); + + size_t bytesMaxG = sizeof(float); + float *maxG = (float *)malloc(bytesMaxG); + + float B[] = {1, 1, 1, 1, 1, 1, 1, 1, 1}; + size_t bytesB = 9 * sizeof(float); + float Sx[] = {-1, 0, 1, -2, 0, 2, -1, 0, 1}; + size_t bytesSx = 9 * sizeof(float); + float Sy[] = {-1, -2, -1, 0, 0, 0, 1, 2, 1}; + size_t bytesSy = 9 * sizeof(float); + + float Gs[] = { + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036}; + size_t bytesGs = 7 * 7 * sizeof(float); + + block_x = 256; + // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs + grid_x = 1; + + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(input_window, in); + imshow(output_window, out); + waitKey(0); + + struct InStruct *args = (struct InStruct *)malloc(sizeof(InStruct)); + packData(args, (float *)src.data, I_sz, (float *)Is.data, I_sz, + (float *)L.data, I_sz, (float *)S.data, I_sz, (float *)G.data, I_sz, + maxG, bytesMaxG, (float *)E.data, I_sz, Gs, bytesGs, B, bytesB, Sx, + bytesSx, Sy, bytesSy, src.size[0], src.size[1], block_x, grid_x); + + // Check if the total elements is a multiple of block size + assert(src.size[0] * src.size[1] % block_x == 0); + + for (unsigned j = 0; j < NUM_RUNS; j++) { + std::cout << "Run: " << j << "\n"; + void *DFG = __visc__launch(1, edgeDetection, (void *)args); + + cap = VideoCapture(inFile); + getNextFrame(cap, src); + + if (NUM_FRAMES >= 2) { + for (int i = 0; i < NUM_FRAMES; i++) { + std::cout << "Frame " << i << "\n"; + args->I = (float *)src.data; + + *maxG = 0.0; + + llvm_visc_track_mem(src.data, I_sz); + llvm_visc_track_mem(Is.data, I_sz); + llvm_visc_track_mem(L.data, I_sz); + llvm_visc_track_mem(S.data, I_sz); + llvm_visc_track_mem(G.data, I_sz); + llvm_visc_track_mem(maxG, bytesMaxG); + llvm_visc_track_mem(E.data, I_sz); + llvm_visc_track_mem(Gs, bytesGs); + llvm_visc_track_mem(B, bytesB); + llvm_visc_track_mem(Sx, bytesSx); + llvm_visc_track_mem(Sy, bytesSy); + + __visc__push(DFG, args); + void *ret = __visc__pop(DFG); + std::cout << "Returned size: " << *(size_t *)ret << " expected " << I_sz + << '\n'; + + llvm_visc_request_mem(maxG, bytesMaxG); + llvm_visc_request_mem(E.data, I_sz); + + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(output_window, out); + imshow(input_window, in); + waitKey(1); + + llvm_visc_untrack_mem(src.data); + llvm_visc_untrack_mem(Is.data); + llvm_visc_untrack_mem(L.data); + llvm_visc_untrack_mem(S.data); + llvm_visc_untrack_mem(G.data); + llvm_visc_untrack_mem(maxG); + llvm_visc_untrack_mem(E.data); + llvm_visc_untrack_mem(Gs); + llvm_visc_untrack_mem(B); + llvm_visc_untrack_mem(Sx); + llvm_visc_untrack_mem(Sy); + + getNextFrame(cap, src); } - __visc__wait(DFG); + } else { + __visc__push(DFG, args); + __visc__pop(DFG); } - __visc__cleanup(); - return 0; + __visc__wait(DFG); + } + __visc__cleanup(); + return 0; } diff --git a/hpvm/test/pipeline/src/visc.h b/hpvm/test/pipeline/src/visc.h index 3a05f49e299a0a63a2251db65762561c25ed3981..917aec5a3773657e63655191b7897b9035b6d378 100644 --- a/hpvm/test/pipeline/src/visc.h +++ b/hpvm/test/pipeline/src/visc.h @@ -15,62 +15,62 @@ #ifdef __cplusplus extern "C" { void __visc__hint(visc::Target); -//void __visc__wait(void*); +// void __visc__wait(void*); #else void __visc__hint(enum Target); -//void __visc__wait(unsigned); +// void __visc__wait(unsigned); #endif #ifdef __cplusplus -//void* __visc__node(...); -//void* __visc__createNode(...); -//void* __visc__createNode1D(...); -//void* __visc__createNode2D(...); -//void* __visc__createNode3D(...); -//void __visc__return(...); +// void* __visc__node(...); +// void* __visc__createNode(...); +// void* __visc__createNode1D(...); +// void* __visc__createNode2D(...); +// void* __visc__createNode3D(...); +// void __visc__return(...); #endif -void* __visc__createNodeND(unsigned,...); +void *__visc__createNodeND(unsigned, ...); void __visc__return(unsigned, ...); void __visc__attributes(unsigned, ...); void __visc__init(); void __visc__cleanup(); -void __visc__bindIn(void*, unsigned, unsigned, unsigned); -void __visc__bindOut(void*, unsigned, unsigned, unsigned); -void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned); -void __visc__push(void*, void*); -void* __visc__pop(void*); -void* __visc__launch(unsigned, ...); -void __visc__wait(void*); +void __visc__bindIn(void *, unsigned, unsigned, unsigned); +void __visc__bindOut(void *, unsigned, unsigned, unsigned); +void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, unsigned); +void __visc__push(void *, void *); +void *__visc__pop(void *); +void *__visc__launch(unsigned, ...); +void __visc__wait(void *); -void* __visc__getNode(); -void* __visc__getParentNode(void*); +void *__visc__getNode(); +void *__visc__getParentNode(void *); void __visc__barrier(); -void* __visc__malloc(long); -long __visc__getNodeInstanceID_x(void*); -long __visc__getNodeInstanceID_y(void*); -long __visc__getNodeInstanceID_z(void*); -long __visc__getNumNodeInstances_x(void*); -long __visc__getNumNodeInstances_y(void*); -long __visc__getNumNodeInstances_z(void*); +void *__visc__malloc(long); +long __visc__getNodeInstanceID_x(void *); +long __visc__getNodeInstanceID_y(void *); +long __visc__getNodeInstanceID_z(void *); +long __visc__getNumNodeInstances_x(void *); +long __visc__getNumNodeInstances_y(void *); +long __visc__getNumNodeInstances_z(void *); // Atomic // signed int -int __visc__atomic_cmpxchg(int*, int, int); -int __visc__atomic_add(int*, int); -int __visc__atomic_sub(int*, int); -int __visc__atomic_xchg(int*, int); -int __visc__atomic_inc(int*); -int __visc__atomic_dec(int*); -int __visc__atomic_min(int*, int); -int __visc__atomic_max(int*, int); -int __visc__atomic_umax(int*, int); -int __visc__atomic_umin(int*, int); -int __visc__atomic_and(int*, int); -int __visc__atomic_or(int*, int); -int __visc__atomic_xor(int*, int); +int __visc__atomic_cmpxchg(int *, int, int); +int __visc__atomic_add(int *, int); +int __visc__atomic_sub(int *, int); +int __visc__atomic_xchg(int *, int); +int __visc__atomic_inc(int *); +int __visc__atomic_dec(int *); +int __visc__atomic_min(int *, int); +int __visc__atomic_max(int *, int); +int __visc__atomic_umax(int *, int); +int __visc__atomic_umin(int *, int); +int __visc__atomic_and(int *, int); +int __visc__atomic_or(int *, int); +int __visc__atomic_xor(int *, int); // Special Func float __visc__floor(float); @@ -79,18 +79,17 @@ float __visc__sqrt(float); float __visc__sin(float); float __visc__cos(float); // unsigned int -//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); -//unsigned __visc__atomic_add(unsigned*, unsigned); -//unsigned __visc__atomic_sub(unsigned*, unsigned); -//unsigned __visc__atomic_xchg(unsigned*, unsigned); -//unsigned __visc__atomic_inc(unsigned*); -//unsigned __visc__atomic_dec(unsigned*); -//unsigned __visc__atomic_min(unsigned*, unsigned); -//unsigned __visc__atomic_max(unsigned*, unsigned); -//unsigned __visc__atomic_and(unsigned*, unsigned); -//unsigned __visc__atomic_or(unsigned*, unsigned); -//unsigned __visc__atomic_xor(unsigned*, unsigned); - +// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); +// unsigned __visc__atomic_add(unsigned*, unsigned); +// unsigned __visc__atomic_sub(unsigned*, unsigned); +// unsigned __visc__atomic_xchg(unsigned*, unsigned); +// unsigned __visc__atomic_inc(unsigned*); +// unsigned __visc__atomic_dec(unsigned*); +// unsigned __visc__atomic_min(unsigned*, unsigned); +// unsigned __visc__atomic_max(unsigned*, unsigned); +// unsigned __visc__atomic_and(unsigned*, unsigned); +// unsigned __visc__atomic_or(unsigned*, unsigned); +// unsigned __visc__atomic_xor(unsigned*, unsigned); #include <unistd.h> @@ -99,12 +98,10 @@ long get_group_id(int); long get_local_id(int); long get_local_size(int); - -void llvm_visc_track_mem(void*, size_t); -void llvm_visc_untrack_mem(void*); -void llvm_visc_request_mem(void*, size_t); +void llvm_visc_track_mem(void *, size_t); +void llvm_visc_untrack_mem(void *); +void llvm_visc_request_mem(void *, size_t); #ifdef __cplusplus } #endif - diff --git a/hpvm/test/unitTests/CreateNodeAndEdge.c b/hpvm/test/unitTests/CreateNodeAndEdge.c index f8ba09217de591d4ccc7cd81896d0d865b6d7ba5..1b6b1cff211d5af5a909065af988aadbe979f2ec 100644 --- a/hpvm/test/unitTests/CreateNodeAndEdge.c +++ b/hpvm/test/unitTests/CreateNodeAndEdge.c @@ -1,52 +1,50 @@ -#include <stdio.h> #include "visc.h" +#include <stdio.h> struct Root { int *input; int *output; }; - void Func1(int *In, int *Out) { - __visc__hint (CPU_TARGET); + __visc__hint(CPU_TARGET); __visc__attributes(1, In, 1, Out); __visc__return(1, Out); } void Func2(int *BindIn, int *SrcIn, int *Out) { - __visc__hint (CPU_TARGET); + __visc__hint(CPU_TARGET); __visc__attributes(2, BindIn, SrcIn, 1, Out); __visc__return(1, Out); } void PipeRoot(int *In, int *Out) { - __visc__hint (CPU_TARGET); + __visc__hint(CPU_TARGET); - __visc__attributes(1, In, 1, Out); + __visc__attributes(1, In, 1, Out); - void* SrcNode = __visc__createNodeND(0, Func1); - void* DestNode = __visc__createNodeND(0, Func2); + void *SrcNode = __visc__createNodeND(0, Func1); + void *DestNode = __visc__createNodeND(0, Func2); - __visc__bindIn(SrcNode, 0, 0, 0); + __visc__bindIn(SrcNode, 0, 0, 0); - __visc__bindIn(DestNode, 0, 0, 0); - __visc__edge(SrcNode, DestNode, 1, 0, 1, 0); + __visc__bindIn(DestNode, 0, 0, 0); + __visc__edge(SrcNode, DestNode, 1, 0, 1, 0); - __visc__bindOut(SrcNode, 0, 0, 0); + __visc__bindOut(SrcNode, 0, 0, 0); } -int main(void) { - int In = 1; - int Out = 0; - struct Root RootArgs = {(int *) &In, (int *) &Out}; +int main(void) { + int In = 1; + int Out = 0; + struct Root RootArgs = {(int *)&In, (int *)&Out}; - __visc__init(); - void* PipeDFG = __visc__launch(0, PipeRoot, (void *) &RootArgs); - __visc__wait(PipeDFG); - __visc__cleanup(); + __visc__init(); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)&RootArgs); + __visc__wait(PipeDFG); + __visc__cleanup(); - return 0; + return 0; } - diff --git a/hpvm/test/unitTests/MallocIntrinsic.c b/hpvm/test/unitTests/MallocIntrinsic.c index fbc3d3ef0802198e71f69cd1cbd2347a413e2a3e..cfd041a991d976c24b372a81b35842598b571d89 100644 --- a/hpvm/test/unitTests/MallocIntrinsic.c +++ b/hpvm/test/unitTests/MallocIntrinsic.c @@ -1,5 +1,5 @@ -#include <stdlib.h> #include "visc.h" +#include <stdlib.h> struct Root { int *input; @@ -7,32 +7,31 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __visc__hint (CPU_TARGET); + __visc__hint(CPU_TARGET); __visc__attributes(1, In, 1, Out); - Out = (int *)__visc__malloc(*In); + Out = (int *)__visc__malloc(*In); __visc__return(1, Out); } -int main(void) { - int In, Out; +int main(void) { + int In, Out; - // struct Root RootArgs; - // RootArgs.input = (int *)&In; - // RootArgs.output = (int *)&Out; + // struct Root RootArgs; + // RootArgs.input = (int *)&In; + // RootArgs.output = (int *)&Out; - struct Root* RootArgs = (struct Root *) malloc(sizeof(struct Root)); - RootArgs->input = (int *)&In; - RootArgs->output = (int *)&Out; + struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root)); + RootArgs->input = (int *)&In; + RootArgs->output = (int *)&Out; - __visc__init(); + __visc__init(); - void* PipeDFG = __visc__launch(0, PipeRoot, (void *) RootArgs); - __visc__wait(PipeDFG); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); + __visc__wait(PipeDFG); - __visc__cleanup(); + __visc__cleanup(); - return 0; + return 0; } - diff --git a/hpvm/test/unitTests/PipelineIntrinsics.c b/hpvm/test/unitTests/PipelineIntrinsics.c index 0c1932129266eb55564b199d2451b73c0ce21a73..2a9bf83402891beddf13d96c6346e8fed924d17e 100644 --- a/hpvm/test/unitTests/PipelineIntrinsics.c +++ b/hpvm/test/unitTests/PipelineIntrinsics.c @@ -1,5 +1,5 @@ -#include <stdlib.h> #include "visc.h" +#include <stdlib.h> struct Root { int *input; @@ -7,29 +7,28 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __visc__hint (CPU_TARGET); + __visc__hint(CPU_TARGET); __visc__attributes(1, In, 1, Out); __visc__return(1, Out); } -int main(void) { - int In, Out; +int main(void) { + int In, Out; - // struct Root RootArgs; - // RootArgs.input = (int *)&In; - // RootArgs.output = (int *)&Out; + // struct Root RootArgs; + // RootArgs.input = (int *)&In; + // RootArgs.output = (int *)&Out; - struct Root* RootArgs = (struct Root *) malloc(sizeof(struct Root)); - RootArgs->input = (int *)&In; - RootArgs->output = (int *)&Out; + struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root)); + RootArgs->input = (int *)&In; + RootArgs->output = (int *)&Out; - __visc__init(); + __visc__init(); - void* PipeDFG = __visc__launch(0, PipeRoot, (void *) RootArgs); - __visc__wait(PipeDFG); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); + __visc__wait(PipeDFG); - __visc__cleanup(); + __visc__cleanup(); - return 0; + return 0; } - diff --git a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c index e81e489f4faead6bb099b1a588191df98e737cdc..36fc02d22b066025be4a57695265779d8e55652a 100644 --- a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c +++ b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c @@ -1,5 +1,5 @@ -#include <stdlib.h> #include "visc.h" +#include <stdlib.h> struct Root { int *input; @@ -7,25 +7,24 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __visc__hint (CPU_TARGET); + __visc__hint(CPU_TARGET); __visc__attributes(1, In, 1, Out); __visc__return(1, Out); } -int main(void) { - int In, Out; +int main(void) { + int In, Out; - __visc__init(); + __visc__init(); - struct Root* RootArgs = (struct Root *) malloc(sizeof(struct Root)); - RootArgs->input = (int *)&In; - RootArgs->output = (int *)&Out; + struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root)); + RootArgs->input = (int *)&In; + RootArgs->output = (int *)&Out; - void* PipeDFG = __visc__launch(0, PipeRoot, (void *) RootArgs); - __visc__wait(PipeDFG); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); + __visc__wait(PipeDFG); - __visc__cleanup(); + __visc__cleanup(); - return 0; + return 0; } - diff --git a/hpvm/test/unitTests/visc.h b/hpvm/test/unitTests/visc.h index 4faba0d93f16d85272ae4bfcbb3dec1c4b37e140..0b52345b59f7d30e5e00a4dc4102f024444af47c 100644 --- a/hpvm/test/unitTests/visc.h +++ b/hpvm/test/unitTests/visc.h @@ -20,54 +20,54 @@ void __visc__hint(enum Target); #endif #ifdef __cplusplus -void* __visc__node(...); -//void* __visc__createNode(...); -//void* __visc__createNode1D(...); -//void* __visc__createNode2D(...); -//void* __visc__createNode3D(...); -//void __visc__return(...); +void *__visc__node(...); +// void* __visc__createNode(...); +// void* __visc__createNode1D(...); +// void* __visc__createNode2D(...); +// void* __visc__createNode3D(...); +// void __visc__return(...); #endif -void* __visc__createNodeND(unsigned, ...); +void *__visc__createNodeND(unsigned, ...); void __visc__return(unsigned, ...); void __visc__attributes(unsigned, ...); void __visc__init(); void __visc__cleanup(); -void __visc__bindIn(void*, unsigned, unsigned, unsigned); -void __visc__bindOut(void*, unsigned, unsigned, unsigned); -void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned); -void __visc__push(void*, void*); -void* __visc__pop(void*); -void* __visc__launch(unsigned, ...); -void __visc__wait(void*); +void __visc__bindIn(void *, unsigned, unsigned, unsigned); +void __visc__bindOut(void *, unsigned, unsigned, unsigned); +void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, unsigned); +void __visc__push(void *, void *); +void *__visc__pop(void *); +void *__visc__launch(unsigned, ...); +void __visc__wait(void *); -void* __visc__getNode(); -void* __visc__getParentNode(void*); +void *__visc__getNode(); +void *__visc__getParentNode(void *); void __visc__barrier(); -void* __visc__malloc(long); -long __visc__getNodeInstanceID_x(void*); -long __visc__getNodeInstanceID_y(void*); -long __visc__getNodeInstanceID_z(void*); -long __visc__getNumNodeInstances_x(void*); -long __visc__getNumNodeInstances_y(void*); -long __visc__getNumNodeInstances_z(void*); +void *__visc__malloc(long); +long __visc__getNodeInstanceID_x(void *); +long __visc__getNodeInstanceID_y(void *); +long __visc__getNodeInstanceID_z(void *); +long __visc__getNumNodeInstances_x(void *); +long __visc__getNumNodeInstances_y(void *); +long __visc__getNumNodeInstances_z(void *); // Atomic // signed int -int __visc__atomic_cmpxchg(int*, int, int); -int __visc__atomic_add(int*, int); -int __visc__atomic_sub(int*, int); -int __visc__atomic_xchg(int*, int); -int __visc__atomic_inc(int*); -int __visc__atomic_dec(int*); -int __visc__atomic_min(int*, int); -int __visc__atomic_max(int*, int); -int __visc__atomic_umax(int*, int); -int __visc__atomic_umin(int*, int); -int __visc__atomic_and(int*, int); -int __visc__atomic_or(int*, int); -int __visc__atomic_xor(int*, int); +int __visc__atomic_cmpxchg(int *, int, int); +int __visc__atomic_add(int *, int); +int __visc__atomic_sub(int *, int); +int __visc__atomic_xchg(int *, int); +int __visc__atomic_inc(int *); +int __visc__atomic_dec(int *); +int __visc__atomic_min(int *, int); +int __visc__atomic_max(int *, int); +int __visc__atomic_umax(int *, int); +int __visc__atomic_umin(int *, int); +int __visc__atomic_and(int *, int); +int __visc__atomic_or(int *, int); +int __visc__atomic_xor(int *, int); // Special Func float __visc__floor(float); @@ -76,18 +76,17 @@ float __visc__sqrt(float); float __visc__sin(float); float __visc__cos(float); // unsigned int -//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); -//unsigned __visc__atomic_add(unsigned*, unsigned); -//unsigned __visc__atomic_sub(unsigned*, unsigned); -//unsigned __visc__atomic_xchg(unsigned*, unsigned); -//unsigned __visc__atomic_inc(unsigned*); -//unsigned __visc__atomic_dec(unsigned*); -//unsigned __visc__atomic_min(unsigned*, unsigned); -//unsigned __visc__atomic_max(unsigned*, unsigned); -//unsigned __visc__atomic_and(unsigned*, unsigned); -//unsigned __visc__atomic_or(unsigned*, unsigned); -//unsigned __visc__atomic_xor(unsigned*, unsigned); - +// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); +// unsigned __visc__atomic_add(unsigned*, unsigned); +// unsigned __visc__atomic_sub(unsigned*, unsigned); +// unsigned __visc__atomic_xchg(unsigned*, unsigned); +// unsigned __visc__atomic_inc(unsigned*); +// unsigned __visc__atomic_dec(unsigned*); +// unsigned __visc__atomic_min(unsigned*, unsigned); +// unsigned __visc__atomic_max(unsigned*, unsigned); +// unsigned __visc__atomic_and(unsigned*, unsigned); +// unsigned __visc__atomic_or(unsigned*, unsigned); +// unsigned __visc__atomic_xor(unsigned*, unsigned); #include <unistd.h> @@ -96,12 +95,10 @@ long get_group_id(int); long get_local_id(int); long get_local_size(int); - -void llvm_visc_track_mem(void*, size_t); -void llvm_visc_untrack_mem(void*); -void llvm_visc_request_mem(void*, size_t); +void llvm_visc_track_mem(void *, size_t); +void llvm_visc_untrack_mem(void *); +void llvm_visc_request_mem(void *, size_t); #ifdef __cplusplus } #endif -