diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index e9660aa6e7a4aad995c6003fce9697a8304f3ccf..5f2e483cee789a63fc77d1e99723c77a530747ff 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -807,8 +807,7 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { if(KernelLaunchNode == NULL) errs () << "No kernel launch node\n"; else { - errs() << "KernelLaunchNode is not null: "<< KernelLaunchNode<<"\n"; - errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; + errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; } if (!KernelLaunchNode) { @@ -1285,9 +1284,9 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { break; case Intrinsic::visc_barrier: { - errs() << F_nvptx->getName() << "\t: Handling barrier\n"; - errs() << "Substitute with barrier()\n"; - errs() << *II << "\n"; + DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n"); + DEBUG(errs() << "Substitute with barrier()\n"); + DEBUG(errs() << *II << "\n"); FunctionType* FT = FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/), std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), false); @@ -1315,7 +1314,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { //case Intrinsic::visc_atomic_inc: //case Intrinsic::visc_atomic_dec: { - errs() << *II << "\n"; + DEBUG(errs() << *II << "\n"); // Only have support for i32 atomic intrinsics assert(II->getType() == Type::getInt32Ty(II->getContext()) && "Only support i32 atomic intrinsics for now"); @@ -1332,7 +1331,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { AtomicRMWInst* AtomicInst = new AtomicRMWInst(getAtomicOp(II->getIntrinsicID()), Ptr, Val, llvm::SequentiallyConsistent, llvm::CrossThread, II); AtomicInst->setVolatile(true); - errs() << "Substitute with: " << *AtomicInst << "\n"; + DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n"); II->replaceAllUsesWith(AtomicInst); IItoRemove.push_back(II); } @@ -1348,7 +1347,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); if(calleeF->isDeclaration()) { // Add the declaration to kernel module - errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"; + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); KernelM.getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) { // Now handle a few specific intrinsics @@ -1357,7 +1356,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { case Intrinsic::sin: case Intrinsic::cos: { - errs() << "Found sincos: " << *II << "\n"; + DEBUG(errs() << "Found sincos: " << *II << "\n"); // Get the libclc function // libclc uses mangled name for sin cos assert(II->getType()->isFloatTy() @@ -1381,7 +1380,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { } case Intrinsic::floor: { - errs() << "Found floor intrinsic\n"; + DEBUG(errs() << "Found floor intrinsic\n"); F = Intrinsic::getDeclaration(&KernelM, Intrinsic::nvvm_floor_f); FunctionType* FTy = F->getFunctionType(); DEBUG(errs() << *F << "\n"); @@ -1434,7 +1433,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // their subsequent uses are still around for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), re = IItoRemove.rend(); ri != re; ++ri) { - errs() << "Erasing: " << **ri << "\n"; + DEBUG(errs() << "Erasing: " << **ri << "\n"); (*ri)->eraseFromParent(); } @@ -1709,7 +1708,7 @@ static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector } else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { // found a store in use chain - errs() << "Found store/atomicrmw instruction: " << *I << "\n"; + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); return true; } else if(BuildDFG::isViscIntrinsic(I)) { @@ -1720,7 +1719,7 @@ static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector return true; } else { - errs() << "\tTraverse use chain of: " << *I << "\n"; + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); if(findLoadStoreUses(I, UseList, VisitedList)) return true; } @@ -1770,7 +1769,7 @@ static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*Dependence // Function to check if argument arg can be changed to a constant memory pointer static bool canBePromoted(Argument* arg, Function* F) { - errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"; + DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); std::vector<Value*> UseList; std::vector<Value*> VisitedList; // recursively traverse use chain @@ -1788,7 +1787,7 @@ static bool canBePromoted(Argument* arg, Function* F) { if(isDependentOnNodeInstanceID(U, &DependenceList)) return false; } - errs() << "\tYes, Promotable to Constant Memory\n"; + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); return true; } @@ -1853,7 +1852,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa // Iterate over the number of dimensions and store the global work group // size in that dimension for(unsigned i=0; i < WGSize.size(); i++) { - errs() << *WGSize[i] << "\n"; + DEBUG(errs() << *WGSize[i] << "\n"); assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); if(WGSize[i]->getType() != Int64Ty) { diff --git a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp index 31103c218bca660f11563f98093c93b2cda2b353..32612a85c1cd69cbbd1fb67a7920da8571b480b8 100644 --- a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp @@ -814,7 +814,6 @@ void CGT_SPIR::codeGen(DFInternalNode* N) { if(KernelLaunchNode == NULL) errs () << "No kernel launch node\n"; else { - errs() << "KernelLaunchNode is not null: "<< KernelLaunchNode<<"\n"; errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; } @@ -1309,9 +1308,9 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { break; case Intrinsic::visc_barrier: { - errs() << F_spir->getName() << "\t: Handling barrier\n"; - errs() << "Substitute with barrier()\n"; - errs() << *II << "\n"; + DEBUG(errs() << F_spir->getName() << "\t: Handling barrier\n"); + DEBUG(errs() << "Substitute with barrier()\n"); + DEBUG(errs() << *II << "\n"); FunctionType* FT = FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/), std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), false); @@ -1338,7 +1337,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { case Intrinsic::visc_atomic_inc: case Intrinsic::visc_atomic_dec: { - errs() << *II << "\n"; + DEBUG(errs() << *II << "\n"); // Only have support for i32 atomic intrinsics assert(II->getType() == Type::getInt32Ty(II->getContext()) && "Only support i32 atomic intrinsics for now"); @@ -1371,7 +1370,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { ArrayRef<Value*>(atomicArgs, 2), "", II); - errs() << "Substitute with: " << *AtomicInst << "\n"; + DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n"); II->replaceAllUsesWith(AtomicInst); IItoRemove.push_back(II); } @@ -1387,7 +1386,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); if(calleeF->isDeclaration()) { // Add the declaration to kernel module - errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"; + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); KernelM.getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) { // Now handle a few specific intrinsics @@ -1399,7 +1398,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { case Intrinsic::floor: case Intrinsic::nvvm_rsqrt_approx_f: { - errs() << "Found math function: " << *II << "\n"; + DEBUG(errs() << "Found math function: " << *II << "\n"); // Get the builtin function // SPIR uses mangled name for builtin math functions assert(II->getType()->isFloatTy() @@ -1418,7 +1417,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { break; } default: - errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ; + DEBUG(errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ); } } diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 75280bd50d674be701b7b59f36f75125ceed23e0..e609bb63a0e5ecdaa130e939334b905839f59a34 100644 --- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -55,6 +55,7 @@ private: // VISC Runtime API Constant* llvm_visc_x86_launch; Constant* llvm_visc_x86_wait; + Constant* llvm_visc_x86_argument_ptr; Constant* llvm_visc_streamLaunch; Constant* llvm_visc_streamPush; @@ -175,6 +176,7 @@ void CGT_X86::initRuntimeAPI() { DECLARE(llvm_visc_x86_launch); DECLARE(malloc); DECLARE(llvm_visc_x86_wait); + DECLARE(llvm_visc_x86_argument_ptr); DECLARE(llvm_visc_streamLaunch); DECLARE(llvm_visc_streamPush); DECLARE(llvm_visc_streamPop); @@ -1078,6 +1080,12 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { i->getName()+".addr", RI); } + else if(i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } else { BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), @@ -1092,7 +1100,7 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { } /* Add a call to the generated function of the child node */ DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); - errs() << "Type: " << *C->getGenFunc()->getType() << "\n"; + DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, C->getGenFunc()->getName()+".output", RI); @@ -1281,6 +1289,36 @@ void CGT_X86::codeGen(DFLeafNode* N) { if(!N->getParent()->isChildGraphStreaming()) addIdxDimArgs(F_X86); + // Go through the arguments, and any pointer arguments with in attribute need + // to have x86_argument_ptr call to get the x86 ptr of the argument + // Insert these calls in a new BB which would dominate all other BBs + // Create new BB + BasicBlock* EntryBB = F_X86->begin(); + BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); + BranchInst* Terminator = BranchInst::Create(EntryBB, BB); + // Insert calls + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); + ai != ae; ++ai) { + if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) { + assert(ai->getType()->isPointerTy() + && "Only pointer arguments can have visc in/out attributes "); + Argument* size = ai->getNextNode(); + assert(size->getType() == Type::getInt64Ty(M.getContext()) + && "Next argument after a pointer should be an i64 type"); + CastInst* BI = BitCastInst::CreatePointerCast(ai, + Type::getInt8PtrTy(M.getContext()), + ai->getName()+".i8ptr", + Terminator); + Value* ArgPtrCallArgs[] = {BI, size}; + CallInst::Create(llvm_visc_x86_argument_ptr, + ArrayRef<Value*>(ArgPtrCallArgs, 2), + "", + Terminator); + + } + } + errs() << *BB << "\n"; + // Go through all the instructions for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { Instruction *I = &(*i); diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp index 4cbe09538c9126021fed6653fe71c1e7b34df8e7..3b4c82b1cf6143de01121968a80b9274747004af 100755 --- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp +++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp @@ -1082,17 +1082,17 @@ bool GenVISC::runOnModule(Module &M) { // FIXME: What if the child node function has not been visited already. // i.e., it's return type has not been fixed. Function* F = I->getParent()->getParent(); - errs() << F->getName() << "\n"; + DEBUG(errs() << F->getName() << "\n";); IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0)); - errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"; + DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); - errs() << ChildF->getName() << "\n"; + DEBUG(errs() << ChildF->getName() << "\n";); int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType()); Type* ReturnType = F->getReturnType(); - errs() << *ReturnType << "\n"; + DEBUG(errs() << *ReturnType << "\n";); assert(ReturnType->isVoidTy() || isa<StructType>(ReturnType) && "Return type should either be a struct or void type!"); diff --git a/llvm/lib/Transforms/LocalMem/LocalMem.cpp b/llvm/lib/Transforms/LocalMem/LocalMem.cpp index 0de36b51cbbe11da3c050d90ddf061544468a69a..a6faa3ef23f6b0275a4f1af9f940993941076f9b 100644 --- a/llvm/lib/Transforms/LocalMem/LocalMem.cpp +++ b/llvm/lib/Transforms/LocalMem/LocalMem.cpp @@ -29,7 +29,6 @@ namespace { // Helper Functions static AllocationNodeProperty* isAllocationNode(DFLeafNode* N); -static Value* getOutValueAt(DFLeafNode*N, unsigned i); // LocalMem - The first implementation. struct LocalMem : public ModulePass { @@ -104,12 +103,12 @@ bool LocalMem::runOnModule(Module &M) { } void AT_OCL::codeGen(DFInternalNode* N) { - errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"; + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); } // Code generation for leaf nodes void AT_OCL::codeGen(DFLeafNode* N) { - errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"; + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); // Skip code generation if it is a dummy node if(N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); @@ -123,10 +122,10 @@ void AT_OCL::codeGen(DFLeafNode* N) { N->setProperty(DFNode::Allocation, ANP); AllocationNodeProperty* anp = (AllocationNodeProperty*) N->getProperty(DFNode::Allocation); AllocationNodeProperty::AllocationListType AL = anp->getAllocationList(); - errs() << "Total allocations = " << AL.size() << "\n"; + DEBUG(errs() << "Total allocations = " << AL.size() << "\n"); for(auto P: AL) { - errs() << " EdgePort: " << P.first->getDestPosition(); - errs() << " Size: " << *P.second << "\n"; + DEBUG(errs() << " EdgePort: " << P.first->getDestPosition()); + DEBUG(errs() << " Size: " << *P.second << "\n"); } } diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp old mode 100755 new mode 100644 index d78b7529c4affa864b58c86fc6f220a944a608f8..18c15167a472f6ae4e2251cf19377bdf201dac6a --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -183,6 +183,10 @@ static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL return d_input; } +void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) { + return llvm_visc_request_mem(ptr, size); +} + void* llvm_visc_request_mem(void* ptr, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n"); @@ -1537,6 +1541,7 @@ void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); + DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n"); //free(clDevices); free(programSource); diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h old mode 100755 new mode 100644 index 379ac3bf08ff3b2f55399a56edb9840b45e3589f..a5fd214194d29191fb90a8e0de203a10d0eac776 --- a/llvm/projects/visc-rt/visc-rt.h +++ b/llvm/projects/visc-rt/visc-rt.h @@ -160,6 +160,8 @@ void* llvm_visc_x86_launch(void* (void*), void*); void llvm_visc_x86_wait(void*); void* llvm_visc_ocl_initContext(enum visc::Target); +void* llvm_visc_x86_argument_ptr(void*, size_t); + void llvm_visc_ocl_clearContext(void*); void llvm_visc_ocl_argument_shared(void*, int, size_t); void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t); diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile index 3a265f737f93433d5a83a5a6a503a6c879a09d8a..af1e1ac8d24d3e0468e78b294acefa85086e4381 100644 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile @@ -21,9 +21,9 @@ ifeq ($(POS),) endif ifeq ($(POS),middle) - APP_CXXFLAGS+=-DMIDDLE + APP_CXXFLAGS+=-DMIDDLE -DNAME=$(NAME) else ifeq ($(POS),right) - APP_CXXFLAGS+=-DRIGHT + APP_CXXFLAGS+=-DRIGHT -DNAME=$(NAME) endif BIN = $(addsuffix -$(VERSION), $(APP)) diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/copyToVersions.sh b/llvm/test/VISC/parboil/benchmarks/pipeline/copyToVersions.sh new file mode 100755 index 0000000000000000000000000000000000000000..3b9c19bad6dd86de7eb9a82edc7f17b92265155e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/copyToVersions.sh @@ -0,0 +1,12 @@ + +declare -a versionList=("viscGPU" "viscVector" "viscScalar" "viscGPU-Scalar-MaxG" "viscVector-Scalar-MaxG" "viscGPU-Scalar-ZC" "viscVector-Scalar-ZC") +declare -a fileList=("Makefile" "io.cc" "main.cc") + +for version in "${versionList[@]}"; do + echo $version + for filename in "${fileList[@]}"; do + echo cp ./src/visc_parallel/$filename ./src/$version/ + cp ./src/visc_parallel/$filename ./src/$version/ + done + echo +done diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/run.sh b/llvm/test/VISC/parboil/benchmarks/pipeline/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..0c8435764bd87c92dd30ad51aa97011ddb07b339 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash +echo Pipeline Script $1 $2 + +version=$1 +pos=$2 + +if [[ ($version == *"GPU"*) || ($version == "visc_parallel") ]] +then + target="" +elif [[ $version == *"Vector"* ]] +then + target="x86" +else + target="seq" +fi + +make VERSION=$version TARGET=$target clean +make VERSION=$version TARGET=$target POS=$pos +make VERSION=$version TARGET=$target POS=$pos run + diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc index b501269f0aa55cb808b71a72a445ceb0d1bf67eb..7d609066b0ac6570af7152ca84822c8e9dff816f 100755 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc @@ -415,8 +415,8 @@ void edgeDetection(float *I, size_t bytesI, } } -#define NUM_RUNS 1 -#define NUM_FRAMES 20 +#define NUM_RUNS 5 +#define NUM_FRAMES 2000 using namespace cv; int main (int argc, char *argv[]) { @@ -441,7 +441,7 @@ int main (int argc, char *argv[]) { exit(-1); } - /* Read in data */ + //Read in data //std::cout << "Reading video file: " << params->inpFiles[0] << "\n"; //VideoCapture cap(params->inpFiles[0]); //if(!cap.isOpened()) { @@ -489,6 +489,9 @@ int main (int argc, char *argv[]) { exit(-1); } + // Read data from matrix file + //readColMajorMatrixFile(params->inpFiles[0], + //matIrow, matIcol, matI); pb_InitializeTimerSet(&timers); __visc__init(); diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/Makefile similarity index 96% rename from llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/Makefile rename to llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/Makefile index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/Makefile @@ -4,8 +4,8 @@ LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include -APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib #OpenCV link flags all diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/io.cc similarity index 100% rename from llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/io.cc rename to llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/io.cc diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/main.cc new file mode 100755 index 0000000000000000000000000000000000000000..836cf1a01a56e6548f0fd11b3f11df0f303b3a8e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/main.cc @@ -0,0 +1,1117 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include "opencv2/opencv.hpp" +#include "opencv2/core/ocl.hpp" +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <iostream> +#include <cassert> +#include <parboil.h> +#include <visc.h> + + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "GPU-CPU(No Vector)-MaxG Pipeline - Input Video"; +std::string output_window = "GPU-CPU(No Vector)-MaxG Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + +// Definitions of sizes for edge detection kernels + +#define MIN_BR 0.0f +#define MAX_BR 1.0f + +// Code needs to be changed for this to vary +#define SZB 3 + +#define REDUCTION_TILE_SZ 1024 + +#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + +extern "C" { + +struct __attribute__((__packed__)) InStruct { + float* I ; + size_t bytesI; + float* Is ; + size_t bytesIs; + float* L; + size_t bytesL; + float* S; + size_t bytesS; + float* G; + size_t bytesG; + float* maxG; + size_t bytesMaxG; + float* E; + size_t bytesE; + float* Gs; + size_t bytesGs; + float* B; + size_t bytesB; + float* Sx; + size_t bytesSx; + float* Sy; + size_t bytesSy; + int m; + int n; + int block_x; + int grid_x; +}; + + +void packData(struct InStruct* args, float* I, size_t bytesI, + float* Is, size_t bytesIs, + float* L, size_t bytesL, + float* S, size_t bytesS, + float* G, size_t bytesG, + float* maxG, size_t bytesMaxG, + float* E, size_t bytesE, + float* Gs, size_t bytesGs, + float* B, size_t bytesB, + float* Sx, size_t bytesSx, + float* Sy, size_t bytesSy, + int m, int n, + int block_x, int grid_x) { + args->I = I; + args->bytesI = bytesI; + args->Is = Is; + args->bytesIs = bytesIs; + args->L = L; + args->bytesL = bytesL; + args->S = S; + args->bytesS = bytesS; + args->G = G; + args->bytesG = bytesG; + args->maxG = maxG; + args->bytesMaxG = bytesMaxG; + args->E = E; + args->bytesE = bytesE; + args->Gs = Gs; + args->bytesGs = bytesGs; + args->B = B; + args->bytesB = bytesB; + args->Sx = Sx; + args->bytesSx = bytesSx; + args->Sy = Sy; + args->bytesSy = bytesSy; + args->m = m; + args->n = n; + args->block_x = block_x; + args->grid_x = grid_x; +} + +/* + * Gaussian smoothing of image I of size m x n + * I : input image + * Gs : gaussian filter + * Is: output (smoothed image) + * m, n : dimensions + * + * Need 2D grid, a thread per pixel + * No use of separable algorithm because we need to do this in one kernel + * No use of shared memory because + * - we don't handle it in the X86 pass + */ + +#define GAUSSIAN_SIZE 7 +#define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2) +void gaussianSmoothing(float *I, size_t bytesI, + float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(2, I, Gs, 1, Is); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + int gloc = gx + gy*n; + + float smoothedVal = 0; + float gval; + int loadOffset; + + if ((gx < n) && (gy < m)) { + for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++) + for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) { + + loadOffset = gloc + i*n + j; + + if ((gy + i) < 0) // top contour + loadOffset = gx + j; + else if ((gy + i) > m-1 ) // bottom contour + loadOffset = (m-1)*n + gx + j; + else + loadOffset = gloc + i*n + j; // within image vertically + + // Adjust so we are within image horizonally + if ((gx + j) < 0) // left contour + loadOffset -= (gx+j); + else if ((gx + j) > n-1 ) // right contour + loadOffset = loadOffset - gx - j + n - 1; + + gval = I[loadOffset]; + smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j]; + } + + Is[gloc] = smoothedVal; + } + __visc__return(m, n); +} + +void WrapperGaussianSmoothing(float *I, size_t bytesI, + float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, I, Gs, 1, Is); + void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n); + __visc__bindIn(GSNode, 0, 0, 0); // Bind I + __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI + __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs + __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs + __visc__bindIn(GSNode, 4, 4, 0); // Bind Is + __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs + __visc__bindIn(GSNode, 6, 6, 0); // Bind m + __visc__bindIn(GSNode, 7, 7, 0); // Bind n + + __visc__bindOut(GSNode, 0, 0, 0); // bind output m + __visc__bindOut(GSNode, 1, 1, 0); // bind output n +} + + +/* Compute a non-linear laplacian estimate of input image I of size m x n */ +/* + * Is : blurred imput image + * m, n : dimensions + * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) + * L : output (laplacian of the image) + * Need 2D grid, a thread per pixel +*/ +void laplacianEstimate(float *Is, size_t bytesIs, + float *B, size_t bytesB, + float *L, size_t bytesL, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(2, Is, B, 1, L); + // 3x3 image area + float imageArea[SZB][SZB]; + + //int gx = get_global_id(0); + //int gy = get_global_id(1); + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + //if(gx == 0 && gy == 0) + //std::cout << "Entered laplacian\n"; + + int i, j; + + if ((gx < n) && (gy < m)) { + // Data copy for dilation filter + imageArea[1][1] = Is[gy * n + gx]; + + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + } else { + imageArea[1][0] = Is[gy * n + gx - 1]; + imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; + imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + } else { + imageArea[1][2] = Is[gy * n + gx + 1]; + imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; + imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; + } + + imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; + imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; + + // Compute pixel of dilated image + float dilatedPixel = MIN_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + + // Data copy for erotion filter - only change the boundary conditions + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + } else { + if (gy == 0) imageArea[0][0] = MAX_BR; + if (gy == m-1) imageArea[2][0] = MAX_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + } else { + if (gy == 0) imageArea[0][2] = MAX_BR; + if (gy == m-1) imageArea[2][2] = MAX_BR; + } + + if (gy == 0) imageArea[0][1] = MAX_BR; + if (gy == m-1) imageArea[2][1] = MAX_BR; + + // Compute pixel of eroded image + float erodedPixel = MAX_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + + float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1]; + L[gy*n+gx] = laplacian; + } + //OutStruct output = {bytesB, bytesL}; + //if(gx == m-1 && gy == n-1) + //std::cout << "Exit laplacian\n"; + __visc__return(m); +} + +void WrapperlaplacianEstimate(float *Is, size_t bytesIs, + float *B, size_t bytesB, + float *L, size_t bytesL, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, Is, B, 1, L); + void* LNode = __visc__createNode2D(laplacianEstimate, m, n); + __visc__bindIn(LNode, 0, 0, 0); // Bind Is + __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(LNode, 2, 2, 0); // Bind B + __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(LNode, 4, 4, 0); // Bind L + __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL + __visc__bindIn(LNode, 6, 6, 0); // Bind m + __visc__bindIn(LNode, 7, 7, 0); // Bind n + + __visc__bindOut(LNode, 0, 0, 0); // bind output m + +} + +/* Compute the zero crossings of input image L of size m x n */ +/* + * L : imput image (computed Laplacian) + * m, n : dimensions + * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) + * S : output (sign of the image) + * Need 2D grid, a thread per pixel + */ +void computeZeroCrossings(float *L, size_t bytesL, + float *B, size_t bytesB, + float *S, size_t bytesS, + int m, int n) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + + // 3x3 image area + float imageArea[SZB][SZB]; + + //int gx = get_global_id(0); + //int gy = get_global_id(1); + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + int i, j; + + //if(gx == 0 && gy == 0) + //std::cout << "Entered ZC\n"; + if ((gx < n) && (gy < m)) { + // Data copy for dilation filter + imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR; + + if (gx == 0) { // left most line + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + } else { + imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR; + imageArea[0][0] = (gy > 0) ? + (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][0] = (gy < m - 1) ? + (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + } else { + imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR; + imageArea[0][2] = (gy > 0) ? + (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][2] = (gy < m - 1) ? + (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + } + + imageArea[0][1] = (gy > 0) ? + (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][1] = (gy < m - 1)? + (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + + // Compute pixel of dilated image + float dilatedPixel = MIN_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + + // Data copy for erotion filter - only change the boundary conditions + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + } else { + if (gy == 0) imageArea[0][0] = MAX_BR; + if (gy == m-1) imageArea[2][0] = MAX_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + } else { + if (gy == 0) imageArea[0][2] = MAX_BR; + if (gy == m-1) imageArea[2][2] = MAX_BR; + } + + if (gy == 0) imageArea[0][1] = MAX_BR; + if (gy == m-1) imageArea[2][1] = MAX_BR; + + // Compute pixel of eroded image + float erodedPixel = MAX_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + + float pixelSign = dilatedPixel - erodedPixel; + S[gy*n+gx] = pixelSign; + } + //OutStruct output = {bytesB, bytesS}; + //if(gx == n-1 && gy == n-1) + //std::cout << "Exit ZC\n"; + __visc__return(m); +} + +void WrapperComputeZeroCrossings(float *L, size_t bytesL, + float *B, size_t bytesB, + float *S, size_t bytesS, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n); + __visc__bindIn(ZCNode, 0, 0, 0); // Bind L + __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL + __visc__bindIn(ZCNode, 2, 2, 0); // Bind B + __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(ZCNode, 4, 4, 0); // Bind S + __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS + __visc__bindIn(ZCNode, 6, 6, 0); // Bind m + __visc__bindIn(ZCNode, 7, 7, 0); // Bind n + + __visc__bindOut(ZCNode, 0, 0, 0); // bind output m + +} + +/* + * Gradient computation using Sobel filters + * Is : input (smoothed image) + * Sx, Sy: Sobel operators + * - Sx = [-1 0 1 ; -2 0 2 ; -1 0 1 ] + * - Sy = [-1 -2 -1 ; 0 0 0 ; 1 2 1 ] + * m, n : dimensions + * G: output, gradient magnitude : sqrt(Gx^2+Gy^2) + * Need 2D grid, a thread per pixel + * No use of separable algorithm because we need to do this in one kernel + * No use of shared memory because + * - we don't handle it in the X86 pass + */ + +#define SOBEL_SIZE 3 +#define SOBEL_RADIUS (SOBEL_SIZE / 2) + +void computeGradient(float *Is, size_t bytesIs, + float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, + float *G, size_t bytesG, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(3, Is, Sx, Sy, 1, G); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + int gloc = gx + gy*n; + + float Gx = 0; + float Gy = 0; + float gval; + int loadOffset; + + if ((gx < n) && (gy < m)) { + for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++) + for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) { + + loadOffset = gloc + i*n + j; + + if ((gy + i) < 0) // top contour + loadOffset = gx + j; + else if ((gy + i) > m-1 ) // bottom contour + loadOffset = (m-1)*n + gx + j; + else + loadOffset = gloc + i*n + j; // within image vertically + + // Adjust so we are within image horizonally + if ((gx + j) < 0) // left contour + loadOffset -= (gx+j); + else if ((gx + j) > n-1 ) // right contour + loadOffset = loadOffset - gx - j + n - 1; + + gval = Is[loadOffset]; + Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + } + + G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy); + //G[gloc] = Gx*Gx + Gy*Gy; + } + __visc__return(n); +} + +void WrapperComputeGradient(float *Is, size_t bytesIs, + float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, + float *G, size_t bytesG, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, Is, Sx, Sy, 1, G); + void* CGNode = __visc__createNode2D(computeGradient, m, n); + __visc__bindIn(CGNode, 0, 0, 0); // Bind Is + __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx + __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx + __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy + __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy + __visc__bindIn(CGNode, 6, 6, 0); // Bind G + __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG + __visc__bindIn(CGNode, 8, 8, 0); // Bind m + __visc__bindIn(CGNode, 9, 9, 0); // Bind n + + __visc__bindOut(CGNode, 0, 0, 0); // bind output m +} + +/* + * Reduction + * G : input + * maxG: output + * m, n: input size + * Needs a single thread block + */ +void computeMaxGradientLeaf(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n) { + + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(1, G, 1, maxG); + + void* thisNode = __visc__getNode(); + + int lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x + int dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x + + + // Assume a single thread block + // Thread block iterates over all elements + for (int i = lx + dimx; i < m*n; i+= dimx) { + if (G[lx] < G[i]) + G[lx] = G[i]; + } + + // First thread iterates over all elements of the thread block + if (lx == 0) { + for (int i = 1; (i < dimx) && (i < m*n); i++) + if (G[lx] < G[i]) + G[lx] = G[i]; + + *maxG = G[lx]; + } + + __visc__return(n); +} + +/* + * Reduction + * G : input + * maxG: output + * Each static node processes 2*nodeDim elements + * Need 1D grid, a thread per 2 pixels + */ +//void computeMaxGradientLeaf(float *G, size_t bytesG, + //float *maxG, size_t bytesMaxG, + //int m, int n) { + + //__visc__hint(visc::DEVICE); + //TODO: maxG should be initialized to zero (MIN_BR) every time + //__visc__attributes(2, G, maxG, 1, maxG); + + //void* thisNode = __visc__getNode(); + //void* parentNode = __visc__getParentNode(thisNode); + + //int lx = __visc__getNodeInstanceID_x(thisNode); + //int px = __visc__getNodeInstanceID_x(parentNode); + //int dimx = __visc__getNumNodeInstances_x(thisNode); + + //int gid = lx + 2*px*dimx; + + //for (unsigned stride = dimx; stride > 32; stride >>= 1) { + //if ((gid + stride < m*n) && (lx < stride)) + //if (G[gid + stride] > G[gid]) + //G[gid] = G[gid + stride]; + //__visc__barrier(); + //} + + //for (unsigned stride = 32; stride >= 1; stride >>= 1) { + //if ((gid + stride < m*n) && (lx < stride)) + //if (G[gid + stride] > G[gid]) + //G[gid] = G[gid + stride]; + //} + + //if (lx == 0) + //__visc__atomic_max(maxG,G[gid]); + + //__visc__return(m); +//} + +void computeMaxGradientTB(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n, + int block_x) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); + __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n + + __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m +} + +void WrapperComputeMaxGradient(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n, + int block_x, int grid_x) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x); + __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n + __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x + + __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m +} + +/* Reject the zero crossings where the gradient is below a threshold */ +/* + * S : input (computed zero crossings) + * m, n : dimensions + * G: gradient of (smoothed) image + * E : output (edges of the image) + * Need 2D grid, a thread per pixel + */ + +#define THETA 0.1 +void rejectZeroCrossings(float *S, size_t bytesS, + float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, + int m, int n) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + float mG = *maxG; + //float mG = 1.39203; + if ((gx < n) && (gy < m)) { + E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ; + } + __visc__return(m); +} + +void WrapperRejectZeroCrossings(float *S, size_t bytesS, + float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n); + __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S + __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS + __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G + __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG + __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG + __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG + __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E + __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE + __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m + __visc__bindIn(RZCNode, 9, 9, 0); // Bind n + + __visc__bindOut(RZCNode, 0, 0, 0); // bind output m +} + + + +// Pipelined Root node +void edgeDetection(float *I, size_t bytesI, // 0 + float *Is, size_t bytesIs, // 2 + float *L, size_t bytesL, // 4 + float *S, size_t bytesS, // 6 + float *G, size_t bytesG, // 8 + float *maxG, size_t bytesMaxG, // 10 + float *E, size_t bytesE, // 12 + float *Gs, size_t bytesGs, // 14 + float *B, size_t bytesB, // 16 + float *Sx, size_t bytesSx, // 18 + float *Sy, size_t bytesSy, // 20 + int m, // 22 + int n, // 23 + int block_x, // 24 + int grid_x // 25 + ) { + __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); + __visc__hint(visc::CPU_TARGET); + void* GSNode = __visc__createNode(WrapperGaussianSmoothing); + void* LNode = __visc__createNode(WrapperlaplacianEstimate); + void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings); + void* CGNode = __visc__createNode(WrapperComputeGradient); + void* CMGNode = __visc__createNode(WrapperComputeMaxGradient); + void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings); + + // Gaussian Inputs + __visc__bindIn(GSNode, 0 , 0, 1); // Bind I + __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI + __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs + __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs + __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is + __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs + __visc__bindIn(GSNode, 22, 6, 1); // Bind m + __visc__bindIn(GSNode, 23, 7, 1); // Bind n + + // Laplacian Inputs + __visc__bindIn(LNode, 2 , 0, 1); // Bind Is + __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs + __visc__bindIn(LNode, 16, 2, 1); // Bind B + __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(LNode, 4 , 4, 1); // Bind L + __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL +// __visc__bindIn(LNode, 22, 6, 1); // Bind m + __visc__edge(GSNode, LNode, 0, 6, 1); // Get m + __visc__bindIn(LNode, 23, 7, 1); // Bind n + + // Compute ZC Inputs + __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L + __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL + __visc__bindIn(CZCNode, 16, 2, 1); // Bind B + __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S + __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS + //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m + __visc__edge(LNode, CZCNode, 0, 6, 1); // Get m + __visc__bindIn(CZCNode, 23, 7, 1); // Bind n + + // Gradient Inputs + __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is + __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs + __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx + __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx + __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy + __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy + __visc__bindIn(CGNode, 8 , 6, 1); // Bind G + __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG + __visc__bindIn(CGNode, 22, 8, 1); // Bind m + //__visc__edge(CZCNode, CGNode, 0, 8, 1); // Get m + //__visc__bindIn(CGNode, 23, 9, 1); // Bind n + __visc__edge(GSNode, CGNode, 1, 9, 1); // Get n + + // Max Gradient Inputs + __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G + __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG + __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG + __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG + __visc__bindIn(CMGNode, 22, 4, 1); // Bind m + //__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m + //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n + __visc__edge(CGNode, CMGNode, 0, 5, 1); // Get n + __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x + __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x + + // Reject ZC Inputs + __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S + __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS + __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G + __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG + __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG + __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG + __visc__bindIn(RZCNode, 12, 6, 1); // Bind E + __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE + //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m + __visc__edge(CZCNode, RZCNode, 0, 8, 1); // Get m + //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n + __visc__edge(CMGNode, RZCNode, 0, 9, 1); // Get n + + __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality +} + +} + +using namespace cv; + +void getNextFrame(VideoCapture& VC, Mat& F) { + VC >> F; + /// Convert the image to grayscale if image colored + if(F.channels() == 3) + cvtColor( F, F, CV_BGR2GRAY ); + + F.convertTo(F, CV_32F, 1.0/255.0); + +} + +//void showInOut(Mat& Input, Mat& Output) { + //Mat in, out; + //resize(Input, in, Size(512, 768)); + //resize(Output, out, Size(512, 768)); + //imshow(input_window, in); + //imshow(output_window, out); +//} + + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t I_sz; + int block_x, grid_x; + + std::cout << "Using OpenCV" << CV_VERSION << "\n"; + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] != NULL)) + { + fprintf(stderr, "Expecting input image filename\n"); + exit(-1); + } + + /* Read in data */ + std::cout << "Reading video file: " << params->inpFiles[0] << "\n"; + VideoCapture cap(params->inpFiles[0]); + if(!cap.isOpened()) { + std::cout << "Could not open video file" << "\n"; + return -1; + } + + int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT); + std::cout << "Number of frames = " << NUM_FRAMES << "\n"; + namedWindow(input_window, CV_WINDOW_AUTOSIZE); + namedWindow(output_window, CV_WINDOW_AUTOSIZE); + moveWindow(input_window, POSX_IN, POSY_IN); + moveWindow(output_window, POSX_OUT, POSY_OUT); + + Mat src, Is, L, S, G, E; + + getNextFrame(cap, src); + + std::cout << "Image dimension = " << src.size() << "\n"; + if(!src.isContinuous()) { + std::cout << "Expecting contiguous storage of image in memory!\n"; + exit(-1); + } + + Is = Mat(src.size[0], src.size[1], CV_32F); + L = Mat(src.size[0], src.size[1], CV_32F); + S = Mat(src.size[0], src.size[1], CV_32F); + G = Mat(src.size[0], src.size[1], CV_32F); + E = Mat(src.size[0], src.size[1], CV_32F); + + // All these matrices need to have their data array contiguous in memory + assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous()); + + pb_InitializeTimerSet(&timers); + __visc__init(); + + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + I_sz = src.size[0]*src.size[1]*sizeof(float); + + size_t bytesMaxG = sizeof(float); + float* maxG = (float*)malloc(bytesMaxG); + + float B[] = { 1, 1, 1, + 1, 1, 1, + 1, 1, 1 }; + size_t bytesB = 9*sizeof(float); + //Sx = [-1 0 1 ; -2 0 2 ; -1 0 1 ] + //Sy = [-1 -2 -1 ; 0 0 0 ; 1 2 1 ] + float Sx[] = { -1, 0, 1, + -2, 0, 2, + -1, 0, 1 }; + size_t bytesSx = 9*sizeof(float); + float Sy[] = { -1, -2, -1, + 0, 0, 0, + 1, 2, 1 }; + size_t bytesSy = 9*sizeof(float); + + float Gs [] = { + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 }; + size_t bytesGs = 7*7*sizeof(float); + + block_x = 256; + // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs + grid_x = 1; + + // Copy A and B^T into device memory + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + //showInOut(src, E); + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(input_window, in); + imshow(output_window, out); + waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); + packData(args, (float*)src.data, I_sz, + (float*)Is.data, I_sz, + (float*)L.data, I_sz, + (float*)S.data, I_sz, + (float*)G.data, I_sz, + maxG, bytesMaxG, + (float*)E.data, I_sz, + Gs, bytesGs, + B, bytesB, + Sx, bytesSx, + Sy, bytesSy, + src.size[0], src.size[1], + block_x, grid_x); + + // Check if the total elements is a multiple of block size + assert(src.size[0]*src.size[1] % block_x == 0); + + //imshow(input_window, src); + //imshow(output_window, E); + //waitKey(0); + for(unsigned j=0; j<NUM_RUNS; j++) { + std::cout << "Run: " << j << "\n"; + void* DFG = __visc__launch(1, edgeDetection, (void*)args); + + cap = VideoCapture(params->inpFiles[0]); + getNextFrame(cap, src); + + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + + if(NUM_FRAMES >=2) { + //__visc__push(DFG, args); + //__visc__push(DFG, args); + for(int i=0; i<NUM_FRAMES; i++) { + //std::cout << "Frame " << i << "\n"; + args->I = (float*) src.data; + + *maxG = 0.0; + + llvm_visc_track_mem(src.data, I_sz); + llvm_visc_track_mem(Is.data, I_sz); + llvm_visc_track_mem(L.data, I_sz); + llvm_visc_track_mem(S.data, I_sz); + llvm_visc_track_mem(G.data, I_sz); + llvm_visc_track_mem(maxG, bytesMaxG); + llvm_visc_track_mem(E.data, I_sz); + llvm_visc_track_mem(Gs, bytesGs); + llvm_visc_track_mem(B, bytesB); + llvm_visc_track_mem(Sx, bytesSx); + llvm_visc_track_mem(Sy, bytesSy); + + __visc__push(DFG, args); + __visc__pop(DFG); + + //llvm_visc_request_mem(E.data, I_sz); + //std::cout << "Show E" << "\n"; + //imshow(window_name, E); + //waitKey(0); + //llvm_visc_request_mem(src.data, I_sz); + //llvm_visc_request_mem(Is.data, I_sz); + //llvm_visc_request_mem(L.data, I_sz); + //llvm_visc_request_mem(S.data, I_sz); + //llvm_visc_request_mem(G.data, I_sz); + llvm_visc_request_mem(maxG, bytesMaxG); + llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; + //std::cout << "Max G = " << *maxG << "\n"; + + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + //std::cout << "Show E\n"; + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(output_window, out); + imshow(input_window, in); + waitKey(1); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + + llvm_visc_untrack_mem(src.data); + llvm_visc_untrack_mem(Is.data); + llvm_visc_untrack_mem(L.data); + llvm_visc_untrack_mem(S.data); + llvm_visc_untrack_mem(G.data); + llvm_visc_untrack_mem(maxG); + llvm_visc_untrack_mem(E.data); + llvm_visc_untrack_mem(Gs); + llvm_visc_untrack_mem(B); + llvm_visc_untrack_mem(Sx); + llvm_visc_untrack_mem(Sy); + + getNextFrame(cap, src); + + } + //__visc__pop(DFG); + //__visc__pop(DFG); + } + else { + __visc__push(DFG, args); + __visc__pop(DFG); + } + + + __visc__wait(DFG); + } + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + + pb_PrintTimerSet(&timers); + __visc__cleanup(); + + //if (params->outFile) { + + /* Write C to file */ + //pb_SwitchToTimer(&timers, pb_TimerID_IO); + //writeColMajorMatrixFile(params->outFile, + //src.size[0], src.size[1], matE); + //} + + //std::cout << "Show Is" << "\n"; + //Mat output(src.size[0], src.size[1], CV_32F); + //imshow(output_window, Is); + //waitKey(0); + + //std::cout << "Show G" << "\n"; + //imshow(output_window, L); + //waitKey(0); + + //std::cout << "Show L" << "\n"; + //imshow(output_window, S); + //waitKey(0); + + //std::cout << "Show S" << "\n"; + //imshow(output_window, G); + //waitKey(0); + + //std::cout << "Show E" << "\n"; + //imshow(output_window, E); + //waitKey(0); + + //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl; + pb_FreeParameters(params); + + return 0; +} diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f87cb91102c01826ecf87c2e698822d7caaaef5e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/Makefile @@ -0,0 +1,12 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=visc +SRCDIR_OBJS=io.ll #compute_gold.o +VISC_OBJS=main.visc.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib + +#OpenCV link flags all +#/usr/bin/c++ -std=c++0x CMakeFiles/EdgeDetect.dir/EdgeDetect.cpp.o -o EdgeDetect -L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_cudev.so.3.0.0 /opt/opencv/lib/libopencv_cudawarping.so.3.0.0 /opt/opencv/lib/libopencv_cudastereo.so.3.0.0 /opt/opencv/lib/libopencv_cudaoptflow.so.3.0.0 /opt/opencv/lib/libopencv_cudaobjdetect.so.3.0.0 /opt/opencv/lib/libopencv_cudalegacy.so.3.0.0 /opt/opencv/lib/libopencv_cudaimgproc.so.3.0.0 /opt/opencv/lib/libopencv_cudafilters.so.3.0.0 /opt/opencv/lib/libopencv_cudafeatures2d.so.3.0.0 /opt/opencv/lib/libopencv_cudacodec.so.3.0.0 /opt/opencv/lib/libopencv_cudabgsegm.so.3.0.0 /opt/opencv/lib/libopencv_cudaarithm.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -lcudart -lnpp -lcufft -lcudart -lnpp -lcufft -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/main.cc similarity index 94% rename from llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/main.cc rename to llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/main.cc index e67683e36e7f0fbe44f1f48c2eaa27d4cf821842..c77934beaedb52c020fc1d41aca3686671ab75e2 100755 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/main.cc @@ -23,6 +23,39 @@ #include <parboil.h> #include <visc.h> + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "GPU-CPU(No Vector)-ZC Pipeline - Input Video"; +std::string output_window = "GPU-CPU(No Vector)-ZC Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + // Definitions of sizes for edge detection kernels #define MIN_BR 0.0f @@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::GPU_TARGET); __visc__attributes(2, Is, B, 1, L); // 3x3 image area float imageArea[SZB][SZB]; @@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL, float *S, size_t bytesS, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::SPIR_TARGET); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); // 3x3 image area @@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(1, G, 1, maxG); void* thisNode = __visc__getNode(); @@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG, int m, int n, int block_x) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G @@ -645,7 +679,8 @@ void rejectZeroCrossings(float *S, size_t bytesS, float *maxG, size_t bytesMaxG, float *E, size_t bytesE, int m, int n) { - __visc__hint(visc::CPU_TARGET); + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(3, S, G, maxG, 1, E); void* thisNode = __visc__getNode(); @@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0 } } -#define NUM_RUNS 1 -#define DEPTH 3 -#define HEIGHT 640 -#define WIDTH 480 -std::string input_window = "GPU Pipeline - Input Video"; -std::string output_window = "GPU Pipeline - Edge Mapping"; -#define POSX_IN 0 -#define POSY_IN 0 -#define POSX_OUT 0 -#define POSY_OUT 540 -//#define NUM_FRAMES 20 using namespace cv; void getNextFrame(VideoCapture& VC, Mat& F) { @@ -826,8 +850,6 @@ int main (int argc, char *argv[]) { size_t I_sz; int block_x, grid_x; - std::string input_window = "Input Frame"; - std::string output_window = "Edge Map"; std::cout << "Using OpenCV" << CV_VERSION << "\n"; @@ -878,7 +900,7 @@ int main (int argc, char *argv[]) { pb_InitializeTimerSet(&timers); __visc__init(); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); // copy A to device memory I_sz = src.size[0]*src.size[1]*sizeof(float); @@ -915,17 +937,19 @@ int main (int argc, char *argv[]) { grid_x = 1; // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); - struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); - Mat in, out; //showInOut(src, E); + Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); resize(E, out, Size(HEIGHT, WIDTH)); imshow(input_window, in); imshow(output_window, out); waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); packData(args, (float*)src.data, I_sz, (float*)Is.data, I_sz, (float*)L.data, I_sz, @@ -952,7 +976,9 @@ int main (int argc, char *argv[]) { cap = VideoCapture(params->inpFiles[0]); getNextFrame(cap, src); + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + if(NUM_FRAMES >=2) { //__visc__push(DFG, args); //__visc__push(DFG, args); @@ -961,6 +987,7 @@ int main (int argc, char *argv[]) { args->I = (float*) src.data; *maxG = 0.0; + llvm_visc_track_mem(src.data, I_sz); llvm_visc_track_mem(Is.data, I_sz); llvm_visc_track_mem(L.data, I_sz); @@ -972,7 +999,7 @@ int main (int argc, char *argv[]) { llvm_visc_track_mem(B, bytesB); llvm_visc_track_mem(Sx, bytesSx); llvm_visc_track_mem(Sy, bytesSy); - + __visc__push(DFG, args); __visc__pop(DFG); @@ -981,38 +1008,44 @@ int main (int argc, char *argv[]) { //imshow(window_name, E); //waitKey(0); //llvm_visc_request_mem(src.data, I_sz); - llvm_visc_request_mem(Is.data, I_sz); - llvm_visc_request_mem(L.data, I_sz); - llvm_visc_request_mem(S.data, I_sz); - llvm_visc_request_mem(G.data, I_sz); + //llvm_visc_request_mem(Is.data, I_sz); + //llvm_visc_request_mem(L.data, I_sz); + //llvm_visc_request_mem(S.data, I_sz); + //llvm_visc_request_mem(G.data, I_sz); llvm_visc_request_mem(maxG, bytesMaxG); llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; //std::cout << "Max G = " << *maxG << "\n"; Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); - std::cout << "Show E\n"; + //std::cout << "Show E\n"; resize(E, out, Size(HEIGHT, WIDTH)); - imshow(input_window, in); - imshow(output_window, out); - waitKey(0); - std::cout << "Show Is\n"; - resize(Is, out, Size(HEIGHT, WIDTH)); - imshow(output_window, out); - waitKey(0); - std::cout << "Show L\n"; - resize(L, out, Size(HEIGHT, WIDTH)); - imshow(output_window, out); - waitKey(0); - std::cout << "Show S\n"; - resize(S, out, Size(HEIGHT, WIDTH)); imshow(output_window, out); - waitKey(0); - std::cout << "Show G\n"; - resize(G, out, Size(HEIGHT, WIDTH)); - imshow(output_window, out); - waitKey(0); - getNextFrame(cap, src); + imshow(input_window, in); + waitKey(1); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); llvm_visc_untrack_mem(src.data); llvm_visc_untrack_mem(Is.data); @@ -1026,7 +1059,7 @@ int main (int argc, char *argv[]) { llvm_visc_untrack_mem(Sx); llvm_visc_untrack_mem(Sy); - + getNextFrame(cap, src); } //__visc__pop(DFG); @@ -1036,7 +1069,8 @@ int main (int argc, char *argv[]) { __visc__push(DFG, args); __visc__pop(DFG); } - + + __visc__wait(DFG); } diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile @@ -4,8 +4,8 @@ LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include -APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib #OpenCV link flags all diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc index 0da2c030aa83aee13481f2c160d2cc99f7acb2e7..8f5c8451b85e2b02c8e4879e53d871002fc3b171 100755 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc @@ -23,6 +23,39 @@ #include <parboil.h> #include <visc.h> + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "GPU Pipeline - Input Video"; +std::string output_window = "GPU Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + // Definitions of sizes for edge detection kernels #define MIN_BR 0.0f @@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::GPU_TARGET); __visc__attributes(2, Is, B, 1, L); // 3x3 image area float imageArea[SZB][SZB]; @@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL, float *S, size_t bytesS, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::SPIR_TARGET); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); // 3x3 image area @@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(1, G, 1, maxG); void* thisNode = __visc__getNode(); @@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG, int m, int n, int block_x) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G @@ -646,6 +680,7 @@ void rejectZeroCrossings(float *S, size_t bytesS, float *E, size_t bytesE, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(3, S, G, maxG, 1, E); void* thisNode = __visc__getNode(); @@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0 } } -#define NUM_RUNS 100 -#define DEPTH 3 -#define HEIGHT 640 -#define WIDTH 480 -std::string input_window = "GPU Pipeline - Input Video"; -std::string output_window = "GPU Pipeline - Edge Mapping"; -#define POSX_IN 0 -#define POSY_IN 0 -#define POSX_OUT 0 -#define POSY_OUT 540 -//#define NUM_FRAMES 20 using namespace cv; void getNextFrame(VideoCapture& VC, Mat& F) { @@ -826,8 +850,6 @@ int main (int argc, char *argv[]) { size_t I_sz; int block_x, grid_x; - std::string input_window = "Input Frame"; - std::string output_window = "Edge Map"; std::cout << "Using OpenCV" << CV_VERSION << "\n"; @@ -878,7 +900,7 @@ int main (int argc, char *argv[]) { pb_InitializeTimerSet(&timers); __visc__init(); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); // copy A to device memory I_sz = src.size[0]*src.size[1]*sizeof(float); @@ -915,17 +937,19 @@ int main (int argc, char *argv[]) { grid_x = 1; // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); - struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); - Mat in, out; //showInOut(src, E); + Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); resize(E, out, Size(HEIGHT, WIDTH)); imshow(input_window, in); imshow(output_window, out); waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); packData(args, (float*)src.data, I_sz, (float*)Is.data, I_sz, (float*)L.data, I_sz, @@ -952,7 +976,9 @@ int main (int argc, char *argv[]) { cap = VideoCapture(params->inpFiles[0]); getNextFrame(cap, src); + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + if(NUM_FRAMES >=2) { //__visc__push(DFG, args); //__visc__push(DFG, args); @@ -961,6 +987,7 @@ int main (int argc, char *argv[]) { args->I = (float*) src.data; *maxG = 0.0; + llvm_visc_track_mem(src.data, I_sz); llvm_visc_track_mem(Is.data, I_sz); llvm_visc_track_mem(L.data, I_sz); @@ -972,7 +999,7 @@ int main (int argc, char *argv[]) { llvm_visc_track_mem(B, bytesB); llvm_visc_track_mem(Sx, bytesSx); llvm_visc_track_mem(Sy, bytesSy); - + __visc__push(DFG, args); __visc__pop(DFG); @@ -987,15 +1014,38 @@ int main (int argc, char *argv[]) { //llvm_visc_request_mem(G.data, I_sz); llvm_visc_request_mem(maxG, bytesMaxG); llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; //std::cout << "Max G = " << *maxG << "\n"; Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); + //std::cout << "Show E\n"; resize(E, out, Size(HEIGHT, WIDTH)); imshow(output_window, out); imshow(input_window, in); waitKey(1); - getNextFrame(cap, src); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); llvm_visc_untrack_mem(src.data); llvm_visc_untrack_mem(Is.data); @@ -1009,7 +1059,7 @@ int main (int argc, char *argv[]) { llvm_visc_untrack_mem(Sx); llvm_visc_untrack_mem(Sy); - + getNextFrame(cap, src); } //__visc__pop(DFG); @@ -1019,7 +1069,8 @@ int main (int argc, char *argv[]) { __visc__push(DFG, args); __visc__pop(DFG); } - + + __visc__wait(DFG); } diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile @@ -4,8 +4,8 @@ LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include -APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib #OpenCV link flags all diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc index 717255cc9f9f3300566601cdc9a060503487d8b4..6d4b6e4304496ed555b52e630d80ff016a57d8e1 100755 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc @@ -23,6 +23,39 @@ #include <parboil.h> #include <visc.h> + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "CPU(No vector) Pipeline - Input Video"; +std::string output_window = "CPU(No vector) Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + // Definitions of sizes for edge detection kernels #define MIN_BR 0.0f @@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::GPU_TARGET); __visc__attributes(2, Is, B, 1, L); // 3x3 image area float imageArea[SZB][SZB]; @@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL, float *S, size_t bytesS, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::SPIR_TARGET); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); // 3x3 image area @@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(1, G, 1, maxG); void* thisNode = __visc__getNode(); @@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG, int m, int n, int block_x) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G @@ -646,6 +680,7 @@ void rejectZeroCrossings(float *S, size_t bytesS, float *E, size_t bytesE, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(3, S, G, maxG, 1, E); void* thisNode = __visc__getNode(); @@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0 } } -#define NUM_RUNS 100 -#define DEPTH 3 -#define HEIGHT 640 -#define WIDTH 480 -std::string input_window = "Scalar Pipeline - Input Video"; -std::string output_window = "Scalar Pipeline - Edge Mapping"; -#define POSX_IN 1280 -#define POSY_IN 0 -#define POSX_OUT 1280 -#define POSY_OUT 540 -//#define NUM_FRAMES 20 using namespace cv; void getNextFrame(VideoCapture& VC, Mat& F) { @@ -826,8 +850,6 @@ int main (int argc, char *argv[]) { size_t I_sz; int block_x, grid_x; - std::string input_window = "Input Frame"; - std::string output_window = "Edge Map"; std::cout << "Using OpenCV" << CV_VERSION << "\n"; @@ -878,7 +900,7 @@ int main (int argc, char *argv[]) { pb_InitializeTimerSet(&timers); __visc__init(); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); // copy A to device memory I_sz = src.size[0]*src.size[1]*sizeof(float); @@ -915,17 +937,19 @@ int main (int argc, char *argv[]) { grid_x = 1; // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); - struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); - Mat in, out; //showInOut(src, E); + Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); resize(E, out, Size(HEIGHT, WIDTH)); imshow(input_window, in); imshow(output_window, out); waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); packData(args, (float*)src.data, I_sz, (float*)Is.data, I_sz, (float*)L.data, I_sz, @@ -952,7 +976,9 @@ int main (int argc, char *argv[]) { cap = VideoCapture(params->inpFiles[0]); getNextFrame(cap, src); + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + if(NUM_FRAMES >=2) { //__visc__push(DFG, args); //__visc__push(DFG, args); @@ -961,6 +987,7 @@ int main (int argc, char *argv[]) { args->I = (float*) src.data; *maxG = 0.0; + llvm_visc_track_mem(src.data, I_sz); llvm_visc_track_mem(Is.data, I_sz); llvm_visc_track_mem(L.data, I_sz); @@ -972,7 +999,7 @@ int main (int argc, char *argv[]) { llvm_visc_track_mem(B, bytesB); llvm_visc_track_mem(Sx, bytesSx); llvm_visc_track_mem(Sy, bytesSy); - + __visc__push(DFG, args); __visc__pop(DFG); @@ -987,15 +1014,38 @@ int main (int argc, char *argv[]) { //llvm_visc_request_mem(G.data, I_sz); llvm_visc_request_mem(maxG, bytesMaxG); llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; //std::cout << "Max G = " << *maxG << "\n"; Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); + //std::cout << "Show E\n"; resize(E, out, Size(HEIGHT, WIDTH)); imshow(output_window, out); imshow(input_window, in); waitKey(1); - getNextFrame(cap, src); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); llvm_visc_untrack_mem(src.data); llvm_visc_untrack_mem(Is.data); @@ -1009,7 +1059,7 @@ int main (int argc, char *argv[]) { llvm_visc_untrack_mem(Sx); llvm_visc_untrack_mem(Sy); - + getNextFrame(cap, src); } //__visc__pop(DFG); @@ -1019,7 +1069,8 @@ int main (int argc, char *argv[]) { __visc__push(DFG, args); __visc__pop(DFG); } - + + __visc__wait(DFG); } diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f87cb91102c01826ecf87c2e698822d7caaaef5e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/Makefile @@ -0,0 +1,12 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=visc +SRCDIR_OBJS=io.ll #compute_gold.o +VISC_OBJS=main.visc.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib + +#OpenCV link flags all +#/usr/bin/c++ -std=c++0x CMakeFiles/EdgeDetect.dir/EdgeDetect.cpp.o -o EdgeDetect -L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_cudev.so.3.0.0 /opt/opencv/lib/libopencv_cudawarping.so.3.0.0 /opt/opencv/lib/libopencv_cudastereo.so.3.0.0 /opt/opencv/lib/libopencv_cudaoptflow.so.3.0.0 /opt/opencv/lib/libopencv_cudaobjdetect.so.3.0.0 /opt/opencv/lib/libopencv_cudalegacy.so.3.0.0 /opt/opencv/lib/libopencv_cudaimgproc.so.3.0.0 /opt/opencv/lib/libopencv_cudafilters.so.3.0.0 /opt/opencv/lib/libopencv_cudafeatures2d.so.3.0.0 /opt/opencv/lib/libopencv_cudacodec.so.3.0.0 /opt/opencv/lib/libopencv_cudabgsegm.so.3.0.0 /opt/opencv/lib/libopencv_cudaarithm.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -lcudart -lnpp -lcufft -lcudart -lnpp -lcufft -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/main.cc new file mode 100755 index 0000000000000000000000000000000000000000..22197ad6fb4905660fedef106210d27f31543709 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/main.cc @@ -0,0 +1,1117 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include "opencv2/opencv.hpp" +#include "opencv2/core/ocl.hpp" +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <iostream> +#include <cassert> +#include <parboil.h> +#include <visc.h> + + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "Vector-CPU(No Vector)-MaxG Pipeline - Input Video"; +std::string output_window = "Vector-CPU(No Vector)-MaxG Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + +// Definitions of sizes for edge detection kernels + +#define MIN_BR 0.0f +#define MAX_BR 1.0f + +// Code needs to be changed for this to vary +#define SZB 3 + +#define REDUCTION_TILE_SZ 1024 + +#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + +extern "C" { + +struct __attribute__((__packed__)) InStruct { + float* I ; + size_t bytesI; + float* Is ; + size_t bytesIs; + float* L; + size_t bytesL; + float* S; + size_t bytesS; + float* G; + size_t bytesG; + float* maxG; + size_t bytesMaxG; + float* E; + size_t bytesE; + float* Gs; + size_t bytesGs; + float* B; + size_t bytesB; + float* Sx; + size_t bytesSx; + float* Sy; + size_t bytesSy; + int m; + int n; + int block_x; + int grid_x; +}; + + +void packData(struct InStruct* args, float* I, size_t bytesI, + float* Is, size_t bytesIs, + float* L, size_t bytesL, + float* S, size_t bytesS, + float* G, size_t bytesG, + float* maxG, size_t bytesMaxG, + float* E, size_t bytesE, + float* Gs, size_t bytesGs, + float* B, size_t bytesB, + float* Sx, size_t bytesSx, + float* Sy, size_t bytesSy, + int m, int n, + int block_x, int grid_x) { + args->I = I; + args->bytesI = bytesI; + args->Is = Is; + args->bytesIs = bytesIs; + args->L = L; + args->bytesL = bytesL; + args->S = S; + args->bytesS = bytesS; + args->G = G; + args->bytesG = bytesG; + args->maxG = maxG; + args->bytesMaxG = bytesMaxG; + args->E = E; + args->bytesE = bytesE; + args->Gs = Gs; + args->bytesGs = bytesGs; + args->B = B; + args->bytesB = bytesB; + args->Sx = Sx; + args->bytesSx = bytesSx; + args->Sy = Sy; + args->bytesSy = bytesSy; + args->m = m; + args->n = n; + args->block_x = block_x; + args->grid_x = grid_x; +} + +/* + * Gaussian smoothing of image I of size m x n + * I : input image + * Gs : gaussian filter + * Is: output (smoothed image) + * m, n : dimensions + * + * Need 2D grid, a thread per pixel + * No use of separable algorithm because we need to do this in one kernel + * No use of shared memory because + * - we don't handle it in the X86 pass + */ + +#define GAUSSIAN_SIZE 7 +#define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2) +void gaussianSmoothing(float *I, size_t bytesI, + float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(2, I, Gs, 1, Is); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + int gloc = gx + gy*n; + + float smoothedVal = 0; + float gval; + int loadOffset; + + if ((gx < n) && (gy < m)) { + for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++) + for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) { + + loadOffset = gloc + i*n + j; + + if ((gy + i) < 0) // top contour + loadOffset = gx + j; + else if ((gy + i) > m-1 ) // bottom contour + loadOffset = (m-1)*n + gx + j; + else + loadOffset = gloc + i*n + j; // within image vertically + + // Adjust so we are within image horizonally + if ((gx + j) < 0) // left contour + loadOffset -= (gx+j); + else if ((gx + j) > n-1 ) // right contour + loadOffset = loadOffset - gx - j + n - 1; + + gval = I[loadOffset]; + smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j]; + } + + Is[gloc] = smoothedVal; + } + __visc__return(m, n); +} + +void WrapperGaussianSmoothing(float *I, size_t bytesI, + float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, I, Gs, 1, Is); + void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n); + __visc__bindIn(GSNode, 0, 0, 0); // Bind I + __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI + __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs + __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs + __visc__bindIn(GSNode, 4, 4, 0); // Bind Is + __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs + __visc__bindIn(GSNode, 6, 6, 0); // Bind m + __visc__bindIn(GSNode, 7, 7, 0); // Bind n + + __visc__bindOut(GSNode, 0, 0, 0); // bind output m + __visc__bindOut(GSNode, 1, 1, 0); // bind output n +} + + +/* Compute a non-linear laplacian estimate of input image I of size m x n */ +/* + * Is : blurred imput image + * m, n : dimensions + * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) + * L : output (laplacian of the image) + * Need 2D grid, a thread per pixel +*/ +void laplacianEstimate(float *Is, size_t bytesIs, + float *B, size_t bytesB, + float *L, size_t bytesL, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(2, Is, B, 1, L); + // 3x3 image area + float imageArea[SZB][SZB]; + + //int gx = get_global_id(0); + //int gy = get_global_id(1); + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + //if(gx == 0 && gy == 0) + //std::cout << "Entered laplacian\n"; + + int i, j; + + if ((gx < n) && (gy < m)) { + // Data copy for dilation filter + imageArea[1][1] = Is[gy * n + gx]; + + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + } else { + imageArea[1][0] = Is[gy * n + gx - 1]; + imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; + imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + } else { + imageArea[1][2] = Is[gy * n + gx + 1]; + imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; + imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; + } + + imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; + imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; + + // Compute pixel of dilated image + float dilatedPixel = MIN_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + + // Data copy for erotion filter - only change the boundary conditions + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + } else { + if (gy == 0) imageArea[0][0] = MAX_BR; + if (gy == m-1) imageArea[2][0] = MAX_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + } else { + if (gy == 0) imageArea[0][2] = MAX_BR; + if (gy == m-1) imageArea[2][2] = MAX_BR; + } + + if (gy == 0) imageArea[0][1] = MAX_BR; + if (gy == m-1) imageArea[2][1] = MAX_BR; + + // Compute pixel of eroded image + float erodedPixel = MAX_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + + float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1]; + L[gy*n+gx] = laplacian; + } + //OutStruct output = {bytesB, bytesL}; + //if(gx == m-1 && gy == n-1) + //std::cout << "Exit laplacian\n"; + __visc__return(m); +} + +void WrapperlaplacianEstimate(float *Is, size_t bytesIs, + float *B, size_t bytesB, + float *L, size_t bytesL, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, Is, B, 1, L); + void* LNode = __visc__createNode2D(laplacianEstimate, m, n); + __visc__bindIn(LNode, 0, 0, 0); // Bind Is + __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(LNode, 2, 2, 0); // Bind B + __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(LNode, 4, 4, 0); // Bind L + __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL + __visc__bindIn(LNode, 6, 6, 0); // Bind m + __visc__bindIn(LNode, 7, 7, 0); // Bind n + + __visc__bindOut(LNode, 0, 0, 0); // bind output m + +} + +/* Compute the zero crossings of input image L of size m x n */ +/* + * L : imput image (computed Laplacian) + * m, n : dimensions + * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) + * S : output (sign of the image) + * Need 2D grid, a thread per pixel + */ +void computeZeroCrossings(float *L, size_t bytesL, + float *B, size_t bytesB, + float *S, size_t bytesS, + int m, int n) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + + // 3x3 image area + float imageArea[SZB][SZB]; + + //int gx = get_global_id(0); + //int gy = get_global_id(1); + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + int i, j; + + //if(gx == 0 && gy == 0) + //std::cout << "Entered ZC\n"; + if ((gx < n) && (gy < m)) { + // Data copy for dilation filter + imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR; + + if (gx == 0) { // left most line + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + } else { + imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR; + imageArea[0][0] = (gy > 0) ? + (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][0] = (gy < m - 1) ? + (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + } else { + imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR; + imageArea[0][2] = (gy > 0) ? + (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][2] = (gy < m - 1) ? + (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + } + + imageArea[0][1] = (gy > 0) ? + (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][1] = (gy < m - 1)? + (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + + // Compute pixel of dilated image + float dilatedPixel = MIN_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + + // Data copy for erotion filter - only change the boundary conditions + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + } else { + if (gy == 0) imageArea[0][0] = MAX_BR; + if (gy == m-1) imageArea[2][0] = MAX_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + } else { + if (gy == 0) imageArea[0][2] = MAX_BR; + if (gy == m-1) imageArea[2][2] = MAX_BR; + } + + if (gy == 0) imageArea[0][1] = MAX_BR; + if (gy == m-1) imageArea[2][1] = MAX_BR; + + // Compute pixel of eroded image + float erodedPixel = MAX_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + + float pixelSign = dilatedPixel - erodedPixel; + S[gy*n+gx] = pixelSign; + } + //OutStruct output = {bytesB, bytesS}; + //if(gx == n-1 && gy == n-1) + //std::cout << "Exit ZC\n"; + __visc__return(m); +} + +void WrapperComputeZeroCrossings(float *L, size_t bytesL, + float *B, size_t bytesB, + float *S, size_t bytesS, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n); + __visc__bindIn(ZCNode, 0, 0, 0); // Bind L + __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL + __visc__bindIn(ZCNode, 2, 2, 0); // Bind B + __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(ZCNode, 4, 4, 0); // Bind S + __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS + __visc__bindIn(ZCNode, 6, 6, 0); // Bind m + __visc__bindIn(ZCNode, 7, 7, 0); // Bind n + + __visc__bindOut(ZCNode, 0, 0, 0); // bind output m + +} + +/* + * Gradient computation using Sobel filters + * Is : input (smoothed image) + * Sx, Sy: Sobel operators + * - Sx = [-1 0 1 ; -2 0 2 ; -1 0 1 ] + * - Sy = [-1 -2 -1 ; 0 0 0 ; 1 2 1 ] + * m, n : dimensions + * G: output, gradient magnitude : sqrt(Gx^2+Gy^2) + * Need 2D grid, a thread per pixel + * No use of separable algorithm because we need to do this in one kernel + * No use of shared memory because + * - we don't handle it in the X86 pass + */ + +#define SOBEL_SIZE 3 +#define SOBEL_RADIUS (SOBEL_SIZE / 2) + +void computeGradient(float *Is, size_t bytesIs, + float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, + float *G, size_t bytesG, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(3, Is, Sx, Sy, 1, G); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + int gloc = gx + gy*n; + + float Gx = 0; + float Gy = 0; + float gval; + int loadOffset; + + if ((gx < n) && (gy < m)) { + for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++) + for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) { + + loadOffset = gloc + i*n + j; + + if ((gy + i) < 0) // top contour + loadOffset = gx + j; + else if ((gy + i) > m-1 ) // bottom contour + loadOffset = (m-1)*n + gx + j; + else + loadOffset = gloc + i*n + j; // within image vertically + + // Adjust so we are within image horizonally + if ((gx + j) < 0) // left contour + loadOffset -= (gx+j); + else if ((gx + j) > n-1 ) // right contour + loadOffset = loadOffset - gx - j + n - 1; + + gval = Is[loadOffset]; + Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + } + + G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy); + //G[gloc] = Gx*Gx + Gy*Gy; + } + __visc__return(n); +} + +void WrapperComputeGradient(float *Is, size_t bytesIs, + float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, + float *G, size_t bytesG, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, Is, Sx, Sy, 1, G); + void* CGNode = __visc__createNode2D(computeGradient, m, n); + __visc__bindIn(CGNode, 0, 0, 0); // Bind Is + __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx + __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx + __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy + __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy + __visc__bindIn(CGNode, 6, 6, 0); // Bind G + __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG + __visc__bindIn(CGNode, 8, 8, 0); // Bind m + __visc__bindIn(CGNode, 9, 9, 0); // Bind n + + __visc__bindOut(CGNode, 0, 0, 0); // bind output m +} + +/* + * Reduction + * G : input + * maxG: output + * m, n: input size + * Needs a single thread block + */ +void computeMaxGradientLeaf(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n) { + + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(1, G, 1, maxG); + + void* thisNode = __visc__getNode(); + + int lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x + int dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x + + + // Assume a single thread block + // Thread block iterates over all elements + for (int i = lx + dimx; i < m*n; i+= dimx) { + if (G[lx] < G[i]) + G[lx] = G[i]; + } + + // First thread iterates over all elements of the thread block + if (lx == 0) { + for (int i = 1; (i < dimx) && (i < m*n); i++) + if (G[lx] < G[i]) + G[lx] = G[i]; + + *maxG = G[lx]; + } + + __visc__return(n); +} + +/* + * Reduction + * G : input + * maxG: output + * Each static node processes 2*nodeDim elements + * Need 1D grid, a thread per 2 pixels + */ +//void computeMaxGradientLeaf(float *G, size_t bytesG, + //float *maxG, size_t bytesMaxG, + //int m, int n) { + + //__visc__hint(visc::DEVICE); + //TODO: maxG should be initialized to zero (MIN_BR) every time + //__visc__attributes(2, G, maxG, 1, maxG); + + //void* thisNode = __visc__getNode(); + //void* parentNode = __visc__getParentNode(thisNode); + + //int lx = __visc__getNodeInstanceID_x(thisNode); + //int px = __visc__getNodeInstanceID_x(parentNode); + //int dimx = __visc__getNumNodeInstances_x(thisNode); + + //int gid = lx + 2*px*dimx; + + //for (unsigned stride = dimx; stride > 32; stride >>= 1) { + //if ((gid + stride < m*n) && (lx < stride)) + //if (G[gid + stride] > G[gid]) + //G[gid] = G[gid + stride]; + //__visc__barrier(); + //} + + //for (unsigned stride = 32; stride >= 1; stride >>= 1) { + //if ((gid + stride < m*n) && (lx < stride)) + //if (G[gid + stride] > G[gid]) + //G[gid] = G[gid + stride]; + //} + + //if (lx == 0) + //__visc__atomic_max(maxG,G[gid]); + + //__visc__return(m); +//} + +void computeMaxGradientTB(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n, + int block_x) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); + __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n + + __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m +} + +void WrapperComputeMaxGradient(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n, + int block_x, int grid_x) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x); + __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n + __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x + + __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m +} + +/* Reject the zero crossings where the gradient is below a threshold */ +/* + * S : input (computed zero crossings) + * m, n : dimensions + * G: gradient of (smoothed) image + * E : output (edges of the image) + * Need 2D grid, a thread per pixel + */ + +#define THETA 0.1 +void rejectZeroCrossings(float *S, size_t bytesS, + float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, + int m, int n) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + float mG = *maxG; + //float mG = 1.39203; + if ((gx < n) && (gy < m)) { + E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ; + } + __visc__return(m); +} + +void WrapperRejectZeroCrossings(float *S, size_t bytesS, + float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n); + __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S + __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS + __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G + __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG + __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG + __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG + __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E + __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE + __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m + __visc__bindIn(RZCNode, 9, 9, 0); // Bind n + + __visc__bindOut(RZCNode, 0, 0, 0); // bind output m +} + + + +// Pipelined Root node +void edgeDetection(float *I, size_t bytesI, // 0 + float *Is, size_t bytesIs, // 2 + float *L, size_t bytesL, // 4 + float *S, size_t bytesS, // 6 + float *G, size_t bytesG, // 8 + float *maxG, size_t bytesMaxG, // 10 + float *E, size_t bytesE, // 12 + float *Gs, size_t bytesGs, // 14 + float *B, size_t bytesB, // 16 + float *Sx, size_t bytesSx, // 18 + float *Sy, size_t bytesSy, // 20 + int m, // 22 + int n, // 23 + int block_x, // 24 + int grid_x // 25 + ) { + __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); + __visc__hint(visc::CPU_TARGET); + void* GSNode = __visc__createNode(WrapperGaussianSmoothing); + void* LNode = __visc__createNode(WrapperlaplacianEstimate); + void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings); + void* CGNode = __visc__createNode(WrapperComputeGradient); + void* CMGNode = __visc__createNode(WrapperComputeMaxGradient); + void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings); + + // Gaussian Inputs + __visc__bindIn(GSNode, 0 , 0, 1); // Bind I + __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI + __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs + __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs + __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is + __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs + __visc__bindIn(GSNode, 22, 6, 1); // Bind m + __visc__bindIn(GSNode, 23, 7, 1); // Bind n + + // Laplacian Inputs + __visc__bindIn(LNode, 2 , 0, 1); // Bind Is + __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs + __visc__bindIn(LNode, 16, 2, 1); // Bind B + __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(LNode, 4 , 4, 1); // Bind L + __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL +// __visc__bindIn(LNode, 22, 6, 1); // Bind m + __visc__edge(GSNode, LNode, 0, 6, 1); // Get m + __visc__bindIn(LNode, 23, 7, 1); // Bind n + + // Compute ZC Inputs + __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L + __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL + __visc__bindIn(CZCNode, 16, 2, 1); // Bind B + __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S + __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS + //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m + __visc__edge(LNode, CZCNode, 0, 6, 1); // Get m + __visc__bindIn(CZCNode, 23, 7, 1); // Bind n + + // Gradient Inputs + __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is + __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs + __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx + __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx + __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy + __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy + __visc__bindIn(CGNode, 8 , 6, 1); // Bind G + __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG + __visc__bindIn(CGNode, 22, 8, 1); // Bind m + //__visc__edge(CZCNode, CGNode, 0, 8, 1); // Get m + //__visc__bindIn(CGNode, 23, 9, 1); // Bind n + __visc__edge(GSNode, CGNode, 1, 9, 1); // Get n + + // Max Gradient Inputs + __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G + __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG + __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG + __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG + __visc__bindIn(CMGNode, 22, 4, 1); // Bind m + //__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m + //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n + __visc__edge(CGNode, CMGNode, 0, 5, 1); // Get n + __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x + __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x + + // Reject ZC Inputs + __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S + __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS + __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G + __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG + __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG + __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG + __visc__bindIn(RZCNode, 12, 6, 1); // Bind E + __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE + //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m + __visc__edge(CZCNode, RZCNode, 0, 8, 1); // Get m + //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n + __visc__edge(CMGNode, RZCNode, 0, 9, 1); // Get n + + __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality +} + +} + +using namespace cv; + +void getNextFrame(VideoCapture& VC, Mat& F) { + VC >> F; + /// Convert the image to grayscale if image colored + if(F.channels() == 3) + cvtColor( F, F, CV_BGR2GRAY ); + + F.convertTo(F, CV_32F, 1.0/255.0); + +} + +//void showInOut(Mat& Input, Mat& Output) { + //Mat in, out; + //resize(Input, in, Size(512, 768)); + //resize(Output, out, Size(512, 768)); + //imshow(input_window, in); + //imshow(output_window, out); +//} + + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t I_sz; + int block_x, grid_x; + + std::cout << "Using OpenCV" << CV_VERSION << "\n"; + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] != NULL)) + { + fprintf(stderr, "Expecting input image filename\n"); + exit(-1); + } + + /* Read in data */ + std::cout << "Reading video file: " << params->inpFiles[0] << "\n"; + VideoCapture cap(params->inpFiles[0]); + if(!cap.isOpened()) { + std::cout << "Could not open video file" << "\n"; + return -1; + } + + int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT); + std::cout << "Number of frames = " << NUM_FRAMES << "\n"; + namedWindow(input_window, CV_WINDOW_AUTOSIZE); + namedWindow(output_window, CV_WINDOW_AUTOSIZE); + moveWindow(input_window, POSX_IN, POSY_IN); + moveWindow(output_window, POSX_OUT, POSY_OUT); + + Mat src, Is, L, S, G, E; + + getNextFrame(cap, src); + + std::cout << "Image dimension = " << src.size() << "\n"; + if(!src.isContinuous()) { + std::cout << "Expecting contiguous storage of image in memory!\n"; + exit(-1); + } + + Is = Mat(src.size[0], src.size[1], CV_32F); + L = Mat(src.size[0], src.size[1], CV_32F); + S = Mat(src.size[0], src.size[1], CV_32F); + G = Mat(src.size[0], src.size[1], CV_32F); + E = Mat(src.size[0], src.size[1], CV_32F); + + // All these matrices need to have their data array contiguous in memory + assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous()); + + pb_InitializeTimerSet(&timers); + __visc__init(); + + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + I_sz = src.size[0]*src.size[1]*sizeof(float); + + size_t bytesMaxG = sizeof(float); + float* maxG = (float*)malloc(bytesMaxG); + + float B[] = { 1, 1, 1, + 1, 1, 1, + 1, 1, 1 }; + size_t bytesB = 9*sizeof(float); + //Sx = [-1 0 1 ; -2 0 2 ; -1 0 1 ] + //Sy = [-1 -2 -1 ; 0 0 0 ; 1 2 1 ] + float Sx[] = { -1, 0, 1, + -2, 0, 2, + -1, 0, 1 }; + size_t bytesSx = 9*sizeof(float); + float Sy[] = { -1, -2, -1, + 0, 0, 0, + 1, 2, 1 }; + size_t bytesSy = 9*sizeof(float); + + float Gs [] = { + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 }; + size_t bytesGs = 7*7*sizeof(float); + + block_x = 256; + // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs + grid_x = 1; + + // Copy A and B^T into device memory + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + //showInOut(src, E); + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(input_window, in); + imshow(output_window, out); + waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); + packData(args, (float*)src.data, I_sz, + (float*)Is.data, I_sz, + (float*)L.data, I_sz, + (float*)S.data, I_sz, + (float*)G.data, I_sz, + maxG, bytesMaxG, + (float*)E.data, I_sz, + Gs, bytesGs, + B, bytesB, + Sx, bytesSx, + Sy, bytesSy, + src.size[0], src.size[1], + block_x, grid_x); + + // Check if the total elements is a multiple of block size + assert(src.size[0]*src.size[1] % block_x == 0); + + //imshow(input_window, src); + //imshow(output_window, E); + //waitKey(0); + for(unsigned j=0; j<NUM_RUNS; j++) { + std::cout << "Run: " << j << "\n"; + void* DFG = __visc__launch(1, edgeDetection, (void*)args); + + cap = VideoCapture(params->inpFiles[0]); + getNextFrame(cap, src); + + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + + if(NUM_FRAMES >=2) { + //__visc__push(DFG, args); + //__visc__push(DFG, args); + for(int i=0; i<NUM_FRAMES; i++) { + //std::cout << "Frame " << i << "\n"; + args->I = (float*) src.data; + + *maxG = 0.0; + + llvm_visc_track_mem(src.data, I_sz); + llvm_visc_track_mem(Is.data, I_sz); + llvm_visc_track_mem(L.data, I_sz); + llvm_visc_track_mem(S.data, I_sz); + llvm_visc_track_mem(G.data, I_sz); + llvm_visc_track_mem(maxG, bytesMaxG); + llvm_visc_track_mem(E.data, I_sz); + llvm_visc_track_mem(Gs, bytesGs); + llvm_visc_track_mem(B, bytesB); + llvm_visc_track_mem(Sx, bytesSx); + llvm_visc_track_mem(Sy, bytesSy); + + __visc__push(DFG, args); + __visc__pop(DFG); + + //llvm_visc_request_mem(E.data, I_sz); + //std::cout << "Show E" << "\n"; + //imshow(window_name, E); + //waitKey(0); + //llvm_visc_request_mem(src.data, I_sz); + //llvm_visc_request_mem(Is.data, I_sz); + //llvm_visc_request_mem(L.data, I_sz); + //llvm_visc_request_mem(S.data, I_sz); + //llvm_visc_request_mem(G.data, I_sz); + llvm_visc_request_mem(maxG, bytesMaxG); + llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; + //std::cout << "Max G = " << *maxG << "\n"; + + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + //std::cout << "Show E\n"; + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(output_window, out); + imshow(input_window, in); + waitKey(1); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + + llvm_visc_untrack_mem(src.data); + llvm_visc_untrack_mem(Is.data); + llvm_visc_untrack_mem(L.data); + llvm_visc_untrack_mem(S.data); + llvm_visc_untrack_mem(G.data); + llvm_visc_untrack_mem(maxG); + llvm_visc_untrack_mem(E.data); + llvm_visc_untrack_mem(Gs); + llvm_visc_untrack_mem(B); + llvm_visc_untrack_mem(Sx); + llvm_visc_untrack_mem(Sy); + + getNextFrame(cap, src); + + } + //__visc__pop(DFG); + //__visc__pop(DFG); + } + else { + __visc__push(DFG, args); + __visc__pop(DFG); + } + + + __visc__wait(DFG); + } + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + + pb_PrintTimerSet(&timers); + __visc__cleanup(); + + //if (params->outFile) { + + /* Write C to file */ + //pb_SwitchToTimer(&timers, pb_TimerID_IO); + //writeColMajorMatrixFile(params->outFile, + //src.size[0], src.size[1], matE); + //} + + //std::cout << "Show Is" << "\n"; + //Mat output(src.size[0], src.size[1], CV_32F); + //imshow(output_window, Is); + //waitKey(0); + + //std::cout << "Show G" << "\n"; + //imshow(output_window, L); + //waitKey(0); + + //std::cout << "Show L" << "\n"; + //imshow(output_window, S); + //waitKey(0); + + //std::cout << "Show S" << "\n"; + //imshow(output_window, G); + //waitKey(0); + + //std::cout << "Show E" << "\n"; + //imshow(output_window, E); + //waitKey(0); + + //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl; + pb_FreeParameters(params); + + return 0; +} diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f87cb91102c01826ecf87c2e698822d7caaaef5e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/Makefile @@ -0,0 +1,12 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=visc +SRCDIR_OBJS=io.ll #compute_gold.o +VISC_OBJS=main.visc.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib + +#OpenCV link flags all +#/usr/bin/c++ -std=c++0x CMakeFiles/EdgeDetect.dir/EdgeDetect.cpp.o -o EdgeDetect -L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_cudev.so.3.0.0 /opt/opencv/lib/libopencv_cudawarping.so.3.0.0 /opt/opencv/lib/libopencv_cudastereo.so.3.0.0 /opt/opencv/lib/libopencv_cudaoptflow.so.3.0.0 /opt/opencv/lib/libopencv_cudaobjdetect.so.3.0.0 /opt/opencv/lib/libopencv_cudalegacy.so.3.0.0 /opt/opencv/lib/libopencv_cudaimgproc.so.3.0.0 /opt/opencv/lib/libopencv_cudafilters.so.3.0.0 /opt/opencv/lib/libopencv_cudafeatures2d.so.3.0.0 /opt/opencv/lib/libopencv_cudacodec.so.3.0.0 /opt/opencv/lib/libopencv_cudabgsegm.so.3.0.0 /opt/opencv/lib/libopencv_cudaarithm.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -lcudart -lnpp -lcufft -lcudart -lnpp -lcufft -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/main.cc new file mode 100755 index 0000000000000000000000000000000000000000..a061fc03e9e63d9dd12b2628ca3d72eeac9f8a6f --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/main.cc @@ -0,0 +1,1117 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include "opencv2/opencv.hpp" +#include "opencv2/core/ocl.hpp" +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <iostream> +#include <cassert> +#include <parboil.h> +#include <visc.h> + + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "Vector-CPU(No Vector)-ZC Pipeline - Input Video"; +std::string output_window = "Vector-CPU(No Vector)-ZC Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + +// Definitions of sizes for edge detection kernels + +#define MIN_BR 0.0f +#define MAX_BR 1.0f + +// Code needs to be changed for this to vary +#define SZB 3 + +#define REDUCTION_TILE_SZ 1024 + +#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + +extern "C" { + +struct __attribute__((__packed__)) InStruct { + float* I ; + size_t bytesI; + float* Is ; + size_t bytesIs; + float* L; + size_t bytesL; + float* S; + size_t bytesS; + float* G; + size_t bytesG; + float* maxG; + size_t bytesMaxG; + float* E; + size_t bytesE; + float* Gs; + size_t bytesGs; + float* B; + size_t bytesB; + float* Sx; + size_t bytesSx; + float* Sy; + size_t bytesSy; + int m; + int n; + int block_x; + int grid_x; +}; + + +void packData(struct InStruct* args, float* I, size_t bytesI, + float* Is, size_t bytesIs, + float* L, size_t bytesL, + float* S, size_t bytesS, + float* G, size_t bytesG, + float* maxG, size_t bytesMaxG, + float* E, size_t bytesE, + float* Gs, size_t bytesGs, + float* B, size_t bytesB, + float* Sx, size_t bytesSx, + float* Sy, size_t bytesSy, + int m, int n, + int block_x, int grid_x) { + args->I = I; + args->bytesI = bytesI; + args->Is = Is; + args->bytesIs = bytesIs; + args->L = L; + args->bytesL = bytesL; + args->S = S; + args->bytesS = bytesS; + args->G = G; + args->bytesG = bytesG; + args->maxG = maxG; + args->bytesMaxG = bytesMaxG; + args->E = E; + args->bytesE = bytesE; + args->Gs = Gs; + args->bytesGs = bytesGs; + args->B = B; + args->bytesB = bytesB; + args->Sx = Sx; + args->bytesSx = bytesSx; + args->Sy = Sy; + args->bytesSy = bytesSy; + args->m = m; + args->n = n; + args->block_x = block_x; + args->grid_x = grid_x; +} + +/* + * Gaussian smoothing of image I of size m x n + * I : input image + * Gs : gaussian filter + * Is: output (smoothed image) + * m, n : dimensions + * + * Need 2D grid, a thread per pixel + * No use of separable algorithm because we need to do this in one kernel + * No use of shared memory because + * - we don't handle it in the X86 pass + */ + +#define GAUSSIAN_SIZE 7 +#define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2) +void gaussianSmoothing(float *I, size_t bytesI, + float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(2, I, Gs, 1, Is); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + int gloc = gx + gy*n; + + float smoothedVal = 0; + float gval; + int loadOffset; + + if ((gx < n) && (gy < m)) { + for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++) + for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) { + + loadOffset = gloc + i*n + j; + + if ((gy + i) < 0) // top contour + loadOffset = gx + j; + else if ((gy + i) > m-1 ) // bottom contour + loadOffset = (m-1)*n + gx + j; + else + loadOffset = gloc + i*n + j; // within image vertically + + // Adjust so we are within image horizonally + if ((gx + j) < 0) // left contour + loadOffset -= (gx+j); + else if ((gx + j) > n-1 ) // right contour + loadOffset = loadOffset - gx - j + n - 1; + + gval = I[loadOffset]; + smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j]; + } + + Is[gloc] = smoothedVal; + } + __visc__return(m, n); +} + +void WrapperGaussianSmoothing(float *I, size_t bytesI, + float *Gs, size_t bytesGs, + float *Is, size_t bytesIs, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, I, Gs, 1, Is); + void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n); + __visc__bindIn(GSNode, 0, 0, 0); // Bind I + __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI + __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs + __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs + __visc__bindIn(GSNode, 4, 4, 0); // Bind Is + __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs + __visc__bindIn(GSNode, 6, 6, 0); // Bind m + __visc__bindIn(GSNode, 7, 7, 0); // Bind n + + __visc__bindOut(GSNode, 0, 0, 0); // bind output m + __visc__bindOut(GSNode, 1, 1, 0); // bind output n +} + + +/* Compute a non-linear laplacian estimate of input image I of size m x n */ +/* + * Is : blurred imput image + * m, n : dimensions + * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) + * L : output (laplacian of the image) + * Need 2D grid, a thread per pixel +*/ +void laplacianEstimate(float *Is, size_t bytesIs, + float *B, size_t bytesB, + float *L, size_t bytesL, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(2, Is, B, 1, L); + // 3x3 image area + float imageArea[SZB][SZB]; + + //int gx = get_global_id(0); + //int gy = get_global_id(1); + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + //if(gx == 0 && gy == 0) + //std::cout << "Entered laplacian\n"; + + int i, j; + + if ((gx < n) && (gy < m)) { + // Data copy for dilation filter + imageArea[1][1] = Is[gy * n + gx]; + + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + } else { + imageArea[1][0] = Is[gy * n + gx - 1]; + imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; + imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + } else { + imageArea[1][2] = Is[gy * n + gx + 1]; + imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; + imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; + } + + imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; + imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; + + // Compute pixel of dilated image + float dilatedPixel = MIN_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + + // Data copy for erotion filter - only change the boundary conditions + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + } else { + if (gy == 0) imageArea[0][0] = MAX_BR; + if (gy == m-1) imageArea[2][0] = MAX_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + } else { + if (gy == 0) imageArea[0][2] = MAX_BR; + if (gy == m-1) imageArea[2][2] = MAX_BR; + } + + if (gy == 0) imageArea[0][1] = MAX_BR; + if (gy == m-1) imageArea[2][1] = MAX_BR; + + // Compute pixel of eroded image + float erodedPixel = MAX_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + + float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1]; + L[gy*n+gx] = laplacian; + } + //OutStruct output = {bytesB, bytesL}; + //if(gx == m-1 && gy == n-1) + //std::cout << "Exit laplacian\n"; + __visc__return(m); +} + +void WrapperlaplacianEstimate(float *Is, size_t bytesIs, + float *B, size_t bytesB, + float *L, size_t bytesL, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, Is, B, 1, L); + void* LNode = __visc__createNode2D(laplacianEstimate, m, n); + __visc__bindIn(LNode, 0, 0, 0); // Bind Is + __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(LNode, 2, 2, 0); // Bind B + __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(LNode, 4, 4, 0); // Bind L + __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL + __visc__bindIn(LNode, 6, 6, 0); // Bind m + __visc__bindIn(LNode, 7, 7, 0); // Bind n + + __visc__bindOut(LNode, 0, 0, 0); // bind output m + +} + +/* Compute the zero crossings of input image L of size m x n */ +/* + * L : imput image (computed Laplacian) + * m, n : dimensions + * B : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0]) + * S : output (sign of the image) + * Need 2D grid, a thread per pixel + */ +void computeZeroCrossings(float *L, size_t bytesL, + float *B, size_t bytesB, + float *S, size_t bytesS, + int m, int n) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + + // 3x3 image area + float imageArea[SZB][SZB]; + + //int gx = get_global_id(0); + //int gy = get_global_id(1); + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + int i, j; + + //if(gx == 0 && gy == 0) + //std::cout << "Entered ZC\n"; + if ((gx < n) && (gy < m)) { + // Data copy for dilation filter + imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR; + + if (gx == 0) { // left most line + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + } else { + imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR; + imageArea[0][0] = (gy > 0) ? + (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][0] = (gy < m - 1) ? + (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + } else { + imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR; + imageArea[0][2] = (gy > 0) ? + (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][2] = (gy < m - 1) ? + (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + } + + imageArea[0][1] = (gy > 0) ? + (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + imageArea[2][1] = (gy < m - 1)? + (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR) + : MIN_BR; + + // Compute pixel of dilated image + float dilatedPixel = MIN_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + + // Data copy for erotion filter - only change the boundary conditions + if (gx == 0) { + imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + } else { + if (gy == 0) imageArea[0][0] = MAX_BR; + if (gy == m-1) imageArea[2][0] = MAX_BR; + } + + if (gx == n - 1) { + imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + } else { + if (gy == 0) imageArea[0][2] = MAX_BR; + if (gy == m-1) imageArea[2][2] = MAX_BR; + } + + if (gy == 0) imageArea[0][1] = MAX_BR; + if (gy == m-1) imageArea[2][1] = MAX_BR; + + // Compute pixel of eroded image + float erodedPixel = MAX_BR; + for (i = 0; i < SZB; i++) + for (j = 0; j < SZB; j++) + erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + + float pixelSign = dilatedPixel - erodedPixel; + S[gy*n+gx] = pixelSign; + } + //OutStruct output = {bytesB, bytesS}; + //if(gx == n-1 && gy == n-1) + //std::cout << "Exit ZC\n"; + __visc__return(m); +} + +void WrapperComputeZeroCrossings(float *L, size_t bytesL, + float *B, size_t bytesB, + float *S, size_t bytesS, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n); + __visc__bindIn(ZCNode, 0, 0, 0); // Bind L + __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL + __visc__bindIn(ZCNode, 2, 2, 0); // Bind B + __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(ZCNode, 4, 4, 0); // Bind S + __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS + __visc__bindIn(ZCNode, 6, 6, 0); // Bind m + __visc__bindIn(ZCNode, 7, 7, 0); // Bind n + + __visc__bindOut(ZCNode, 0, 0, 0); // bind output m + +} + +/* + * Gradient computation using Sobel filters + * Is : input (smoothed image) + * Sx, Sy: Sobel operators + * - Sx = [-1 0 1 ; -2 0 2 ; -1 0 1 ] + * - Sy = [-1 -2 -1 ; 0 0 0 ; 1 2 1 ] + * m, n : dimensions + * G: output, gradient magnitude : sqrt(Gx^2+Gy^2) + * Need 2D grid, a thread per pixel + * No use of separable algorithm because we need to do this in one kernel + * No use of shared memory because + * - we don't handle it in the X86 pass + */ + +#define SOBEL_SIZE 3 +#define SOBEL_RADIUS (SOBEL_SIZE / 2) + +void computeGradient(float *Is, size_t bytesIs, + float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, + float *G, size_t bytesG, + int m, int n) { + + __visc__hint(visc::DEVICE); + __visc__attributes(3, Is, Sx, Sy, 1, G); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + int gloc = gx + gy*n; + + float Gx = 0; + float Gy = 0; + float gval; + int loadOffset; + + if ((gx < n) && (gy < m)) { + for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++) + for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) { + + loadOffset = gloc + i*n + j; + + if ((gy + i) < 0) // top contour + loadOffset = gx + j; + else if ((gy + i) > m-1 ) // bottom contour + loadOffset = (m-1)*n + gx + j; + else + loadOffset = gloc + i*n + j; // within image vertically + + // Adjust so we are within image horizonally + if ((gx + j) < 0) // left contour + loadOffset -= (gx+j); + else if ((gx + j) > n-1 ) // right contour + loadOffset = loadOffset - gx - j + n - 1; + + gval = Is[loadOffset]; + Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; + } + + G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy); + //G[gloc] = Gx*Gx + Gy*Gy; + } + __visc__return(n); +} + +void WrapperComputeGradient(float *Is, size_t bytesIs, + float *Sx, size_t bytesSx, + float *Sy, size_t bytesSy, + float *G, size_t bytesG, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, Is, Sx, Sy, 1, G); + void* CGNode = __visc__createNode2D(computeGradient, m, n); + __visc__bindIn(CGNode, 0, 0, 0); // Bind Is + __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx + __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx + __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy + __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy + __visc__bindIn(CGNode, 6, 6, 0); // Bind G + __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG + __visc__bindIn(CGNode, 8, 8, 0); // Bind m + __visc__bindIn(CGNode, 9, 9, 0); // Bind n + + __visc__bindOut(CGNode, 0, 0, 0); // bind output m +} + +/* + * Reduction + * G : input + * maxG: output + * m, n: input size + * Needs a single thread block + */ +void computeMaxGradientLeaf(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n) { + + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(1, G, 1, maxG); + + void* thisNode = __visc__getNode(); + + int lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x + int dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x + + + // Assume a single thread block + // Thread block iterates over all elements + for (int i = lx + dimx; i < m*n; i+= dimx) { + if (G[lx] < G[i]) + G[lx] = G[i]; + } + + // First thread iterates over all elements of the thread block + if (lx == 0) { + for (int i = 1; (i < dimx) && (i < m*n); i++) + if (G[lx] < G[i]) + G[lx] = G[i]; + + *maxG = G[lx]; + } + + __visc__return(n); +} + +/* + * Reduction + * G : input + * maxG: output + * Each static node processes 2*nodeDim elements + * Need 1D grid, a thread per 2 pixels + */ +//void computeMaxGradientLeaf(float *G, size_t bytesG, + //float *maxG, size_t bytesMaxG, + //int m, int n) { + + //__visc__hint(visc::DEVICE); + //TODO: maxG should be initialized to zero (MIN_BR) every time + //__visc__attributes(2, G, maxG, 1, maxG); + + //void* thisNode = __visc__getNode(); + //void* parentNode = __visc__getParentNode(thisNode); + + //int lx = __visc__getNodeInstanceID_x(thisNode); + //int px = __visc__getNodeInstanceID_x(parentNode); + //int dimx = __visc__getNumNodeInstances_x(thisNode); + + //int gid = lx + 2*px*dimx; + + //for (unsigned stride = dimx; stride > 32; stride >>= 1) { + //if ((gid + stride < m*n) && (lx < stride)) + //if (G[gid + stride] > G[gid]) + //G[gid] = G[gid + stride]; + //__visc__barrier(); + //} + + //for (unsigned stride = 32; stride >= 1; stride >>= 1) { + //if ((gid + stride < m*n) && (lx < stride)) + //if (G[gid + stride] > G[gid]) + //G[gid] = G[gid + stride]; + //} + + //if (lx == 0) + //__visc__atomic_max(maxG,G[gid]); + + //__visc__return(m); +//} + +void computeMaxGradientTB(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n, + int block_x) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); + __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n + + __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m +} + +void WrapperComputeMaxGradient(float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + int m, int n, + int block_x, int grid_x) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x); + __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n + __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x + + __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m +} + +/* Reject the zero crossings where the gradient is below a threshold */ +/* + * S : input (computed zero crossings) + * m, n : dimensions + * G: gradient of (smoothed) image + * E : output (edges of the image) + * Need 2D grid, a thread per pixel + */ + +#define THETA 0.1 +void rejectZeroCrossings(float *S, size_t bytesS, + float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, + int m, int n) { + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + + void* thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); + + float mG = *maxG; + //float mG = 1.39203; + if ((gx < n) && (gy < m)) { + E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ; + } + __visc__return(m); +} + +void WrapperRejectZeroCrossings(float *S, size_t bytesS, + float *G, size_t bytesG, + float *maxG, size_t bytesMaxG, + float *E, size_t bytesE, + int m, int n) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n); + __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S + __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS + __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G + __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG + __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG + __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG + __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E + __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE + __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m + __visc__bindIn(RZCNode, 9, 9, 0); // Bind n + + __visc__bindOut(RZCNode, 0, 0, 0); // bind output m +} + + + +// Pipelined Root node +void edgeDetection(float *I, size_t bytesI, // 0 + float *Is, size_t bytesIs, // 2 + float *L, size_t bytesL, // 4 + float *S, size_t bytesS, // 6 + float *G, size_t bytesG, // 8 + float *maxG, size_t bytesMaxG, // 10 + float *E, size_t bytesE, // 12 + float *Gs, size_t bytesGs, // 14 + float *B, size_t bytesB, // 16 + float *Sx, size_t bytesSx, // 18 + float *Sy, size_t bytesSy, // 20 + int m, // 22 + int n, // 23 + int block_x, // 24 + int grid_x // 25 + ) { + __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); + __visc__hint(visc::CPU_TARGET); + void* GSNode = __visc__createNode(WrapperGaussianSmoothing); + void* LNode = __visc__createNode(WrapperlaplacianEstimate); + void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings); + void* CGNode = __visc__createNode(WrapperComputeGradient); + void* CMGNode = __visc__createNode(WrapperComputeMaxGradient); + void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings); + + // Gaussian Inputs + __visc__bindIn(GSNode, 0 , 0, 1); // Bind I + __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI + __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs + __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs + __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is + __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs + __visc__bindIn(GSNode, 22, 6, 1); // Bind m + __visc__bindIn(GSNode, 23, 7, 1); // Bind n + + // Laplacian Inputs + __visc__bindIn(LNode, 2 , 0, 1); // Bind Is + __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs + __visc__bindIn(LNode, 16, 2, 1); // Bind B + __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(LNode, 4 , 4, 1); // Bind L + __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL +// __visc__bindIn(LNode, 22, 6, 1); // Bind m + __visc__edge(GSNode, LNode, 0, 6, 1); // Get m + __visc__bindIn(LNode, 23, 7, 1); // Bind n + + // Compute ZC Inputs + __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L + __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL + __visc__bindIn(CZCNode, 16, 2, 1); // Bind B + __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S + __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS + //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m + __visc__edge(LNode, CZCNode, 0, 6, 1); // Get m + __visc__bindIn(CZCNode, 23, 7, 1); // Bind n + + // Gradient Inputs + __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is + __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs + __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx + __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx + __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy + __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy + __visc__bindIn(CGNode, 8 , 6, 1); // Bind G + __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG + __visc__bindIn(CGNode, 22, 8, 1); // Bind m + //__visc__edge(CZCNode, CGNode, 0, 8, 1); // Get m + //__visc__bindIn(CGNode, 23, 9, 1); // Bind n + __visc__edge(GSNode, CGNode, 1, 9, 1); // Get n + + // Max Gradient Inputs + __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G + __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG + __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG + __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG + __visc__bindIn(CMGNode, 22, 4, 1); // Bind m + //__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m + //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n + __visc__edge(CGNode, CMGNode, 0, 5, 1); // Get n + __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x + __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x + + // Reject ZC Inputs + __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S + __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS + __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G + __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG + __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG + __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG + __visc__bindIn(RZCNode, 12, 6, 1); // Bind E + __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE + //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m + __visc__edge(CZCNode, RZCNode, 0, 8, 1); // Get m + //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n + __visc__edge(CMGNode, RZCNode, 0, 9, 1); // Get n + + __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality +} + +} + +using namespace cv; + +void getNextFrame(VideoCapture& VC, Mat& F) { + VC >> F; + /// Convert the image to grayscale if image colored + if(F.channels() == 3) + cvtColor( F, F, CV_BGR2GRAY ); + + F.convertTo(F, CV_32F, 1.0/255.0); + +} + +//void showInOut(Mat& Input, Mat& Output) { + //Mat in, out; + //resize(Input, in, Size(512, 768)); + //resize(Output, out, Size(512, 768)); + //imshow(input_window, in); + //imshow(output_window, out); +//} + + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t I_sz; + int block_x, grid_x; + + std::cout << "Using OpenCV" << CV_VERSION << "\n"; + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] != NULL)) + { + fprintf(stderr, "Expecting input image filename\n"); + exit(-1); + } + + /* Read in data */ + std::cout << "Reading video file: " << params->inpFiles[0] << "\n"; + VideoCapture cap(params->inpFiles[0]); + if(!cap.isOpened()) { + std::cout << "Could not open video file" << "\n"; + return -1; + } + + int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT); + std::cout << "Number of frames = " << NUM_FRAMES << "\n"; + namedWindow(input_window, CV_WINDOW_AUTOSIZE); + namedWindow(output_window, CV_WINDOW_AUTOSIZE); + moveWindow(input_window, POSX_IN, POSY_IN); + moveWindow(output_window, POSX_OUT, POSY_OUT); + + Mat src, Is, L, S, G, E; + + getNextFrame(cap, src); + + std::cout << "Image dimension = " << src.size() << "\n"; + if(!src.isContinuous()) { + std::cout << "Expecting contiguous storage of image in memory!\n"; + exit(-1); + } + + Is = Mat(src.size[0], src.size[1], CV_32F); + L = Mat(src.size[0], src.size[1], CV_32F); + S = Mat(src.size[0], src.size[1], CV_32F); + G = Mat(src.size[0], src.size[1], CV_32F); + E = Mat(src.size[0], src.size[1], CV_32F); + + // All these matrices need to have their data array contiguous in memory + assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous()); + + pb_InitializeTimerSet(&timers); + __visc__init(); + + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + I_sz = src.size[0]*src.size[1]*sizeof(float); + + size_t bytesMaxG = sizeof(float); + float* maxG = (float*)malloc(bytesMaxG); + + float B[] = { 1, 1, 1, + 1, 1, 1, + 1, 1, 1 }; + size_t bytesB = 9*sizeof(float); + //Sx = [-1 0 1 ; -2 0 2 ; -1 0 1 ] + //Sy = [-1 -2 -1 ; 0 0 0 ; 1 2 1 ] + float Sx[] = { -1, 0, 1, + -2, 0, 2, + -1, 0, 1 }; + size_t bytesSx = 9*sizeof(float); + float Sy[] = { -1, -2, -1, + 0, 0, 0, + 1, 2, 1 }; + size_t bytesSy = 9*sizeof(float); + + float Gs [] = { + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291, + 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, + 0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, + 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 }; + size_t bytesGs = 7*7*sizeof(float); + + block_x = 256; + // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs + grid_x = 1; + + // Copy A and B^T into device memory + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + //showInOut(src, E); + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(input_window, in); + imshow(output_window, out); + waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); + packData(args, (float*)src.data, I_sz, + (float*)Is.data, I_sz, + (float*)L.data, I_sz, + (float*)S.data, I_sz, + (float*)G.data, I_sz, + maxG, bytesMaxG, + (float*)E.data, I_sz, + Gs, bytesGs, + B, bytesB, + Sx, bytesSx, + Sy, bytesSy, + src.size[0], src.size[1], + block_x, grid_x); + + // Check if the total elements is a multiple of block size + assert(src.size[0]*src.size[1] % block_x == 0); + + //imshow(input_window, src); + //imshow(output_window, E); + //waitKey(0); + for(unsigned j=0; j<NUM_RUNS; j++) { + std::cout << "Run: " << j << "\n"; + void* DFG = __visc__launch(1, edgeDetection, (void*)args); + + cap = VideoCapture(params->inpFiles[0]); + getNextFrame(cap, src); + + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + + if(NUM_FRAMES >=2) { + //__visc__push(DFG, args); + //__visc__push(DFG, args); + for(int i=0; i<NUM_FRAMES; i++) { + //std::cout << "Frame " << i << "\n"; + args->I = (float*) src.data; + + *maxG = 0.0; + + llvm_visc_track_mem(src.data, I_sz); + llvm_visc_track_mem(Is.data, I_sz); + llvm_visc_track_mem(L.data, I_sz); + llvm_visc_track_mem(S.data, I_sz); + llvm_visc_track_mem(G.data, I_sz); + llvm_visc_track_mem(maxG, bytesMaxG); + llvm_visc_track_mem(E.data, I_sz); + llvm_visc_track_mem(Gs, bytesGs); + llvm_visc_track_mem(B, bytesB); + llvm_visc_track_mem(Sx, bytesSx); + llvm_visc_track_mem(Sy, bytesSy); + + __visc__push(DFG, args); + __visc__pop(DFG); + + //llvm_visc_request_mem(E.data, I_sz); + //std::cout << "Show E" << "\n"; + //imshow(window_name, E); + //waitKey(0); + //llvm_visc_request_mem(src.data, I_sz); + //llvm_visc_request_mem(Is.data, I_sz); + //llvm_visc_request_mem(L.data, I_sz); + //llvm_visc_request_mem(S.data, I_sz); + //llvm_visc_request_mem(G.data, I_sz); + llvm_visc_request_mem(maxG, bytesMaxG); + llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; + //std::cout << "Max G = " << *maxG << "\n"; + + Mat in, out; + resize(src, in, Size(HEIGHT, WIDTH)); + //std::cout << "Show E\n"; + resize(E, out, Size(HEIGHT, WIDTH)); + imshow(output_window, out); + imshow(input_window, in); + waitKey(1); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + + llvm_visc_untrack_mem(src.data); + llvm_visc_untrack_mem(Is.data); + llvm_visc_untrack_mem(L.data); + llvm_visc_untrack_mem(S.data); + llvm_visc_untrack_mem(G.data); + llvm_visc_untrack_mem(maxG); + llvm_visc_untrack_mem(E.data); + llvm_visc_untrack_mem(Gs); + llvm_visc_untrack_mem(B); + llvm_visc_untrack_mem(Sx); + llvm_visc_untrack_mem(Sy); + + getNextFrame(cap, src); + + } + //__visc__pop(DFG); + //__visc__pop(DFG); + } + else { + __visc__push(DFG, args); + __visc__pop(DFG); + } + + + __visc__wait(DFG); + } + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + + pb_PrintTimerSet(&timers); + __visc__cleanup(); + + //if (params->outFile) { + + /* Write C to file */ + //pb_SwitchToTimer(&timers, pb_TimerID_IO); + //writeColMajorMatrixFile(params->outFile, + //src.size[0], src.size[1], matE); + //} + + //std::cout << "Show Is" << "\n"; + //Mat output(src.size[0], src.size[1], CV_32F); + //imshow(output_window, Is); + //waitKey(0); + + //std::cout << "Show G" << "\n"; + //imshow(output_window, L); + //waitKey(0); + + //std::cout << "Show L" << "\n"; + //imshow(output_window, S); + //waitKey(0); + + //std::cout << "Show S" << "\n"; + //imshow(output_window, G); + //waitKey(0); + + //std::cout << "Show E" << "\n"; + //imshow(output_window, E); + //waitKey(0); + + //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl; + pb_FreeParameters(params); + + return 0; +} diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile @@ -4,8 +4,8 @@ LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include -APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include +APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib #OpenCV link flags all diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc index b1ff17bec86167e5b44ff3aa16a6af01f41dd8cc..9cbcf57b3d384fd96dacc7844bdff7412d8c24a3 100755 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc @@ -23,6 +23,39 @@ #include <parboil.h> #include <visc.h> + +#define NUM_RUNS 10000 +#define DEPTH 3 +#define HEIGHT 640 +#define WIDTH 480 +std::string input_window = "Vector Pipeline - Input Video"; +std::string output_window = "Vector Pipeline - Edge Mapping"; + + +#ifdef MIDDLE + #define POSX_IN 640 + #define POSY_IN 0 + #define POSX_OUT 640 + #define POSY_OUT 540 + +#elif RIGHT + #define POSX_IN 1280 + #define POSY_IN 0 + #define POSX_OUT 1280 + #define POSY_OUT 540 + +#else // LEFT + #define POSX_IN 0 + #define POSY_IN 0 + #define POSX_OUT 0 + #define POSY_OUT 540 +#endif + + +//#define NUM_FRAMES 20 + + + // Definitions of sizes for edge detection kernels #define MIN_BR 0.0f @@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::GPU_TARGET); __visc__attributes(2, Is, B, 1, L); // 3x3 image area float imageArea[SZB][SZB]; @@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL, float *S, size_t bytesS, int m, int n) { __visc__hint(visc::DEVICE); - //__visc__hint(visc::SPIR_TARGET); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); // 3x3 image area @@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(1, G, 1, maxG); void* thisNode = __visc__getNode(); @@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG, int m, int n, int block_x) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G @@ -646,6 +680,7 @@ void rejectZeroCrossings(float *S, size_t bytesS, float *E, size_t bytesE, int m, int n) { __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); __visc__attributes(3, S, G, maxG, 1, E); void* thisNode = __visc__getNode(); @@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0 } } -#define NUM_RUNS 100 -#define DEPTH 3 -#define HEIGHT 640 -#define WIDTH 480 -std::string input_window = "Vector Pipeline - Input Video"; -std::string output_window = "Vector Pipeline - Edge Mapping"; -#define POSX_IN 640 -#define POSY_IN 0 -#define POSX_OUT 640 -#define POSY_OUT 540 -//#define NUM_FRAMES 20 using namespace cv; void getNextFrame(VideoCapture& VC, Mat& F) { @@ -826,8 +850,6 @@ int main (int argc, char *argv[]) { size_t I_sz; int block_x, grid_x; - std::string input_window = "Input Frame"; - std::string output_window = "Edge Map"; std::cout << "Using OpenCV" << CV_VERSION << "\n"; @@ -878,7 +900,7 @@ int main (int argc, char *argv[]) { pb_InitializeTimerSet(&timers); __visc__init(); - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); // copy A to device memory I_sz = src.size[0]*src.size[1]*sizeof(float); @@ -915,17 +937,19 @@ int main (int argc, char *argv[]) { grid_x = 1; // Copy A and B^T into device memory - pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); - pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); - struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); - Mat in, out; //showInOut(src, E); + Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); resize(E, out, Size(HEIGHT, WIDTH)); imshow(input_window, in); imshow(output_window, out); waitKey(0); + + //NUM_FRAMES = 20; + pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION ); + struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct)); packData(args, (float*)src.data, I_sz, (float*)Is.data, I_sz, (float*)L.data, I_sz, @@ -952,7 +976,9 @@ int main (int argc, char *argv[]) { cap = VideoCapture(params->inpFiles[0]); getNextFrame(cap, src); + //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements); + if(NUM_FRAMES >=2) { //__visc__push(DFG, args); //__visc__push(DFG, args); @@ -961,6 +987,7 @@ int main (int argc, char *argv[]) { args->I = (float*) src.data; *maxG = 0.0; + llvm_visc_track_mem(src.data, I_sz); llvm_visc_track_mem(Is.data, I_sz); llvm_visc_track_mem(L.data, I_sz); @@ -972,7 +999,7 @@ int main (int argc, char *argv[]) { llvm_visc_track_mem(B, bytesB); llvm_visc_track_mem(Sx, bytesSx); llvm_visc_track_mem(Sy, bytesSy); - + __visc__push(DFG, args); __visc__pop(DFG); @@ -987,15 +1014,38 @@ int main (int argc, char *argv[]) { //llvm_visc_request_mem(G.data, I_sz); llvm_visc_request_mem(maxG, bytesMaxG); llvm_visc_request_mem(E.data, I_sz); + //std::cout << "src.data = " << (float*)src.data << "\n"; + //std::cout << "Is.data = " << (float*)Is.data << "\n"; + //std::cout << "L.data = " << (float*)L.data << "\n"; + //std::cout << "S.data = " << (float*)S.data << "\n"; + //std::cout << "G.data = " << (float*)G.data << "\n"; + //std::cout << "E.data = " << (float*)E.data << "\n"; //std::cout << "Max G = " << *maxG << "\n"; Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); + //std::cout << "Show E\n"; resize(E, out, Size(HEIGHT, WIDTH)); imshow(output_window, out); imshow(input_window, in); waitKey(1); - getNextFrame(cap, src); + //waitKey(0); + //std::cout << "Show Is\n"; + //resize(Is, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show L\n"; + //resize(L, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show S\n"; + //resize(S, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); + //std::cout << "Show G\n"; + //resize(G, out, Size(HEIGHT, WIDTH)); + //imshow(output_window, out); + //waitKey(0); llvm_visc_untrack_mem(src.data); llvm_visc_untrack_mem(Is.data); @@ -1009,7 +1059,7 @@ int main (int argc, char *argv[]) { llvm_visc_untrack_mem(Sx); llvm_visc_untrack_mem(Sy); - + getNextFrame(cap, src); } //__visc__pop(DFG); @@ -1019,7 +1069,8 @@ int main (int argc, char *argv[]) { __visc__push(DFG, args); __visc__pop(DFG); } - + + __visc__wait(DFG); } diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc index 838d461e247df0f58e9c7d701e9714f4065048dd..5953021cf2fe8f27e0899464ca493bf47bef876f 100755 --- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc @@ -24,11 +24,12 @@ #include <visc.h> -#define NUM_RUNS 5 +#define NUM_RUNS 10000 #define DEPTH 3 #define HEIGHT 640 #define WIDTH 480 -std::string input_window = "GPU Pipeline - Input Video"; + +std::string input_window = "GPU Pipeline - Input Video"; std::string output_window = "GPU Pipeline - Edge Mapping";