diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index dd4cc894e4429251c6fbf8b339885fcecdc249d9..6926e6cfed65679b455152cffc311ae946dab314 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -141,15 +141,15 @@ private: Kernel* kernel; // VISC Runtime API - Constant* llvm_visc_ptx_launch; - Constant* llvm_visc_ptx_wait; - Constant* llvm_visc_ptx_initContext; - Constant* llvm_visc_ptx_clearContext; - Constant* llvm_visc_ptx_argument_scalar; - Constant* llvm_visc_ptx_argument_ptr; - Constant* llvm_visc_ptx_free; - Constant* llvm_visc_ptx_getOutput; - Constant* llvm_visc_ptx_executeNode; + Constant* llvm_visc_ocl_launch; + Constant* llvm_visc_ocl_wait; + Constant* llvm_visc_ocl_initContext; + Constant* llvm_visc_ocl_clearContext; + Constant* llvm_visc_ocl_argument_scalar; + Constant* llvm_visc_ocl_argument_ptr; + Constant* llvm_visc_ocl_free; + Constant* llvm_visc_ocl_getOutput; + Constant* llvm_visc_ocl_executeNode; //Functions std::string getKernelsModuleName(Module &M); @@ -239,15 +239,15 @@ void CGT_NVPTX::initRuntimeAPI() { DEBUG(errs() << "Successfully loaded visc-rt API module\n"); // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_visc_ptx_launch); - DECLARE(llvm_visc_ptx_wait); - DECLARE(llvm_visc_ptx_initContext); - DECLARE(llvm_visc_ptx_clearContext); - DECLARE(llvm_visc_ptx_argument_scalar); - DECLARE(llvm_visc_ptx_argument_ptr); - DECLARE(llvm_visc_ptx_free); - DECLARE(llvm_visc_ptx_getOutput); - DECLARE(llvm_visc_ptx_executeNode); + DECLARE(llvm_visc_ocl_launch); + DECLARE(llvm_visc_ocl_wait); + DECLARE(llvm_visc_ocl_initContext); + DECLARE(llvm_visc_ocl_clearContext); + DECLARE(llvm_visc_ocl_argument_scalar); + DECLARE(llvm_visc_ocl_argument_ptr); + DECLARE(llvm_visc_ocl_free); + DECLARE(llvm_visc_ocl_getOutput); + DECLARE(llvm_visc_ocl_executeNode); // Get or insert timerAPI functions as well if you plan to use timers initTimerAPI(); @@ -260,7 +260,9 @@ void CGT_NVPTX::initRuntimeAPI() { InitCall = cast<Instruction>(*VI->use_begin()); initializeTimerSet(InitCall); switchToTimer(visc_TimerID_INIT_CTX, InitCall); - CallInst::Create(llvm_visc_ptx_initContext, None, "", InitCall); + CallInst::Create(llvm_visc_ocl_initContext, + ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)), + "", InitCall); switchToTimer(visc_TimerID_NONE, InitCall); // Insert print instruction at visc exit @@ -362,7 +364,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi Function* KF = K->KernelLeafNode->getFuncPointer(); // Initialize context //DEBUG(errs() << "Initializing context" << "\n"); - //CallInst::Create(llvm_visc_ptx_initContext, None, "", RI); + //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); DEBUG(errs() << "Initializing commandQ" << "\n"); // Initialize command queue @@ -375,7 +377,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi Value* LaunchInstArgs[] = {fileStr, kernelStr}; DEBUG(errs() << "Inserting launch call" << "\n"); - CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ptx_launch, + CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, ArrayRef<Value*>(LaunchInstArgs, 2), "graph"+KF->getName(), InitCall); @@ -444,7 +446,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi isInput, isOutput }; - Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr, + Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, ArrayRef<Value*>(setInputArgs, 6), "", RI); DevicePointers.push_back(d_ptr); // If this has out attribute, store the returned device pointer in @@ -469,7 +471,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi ConstantInt::get(Type::getInt32Ty(M.getContext()),i), ConstantExpr::getSizeOf(inputVal->getType()) }; - CallInst::Create(llvm_visc_ptx_argument_scalar, + CallInst::Create(llvm_visc_ocl_argument_scalar, ArrayRef<Value*>(setInputArgs, 4), "", RI); } @@ -492,7 +494,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi True }; - d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr, + d_Output = CallInst::Create(llvm_visc_ocl_argument_ptr, ArrayRef<Value*>(setOutputArgs, 6), "d_output."+KF->getName(), RI); @@ -512,14 +514,14 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi LocalWGPtr, GlobalWGPtr }; - CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, + CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, ArrayRef<Value*>(ExecNodeArgs, 4), "event."+KF->getName(), RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); // Wait for Kernel to Finish - CallInst::Create(llvm_visc_ptx_wait, + CallInst::Create(llvm_visc_ocl_wait, ArrayRef<Value*>(GraphID), "", RI); @@ -532,7 +534,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi d_Output, outputSize }; - CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, + CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "h_output."+KF->getName()+".addr", RI); @@ -554,7 +556,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; - CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput, + CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "", RI); }*/ @@ -563,12 +565,12 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi DEBUG(errs() << "Clearing context" << "\n"); // Free Device Memory for(auto d_ptr: DevicePointers) { - CallInst::Create(llvm_visc_ptx_free, ArrayRef<Value*>(d_ptr), "", RI); + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); } switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); // Clear Context LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); - CallInst::Create(llvm_visc_ptx_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); switchToTimer(visc_TimerID_NONE, CleanupCall); switchToTimer(visc_TimerID_MISC, RI); @@ -660,12 +662,11 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // Keep track of the arguments order. std::vector<unsigned> inmap1 = N->getInArgMap(); std::vector<unsigned> inmap2 = kernel->getInArgMap(); - // TODO: Verify when we have incoming edges from more than one nodes - // The limit is the size of inmap2, because this is the number of kernel arguments - for (unsigned i = 0; i < inmap2.size(); i++) { - inmap2[i] = inmap1[inmap2[i]]; - } - kernel->setInArgMap(inmap2); + // TODO: Verify when we have incoming edges from more than one nodes The + // limit is the size of inmap2, because this is the number of kernel + // arguments + for (unsigned i = 0; i < inmap2.size(); i++) { inmap2[i] = + inmap1[inmap2[i]]; } kernel->setInArgMap(inmap2); // Keep track of the output arguments order. std::vector<unsigned> outmap1 = N->getOutArgMap(); @@ -931,10 +932,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { case Intrinsic::visc_getNumNodeInstances_x: case Intrinsic::visc_getNumNodeInstances_y: case Intrinsic::visc_getNumNodeInstances_z: { -//TODO: think about whether this is the best way to go -// there are hw specific registers. therefore it is good to have the intrinsic -// but then, why do we need to keep that info in the graph? -// (only for the kernel configuration during the call) + // TODO: think about whether this is the best way to go there are hw + // specific registers. therefore it is good to have the intrinsic but + // then, why do we need to keep that info in the graph? (only for the + // kernel configuration during the call) DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); diff --git a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp index b29274d4e3a53b8745e7a6b916b303ff4441e1f6..a05fe6080262ccf4657650c6169900ac4e35d6f9 100644 --- a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp @@ -27,6 +27,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/IR/Attributes.h" +#include "llvm/SupportVISC/VISCHint.h" #include "llvm/SupportVISC/VISCTimer.h" #include "llvm/SupportVISC/DFG2LLVM.h" @@ -144,15 +145,15 @@ private: Kernel* kernel; // VISC Runtime API - Constant* llvm_visc_ptx_launch; - Constant* llvm_visc_ptx_wait; - Constant* llvm_visc_ptx_initContext; - Constant* llvm_visc_ptx_clearContext; - Constant* llvm_visc_ptx_argument_scalar; - Constant* llvm_visc_ptx_argument_ptr; - Constant* llvm_visc_ptx_free; - Constant* llvm_visc_ptx_getOutput; - Constant* llvm_visc_ptx_executeNode; + Constant* llvm_visc_ocl_launch; + Constant* llvm_visc_ocl_wait; + Constant* llvm_visc_ocl_initContext; + Constant* llvm_visc_ocl_clearContext; + Constant* llvm_visc_ocl_argument_scalar; + Constant* llvm_visc_ocl_argument_ptr; + Constant* llvm_visc_ocl_free; + Constant* llvm_visc_ocl_getOutput; + Constant* llvm_visc_ocl_executeNode; //Functions std::string getKernelsModuleName(Module &M); @@ -243,15 +244,15 @@ void CGT_SPIR::initRuntimeAPI() { DEBUG(errs() << "Successfully loaded visc-rt API module\n"); // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_visc_ptx_launch); - DECLARE(llvm_visc_ptx_wait); - DECLARE(llvm_visc_ptx_initContext); - DECLARE(llvm_visc_ptx_clearContext); - DECLARE(llvm_visc_ptx_argument_scalar); - DECLARE(llvm_visc_ptx_argument_ptr); - DECLARE(llvm_visc_ptx_free); - DECLARE(llvm_visc_ptx_getOutput); - DECLARE(llvm_visc_ptx_executeNode); + DECLARE(llvm_visc_ocl_launch); + DECLARE(llvm_visc_ocl_wait); + DECLARE(llvm_visc_ocl_initContext); + DECLARE(llvm_visc_ocl_clearContext); + DECLARE(llvm_visc_ocl_argument_scalar); + DECLARE(llvm_visc_ocl_argument_ptr); + DECLARE(llvm_visc_ocl_free); + DECLARE(llvm_visc_ocl_getOutput); + DECLARE(llvm_visc_ocl_executeNode); // Get or insert timerAPI functions as well if you plan to use timers initTimerAPI(); @@ -264,7 +265,9 @@ void CGT_SPIR::initRuntimeAPI() { InitCall = cast<Instruction>(*VI->use_begin()); initializeTimerSet(InitCall); switchToTimer(visc_TimerID_INIT_CTX, InitCall); - CallInst::Create(llvm_visc_ptx_initContext, None, "", InitCall); + CallInst::Create(llvm_visc_ocl_initContext, + ArrayRef<Value*>(getTargetID(M, visc::SPIR_TARGET)), + "", InitCall); switchToTimer(visc_TimerID_NONE, InitCall); // Insert print instruction at visc exit @@ -337,13 +340,13 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil if(!N->isRoot()) addIdxDimArgs(F_X86); - /* TODO: Use this code to verufy if this is a good pattern for PTX kernel + /* TODO: Use this code to verufy if this is a good pattern for OCL kernel // Sort children in topological order before code generation for kernel call N->getChildGraph()->sortChildren(); // The DFNode N has the property that it has only one child (leaving Entry - // and Exit dummy nodes). This child is the PTX kernel. This simplifies code + // and Exit dummy nodes). This child is the OCL kernel. This simplifies code // generation for kernel calls significantly. All the inputs to this child // node would either be constants or from the parent node N. @@ -366,7 +369,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil Function* KF = K->KernelLeafNode->getFuncPointer(); // Initialize context //DEBUG(errs() << "Initializing context" << "\n"); - //CallInst::Create(llvm_visc_ptx_initContext, None, "", RI); + //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); DEBUG(errs() << "Initializing commandQ" << "\n"); // Initialize command queue @@ -379,7 +382,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil Value* LaunchInstArgs[] = {fileStr, kernelStr}; DEBUG(errs() << "Inserting launch call" << "\n"); - CallInst* SPIR_Ctx = CallInst::Create(llvm_visc_ptx_launch, + CallInst* SPIR_Ctx = CallInst::Create(llvm_visc_ocl_launch, ArrayRef<Value*>(LaunchInstArgs, 2), "graph"+KF->getName(), InitCall); @@ -448,7 +451,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil isInput, isOutput }; - Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr, + Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, ArrayRef<Value*>(setInputArgs, 6), "", RI); DevicePointers.push_back(d_ptr); // If this has out attribute, store the returned device pointer in @@ -473,7 +476,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil ConstantInt::get(Type::getInt32Ty(M.getContext()),i), ConstantExpr::getSizeOf(inputVal->getType()) }; - CallInst::Create(llvm_visc_ptx_argument_scalar, + CallInst::Create(llvm_visc_ocl_argument_scalar, ArrayRef<Value*>(setInputArgs, 4), "", RI); } @@ -496,7 +499,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil True }; - d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr, + d_Output = CallInst::Create(llvm_visc_ocl_argument_ptr, ArrayRef<Value*>(setOutputArgs, 6), "d_output."+KF->getName(), RI); @@ -516,14 +519,14 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil LocalWGPtr, GlobalWGPtr }; - CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, + CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, ArrayRef<Value*>(ExecNodeArgs, 4), "event."+KF->getName(), RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); // Wait for Kernel to Finish - CallInst::Create(llvm_visc_ptx_wait, + CallInst::Create(llvm_visc_ocl_wait, ArrayRef<Value*>(GraphID), "", RI); @@ -536,7 +539,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil d_Output, outputSize }; - CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, + CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "h_output."+KF->getName()+".addr", RI); @@ -558,7 +561,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; - CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput, + CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "", RI); }*/ @@ -567,12 +570,12 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil DEBUG(errs() << "Clearing context" << "\n"); // Free Device Memory for(auto d_ptr: DevicePointers) { - CallInst::Create(llvm_visc_ptx_free, ArrayRef<Value*>(d_ptr), "", RI); + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); } switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); // Clear Context LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); - CallInst::Create(llvm_visc_ptx_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); switchToTimer(visc_TimerID_NONE, CleanupCall); switchToTimer(visc_TimerID_MISC, RI); @@ -769,33 +772,33 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { // Look up if we have visited this function before. If we have, then just // get the cloned function pointer from DFNode. Otherwise, create the cloned // function and add it to the DFNode GenFunc. - Function *F_nvptx = N->getGenFunc(); - if(F_nvptx == NULL) { + Function *F_spir = N->getGenFunc(); + if(F_spir == NULL) { // Clone the function ValueToValueMapTy VMap; - F_nvptx = CloneFunction(F, VMap, true); + F_spir = CloneFunction(F, VMap, true); // Insert the cloned function into the kernels module - KernelM.getFunctionList().push_back(F_nvptx); + KernelM.getFunctionList().push_back(F_spir); - //TODO: Iterate over all the instructions of F_nvptx and identify the + //TODO: Iterate over all the instructions of F_spir and identify the //callees and clone them into this module. - DEBUG(errs() << *F_nvptx->getType()); - DEBUG(errs() << *F_nvptx); + DEBUG(errs() << *F_spir->getType()); + DEBUG(errs() << *F_spir); //Add generated function info to DFNode - N->setGenFunc(F_nvptx, visc::GPU_TARGET); + N->setGenFunc(F_spir, visc::SPIR_TARGET); } else { errs() << "WARNING: Visiting a node for which code already generated!\n"; } - transformFunctionToVoid(F_nvptx); - removeInOutAttributes(F_nvptx); + transformFunctionToVoid(F_spir); + removeInOutAttributes(F_spir); // Go through all the instructions - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + for (inst_iterator i = inst_begin(F_spir), e = inst_end(F_spir); i != e; ++i) { Instruction *I = &(*i); // Leaf nodes should not contain VISC graph intrinsics or launch assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); @@ -811,7 +814,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { switch (II->getIntrinsicID()) { /**************************** llvm.visc.getNode() *****************************/ case Intrinsic::visc_getNode: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); + DEBUG(errs() << F_spir->getName() << "\t: Handling getNode\n"); // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); @@ -819,7 +822,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { break; /************************* llvm.visc.getParentNode() **************************/ case Intrinsic::visc_getParentNode: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); + DEBUG(errs() << F_spir->getName() << "\t: Handling getParentNode\n"); // get the parent node of the arg node // get argument node ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); @@ -835,7 +838,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { break; /*************************** llvm.visc.getNumDims() ***************************/ case Intrinsic::visc_getNumDims: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); + DEBUG(errs() << F_spir->getName() << "\t: Handling getNumDims\n"); // get node from map // get the appropriate field ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); @@ -856,7 +859,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { case Intrinsic::visc_getNodeInstanceID_x: case Intrinsic::visc_getNodeInstanceID_y: case Intrinsic::visc_getNodeInstanceID_z: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"); + DEBUG(errs() << F_spir->getName() << "\t: Handling getNodeInstanceID\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; assert(ArgDFNode && "Arg node is NULL"); @@ -936,7 +939,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { // but then, why do we need to keep that info in the graph? // (only for the kernel configuration during the call) - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); + DEBUG(errs() << F_spir->getName() << "\t: Handling getNumNodeInstances\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; // A leaf node always has a parent @@ -1026,8 +1029,8 @@ void CGT_SPIR::codeGen(DFLeafNode* N) { re = IItoRemove.rend(); ri != re; ++ri) (*ri)->eraseFromParent(); - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; + addCLMetadata(F_spir); + kernel->KernelFunction = F_spir; errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; DEBUG(errs() << KernelM); diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 6170edf20bdd0ac76268fc666a3d08ae7a134372..f713105b917ecb71b3f3eacf9fe8b9fb6ba58c41 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -30,7 +30,7 @@ typedef struct { cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; -} DFNodeContext_PTX; +} DFNodeContext_OCL; cl_context globalGPUContext; cl_command_queue globalCommandQue; @@ -106,8 +106,8 @@ void llvm_visc_untrack_mem(void* ptr) { } -static void* llvm_visc_ptx_request_mem(void* ptr, size_t size, DFNodeContext_PTX* Context, bool isInput, bool isOutput) { - DEBUG(cout << "[PTX] Request memory: " << ptr << " for context: " << Context->clGPUContext << "\n"); +static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) { + DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clGPUContext << "\n"); MemTrackerEntry* MTE = MTracker.lookup(ptr); if (MTE == NULL) { MTracker.print(); @@ -116,7 +116,7 @@ static void* llvm_visc_ptx_request_mem(void* ptr, size_t size, DFNodeContext_PTX } // If already on device if (MTE->getLocation() == MemTrackerEntry::DEVICE && - ((DFNodeContext_PTX*)MTE->getContext())->clGPUContext == Context->clGPUContext) { + ((DFNodeContext_OCL*)MTE->getContext())->clGPUContext == Context->clGPUContext) { DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << "\n"); return MTE->getAddress(); } @@ -159,7 +159,7 @@ void* llvm_visc_request_mem(void* ptr, size_t size) { // Else copy from device and update table DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << "\n"); DEBUG(cout << "\tCopying ..."); - cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_PTX*)MTE->getContext())->clCommandQue, + cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue, (cl_mem) MTE->getAddress(), CL_TRUE, 0, @@ -951,7 +951,9 @@ void llvm_visc_x86_wait(void* graphID) { DEBUG(cout << "\t... pthread Done!\n"); } -void* llvm_visc_ptx_initContext() { +void* llvm_visc_ocl_initContext(enum visc::Target T) { + DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR"); + DEBUG(cout << "Initializing Context for " << Target << " device\n"); cl_uint numPlatforms; cl_int errcode; errcode = clGetPlatformIDs(0, NULL, &numPlatforms); @@ -985,9 +987,11 @@ void* llvm_visc_ptx_initContext() { assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); // Choose second one which is X86 AVX cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - (long) platforms[1], + (long) platforms[T == visc::GPU_TARGET? 0 : 1], 0}; - globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_CPU, + globalGPUContext = clCreateContextFromType(properties, + T == visc::GPU_TARGET? + CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, NULL, &errcode); free(platforms); DEBUG(cout << "\tContext " << globalGPUContext << "\n"); @@ -996,9 +1000,9 @@ void* llvm_visc_ptx_initContext() { return globalGPUContext; } -void llvm_visc_ptx_clearContext(void* graphID) { +void llvm_visc_ocl_clearContext(void* graphID) { DEBUG(cout << "Clear Context\n"); - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; // FIXME: Have separate function to release command queue and clear context. // Would be useful when a context has multiple command queues clReleaseKernel(Context->clKernel); @@ -1007,30 +1011,30 @@ void llvm_visc_ptx_clearContext(void* graphID) { //clReleaseContext(globalGPUContext); //DEBUG(cout << "Released context at: " << globalGPUContext); free(Context); - DEBUG(cout << "Done with PTX kernel\n"); + DEBUG(cout << "Done with OCL kernel\n"); } -void llvm_visc_ptx_argument_scalar(void* graphID, void* input, int arg_index, size_t size) { +void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) { DEBUG(cout << "Set Scalar Input:"); DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << "\n"); - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; DEBUG(cout << "Using Context: " << Context << "\n"); DEBUG(cout << "Using clKernel: " << Context->clKernel << "\n"); cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); } -void* llvm_visc_ptx_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) { +void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) { DEBUG(cout << "Set Pointer Input:"); DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << "\n"); // Size should be non-zero assert(size != 0 && "Size of data pointed to has to be non-zero!"); DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << "\n"); - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; // Check with runtime the location of this memory - cl_mem d_input = (cl_mem) llvm_visc_ptx_request_mem(input, size, Context, isInput, isOutput); + cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput); // Set Kernel Argument cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); @@ -1039,25 +1043,25 @@ void* llvm_visc_ptx_argument_ptr(void* graphID, void* input, int arg_index, size return d_input; } -void llvm_visc_ptx_free(void* ptr) { +void llvm_visc_ocl_free(void* ptr) { DEBUG(cout << "Release Device Pointer: " << ptr << "\n"); //cl_mem d_ptr = (cl_mem) ptr; //clReleaseMemObject(d_ptr); } -void* llvm_visc_ptx_getOutput(void* graphID, void* h_output, void* d_output, size_t size) { +void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) { DEBUG(cout << "Get Output:\n"); DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << "\n"); if(h_output == NULL) h_output = malloc(size); - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, h_output, 0, NULL, NULL); checkErr(errcode, CL_SUCCESS, "Failure to read output"); return h_output; } -void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* +void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) { size_t GlobalWG[3]; @@ -1077,7 +1081,7 @@ void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* } } - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; // TODO: Would like to use event to ensure better scheduling of kernels. // Currently passing the event paratemeter results in seg fault with // clEnqueueNDRangeKernel. @@ -1152,12 +1156,12 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength) return cSourceString; } -void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { - DEBUG(cout << "Launch PTX Kernel\n"); +void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { + DEBUG(cout << "Launch OCL Kernel\n"); // Initialize OpenCL // OpenCL specific variables - DFNodeContext_PTX *Context = (DFNodeContext_PTX *) malloc(sizeof(DFNodeContext_PTX)); + DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL)); size_t dataBytes; size_t kernelLength; @@ -1222,9 +1226,9 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { } -void llvm_visc_ptx_wait(void* graphID) { +void llvm_visc_ocl_wait(void* graphID) { DEBUG(cout << "Wait\n"); - DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID; + DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID; clFinish(Context->clCommandQue); } diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h index db999bd6c5bd91725ac7ba979efbb1b5f9fca847..799e3618b9eccd5ec84e6ef1a345dd7fd20ff4ab 100644 --- a/llvm/projects/visc-rt/visc-rt.h +++ b/llvm/projects/visc-rt/visc-rt.h @@ -1,4 +1,5 @@ /* + * * (c) 2010 The Board of Trustees of the University of Illinois. */ #ifndef VISC_RT_HEADER @@ -8,6 +9,7 @@ #include <map> #include <ctime> #include "llvm/Support/CommandLine.h" +#include "llvm/SupportVISC/VISCHint.h" #include "llvm/SupportVISC/VISCTimer.h" using namespace std; @@ -157,16 +159,16 @@ void* llvm_visc_request_mem(void*, size_t); /*********************** OPENCL & PTHREAD API **************************/ void* llvm_visc_x86_launch(void* (void*), void*); void llvm_visc_x86_wait(void*); -void* llvm_visc_ptx_initContext(); - -void llvm_visc_ptx_clearContext(void*); -void llvm_visc_ptx_argument_scalar(void*, void*, int, size_t); -void* llvm_visc_ptx_argument_ptr(void*, void*, int, size_t, bool, bool); -void llvm_visc_ptx_free(void*); -void* llvm_visc_ptx_getOutput(void*, void*, void*, size_t); -void* llvm_visc_ptx_executeNode(void*, unsigned, const size_t*, const size_t*); -void* llvm_visc_ptx_launch(const char*, const char*); -void llvm_visc_ptx_wait(void*); +void* llvm_visc_ocl_initContext(enum visc::Target); + +void llvm_visc_ocl_clearContext(void*); +void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t); +void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool); +void llvm_visc_ocl_free(void*); +void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t); +void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*); +void* llvm_visc_ocl_launch(const char*, const char*); +void llvm_visc_ocl_wait(void*); void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID); void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL);